diff --git a/packages/graphics/bcm2835-driver/package.mk b/packages/graphics/bcm2835-driver/package.mk
index b6a1fb8715..4690e9b506 100644
--- a/packages/graphics/bcm2835-driver/package.mk
+++ b/packages/graphics/bcm2835-driver/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="bcm2835-driver"
-PKG_VERSION="475a89a"
+PKG_VERSION="a5d4376"
 PKG_ARCH="any"
 PKG_LICENSE="nonfree"
 PKG_SITE="http://www.broadcom.com"
diff --git a/packages/mediacenter/kodi-binary-addons/peripheral.joystick/package.mk b/packages/mediacenter/kodi-binary-addons/peripheral.joystick/package.mk
index e585077e6f..de62d196d6 100644
--- a/packages/mediacenter/kodi-binary-addons/peripheral.joystick/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/peripheral.joystick/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="peripheral.joystick"
-PKG_VERSION="eeb6fec"
+PKG_VERSION="110ddb7"
 PKG_REV="0"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.hts/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.hts/package.mk
index aa7dfb9538..9354f10dcd 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.hts/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.hts/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="pvr.hts"
-PKG_VERSION="3009090"
+PKG_VERSION="5bf84c3"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.vdr.vnsi/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.vdr.vnsi/package.mk
index 54fe74e1a6..1c8782f6a0 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.vdr.vnsi/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.vdr.vnsi/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="pvr.vdr.vnsi"
-PKG_VERSION="dcdc216"
+PKG_VERSION="9659c8c"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi/package.mk b/packages/mediacenter/kodi/package.mk
index 5090d0c95a..cd4adb329c 100644
--- a/packages/mediacenter/kodi/package.mk
+++ b/packages/mediacenter/kodi/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="kodi"
-PKG_VERSION="a10c504"
+PKG_VERSION="da07d20"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
 PKG_SITE="http://www.kodi.tv"
diff --git a/packages/mediacenter/kodi/patches/kodi-999.01-create-archive-cache.patch b/packages/mediacenter/kodi/patches/kodi-999.01-create-archive-cache.patch
deleted file mode 100644
index a122c67c9b..0000000000
--- a/packages/mediacenter/kodi/patches/kodi-999.01-create-archive-cache.patch
+++ /dev/null
@@ -1,28 +0,0 @@
-From bf68ee5438cb0e1343e7a6c35df35e7a4fb36223 Mon Sep 17 00:00:00 2001
-From: Shani-08 <Shani-08@users.noreply.github.com>
-Date: Sat, 28 Jan 2017 23:21:30 +0000
-Subject: [PATCH] create archive_cache if not exists
-
----
- xbmc/Application.cpp | 8 ++++----
- 1 file changed, 4 insertions(+), 4 deletions(-)
-
-diff --git a/xbmc/Application.cpp b/xbmc/Application.cpp
-index c6ef0c0..13d2cfa 100644
---- a/xbmc/Application.cpp
-+++ b/xbmc/Application.cpp
-@@ -1076,10 +1076,10 @@ void CApplication::CreateUserDirs() const
- 
-   //Let's clear our archive cache before starting up anything more
-   auto archiveCachePath = CSpecialProtocol::TranslatePath("special://temp/archive_cache/");
--  if (CDirectory::RemoveRecursive(archiveCachePath))
--    CDirectory::Create(archiveCachePath);
--  else
--    CLog::Log(LOGWARNING, "Failed to remove the archive cache at %s", archiveCachePath.c_str());
-+  if (CDirectory::Exists(archiveCachePath))
-+    if (!CDirectory::RemoveRecursive(archiveCachePath))
-+      CLog::Log(LOGWARNING, "Failed to remove the archive cache at %s", archiveCachePath.c_str());
-+  CDirectory::Create(archiveCachePath);
- 
- }
- 
diff --git a/packages/mediacenter/kodi/patches/kodi-999.17-PR11222-cec-button-repeat-settings.patch b/packages/mediacenter/kodi/patches/kodi-999.17-PR11222-cec-button-repeat-settings.patch
new file mode 100644
index 0000000000..488f1100c5
--- /dev/null
+++ b/packages/mediacenter/kodi/patches/kodi-999.17-PR11222-cec-button-repeat-settings.patch
@@ -0,0 +1,134 @@
+From 8a39c3a53fc6fe49a42f91f27d83eb44e6fbcc32 Mon Sep 17 00:00:00 2001
+From: popcornmix <popcornmix@gmail.com>
+Date: Mon, 19 Dec 2016 14:21:40 +0000
+Subject: [PATCH 1/3] [cec] Drop CEC_DOUBLE_TAP_TIMEOUT_MS_OLD code
+
+Kodi won't even build with libcec 3, so supporting a libcec 2.2 setting is of no value
+---
+ xbmc/peripherals/devices/PeripheralCecAdapter.cpp | 7 +------
+ 1 file changed, 1 insertion(+), 6 deletions(-)
+
+diff --git a/xbmc/peripherals/devices/PeripheralCecAdapter.cpp b/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
+index 5b5b38d..2d4b06c 100644
+--- a/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
++++ b/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
+@@ -1392,13 +1392,8 @@ void CPeripheralCecAdapter::SetConfigurationFromSettings(void)
+   m_configuration.bPowerOffOnStandby = iStandbyAction == LOCALISED_ID_SUSPEND ? 1 : 0;
+   m_bShutdownOnStandby = iStandbyAction == LOCALISED_ID_POWEROFF;
+ 
+-#if defined(CEC_DOUBLE_TAP_TIMEOUT_MS_OLD)
+-  // double tap prevention timeout in ms. libCEC uses 50ms units for this in 2.2.0, so divide by 50
+-  m_configuration.iDoubleTapTimeout50Ms = GetSettingInt("double_tap_timeout_ms") / 50;
+-#else
+-  // backwards compatibility. will be removed once the next major release of libCEC is out
++  // double tap prevention timeout in ms
+   m_configuration.iDoubleTapTimeoutMs = GetSettingInt("double_tap_timeout_ms");
+-#endif
+ 
+   if (GetSettingBool("pause_playback_on_deactivate"))
+   {
+
+From b79f99511bcf81fb07d90ba9c3aa378c88d7cc03 Mon Sep 17 00:00:00 2001
+From: popcornmix <popcornmix@gmail.com>
+Date: Mon, 19 Dec 2016 14:21:40 +0000
+Subject: [PATCH 2/3] [cec] Add settings for configuring button repeats
+
+---
+ addons/resource.language.en_gb/resources/strings.po | 15 +++++++++++++++
+ system/peripherals.xml                              |  4 +++-
+ xbmc/peripherals/devices/PeripheralCecAdapter.cpp   | 11 +++++++++++
+ 3 files changed, 29 insertions(+), 1 deletion(-)
+
+diff --git a/addons/resource.language.en_gb/resources/strings.po b/addons/resource.language.en_gb/resources/strings.po
+index 699eae7..34bc0d8 100644
+--- a/addons/resource.language.en_gb/resources/strings.po
++++ b/addons/resource.language.en_gb/resources/strings.po
+@@ -20426,3 +20426,18 @@ msgstr ""
+ msgctxt "#39016"
+ msgid "Resume audiobook"
+ msgstr ""
++
++#: system/peripherals.xml
++msgctxt "#38050"
++msgid "Remote button press delay before repeating (ms)"
++msgstr ""
++
++#: system/peripherals.xml
++msgctxt "#38051"
++msgid "Remote button press repeat rate (ms)"
++msgstr ""
++
++#: system/peripherals.xml
++msgctxt "#38052"
++msgid "Remote button press release time (ms)"
++msgstr ""
+diff --git a/system/peripherals.xml b/system/peripherals.xml
+index d5704b2..02b1a9e 100644
+--- a/system/peripherals.xml
++++ b/system/peripherals.xml
+@@ -31,7 +31,9 @@
+     <setting key="device_type" type="int" value="1" configurable="0" />
+     <setting key="wake_devices_advanced" type="string" value="" configurable="0" />
+     <setting key="standby_devices_advanced" type="string" value="" configurable="0" />
+-    <setting key="double_tap_timeout_ms" type="int" min="0" value="300" configurable="0" />
++    <setting key="double_tap_timeout_ms" type="int" min="50" max="1000" step="50" value="300" label="38050" order="16" />
++    <setting key="button_repeat_rate_ms" type="int" min="0" max="250" step="10" value="0" label="38051" order="17" />
++    <setting key="button_release_delay_ms" type="int" min="0" max="500" step="50" value="0" label="38052" order="18" />
+   </peripheral>
+ 
+   <peripheral vendor_product="2548:1001,2548:1002" bus="usb" name="Pulse-Eight CEC Adapter" mapTo="cec">
+diff --git a/xbmc/peripherals/devices/PeripheralCecAdapter.cpp b/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
+index 2d4b06c..cb175d3 100644
+--- a/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
++++ b/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
+@@ -1297,6 +1297,15 @@ void CPeripheralCecAdapter::SetConfigurationFromLibCEC(const CEC::libcec_configu
+   m_configuration.bActivateSource = config.bActivateSource;
+   bChanged |= SetSetting("activate_source", m_configuration.bActivateSource == 1);
+ 
++  m_configuration.iDoubleTapTimeoutMs = config.iDoubleTapTimeoutMs;
++  bChanged |= SetSetting("double_tap_timeout_ms", (int)m_configuration.iDoubleTapTimeoutMs);
++
++  m_configuration.iButtonRepeatRateMs = config.iButtonRepeatRateMs;
++  bChanged |= SetSetting("button_repeat_rate_ms", (int)m_configuration.iButtonRepeatRateMs);
++
++  m_configuration.iButtonReleaseDelayMs = config.iButtonReleaseDelayMs;
++  bChanged |= SetSetting("button_release_delay_ms", (int)m_configuration.iButtonReleaseDelayMs);
++
+   m_configuration.bPowerOffOnStandby = config.bPowerOffOnStandby;
+ 
+   m_configuration.iFirmwareVersion = config.iFirmwareVersion;
+@@ -1394,6 +1403,8 @@ void CPeripheralCecAdapter::SetConfigurationFromSettings(void)
+ 
+   // double tap prevention timeout in ms
+   m_configuration.iDoubleTapTimeoutMs = GetSettingInt("double_tap_timeout_ms");
++  m_configuration.iButtonRepeatRateMs = GetSettingInt("button_repeat_rate_ms");
++  m_configuration.iButtonReleaseDelayMs = GetSettingInt("button_release_delay_ms");
+ 
+   if (GetSettingBool("pause_playback_on_deactivate"))
+   {
+
+From 5f19f6a2c06075e4d75b66772e304cd5f50523eb Mon Sep 17 00:00:00 2001
+From: popcornmix <popcornmix@gmail.com>
+Date: Mon, 19 Dec 2016 14:21:40 +0000
+Subject: [PATCH 3/3] [cec] Don't discard buttons when repeat mode is enabled
+
+---
+ xbmc/peripherals/devices/PeripheralCecAdapter.cpp | 5 ++++-
+ 1 file changed, 4 insertions(+), 1 deletion(-)
+
+diff --git a/xbmc/peripherals/devices/PeripheralCecAdapter.cpp b/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
+index cb175d3..9b2943f 100644
+--- a/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
++++ b/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
+@@ -804,7 +804,10 @@ void CPeripheralCecAdapter::PushCecKeypress(const CecButtonPress &key)
+   CLog::Log(LOGDEBUG, "%s - received key %2x duration %d", __FUNCTION__, key.iButton, key.iDuration);
+ 
+   CSingleLock lock(m_critSection);
+-  if (key.iDuration > 0)
++  // avoid the queue getting too long
++  if (m_configuration.iButtonRepeatRateMs && m_buttonQueue.size() > 5)
++    return;
++  if (m_configuration.iButtonRepeatRateMs == 0 && key.iDuration > 0)
+   {
+     if (m_currentButton.iButton == key.iButton && m_currentButton.iDuration == 0)
+     {
diff --git a/packages/mediacenter/kodi/patches/kodi-999.99-PR11663.patch b/packages/mediacenter/kodi/patches/kodi-999.99-PR11663.patch
deleted file mode 100644
index de03bf615c..0000000000
--- a/packages/mediacenter/kodi/patches/kodi-999.99-PR11663.patch
+++ /dev/null
@@ -1,22 +0,0 @@
-From adecb3af2e3eb5a2715e6f1264fe3047dd209318 Mon Sep 17 00:00:00 2001
-From: Jonas Karlman <jonas@kwiboo.se>
-Date: Sun, 12 Feb 2017 16:31:29 +0100
-Subject: [PATCH] [settings] change allowed remotedelay range to inlcude zero
-
----
- xbmc/settings/AdvancedSettings.cpp | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/xbmc/settings/AdvancedSettings.cpp b/xbmc/settings/AdvancedSettings.cpp
-index cc37998..1c00eda 100644
---- a/xbmc/settings/AdvancedSettings.cpp
-+++ b/xbmc/settings/AdvancedSettings.cpp
-@@ -1010,7 +1010,7 @@ void CAdvancedSettings::ParseSettingsFile(const std::string &file)
-     }
-   }
- 
--  XMLUtils::GetInt(pRootElement, "remotedelay", m_remoteDelay, 1, 20);
-+  XMLUtils::GetInt(pRootElement, "remotedelay", m_remoteDelay, 0, 20);
-   XMLUtils::GetFloat(pRootElement, "controllerdeadzone", m_controllerDeadzone, 0.0f, 1.0f);
-   XMLUtils::GetUInt(pRootElement, "fanartres", m_fanartRes, 0, 1080);
-   XMLUtils::GetUInt(pRootElement, "imageres", m_imageRes, 0, 1080);
diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1000-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1000-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch
deleted file mode 100644
index 0e814fa3c0..0000000000
--- a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1000-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch
+++ /dev/null
@@ -1,49 +0,0 @@
-From 84e9a1784bbd3182b68cefa5e5feae8da8b9e184 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Fri, 5 Jun 2015 22:48:33 +0100
-Subject: [PATCH] mpeg4video: Signal unsupported GMC with more than one warp
- point
-
----
- libavcodec/avcodec.h       | 1 +
- libavcodec/mpeg4videodec.c | 4 ++++
- 2 files changed, 5 insertions(+)
-
-diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
-index 8c7c420..e63dc2d 100644
---- a/libavcodec/avcodec.h
-+++ b/libavcodec/avcodec.h
-@@ -2527,6 +2527,7 @@ typedef struct AVCodecContext {
- #define FF_BUG_DC_CLIP          4096
- #define FF_BUG_MS               8192 ///< Work around various bugs in Microsoft's broken decoders.
- #define FF_BUG_TRUNCATED       16384
-+#define FF_BUG_GMC_UNSUPPORTED 32768
- 
-     /**
-      * strictly follow the standard (MPEG4, ...).
-diff --git a/libavcodec/mpeg4videodec.c b/libavcodec/mpeg4videodec.c
-index 9bf33dd..0b5d3b9 100644
---- a/libavcodec/mpeg4videodec.c
-+++ b/libavcodec/mpeg4videodec.c
-@@ -2179,6 +2179,9 @@ int ff_mpeg4_workaround_bugs(AVCodecContext *avctx)
- 
-         if (ctx->divx_version >= 0)
-             s->workaround_bugs |= FF_BUG_HPEL_CHROMA;
-+
-+        if (ctx->num_sprite_warping_points > 1)
-+            s->workaround_bugs |= FF_BUG_GMC_UNSUPPORTED;
-     }
- 
-     if (s->workaround_bugs & FF_BUG_STD_QPEL) {
-@@ -2203,6 +2206,7 @@ int ff_mpeg4_workaround_bugs(AVCodecContext *avctx)
-                s->workaround_bugs, ctx->lavc_build, ctx->xvid_build,
-                ctx->divx_version, ctx->divx_build, s->divx_packed ? "p" : "");
- 
-+    avctx->workaround_bugs = s->workaround_bugs;
-     if (CONFIG_MPEG4_DECODER && ctx->xvid_build >= 0 &&
-         s->codec_id == AV_CODEC_ID_MPEG4 &&
-         avctx->idct_algo == FF_IDCT_AUTO) {
--- 
-1.9.1
-
-
diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1001-hevcdsp_ARM_NEON_optimized_epel_functions.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1001-hevcdsp_ARM_NEON_optimized_epel_functions.patch
deleted file mode 100644
index d19984b4de..0000000000
--- a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1001-hevcdsp_ARM_NEON_optimized_epel_functions.patch
+++ /dev/null
@@ -1,410 +0,0 @@
-From 29c3327a0d72a7e872ff170363cfe5ed13bca5d0 Mon Sep 17 00:00:00 2001
-From: Seppo Tomperi <seppo.tomperi@vtt.fi>
-Date: Tue, 22 Dec 2015 18:10:24 +0000
-Subject: [PATCH] hevcdsp: ARM NEON optimized epel functions
-
----
- libavcodec/arm/Makefile            |   1 +
- libavcodec/arm/hevcdsp_epel_neon.S | 334 +++++++++++++++++++++++++++++++++++++
- libavcodec/arm/hevcdsp_init_neon.c |  23 +++
- 3 files changed, 358 insertions(+)
- create mode 100644 libavcodec/arm/hevcdsp_epel_neon.S
-
-diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
-index cdd35b0..6051ec8 100644
---- a/libavcodec/arm/Makefile
-+++ b/libavcodec/arm/Makefile
-@@ -131,6 +131,7 @@ NEON-OBJS-$(CONFIG_DCA_DECODER)        += arm/dcadsp_neon.o             \
-                                           arm/synth_filter_neon.o
- NEON-OBJS-$(CONFIG_HEVC_DECODER)       += arm/hevcdsp_init_neon.o       \
-                                           arm/hevcdsp_deblock_neon.o    \
-+                                          arm/hevcdsp_epel_neon.o       \
-                                           arm/hevcdsp_idct_neon.o       \
-                                           arm/hevcdsp_qpel_neon.o
- NEON-OBJS-$(CONFIG_RV30_DECODER)       += arm/rv34dsp_neon.o
-diff --git a/libavcodec/arm/hevcdsp_epel_neon.S b/libavcodec/arm/hevcdsp_epel_neon.S
-new file mode 100644
-index 0000000..516ae5b
---- /dev/null
-+++ b/libavcodec/arm/hevcdsp_epel_neon.S
-@@ -0,0 +1,334 @@
-+/*
-+ * Copyright (c) 2014 - 2015 Seppo Tomperi <seppo.tomperi@vtt.fi>
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "libavutil/arm/asm.S"
-+#include "neon.S"
-+
-+#define MAX_PB_SIZE #64
-+
-+.macro vextin_d4
-+    vld1.8    {q10}, [r1], r2
-+    vmov      d16, d20
-+    vext.8    d17, d20, d21, #1
-+    vext.8    d18, d20, d21, #2
-+    vext.8    d19, d20, d21, #3
-+.endm
-+
-+.macro vextin_d4_8
-+    vld1.8    d16, [r1], r2
-+    vext.8    d17, d16, d16, #1
-+    vext.8    d18, d16, d16, #2
-+    vext.8    d19, d16, d16, #3
-+.endm
-+
-+.macro load_coeffs_16b coeffs
-+    ldr      \coeffs, [\coeffs]
-+    vdup.i8  d0, \coeffs
-+    lsr      \coeffs, #8
-+    vdup.i8  d1, \coeffs
-+    lsr      \coeffs, #8
-+    vdup.i8  d2, \coeffs
-+    lsr      \coeffs, #8
-+    vdup.i8  d3, \coeffs
-+.endm
-+
-+.macro epel_filter_16b out=q12
-+    vmull.u8 q3, d16, d0
-+    vmull.u8 q11, d19, d3
-+    vmull.u8 \out, d17, d1
-+    vmull.u8 q10, d18, d2
-+    vadd.s16 q3, q11
-+    vadd.s16 \out, q10
-+    vsub.s16 \out, q3
-+.endm
-+
-+.macro load_coeffs_32b coeffs
-+    ldr      \coeffs, [\coeffs]
-+    vmov.i64 d4, #0
-+    vmov.8   d4[0], \coeffs
-+    lsr      \coeffs, #8
-+    vmov.8   d4[2], \coeffs
-+    lsr      \coeffs, #8
-+    vmov.8   d4[4], \coeffs
-+    lsr      \coeffs, #8
-+    vmov.8   d4[6], \coeffs
-+.endm
-+
-+.macro epel_filter_32b
-+    vmull.s16 q3, d24, d4[0] //q12
-+    vmull.s16 q4, d25, d4[0]
-+    vmull.s16 q5, d30, d4[3] //q15
-+    vmull.s16 q6, d31, d4[3]
-+
-+    vmull.s16 q7, d26, d4[1] // q13
-+    vmull.s16 q8, d27, d4[1]
-+    vmull.s16 q9, d28, d4[2] // q14
-+    vmull.s16 q10, d29, d4[2]
-+    vadd.s32 q3, q5
-+    vadd.s32 q4, q6
-+    vadd.s32 q7, q9
-+    vadd.s32 q8, q10
-+    vsub.s32 q7, q3
-+    vsub.s32 q8, q4
-+    vqshrn.s32  d6, q7, #6
-+    vqshrn.s32  d7, q8, #6
-+.endm
-+
-+.macro epel_filter_32b_4
-+    vmull.s16 q3, d24, d4[0] //q12
-+    vmull.s16 q5, d30, d4[3] //q15
-+    vmull.s16 q7, d26, d4[1] // q13
-+    vmull.s16 q9, d28, d4[2] // q14
-+    vadd.s32 q3, q5
-+    vadd.s32 q7, q9
-+    vsub.s32 q7, q3
-+    vqshrn.s32  d6, q7, #6
-+.endm
-+
-+function ff_hevc_put_epel_h_neon_8, export=1
-+        push   {r4-r7}
-+        mov    r4, MAX_PB_SIZE
-+        ldr    r7, [sp, #16] // mx
-+        ldr    r5, [sp, #24] // width
-+        sub    r7, #1
-+        lsl    r7, #2
-+        vpush {d8-d15}
-+        adrl   r12, epel_coeffs
-+        add    r7, r12
-+        sub       r1, #1
-+        lsl       r4, #1
-+        load_coeffs_16b r7
-+        mov   r12, r3
-+        mov   r6, r0
-+        mov   r7, r1
-+        cmp       r5, #6
-+        bgt       8f
-+        cmp       r5, #4
-+        blt       2f
-+        b         4f
-+8:      subs r3, #1
-+        pld [r1]
-+        vextin_d4
-+        epel_filter_16b
-+        vst1.16    {q12}, [r0], r4
-+        bne 8b
-+        subs    r5, #8
-+        beq  99f
-+        mov       r3, r12
-+        add       r6, #16
-+        mov       r0, r6
-+        add       r7, #8
-+        mov       r1, r7
-+        cmp       r5, #4
-+        bgt       8b
-+4:      subs r3, #1
-+        pld [r1]
-+        vextin_d4_8
-+        epel_filter_16b
-+        vst1.16    d24, [r0], r4
-+        bne 4b
-+        subs      r5, #4
-+        beq       99f
-+        mov       r3, r12
-+        add       r6, #8
-+        mov       r0, r6
-+        add       r7, #4
-+        mov       r1, r7
-+2:      subs r3, #1
-+        pld [r1]
-+        vextin_d4_8
-+        epel_filter_16b
-+        vst1.32    d24[0], [r0], r4
-+        bne 2b
-+99:     vpop {d8-d15}
-+        pop {r4-r7}
-+        bx lr
-+endfunc
-+
-+function ff_hevc_put_epel_v_neon_8, export=1
-+        push   {r4-r7}
-+        mov    r4, MAX_PB_SIZE
-+        ldr    r7, [sp, #20] // my
-+        ldr    r5, [sp, #24] // width
-+        sub    r7, #1
-+        lsl    r7, #2
-+        vpush {d8-d15}
-+        adrl   r12, epel_coeffs
-+        add    r7, r12
-+        load_coeffs_16b r7
-+        sub       r1, r2
-+        lsl       r4, #1
-+        mov   r12, r3
-+        mov   r6, r0
-+        mov   r7, r1
-+0:      pld [r1]
-+        vld1.8    {d16}, [r1], r2
-+        pld [r1]
-+        vld1.8    {d17}, [r1], r2
-+        pld [r1]
-+        vld1.8    {d18}, [r1], r2
-+        cmp       r5, #6
-+        bgt       8f
-+        cmp       r5, #4
-+        blt       2f
-+        b         4f
-+8:      pld [r1]
-+        vld1.8    {d19}, [r1], r2
-+        subs r3, #1
-+        epel_filter_16b
-+        vst1.16    {q12}, [r0], r4
-+        vmov d16, d17
-+        vmov d17, d18
-+        vmov d18, d19
-+        bne 8b
-+        subs    r5, #8
-+        beq  99f
-+        mov       r3, r12
-+        add       r6, #16
-+        mov       r0, r6
-+        add       r7, #8
-+        mov       r1, r7
-+        b         0b
-+4:      pld       [r1]
-+        vld1.8    {d19}, [r1], r2
-+        subs r3, #1
-+        epel_filter_16b
-+        vst1.16    d24, [r0], r4
-+        vmov d16, d17
-+        vmov d17, d18
-+        vmov d18, d19
-+        bne 4b
-+        subs      r5, #4
-+        beq       99f
-+        mov       r3, r12
-+        add       r6, #8
-+        mov       r0, r6
-+        add       r7, #4
-+        mov       r1, r7
-+        b         0b
-+2:      pld [r1]
-+        vld1.8    {d19}, [r1], r2
-+        subs r3, #1
-+        epel_filter_16b
-+        vst1.32    d24[0], [r0], r4
-+        vmov d16, d17
-+        vmov d17, d18
-+        vmov d18, d19
-+        bne 2b
-+99:     vpop {d8-d15}
-+        pop {r4-r7}
-+        bx lr
-+endfunc
-+
-+function ff_hevc_put_epel_hv_neon_8, export=1
-+        push   {r4-r7}
-+        mov    r4, MAX_PB_SIZE
-+        ldr    r6, [sp, #16] // mx
-+        ldr    r7, [sp, #20] // my
-+        ldr    r5, [sp, #24] // width
-+        sub    r7, #1
-+        lsl    r7, #2
-+        vpush {d8-d15}
-+        adrl   r12, epel_coeffs
-+        sub    r6, #1
-+        lsl    r6, #2
-+        add    r6, r12 // mx epel coeff offset
-+        add    r7, r12
-+        sub       r1, #1
-+        sub       r1, r2
-+        lsl       r4, #1
-+        load_coeffs_16b r6
-+        load_coeffs_32b r7
-+        mov   r12, r3
-+        mov   r6, r0
-+        mov   r7, r1
-+0:      pld   [r1]
-+        vextin_d4
-+        epel_filter_16b q12
-+        pld   [r1]
-+        vextin_d4
-+        epel_filter_16b q13
-+        pld   [r1]
-+        vextin_d4
-+        epel_filter_16b q14
-+        cmp       r5, #6
-+        bgt       8f
-+        cmp       r5, #4
-+        blt       2f
-+        b         4f
-+8:      pld     [r1]
-+        vextin_d4
-+        epel_filter_16b q15
-+        subs r3, #1
-+        epel_filter_32b
-+        vst1.16    {q3}, [r0], r4
-+        vmov q12, q13
-+        vmov q13, q14
-+        vmov q14, q15
-+        bne 8b
-+        subs    r5, #8
-+        beq  99f
-+        mov       r3, r12
-+        add       r6, #16
-+        mov       r0, r6
-+        add       r7, #8
-+        mov       r1, r7
-+        b         0b
-+4:      pld      [r1]
-+        vextin_d4_8
-+        epel_filter_16b q15
-+        subs r3, #1
-+        epel_filter_32b_4
-+        vst1.16    d6, [r0], r4
-+        vmov q12, q13
-+        vmov q13, q14
-+        vmov q14, q15
-+        bne 4b
-+        subs      r5, #4
-+        beq       99f
-+        mov       r3, r12
-+        add       r6, #8
-+        mov       r0, r6
-+        add       r7, #4
-+        mov       r1, r7
-+        b         0b
-+2:      pld      [r1]
-+        vextin_d4_8
-+        epel_filter_16b q15
-+        subs r3, #1
-+        epel_filter_32b_4
-+        vst1.32    d6[0], [r0], r4
-+        vmov q12, q13
-+        vmov q13, q14
-+        vmov q14, q15
-+        bne 2b
-+99:     vpop {d8-d15}
-+        pop {r4-r7}
-+        bx lr
-+endfunc
-+
-+epel_coeffs:
-+       .byte 2, 58, 10, 2
-+       .byte 4, 54, 16, 2
-+       .byte 6, 46, 28, 4
-+       .byte 4, 36, 36, 4
-+       .byte 4, 28, 46, 6
-+       .byte 2, 16, 54, 4
-+       .byte 2, 10, 58, 2
-diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c
-index 5591807..733ff08 100644
---- a/libavcodec/arm/hevcdsp_init_neon.c
-+++ b/libavcodec/arm/hevcdsp_init_neon.c
-@@ -58,6 +58,15 @@ PUT_PIXELS(ff_hevc_put_pixels_w32_neon_8);
- PUT_PIXELS(ff_hevc_put_pixels_w48_neon_8);
- PUT_PIXELS(ff_hevc_put_pixels_w64_neon_8);
- #undef PUT_PIXELS
-+void ff_hevc_put_epel_h_neon_8(int16_t *dst, uint8_t *src,
-+                                ptrdiff_t srcstride, int height,
-+                                intptr_t mx, intptr_t my, int width);
-+void ff_hevc_put_epel_v_neon_8(int16_t *dst, uint8_t *src,
-+                                ptrdiff_t srcstride, int height,
-+                                intptr_t mx, intptr_t my, int width);
-+void ff_hevc_put_epel_hv_neon_8(int16_t *dst, uint8_t *src,
-+                                ptrdiff_t srcstride, int height,
-+                                intptr_t mx, intptr_t my, int width);
- 
- static void (*put_hevc_qpel_neon[4][4])(int16_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
-                                    int height, int width);
-@@ -201,7 +210,21 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
-             c->put_hevc_qpel_bi[x][1][0]      = ff_hevc_put_qpel_bi_neon_wrapper;
-             c->put_hevc_qpel_bi[x][0][1]      = ff_hevc_put_qpel_bi_neon_wrapper;
-             c->put_hevc_qpel_bi[x][1][1]      = ff_hevc_put_qpel_bi_neon_wrapper;
-+            c->put_hevc_epel[x][1][0]         = ff_hevc_put_epel_v_neon_8;
-+            c->put_hevc_epel[x][0][1]         = ff_hevc_put_epel_h_neon_8;
-+            c->put_hevc_epel[x][1][1]         = ff_hevc_put_epel_hv_neon_8;
-         }
-+        c->put_hevc_epel[0][0][0]  = ff_hevc_put_pixels_w2_neon_8;
-+        c->put_hevc_epel[1][0][0]  = ff_hevc_put_pixels_w4_neon_8;
-+        c->put_hevc_epel[2][0][0]  = ff_hevc_put_pixels_w6_neon_8;
-+        c->put_hevc_epel[3][0][0]  = ff_hevc_put_pixels_w8_neon_8;
-+        c->put_hevc_epel[4][0][0]  = ff_hevc_put_pixels_w12_neon_8;
-+        c->put_hevc_epel[5][0][0]  = ff_hevc_put_pixels_w16_neon_8;
-+        c->put_hevc_epel[6][0][0]  = ff_hevc_put_pixels_w24_neon_8;
-+        c->put_hevc_epel[7][0][0]  = ff_hevc_put_pixels_w32_neon_8;
-+        c->put_hevc_epel[8][0][0]  = ff_hevc_put_pixels_w48_neon_8;
-+        c->put_hevc_epel[9][0][0]  = ff_hevc_put_pixels_w64_neon_8;
-+
-         c->put_hevc_qpel[0][0][0]  = ff_hevc_put_pixels_w2_neon_8;
-         c->put_hevc_qpel[1][0][0]  = ff_hevc_put_pixels_w4_neon_8;
-         c->put_hevc_qpel[2][0][0]  = ff_hevc_put_pixels_w6_neon_8;
--- 
-2.5.0
-
-
diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1002-added_ARM_NEON_optimized_SAO_patches.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1002-added_ARM_NEON_optimized_SAO_patches.patch
deleted file mode 100644
index f090c38c7e..0000000000
--- a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1002-added_ARM_NEON_optimized_SAO_patches.patch
+++ /dev/null
@@ -1,3329 +0,0 @@
-From b0cb307c253d2c9f4b94a54dfc74ddb83af984cc Mon Sep 17 00:00:00 2001
-From: Seppo Tomperi <seppo.tomperi@vtt.fi>
-Date: Mon, 8 Dec 2014 13:24:40 +0200
-Subject: [PATCH 1/9] added ARM NEON optimized SAO band offset
-
----
- libavcodec/arm/Makefile            |   3 +-
- libavcodec/arm/hevcdsp_init_neon.c |  47 +++++++++
- libavcodec/arm/hevcdsp_sao_neon.S  | 204 +++++++++++++++++++++++++++++++++++++
- 3 files changed, 253 insertions(+), 1 deletion(-)
- create mode 100644 libavcodec/arm/hevcdsp_sao_neon.S
-
-diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
-index 6051ec8..093a2e8 100644
---- a/libavcodec/arm/Makefile
-+++ b/libavcodec/arm/Makefile
-@@ -133,7 +133,8 @@ NEON-OBJS-$(CONFIG_HEVC_DECODER)       += arm/hevcdsp_init_neon.o       \
-                                           arm/hevcdsp_deblock_neon.o    \
-                                           arm/hevcdsp_epel_neon.o       \
-                                           arm/hevcdsp_idct_neon.o       \
--                                          arm/hevcdsp_qpel_neon.o
-+                                          arm/hevcdsp_qpel_neon.o       \
-+                                          arm/hevcdsp_sao_neon.o
- NEON-OBJS-$(CONFIG_RV30_DECODER)       += arm/rv34dsp_neon.o
- NEON-OBJS-$(CONFIG_RV40_DECODER)       += arm/rv34dsp_neon.o            \
-                                           arm/rv40dsp_neon.o
-diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c
-index 733ff08..69e2b2c 100644
---- a/libavcodec/arm/hevcdsp_init_neon.c
-+++ b/libavcodec/arm/hevcdsp_init_neon.c
-@@ -22,6 +22,7 @@
- #include "libavutil/arm/cpu.h"
- #include "libavcodec/hevcdsp.h"
- #include "hevcdsp_arm.h"
-+#include "../bit_depth_template.c"
- 
- void ff_hevc_v_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
- void ff_hevc_h_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
-@@ -43,6 +44,11 @@ void ff_hevc_transform_add_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
- void ff_hevc_transform_add_32x32_neon_8(uint8_t *_dst, int16_t *coeffs,
-                                       ptrdiff_t stride);
- 
-+void ff_hevc_sao_band_w8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t * offset_table);
-+void ff_hevc_sao_band_w16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t * offset_table);
-+void ff_hevc_sao_band_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t * offset_table);
-+void ff_hevc_sao_band_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t * offset_table);
-+
- #define PUT_PIXELS(name) \
-     void name(int16_t *dst, uint8_t *src, \
-                                 ptrdiff_t srcstride, int height, \
-@@ -151,6 +157,44 @@ void ff_hevc_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t
-     put_hevc_qpel_uw_neon[my][mx](dst, dststride, src, srcstride, width, height, src2, MAX_PB_SIZE);
- }
- 
-+static void ff_hevc_sao_band_neon_wrapper(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                          int16_t *sao_offset_val, int sao_left_class, int width, int height)
-+{
-+    pixel *dst = (pixel *)_dst;
-+    pixel *src = (pixel *)_src;
-+    int8_t offset_table[32] = { 0 };
-+    int k, y, x;
-+    int shift  = 3; // BIT_DEPTH - 5
-+
-+    stride_src /= sizeof(pixel);
-+    stride_dst /= sizeof(pixel);
-+
-+    for (k = 0; k < 4; k++)
-+        offset_table[(k + sao_left_class) & 31] = sao_offset_val[k + 1];
-+
-+    switch(width){
-+    case 8:
-+        ff_hevc_sao_band_w8_neon_8(_dst, _src, stride_dst, stride_src, height, offset_table);
-+        break;
-+    case 16:
-+        ff_hevc_sao_band_w16_neon_8(_dst, _src, stride_dst, stride_src, height, offset_table);
-+        break;
-+    case 32:
-+        ff_hevc_sao_band_w32_neon_8(_dst, _src, stride_dst, stride_src, height, offset_table);
-+        break;
-+    case 64:
-+        ff_hevc_sao_band_w64_neon_8(_dst, _src, stride_dst, stride_src, height, offset_table);
-+        break;
-+    default:
-+        for (y = 0; y < height; y++) {
-+            for (x = 0; x < width; x++)
-+                dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
-+            dst += stride_dst;
-+            src += stride_src;
-+        }
-+    }
-+}
-+
- av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
- {
-     if (bit_depth == 8) {
-@@ -170,6 +214,9 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
-         c->transform_add[2]            = ff_hevc_transform_add_16x16_neon_8;
-         c->transform_add[3]            = ff_hevc_transform_add_32x32_neon_8;
-         c->idct_4x4_luma               = ff_hevc_transform_luma_4x4_neon_8;
-+        for (x = 0; x < sizeof c->sao_band_filter / sizeof *c->sao_band_filter; x++) {
-+          c->sao_band_filter[x]        = ff_hevc_sao_band_neon_wrapper;
-+        }
-         put_hevc_qpel_neon[1][0]       = ff_hevc_put_qpel_v1_neon_8;
-         put_hevc_qpel_neon[2][0]       = ff_hevc_put_qpel_v2_neon_8;
-         put_hevc_qpel_neon[3][0]       = ff_hevc_put_qpel_v3_neon_8;
-diff --git a/libavcodec/arm/hevcdsp_sao_neon.S b/libavcodec/arm/hevcdsp_sao_neon.S
-new file mode 100644
-index 0000000..1f0ad64
---- /dev/null
-+++ b/libavcodec/arm/hevcdsp_sao_neon.S
-@@ -0,0 +1,204 @@
-+/*
-+ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "libavutil/arm/asm.S"
-+#include "neon.S"
-+
-+function ff_hevc_sao_band_w8_neon_8, export=1
-+        push  {r4-r8}
-+        ldr    r4, [sp, #20] // height
-+        ldr    r5, [sp, #24] // offset_table
-+        vpush {d8-d15}
-+        vld1.8  {q0, q1}, [r5] // offset table
-+
-+1:      subs    r4, #1
-+        vld1.8   {d24}, [r1], r3
-+        vshr.u8  d16, d24, #3
-+        vtbl.8   d16, {q0, q1}, d16
-+        vmovl.s8 q2, d16
-+        vmovl.u8 q6, d24
-+        vadd.s16 q2, q6
-+        vqmovun.s16 d4, q2
-+        vst1.8  {d4}, [r0], r2
-+        bne    1b
-+
-+        vpop  {d8-d15}
-+        pop   {r4-r8}
-+        bx lr
-+endfunc
-+
-+function ff_hevc_sao_band_w16_neon_8, export=1
-+        push  {r4-r8}
-+        ldr    r4, [sp, #20] // height
-+        ldr    r5, [sp, #24] // offset_table
-+        vpush {d8-d15}
-+        vld1.8  {q0, q1}, [r5] // offset table
-+
-+1:      subs    r4, #1
-+        vld1.8  {q12}, [r1], r3
-+
-+        vshr.u8   q8, q12, #3
-+
-+        vtbl.8  d16, {q0, q1}, d16
-+        vtbl.8  d17, {q0, q1}, d17
-+
-+        vmovl.s8 q2, d16
-+        vmovl.s8 q3, d17
-+
-+        vmovl.u8 q6, d24
-+        vmovl.u8 q7, d25
-+
-+        vadd.s16 q2, q6
-+        vadd.s16 q3, q7
-+
-+        vqmovun.s16 d4, q2
-+        vqmovun.s16 d5, q3
-+
-+        vstm.8   r0, {q2}
-+        add    r0, r2
-+        bne    1b
-+
-+        vpop  {d8-d15}
-+        pop   {r4-r8}
-+        bx lr
-+endfunc
-+
-+function ff_hevc_sao_band_w32_neon_8, export=1
-+        push  {r4-r8}
-+        ldr    r4, [sp, #20] // height
-+        ldr    r5, [sp, #24] // offset_table
-+        vpush {d8-d15}
-+        vld1.8  {q0, q1}, [r5] // offset table
-+
-+1:      subs    r4, #1
-+        vld1.8  {q12-q13}, [r1], r3
-+
-+        vshr.u8   q8, q12, #3
-+        vshr.u8   q9, q13, #3
-+
-+        vtbl.8  d16, {q0, q1}, d16
-+        vtbl.8  d17, {q0, q1}, d17
-+        vtbl.8  d18, {q0, q1}, d18
-+        vtbl.8  d19, {q0, q1}, d19
-+
-+        vmovl.s8 q2, d16
-+        vmovl.s8 q3, d17 // q8 free
-+        vmovl.s8 q4, d18
-+        vmovl.s8 q5, d19 // q9 free
-+
-+        vmovl.u8 q6, d24
-+        vmovl.u8 q7, d25 // q12 free
-+        vmovl.u8 q8, d26
-+        vmovl.u8 q9, d27 // q13 free
-+
-+        vadd.s16 q2, q6
-+        vadd.s16 q3, q7
-+        vadd.s16 q4, q8
-+        vadd.s16 q5, q9
-+
-+        vqmovun.s16 d4, q2
-+        vqmovun.s16 d5, q3
-+        vqmovun.s16 d6, q4 // q4 free
-+        vqmovun.s16 d7, q5 // q5 free
-+
-+        vst1.8 {q2-q3}, [r0], r2
-+        bne    1b
-+
-+        vpop  {d8-d15}
-+        pop   {r4-r8}
-+        bx lr
-+endfunc
-+
-+function ff_hevc_sao_band_w64_neon_8, export=1
-+        push  {r4-r8}
-+        ldr    r4, [sp, #20] // height
-+        ldr    r5, [sp, #24] // offset_table
-+        vpush {d8-d15}
-+        vld1.8  {q0, q1}, [r5] // offset table
-+
-+1:      subs    r4, #1
-+        vld1.8  {q12-q13}, [r1]!
-+        vld1.8  {q14-q15}, [r1], r3
-+        sub     r1, #32
-+
-+        vshr.u8   q8, q12, #3
-+        vshr.u8   q9, q13, #3
-+        vshr.u8  q10, q14, #3
-+        vshr.u8  q11, q15, #3
-+
-+        vtbl.8  d16, {q0, q1}, d16
-+        vtbl.8  d17, {q0, q1}, d17
-+        vtbl.8  d18, {q0, q1}, d18
-+        vtbl.8  d19, {q0, q1}, d19
-+        vtbl.8  d20, {q0, q1}, d20
-+        vtbl.8  d21, {q0, q1}, d21
-+        vtbl.8  d22, {q0, q1}, d22
-+        vtbl.8  d23, {q0, q1}, d23
-+
-+        vmovl.s8 q2, d16
-+        vmovl.s8 q3, d17 // q8 free
-+        vmovl.s8 q4, d18
-+        vmovl.s8 q5, d19 // q9 free
-+
-+        vmovl.u8 q6, d24
-+        vmovl.u8 q7, d25 // q12 free
-+        vmovl.u8 q8, d26
-+        vmovl.u8 q9, d27 // q13 free
-+
-+        vadd.s16 q2, q6
-+        vadd.s16 q3, q7
-+        vadd.s16 q4, q8
-+        vadd.s16 q5, q9
-+
-+        vqmovun.s16 d4, q2
-+        vqmovun.s16 d5, q3
-+        vqmovun.s16 d6, q4 // q4 free
-+        vqmovun.s16 d7, q5 // q5 free
-+
-+        // free q4 -q9, q12 - q13
-+        vmovl.s8 q4, d20
-+        vmovl.s8 q5, d21 // q10 free
-+        vmovl.s8 q6, d22
-+        vmovl.s8 q7, d23 // q11 free
-+
-+        vmovl.u8  q8, d28
-+        vmovl.u8  q9, d29 // q14 free
-+        vmovl.u8 q10, d30
-+        vmovl.u8 q11, d31 // q15 free
-+
-+        vadd.s16 q4, q8
-+        vadd.s16 q5, q9
-+        vadd.s16 q6, q10
-+        vadd.s16 q7, q11
-+
-+        vqmovun.s16  d8, q4
-+        vqmovun.s16  d9, q5
-+        vqmovun.s16 d10, q6
-+        vqmovun.s16 d11, q7
-+
-+        vstm.8   r0, {q2-q5}
-+        add    r0, r2
-+        bne    1b
-+
-+        vpop  {d8-d15}
-+        pop   {r4-r8}
-+        bx lr
-+endfunc
-+
--- 
-2.5.0
-
-
-From 8429b1de64bb871d57651ecfe3b084e2dfe0af51 Mon Sep 17 00:00:00 2001
-From: Seppo Tomperi <seppo.tomperi@vtt.fi>
-Date: Wed, 27 May 2015 18:10:20 +0100
-Subject: [PATCH 2/9] added NEON optimized sao edge for eo1 width 64
-
----
- libavcodec/arm/hevcdsp_init_neon.c |  47 ++++++++++++
- libavcodec/arm/hevcdsp_sao_neon.S  | 147 +++++++++++++++++++++++++++++++++++++
- 2 files changed, 194 insertions(+)
-
-diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c
-index 69e2b2c..c7b5404 100644
---- a/libavcodec/arm/hevcdsp_init_neon.c
-+++ b/libavcodec/arm/hevcdsp_init_neon.c
-@@ -22,6 +22,7 @@
- #include "libavutil/arm/cpu.h"
- #include "libavcodec/hevcdsp.h"
- #include "hevcdsp_arm.h"
-+#include "libavcodec/avcodec.h"
- #include "../bit_depth_template.c"
- 
- void ff_hevc_v_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
-@@ -48,6 +49,7 @@ void ff_hevc_sao_band_w8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_d
- void ff_hevc_sao_band_w16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t * offset_table);
- void ff_hevc_sao_band_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t * offset_table);
- void ff_hevc_sao_band_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t * offset_table);
-+void ff_hevc_sao_edge_eo1_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
- 
- #define PUT_PIXELS(name) \
-     void name(int16_t *dst, uint8_t *src, \
-@@ -195,6 +197,50 @@ static void ff_hevc_sao_band_neon_wrapper(uint8_t *_dst, uint8_t *_src, ptrdiff_
-     }
- }
- 
-+#define CMP(a, b) ((a) > (b) ? 1 : ((a) == (b) ? 0 : -1))
-+static void ff_hevc_sao_edge_neon_wrapper(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
-+                                          int16_t *_sao_offset_val, int eo, int width, int height)
-+{
-+    static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
-+    static const int8_t pos[4][2][2] = {
-+        { { -1,  0 }, {  1, 0 } }, // horizontal
-+        { {  0, -1 }, {  0, 1 } }, // vertical
-+        { { -1, -1 }, {  1, 1 } }, // 45 degree
-+        { {  1, -1 }, { -1, 1 } }, // 135 degree
-+    };
-+    int8_t sao_offset_val[8];  // padding of 3 for vld
-+    ptrdiff_t stride_src = (2*MAX_PB_SIZE + FF_INPUT_BUFFER_PADDING_SIZE);
-+    pixel *dst = (pixel *)_dst;
-+    pixel *src = (pixel *)_src;
-+    int a_stride, b_stride;
-+    int x, y;
-+
-+    for (x = 0; x < 5; x++) {
-+        sao_offset_val[x] = _sao_offset_val[x];
-+    }
-+
-+    stride_src /= sizeof(pixel);
-+    stride_dst /= sizeof(pixel);
-+
-+    if (eo == 1 && width == 64) {
-+        ff_hevc_sao_edge_eo1_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
-+    } else {
-+        a_stride = pos[eo][0][0] + pos[eo][0][1] * stride_src;
-+        b_stride = pos[eo][1][0] + pos[eo][1][1] * stride_src;
-+        for (y = 0; y < height; y++) {
-+            for (x = 0; x < width; x++) {
-+                int diff0         = CMP(src[x], src[x + a_stride]);
-+                int diff1         = CMP(src[x], src[x + b_stride]);
-+                int offset_val    = edge_idx[2 + diff0 + diff1];
-+                dst[x] = av_clip_pixel(src[x] + sao_offset_val[offset_val]);
-+            }
-+            src += stride_src;
-+            dst += stride_dst;
-+        }
-+    }
-+}
-+#undef CMP
-+
- av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
- {
-     if (bit_depth == 8) {
-@@ -216,6 +262,7 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
-         c->idct_4x4_luma               = ff_hevc_transform_luma_4x4_neon_8;
-         for (x = 0; x < sizeof c->sao_band_filter / sizeof *c->sao_band_filter; x++) {
-           c->sao_band_filter[x]        = ff_hevc_sao_band_neon_wrapper;
-+          c->sao_edge_filter[x]        = ff_hevc_sao_edge_neon_wrapper;
-         }
-         put_hevc_qpel_neon[1][0]       = ff_hevc_put_qpel_v1_neon_8;
-         put_hevc_qpel_neon[2][0]       = ff_hevc_put_qpel_v2_neon_8;
-diff --git a/libavcodec/arm/hevcdsp_sao_neon.S b/libavcodec/arm/hevcdsp_sao_neon.S
-index 1f0ad64..5ec2de9 100644
---- a/libavcodec/arm/hevcdsp_sao_neon.S
-+++ b/libavcodec/arm/hevcdsp_sao_neon.S
-@@ -202,3 +202,150 @@ function ff_hevc_sao_band_w64_neon_8, export=1
-         bx lr
- endfunc
- 
-+function ff_hevc_sao_edge_eo1_w64_neon_8, export=1
-+        push  {r4-r8}
-+        ldr    r4, [sp, #20] // height
-+        ldr    r5, [sp, #24] // sao_offset_val_table
-+        ldr    r6, =0x02
-+        vpush {d8-d15}
-+1:      subs    r4, #1
-+        // load a
-+        sub     r1, r3
-+        vld1.8  {q0-q1}, [r1]!
-+        vld1.8  {q2-q3}, [r1], r3
-+        sub     r1, #32
-+        // load c
-+        vld1.8  {q4-q5}, [r1]!
-+        vld1.8  {q6-q7}, [r1], r3
-+        sub     r1, #32
-+        // load b
-+        vld1.8  {q8-q9}, [r1]!
-+        vld1.8  {q10-q11}, [r1], r3
-+        sub     r1, #32
-+
-+        vcgt.u8 q12, q4, q0 // c > a -> -1 , otherwise 0
-+        vcgt.u8 q0,  q0, q4 // a > c -> -1 , otherwise 0
-+        vcgt.u8 q13, q5, q1
-+        vcgt.u8 q1,  q1, q5
-+        vcgt.u8 q14, q6, q2
-+        vcgt.u8 q2,  q2, q6
-+        vcgt.u8 q15, q7, q3
-+        vcgt.u8 q3,  q3, q7
-+
-+        vsub.s8 q12, q0, q12 // diff0
-+        vsub.s8 q13, q1, q13
-+        vsub.s8 q14, q2, q14
-+        vsub.s8 q15, q3, q15
-+
-+        vcgt.u8  q0,  q4, q8 // c > b
-+        vcgt.u8  q8,  q8, q4 // b > c
-+        vcgt.u8  q1,  q5, q9
-+        vcgt.u8  q9,  q9, q5
-+        vcgt.u8  q2,  q6, q10
-+        vcgt.u8 q10, q10, q6
-+        vcgt.u8  q3,  q7, q11
-+        vcgt.u8 q11, q11, q7
-+
-+        vsub.s8 q0, q8, q0 // diff1
-+        vsub.s8 q1, q9, q1
-+        vsub.s8 q2, q10, q2
-+        vsub.s8 q3, q11, q3
-+
-+        veor.u8 q8, q8  // zero register
-+        vdup.s8 q9, r6  // 2 to all elements
-+        add     r6, #1
-+        vdup.s8 q10, r6 // 3 to all elements
-+        sub     r6, #1
-+
-+        vadd.s8 q0, q12 //diff0 + diff1
-+        vadd.s8 q1, q13
-+        vadd.s8 q2, q14
-+        vadd.s8 q3, q15
-+
-+        vcgt.s8 q4, q0, q8 // diff0 + diff1 > 0
-+        vcgt.s8 q5, q1, q8
-+        vcgt.s8 q6, q2, q8
-+        vcgt.s8 q7, q3, q8
-+
-+        vclt.s8 q11, q0, q8 // diff0 + diff1 < 0
-+        vclt.s8 q12, q1, q8
-+        vclt.s8 q13, q2, q8
-+        vclt.s8 q14, q3, q8
-+
-+        vadd.s8  q8,  q0, q9  // diff0 + diff1 + 2
-+        vand.8  q15,  q8, q4
-+        vadd.s8  q8,  q0, q10 // diff0 + diff1 + 3
-+        vand.8   q8,  q8, q11
-+        vadd.s8  q0, q15, q8  // offset_idx
-+
-+        vadd.s8  q8,  q1, q9  // diff0 + diff1 + 2
-+        vand.8  q15,  q8, q5
-+        vadd.s8  q8,  q1, q10 // diff0 + diff1 + 3
-+        vand.8   q8,  q8, q12
-+        vadd.s8  q1, q15, q8  // offset_idx
-+
-+        vadd.s8  q8,  q2, q9  // diff0 + diff1 + 2 + 2
-+        vand.8  q15,  q8, q6
-+        vadd.s8  q8,  q2, q10 // diff0 + diff1 + 2 + 3
-+        vand.8   q8,  q8, q13
-+        vadd.s8  q2, q15, q8  // offset_idx
-+
-+        vadd.s8  q8,  q3, q9  // diff0 + diff1 + 2 + 2
-+        vand.8  q15,  q8, q7
-+        vadd.s8  q8,  q3, q10 // diff0 + diff1 + 2 + 3
-+        vand.8   q8,  q8, q14
-+        vadd.s8  q3, q15, q8  // offset_idx
-+        // TODO: load only once
-+        vld1.8   d16, [r5]
-+
-+        vtbl.8   d0, {d16}, d0
-+        vtbl.8   d1, {d16}, d1
-+        vtbl.8   d2, {d16}, d2
-+        vtbl.8   d3, {d16}, d3
-+        vtbl.8   d4, {d16}, d4
-+        vtbl.8   d5, {d16}, d5
-+        vtbl.8   d6, {d16}, d6
-+        vtbl.8   d7, {d16}, d7
-+
-+        // TODO: load only once
-+        // load c again
-+        sub     r1, r3
-+        sub     r1, r3
-+        vld1.8  {q4-q5}, [r1]!
-+        vld1.8  {q6-q7}, [r1], r3
-+        sub     r1, #32
-+
-+        vmovl.u8   q8, d8
-+        vmovl.u8   q9, d9
-+        vmovl.u8  q10, d10
-+        vmovl.u8  q11, d11
-+        vmovl.u8  q12, d12
-+        vmovl.u8  q13, d13
-+        vmovl.u8  q14, d14
-+        vmovl.u8  q15, d15
-+
-+        vaddw.s8  q8, d0
-+        vaddw.s8  q9, d1
-+        vaddw.s8 q10, d2
-+        vaddw.s8 q11, d3
-+        vaddw.s8 q12, d4
-+        vaddw.s8 q13, d5
-+        vaddw.s8 q14, d6
-+        vaddw.s8 q15, d7
-+
-+        vqmovun.s16  d0, q8
-+        vqmovun.s16  d1, q9
-+        vqmovun.s16  d2, q10
-+        vqmovun.s16  d3, q11
-+        vqmovun.s16  d4, q12
-+        vqmovun.s16  d5, q13
-+        vqmovun.s16  d6, q14
-+        vqmovun.s16  d7, q15
-+
-+        vstm r0, {q0-q3}
-+        add  r0, r2
-+        bne   1b
-+        vpop  {d8-d15}
-+        pop   {r4-r8}
-+        bx lr
-+endfunc
--- 
-2.5.0
-
-
-From 402e2bd1c5ad659c757bf9734abe6331904fb9e2 Mon Sep 17 00:00:00 2001
-From: Seppo Tomperi <seppo.tomperi@vtt.fi>
-Date: Tue, 16 Dec 2014 16:28:25 +0200
-Subject: [PATCH 3/9] Added SAO edge offset for ARM NEON w32 and w64
-
----
- libavcodec/arm/hevcdsp_init_neon.c |  46 +++-
- libavcodec/arm/hevcdsp_sao_neon.S  | 510 +++++++++++++++++++++++++++++++------
- 2 files changed, 474 insertions(+), 82 deletions(-)
-
-diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c
-index c7b5404..c32940e 100644
---- a/libavcodec/arm/hevcdsp_init_neon.c
-+++ b/libavcodec/arm/hevcdsp_init_neon.c
-@@ -49,7 +49,16 @@ void ff_hevc_sao_band_w8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_d
- void ff_hevc_sao_band_w16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t * offset_table);
- void ff_hevc_sao_band_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t * offset_table);
- void ff_hevc_sao_band_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t * offset_table);
-+
-+void ff_hevc_sao_edge_eo0_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
-+void ff_hevc_sao_edge_eo1_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
-+void ff_hevc_sao_edge_eo2_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
-+void ff_hevc_sao_edge_eo3_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
-+
-+void ff_hevc_sao_edge_eo0_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
- void ff_hevc_sao_edge_eo1_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
-+void ff_hevc_sao_edge_eo2_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
-+void ff_hevc_sao_edge_eo3_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
- 
- #define PUT_PIXELS(name) \
-     void name(int16_t *dst, uint8_t *src, \
-@@ -222,9 +231,40 @@ static void ff_hevc_sao_edge_neon_wrapper(uint8_t *_dst /* align 16 */, uint8_t
-     stride_src /= sizeof(pixel);
-     stride_dst /= sizeof(pixel);
- 
--    if (eo == 1 && width == 64) {
--        ff_hevc_sao_edge_eo1_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
--    } else {
-+    switch (width) {
-+    case 32:
-+        switch(eo) {
-+        case 0:
-+            ff_hevc_sao_edge_eo0_w32_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
-+            break;
-+        case 1:
-+            ff_hevc_sao_edge_eo1_w32_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
-+            break;
-+        case 2:
-+            ff_hevc_sao_edge_eo2_w32_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
-+            break;
-+        case 3:
-+            ff_hevc_sao_edge_eo3_w32_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
-+            break;
-+        }
-+        break;
-+    case 64:
-+        switch(eo) {
-+        case 0:
-+            ff_hevc_sao_edge_eo0_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
-+            break;
-+        case 1:
-+            ff_hevc_sao_edge_eo1_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
-+            break;
-+        case 2:
-+            ff_hevc_sao_edge_eo2_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
-+            break;
-+        case 3:
-+            ff_hevc_sao_edge_eo3_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
-+            break;
-+        }
-+        break;
-+    default:
-         a_stride = pos[eo][0][0] + pos[eo][0][1] * stride_src;
-         b_stride = pos[eo][1][0] + pos[eo][1][1] * stride_src;
-         for (y = 0; y < height; y++) {
-diff --git a/libavcodec/arm/hevcdsp_sao_neon.S b/libavcodec/arm/hevcdsp_sao_neon.S
-index 5ec2de9..4687012 100644
---- a/libavcodec/arm/hevcdsp_sao_neon.S
-+++ b/libavcodec/arm/hevcdsp_sao_neon.S
-@@ -202,27 +202,7 @@ function ff_hevc_sao_band_w64_neon_8, export=1
-         bx lr
- endfunc
- 
--function ff_hevc_sao_edge_eo1_w64_neon_8, export=1
--        push  {r4-r8}
--        ldr    r4, [sp, #20] // height
--        ldr    r5, [sp, #24] // sao_offset_val_table
--        ldr    r6, =0x02
--        vpush {d8-d15}
--1:      subs    r4, #1
--        // load a
--        sub     r1, r3
--        vld1.8  {q0-q1}, [r1]!
--        vld1.8  {q2-q3}, [r1], r3
--        sub     r1, #32
--        // load c
--        vld1.8  {q4-q5}, [r1]!
--        vld1.8  {q6-q7}, [r1], r3
--        sub     r1, #32
--        // load b
--        vld1.8  {q8-q9}, [r1]!
--        vld1.8  {q10-q11}, [r1], r3
--        sub     r1, #32
--
-+.macro edge_w64_body
-         vcgt.u8 q12, q4, q0 // c > a -> -1 , otherwise 0
-         vcgt.u8 q0,  q0, q4 // a > c -> -1 , otherwise 0
-         vcgt.u8 q13, q5, q1
-@@ -251,69 +231,61 @@ function ff_hevc_sao_edge_eo1_w64_neon_8, export=1
-         vsub.s8 q2, q10, q2
-         vsub.s8 q3, q11, q3
- 
--        veor.u8 q8, q8  // zero register
--        vdup.s8 q9, r6  // 2 to all elements
--        add     r6, #1
--        vdup.s8 q10, r6 // 3 to all elements
--        sub     r6, #1
--
-         vadd.s8 q0, q12 //diff0 + diff1
-         vadd.s8 q1, q13
-         vadd.s8 q2, q14
-         vadd.s8 q3, q15
- 
--        vcgt.s8 q4, q0, q8 // diff0 + diff1 > 0
--        vcgt.s8 q5, q1, q8
--        vcgt.s8 q6, q2, q8
--        vcgt.s8 q7, q3, q8
--
--        vclt.s8 q11, q0, q8 // diff0 + diff1 < 0
--        vclt.s8 q12, q1, q8
--        vclt.s8 q13, q2, q8
--        vclt.s8 q14, q3, q8
--
--        vadd.s8  q8,  q0, q9  // diff0 + diff1 + 2
--        vand.8  q15,  q8, q4
--        vadd.s8  q8,  q0, q10 // diff0 + diff1 + 3
--        vand.8   q8,  q8, q11
--        vadd.s8  q0, q15, q8  // offset_idx
--
--        vadd.s8  q8,  q1, q9  // diff0 + diff1 + 2
--        vand.8  q15,  q8, q5
--        vadd.s8  q8,  q1, q10 // diff0 + diff1 + 3
--        vand.8   q8,  q8, q12
--        vadd.s8  q1, q15, q8  // offset_idx
--
--        vadd.s8  q8,  q2, q9  // diff0 + diff1 + 2 + 2
--        vand.8  q15,  q8, q6
--        vadd.s8  q8,  q2, q10 // diff0 + diff1 + 2 + 3
--        vand.8   q8,  q8, q13
--        vadd.s8  q2, q15, q8  // offset_idx
--
--        vadd.s8  q8,  q3, q9  // diff0 + diff1 + 2 + 2
--        vand.8  q15,  q8, q7
--        vadd.s8  q8,  q3, q10 // diff0 + diff1 + 2 + 3
--        vand.8   q8,  q8, q14
--        vadd.s8  q3, q15, q8  // offset_idx
--        // TODO: load only once
--        vld1.8   d16, [r5]
--
--        vtbl.8   d0, {d16}, d0
--        vtbl.8   d1, {d16}, d1
--        vtbl.8   d2, {d16}, d2
--        vtbl.8   d3, {d16}, d3
--        vtbl.8   d4, {d16}, d4
--        vtbl.8   d5, {d16}, d5
--        vtbl.8   d6, {d16}, d6
--        vtbl.8   d7, {d16}, d7
--
--        // TODO: load only once
--        // load c again
--        sub     r1, r3
--        sub     r1, r3
--        vld1.8  {q4-q5}, [r1]!
--        vld1.8  {q6-q7}, [r1], r3
--        sub     r1, #32
-+        vdup.s8 q9, r6 // 3 to all elements
-+        sub     r6, #1
-+
-+        vclt.s8 q12, q0, #0 // diff0 + diff1 < 0
-+        vclt.s8 q13, q1, #0
-+        vclt.s8 q14, q2, #0
-+        vclt.s8 q15, q3, #0
-+
-+        vadd.s8  q8,  q0, q9 // diff0 + diff1 + 3
-+        vadd.s8  q10,  q1, q9
-+        vand.8   q12, q8, q12 // if (diff0 + diff1 < 0) then (diff0 + diff1 + 3) else 0
-+        vand.8   q13, q10, q13
-+        vadd.s8  q8,  q2, q9
-+        vadd.s8  q10,  q3, q9
-+        vand.8   q14, q8, q14
-+        vand.8   q15, q10, q15
-+
-+        vdup.s8 q9, r6  // 2 to all elements
-+        add     r6, #1
-+
-+        vcgt.s8  q10, q0, #0 // diff0 + diff1 > 0
-+        vadd.s8   q8, q0, q9 // diff0 + diff1 + 2
-+        vand.8   q11, q8, q10 // if (diff0 + diff1 > 0) then (diff0 + diff1 + 2) else 0
-+        vcgt.s8  q10, q1, #0
-+        vadd.s8   q0, q11, q12  // offset_idx
-+
-+        vadd.s8   q8, q1, q9 // diff0 + diff1 + 2
-+        vcgt.s8  q12, q2, #0
-+        vand.8   q11, q8, q10 // if (diff0 + diff1 > 0) then (diff0 + diff1 + 2) else 0
-+        vadd.s8   q8, q2, q9 // diff0 + diff1 + 2
-+        vadd.s8   q1, q11, q13
-+
-+        vand.8   q11, q8, q12 // if (diff0 + diff1 > 0) then (diff0 + diff1 + 2) else 0
-+        vcgt.s8  q10, q3, #0
-+        vadd.s8   q2, q11, q14
-+
-+        vadd.s8   q8, q3, q9 // diff0 + diff1 + 2
-+        vmov.32  d18[0], r7  // load offset table from general registers
-+        vand.8   q11, q8, q10 // if (diff0 + diff1 > 0) then (diff0 + diff1 + 2) else 0
-+        vmov.32  d18[1], r5  // load rest of offset table
-+        vadd.s8   q3, q11, q15
-+
-+        vtbl.8   d0, {d18}, d0
-+        vtbl.8   d1, {d18}, d1
-+        vtbl.8   d2, {d18}, d2
-+        vtbl.8   d3, {d18}, d3
-+        vtbl.8   d4, {d18}, d4
-+        vtbl.8   d5, {d18}, d5
-+        vtbl.8   d6, {d18}, d6
-+        vtbl.8   d7, {d18}, d7
- 
-         vmovl.u8   q8, d8
-         vmovl.u8   q9, d9
-@@ -344,8 +316,388 @@ function ff_hevc_sao_edge_eo1_w64_neon_8, export=1
- 
-         vstm r0, {q0-q3}
-         add  r0, r2
-+.endm
-+
-+.macro edge_w32_body
-+        vcgt.u8 q12, q4, q0 // c > a -> -1 , otherwise 0
-+        vcgt.u8 q0,  q0, q4 // a > c -> -1 , otherwise 0
-+        vcgt.u8 q13, q5, q1
-+        vcgt.u8 q1,  q1, q5
-+
-+        vsub.s8 q12, q0, q12 // diff0
-+        vcgt.u8  q0,  q4, q8 // c > b
-+        vsub.s8 q13, q1, q13 // diff0 part 2
-+
-+        vcgt.u8  q6,  q8, q4 // b > c
-+        vcgt.u8  q1,  q5, q9
-+        vcgt.u8  q7,  q9, q5
-+
-+        vsub.s8 q0, q6, q0 // diff1
-+        vsub.s8 q1, q7, q1 // diff1 part 2
-+        vadd.s8 q0, q12 //diff0 + diff1
-+
-+        vdup.s8 q7, r6 // 3 to all elements
-+        sub     r6, #1
-+        vadd.s8 q1, q13
-+
-+        vclt.s8 q12, q0, #0 // diff0 + diff1 < 0
-+        vclt.s8 q13, q1, #0
-+
-+        vadd.s8  q6,  q0, q7 // diff0 + diff1 + 3
-+        vadd.s8  q10,  q1, q7
-+        vdup.s8 q7, r6  // 2 to all elements
-+        add     r6, #1
-+        vand.8   q12, q6, q12 // if (diff0 + diff1 < 0) then (diff0 + diff1 + 3) else 0
-+        vand.8   q13, q10, q13
-+
-+
-+        vcgt.s8  q10, q0, #0 // diff0 + diff1 > 0
-+        vadd.s8   q6, q0, q7 // diff0 + diff1 + 2
-+        vand.8   q11, q6, q10 // if (diff0 + diff1 > 0) then (diff0 + diff1 + 2) else 0
-+        vcgt.s8  q10, q1, #0
-+        vadd.s8   q0, q11, q12  // offset_idx
-+
-+        vadd.s8   q6, q1, q7 // diff0 + diff1 + 2
-+        vmov.32  d14[0], r7  // load offset table from general registers
-+        vand.8   q11, q6, q10 // if (diff0 + diff1 > 0) then (diff0 + diff1 + 2) else 0
-+        vmov.32  d14[1], r5  // load rest of offset table
-+        vadd.s8   q1, q11, q13
-+
-+        vtbl.8   d0, {d14}, d0
-+        vtbl.8   d1, {d14}, d1
-+        vtbl.8   d2, {d14}, d2
-+        vtbl.8   d3, {d14}, d3
-+
-+        vmovl.u8   q6, d8
-+        vmovl.u8   q7, d9
-+        vmovl.u8  q10, d10
-+        vmovl.u8  q11, d11
-+
-+        vaddw.s8  q6, d0
-+        vaddw.s8  q7, d1
-+        vaddw.s8 q10, d2
-+        vaddw.s8 q11, d3
-+
-+        vqmovun.s16  d0, q6
-+        vqmovun.s16  d1, q7
-+        vqmovun.s16  d2, q10
-+        vqmovun.s16  d3, q11
-+
-+        vstm r0, {q0-q1}
-+        add  r0, r2
-+.endm
-+
-+function ff_hevc_sao_edge_eo0_w64_neon_8, export=1
-+        push  {r4-r8}
-+        ldr    r4, [sp, #20] // height
-+        ldr    r5, [sp, #24] // sao_offset_val_table
-+        ldr    r6, =0x03
-+        ldr    r7, [r5]
-+        add    r5, #4
-+        ldr    r5, [r5]
-+        vpush {d8-d15}
-+        sub    r1, #8
-+1:      subs    r4, #1
-+        vld1.64  {q10-q11}, [r1]!
-+        vld1.64  {q12-q13}, [r1]!
-+        vld1.64  {q14}, [r1], r3
-+        sub      r1, #64
-+        // load a
-+        vext.8 q0, q10, q11, #7
-+        vext.8 q1, q11, q12, #7
-+        vext.8 q2, q12, q13, #7
-+        vext.8 q3, q13, q14, #7
-+        // load c
-+        vext.8 q4, q10, q11, #8
-+        vext.8 q5, q11, q12, #8
-+        vext.8 q6, q12, q13, #8
-+        vext.8 q7, q13, q14, #8
-+        // load b
-+        vext.8 q8, q10, q11, #9
-+        vext.8 q9, q11, q12, #9
-+        vext.8 q10, q12, q13, #9
-+        vext.8 q11, q13, q14, #9
-+        edge_w64_body
-+        bne   1b
-+        vpop  {d8-d15}
-+        pop   {r4-r8}
-+        bx lr
-+endfunc
-+
-+function ff_hevc_sao_edge_eo1_w64_neon_8, export=1
-+        push  {r4-r8}
-+        ldr    r4, [sp, #20] // height
-+        ldr    r5, [sp, #24] // sao_offset_val_table
-+        ldr    r6, =0x03
-+        ldr    r7, [r5]
-+        add    r5, #4
-+        ldr    r5, [r5]
-+        vpush {d8-d15}
-+        sub     r1, r3
-+        // load a
-+        vld1.8  {q0-q1}, [r1]!
-+        vld1.8  {q2-q3}, [r1], r3
-+        sub     r1, #32
-+1:      subs    r4, #1
-+        // load c
-+        vld1.8  {q4-q5}, [r1]!
-+        vld1.8  {q6-q7}, [r1], r3
-+        sub     r1, #32
-+        // load b
-+        vld1.8  {q8-q9}, [r1]!
-+        vld1.8  {q10-q11}, [r1]
-+        sub     r1, #32
-+        edge_w64_body
-+        // copy c to a
-+        vmov.64 q0, q4
-+        vmov.64 q1, q5
-+        vmov.64 q2, q6
-+        vmov.64 q3, q7
-         bne   1b
-         vpop  {d8-d15}
-         pop   {r4-r8}
-         bx lr
- endfunc
-+
-+function ff_hevc_sao_edge_eo2_w64_neon_8, export=1
-+        push  {r4-r8}
-+        ldr    r4, [sp, #20] // height
-+        ldr    r5, [sp, #24] // sao_offset_val_table
-+        ldr    r6, =0x03
-+        ldr    r7, [r5]
-+        add    r5, #4
-+        ldr    r5, [r5]
-+        vpush {d8-d15}
-+1:      sub     r1, r3
-+        // load a
-+        // TODO: fix unaligned load
-+        //       don't reload a like in eo1
-+        sub     r1, #1
-+        vld1.8  {q0-q1}, [r1]!
-+        vld1.8  {q2-q3}, [r1], r3
-+        sub     r1, #31
-+        subs    r4, #1
-+        // load c
-+        vld1.8  {q4-q5}, [r1]!
-+        vld1.8  {q6-q7}, [r1], r3
-+        sub     r1, #32
-+        // load b
-+        add     r1, #1
-+        vld1.8  {q8-q9}, [r1]!
-+        vld1.8  {q10-q11}, [r1]
-+        sub     r1, #33
-+        edge_w64_body
-+        // copy c to a
-+        vmov.64 q0, q4
-+        vmov.64 q1, q5
-+        vmov.64 q2, q6
-+        vmov.64 q3, q7
-+        bne   1b
-+        vpop  {d8-d15}
-+        pop   {r4-r8}
-+        bx lr
-+endfunc
-+
-+function ff_hevc_sao_edge_eo3_w64_neon_8, export=1
-+        push  {r4-r8}
-+        ldr    r4, [sp, #20] // height
-+        ldr    r5, [sp, #24] // sao_offset_val_table
-+        ldr    r6, =0x03
-+        ldr    r7, [r5]
-+        add    r5, #4
-+        ldr    r5, [r5]
-+        vpush {d8-d15}
-+1:      sub     r1, r3
-+        // load a
-+        // TODO: fix unaligned load
-+        //       don't reload a like in eo1
-+        add     r1, #1
-+        vld1.8  {q0-q1}, [r1]!
-+        vld1.8  {q2-q3}, [r1], r3
-+        sub     r1, #33
-+        subs    r4, #1
-+        // load c
-+        vld1.8  {q4-q5}, [r1]!
-+        vld1.8  {q6-q7}, [r1], r3
-+        sub     r1, #32
-+        // load b
-+        sub     r1, #1
-+        vld1.8  {q8-q9}, [r1]!
-+        vld1.8  {q10-q11}, [r1]
-+        sub     r1, #31
-+        edge_w64_body
-+        // copy c to a
-+        vmov.64 q0, q4
-+        vmov.64 q1, q5
-+        vmov.64 q2, q6
-+        vmov.64 q3, q7
-+        bne   1b
-+        vpop  {d8-d15}
-+        pop   {r4-r8}
-+        bx lr
-+endfunc
-+
-+function ff_hevc_sao_edge_eo0_w32_neon_8, export=1
-+        push  {r4-r8}
-+        ldr    r4, [sp, #20] // height
-+        ldr    r5, [sp, #24] // sao_offset_val_table
-+        ldr    r6, =0x03
-+        ldr    r7, [r5]
-+        add    r5, #4
-+        ldr    r5, [r5]
-+        vpush {d8-d15}
-+        sub    r1, #8 // load 8 extra bytes
-+1:      subs    r4, #1
-+        vld1.8  {q10-q11}, [r1]
-+        add    r1, #32
-+        vld1.8  {q12}, [r1], r3 // only first 9 bytes are used
-+        sub    r1, #32
-+        // a
-+        vext.8  q0, q10, q11, #7
-+        vext.8  q1, q11, q12, #7
-+        // c
-+        vext.8  q4, q10, q11, #8
-+        vext.8  q5, q11, q12, #8
-+        // b
-+        vext.8  q8, q10, q11, #9
-+        vext.8  q9, q11, q12, #9
-+        edge_w32_body
-+        bne   1b
-+        vpop  {d8-d15}
-+        pop   {r4-r8}
-+        bx lr
-+endfunc
-+
-+function ff_hevc_sao_edge_eo1_w32_neon_8, export=1
-+        push  {r4-r8}
-+        ldr    r4, [sp, #20] // height
-+        ldr    r5, [sp, #24] // sao_offset_val_table
-+        ldr    r6, =0x03
-+        ldr    r7, [r5]
-+        add    r5, #4
-+        ldr    r5, [r5]
-+        vpush {d8-d15}
-+        // load a
-+        sub     r1, r3
-+        vld1.8  {q0-q1}, [r1], r3
-+        // load c
-+        vld1.8  {q4-q5}, [r1], r3
-+1:      subs    r4, #1
-+        // load b
-+        vld1.8  {q8-q9}, [r1], r3
-+        edge_w32_body
-+        // inputs for next loop iteration
-+        // a
-+        vmov.64 q0, q4
-+        vmov.64 q1, q5
-+        // c
-+        vmov.64 q4, q8
-+        vmov.64 q5, q9
-+        bne   1b
-+        vpop  {d8-d15}
-+        pop   {r4-r8}
-+        bx lr
-+endfunc
-+
-+function ff_hevc_sao_edge_eo2_w32_neon_8, export=1
-+        push  {r4-r8}
-+        ldr    r4, [sp, #20] // height
-+        ldr    r5, [sp, #24] // sao_offset_val_table
-+        ldr    r6, =0x03
-+        ldr    r7, [r5]
-+        add    r5, #4
-+        ldr    r5, [r5]
-+        vpush {d8-d15}
-+        // load a
-+        sub     r1, r3
-+        sub    r1, #8
-+        vld1.8  {q10-q11}, [r1]
-+        add    r1, #32
-+        vld1.8  {q12}, [r1], r3
-+        sub    r1, #32
-+        vext.8  q0, q10, q11, #7
-+        vext.8  q1, q11, q12, #7
-+        // load c
-+        vld1.8  {q10-q11}, [r1]
-+        add    r1, #32
-+        vld1.8  {q12}, [r1], r3
-+        sub    r1, #32
-+        vext.8  q4, q10, q11, #8
-+        vext.8  q5, q11, q12, #8
-+        vext.8  q2, q10, q11, #7
-+1:      subs    r4, #1
-+        // load b
-+        vld1.8  {q10-q11}, [r1]
-+        add    r1, #32
-+        vld1.8  {q12}, [r1], r3
-+        sub    r1, #32
-+        vext.8  q8, q10, q11, #9
-+        vext.8  q9, q11, q12, #9
-+        vext.8  q14, q10, q11, #8
-+        vext.8  q15, q11, q12, #8
-+        vext.8  q3, q10, q11, #7
-+        edge_w32_body
-+        // inputs for next loop iteration
-+        // a
-+        vmov.8 q0, q2
-+        vext.8 q1, q4, q5, #15
-+        // c
-+        vmov.8  q4, q14
-+        vmov.8  q5, q15
-+        vmov.8  q2, q3
-+        bne   1b
-+        vpop  {d8-d15}
-+        pop   {r4-r8}
-+        bx lr
-+endfunc
-+
-+function ff_hevc_sao_edge_eo3_w32_neon_8, export=1
-+        push  {r4-r8}
-+        ldr    r4, [sp, #20] // height
-+        ldr    r5, [sp, #24] // sao_offset_val_table
-+        ldr    r6, =0x03
-+        ldr    r7, [r5]
-+        add    r5, #4
-+        sub    r1, r3
-+        ldr    r5, [r5]
-+        sub    r1, #8
-+        vpush {d8-d15}
-+        // load a
-+        vld1.8  {q10-q11}, [r1]
-+        add    r1, #32
-+        vld1.8  {q12}, [r1], r3
-+        sub    r1, #32
-+        vext.8  q0, q10, q11, #9
-+        vext.8  q1, q11, q12, #9
-+        // load c
-+        vld1.8  {q10-q11}, [r1]
-+        add    r1, #32
-+        vld1.8  {q12}, [r1], r3
-+        sub    r1, #32
-+        vext.8  q4, q10, q11, #8
-+        vext.8  q5, q11, q12, #8
-+        vext.8  q2, q12, q11, #8
-+1:      subs    r4, #1
-+        // load b
-+        vld1.8  {q10-q11}, [r1]
-+        add    r1, #32
-+        vld1.8  {q12}, [r1], r3
-+        sub    r1, #32
-+        vext.8  q8, q10, q11, #7
-+        vext.8  q9, q11, q12, #7
-+        vext.8  q3, q12, q10, #7
-+        edge_w32_body
-+        // inputs for next loop iteration
-+        // a
-+        vext.8 q0, q4, q5, #1
-+        vext.8 q1, q5, q2, #1
-+        // c
-+        vext.8  q4, q8, q9, #1
-+        vext.8  q5, q9, q3, #1
-+        vext.8  q2, q3, q1, #1
-+        bne   1b
-+        vpop  {d8-d15}
-+        pop   {r4-r8}
-+        bx lr
-+endfunc
-+
--- 
-2.5.0
-
-
-From 1898d052a73370166d57e17cc7c52b7275887df3 Mon Sep 17 00:00:00 2001
-From: Seppo Tomperi <seppo.tomperi@vtt.fi>
-Date: Fri, 19 Dec 2014 09:44:10 +0200
-Subject: [PATCH 4/9] Improved SAO band NEON opimizations made SAO buffer 16
- byte aligned added alignment hints to loads and stores optimized register
- usage in SAO band neon assembly
-
----
- libavcodec/arm/hevcdsp_sao_neon.S | 212 +++++++++++++++-----------------------
- 1 file changed, 82 insertions(+), 130 deletions(-)
-
-diff --git a/libavcodec/arm/hevcdsp_sao_neon.S b/libavcodec/arm/hevcdsp_sao_neon.S
-index 4687012..ac21013 100644
---- a/libavcodec/arm/hevcdsp_sao_neon.S
-+++ b/libavcodec/arm/hevcdsp_sao_neon.S
-@@ -22,120 +22,84 @@
- #include "neon.S"
- 
- function ff_hevc_sao_band_w8_neon_8, export=1
--        push  {r4-r8}
--        ldr    r4, [sp, #20] // height
--        ldr    r5, [sp, #24] // offset_table
--        vpush {d8-d15}
--        vld1.8  {q0, q1}, [r5] // offset table
-+        ldr      r12, [sp, #4]    // offset_table address
-+        vld1.8   {q0, q1}, [r12]  // offset table
-+        ldr      r12, [sp, #0]    // height
- 
--1:      subs    r4, #1
--        vld1.8   {d24}, [r1], r3
-+1:      subs     r12, #1
-+        vld1.8   {d24}, [r1,:64], r3
-         vshr.u8  d16, d24, #3
-         vtbl.8   d16, {q0, q1}, d16
--        vmovl.s8 q2, d16
-         vmovl.u8 q6, d24
--        vadd.s16 q2, q6
-+        vaddw.s8 q6, d16
-         vqmovun.s16 d4, q2
--        vst1.8  {d4}, [r0], r2
-+        vst1.8  {d4}, [r0,:64], r2
-         bne    1b
- 
--        vpop  {d8-d15}
--        pop   {r4-r8}
-         bx lr
- endfunc
- 
- function ff_hevc_sao_band_w16_neon_8, export=1
--        push  {r4-r8}
--        ldr    r4, [sp, #20] // height
--        ldr    r5, [sp, #24] // offset_table
--        vpush {d8-d15}
--        vld1.8  {q0, q1}, [r5] // offset table
--
--1:      subs    r4, #1
--        vld1.8  {q12}, [r1], r3
-+        ldr      r12, [sp, #4]    // offset_table address
-+        vld1.8   {q0, q1}, [r12]  // offset table
-+        ldr      r12, [sp, #0]    // height
- 
-+1:      subs     r12, #1
-+        vld1.8  {q12}, [r1,:128], r3
-         vshr.u8   q8, q12, #3
--
-         vtbl.8  d16, {q0, q1}, d16
-         vtbl.8  d17, {q0, q1}, d17
--
--        vmovl.s8 q2, d16
--        vmovl.s8 q3, d17
--
--        vmovl.u8 q6, d24
--        vmovl.u8 q7, d25
--
--        vadd.s16 q2, q6
--        vadd.s16 q3, q7
--
--        vqmovun.s16 d4, q2
--        vqmovun.s16 d5, q3
--
--        vstm.8   r0, {q2}
--        add    r0, r2
-+        vmovl.u8 q10, d24
-+        vmovl.u8 q11, d25
-+        vaddw.s8 q10, d16
-+        vaddw.s8 q11, d17
-+        vqmovun.s16 d4, q10
-+        vqmovun.s16 d5, q11
-+        vst1.8   {q2}, [r0,:128], r2
-         bne    1b
- 
--        vpop  {d8-d15}
--        pop   {r4-r8}
-         bx lr
- endfunc
- 
- function ff_hevc_sao_band_w32_neon_8, export=1
--        push  {r4-r8}
--        ldr    r4, [sp, #20] // height
--        ldr    r5, [sp, #24] // offset_table
--        vpush {d8-d15}
--        vld1.8  {q0, q1}, [r5] // offset table
--
--1:      subs    r4, #1
--        vld1.8  {q12-q13}, [r1], r3
--
--        vshr.u8   q8, q12, #3
--        vshr.u8   q9, q13, #3
--
--        vtbl.8  d16, {q0, q1}, d16
--        vtbl.8  d17, {q0, q1}, d17
--        vtbl.8  d18, {q0, q1}, d18
--        vtbl.8  d19, {q0, q1}, d19
--
--        vmovl.s8 q2, d16
--        vmovl.s8 q3, d17 // q8 free
--        vmovl.s8 q4, d18
--        vmovl.s8 q5, d19 // q9 free
--
--        vmovl.u8 q6, d24
--        vmovl.u8 q7, d25 // q12 free
--        vmovl.u8 q8, d26
--        vmovl.u8 q9, d27 // q13 free
--
--        vadd.s16 q2, q6
--        vadd.s16 q3, q7
--        vadd.s16 q4, q8
--        vadd.s16 q5, q9
--
--        vqmovun.s16 d4, q2
--        vqmovun.s16 d5, q3
--        vqmovun.s16 d6, q4 // q4 free
--        vqmovun.s16 d7, q5 // q5 free
--
--        vst1.8 {q2-q3}, [r0], r2
--        bne    1b
--
--        vpop  {d8-d15}
--        pop   {r4-r8}
--        bx lr
-+        ldr      r12, [sp, #4]    // offset_table address
-+        vld1.8   {q0, q1}, [r12]  // offset table
-+        ldr      r12, [sp, #0]    // height
-+
-+1:      subs     r12, #1
-+        vld1.8   {q2-q3}, [r1,:128], r3
-+        vshr.u8  q8, q2, #3
-+        vshr.u8  q9, q3, #3
-+        vtbl.8   d16, {q0, q1}, d16
-+        vtbl.8   d17, {q0, q1}, d17
-+        vtbl.8   d18, {q0, q1}, d18
-+        vtbl.8   d19, {q0, q1}, d19
-+        vmovl.u8 q12, d4
-+        vmovl.u8 q13, d5
-+        vmovl.u8 q14, d6
-+        vmovl.u8 q15, d7
-+        vaddw.s8 q12, d16
-+        vaddw.s8 q13, d17
-+        vaddw.s8 q14, d18
-+        vaddw.s8 q15, d19
-+        vqmovun.s16 d4, q12
-+        vqmovun.s16 d5, q13
-+        vqmovun.s16 d6, q14
-+        vqmovun.s16 d7, q15
-+        vst1.8   {q2-q3}, [r0,:128], r2
-+        bne      1b
-+
-+        bx       lr
- endfunc
- 
- function ff_hevc_sao_band_w64_neon_8, export=1
--        push  {r4-r8}
--        ldr    r4, [sp, #20] // height
--        ldr    r5, [sp, #24] // offset_table
--        vpush {d8-d15}
--        vld1.8  {q0, q1}, [r5] // offset table
-+        ldr      r12, [sp, #4]    // offset_table address
-+        vld1.8   {q0, q1}, [r12]  // offset table
-+        ldr      r12, [sp, #0]    // height
- 
--1:      subs    r4, #1
--        vld1.8  {q12-q13}, [r1]!
--        vld1.8  {q14-q15}, [r1], r3
-+1:      subs     r12, #1
-+        vld1.8  {q12-q13}, [r1,:128]!
-+        vld1.8  {q14-q15}, [r1,:128], r3
-         sub     r1, #32
- 
-         vshr.u8   q8, q12, #3
-@@ -152,53 +116,41 @@ function ff_hevc_sao_band_w64_neon_8, export=1
-         vtbl.8  d22, {q0, q1}, d22
-         vtbl.8  d23, {q0, q1}, d23
- 
--        vmovl.s8 q2, d16
--        vmovl.s8 q3, d17 // q8 free
--        vmovl.s8 q4, d18
--        vmovl.s8 q5, d19 // q9 free
-+        vmovl.u8 q2, d24
-+        vmovl.u8 q3, d25
-+        vmovl.u8 q12, d26
-+        vmovl.u8 q13, d27
- 
--        vmovl.u8 q6, d24
--        vmovl.u8 q7, d25 // q12 free
--        vmovl.u8 q8, d26
--        vmovl.u8 q9, d27 // q13 free
--
--        vadd.s16 q2, q6
--        vadd.s16 q3, q7
--        vadd.s16 q4, q8
--        vadd.s16 q5, q9
-+        vaddw.s8 q2, d16
-+        vaddw.s8 q3, d17
-+        vaddw.s8 q12, d18
-+        vaddw.s8 q13, d19
- 
-         vqmovun.s16 d4, q2
-         vqmovun.s16 d5, q3
--        vqmovun.s16 d6, q4 // q4 free
--        vqmovun.s16 d7, q5 // q5 free
--
--        // free q4 -q9, q12 - q13
--        vmovl.s8 q4, d20
--        vmovl.s8 q5, d21 // q10 free
--        vmovl.s8 q6, d22
--        vmovl.s8 q7, d23 // q11 free
--
--        vmovl.u8  q8, d28
--        vmovl.u8  q9, d29 // q14 free
--        vmovl.u8 q10, d30
--        vmovl.u8 q11, d31 // q15 free
--
--        vadd.s16 q4, q8
--        vadd.s16 q5, q9
--        vadd.s16 q6, q10
--        vadd.s16 q7, q11
--
--        vqmovun.s16  d8, q4
--        vqmovun.s16  d9, q5
--        vqmovun.s16 d10, q6
--        vqmovun.s16 d11, q7
--
--        vstm.8   r0, {q2-q5}
--        add    r0, r2
-+        vqmovun.s16 d6, q12
-+        vqmovun.s16 d7, q13
-+
-+        vmovl.u8 q12, d28
-+        vmovl.u8 q13, d29
-+        vmovl.u8 q14, d30
-+        vmovl.u8 q15, d31
-+
-+        vaddw.s8 q12, d20
-+        vaddw.s8 q13, d21
-+        vaddw.s8 q14, d22
-+        vaddw.s8 q15, d23
-+
-+        vqmovun.s16  d8, q12
-+        vqmovun.s16  d9, q13
-+        vqmovun.s16 d10, q14
-+        vqmovun.s16 d11, q15
-+
-+        vst1.8     {q2-q3}, [r0,:128]!
-+        vst1.8     {q4-q5}, [r0,:128], r2
-+        sub    r0, #32
-         bne    1b
- 
--        vpop  {d8-d15}
--        pop   {r4-r8}
-         bx lr
- endfunc
- 
--- 
-2.5.0
-
-
-From 26bd536800db2f50ff6a021e1fda0d0394d1ea01 Mon Sep 17 00:00:00 2001
-From: Seppo Tomperi <seppo.tomperi@vtt.fi>
-Date: Mon, 29 Dec 2014 15:00:49 +0200
-Subject: [PATCH 5/9] better code reuse in NEON SAO band
-
----
- libavcodec/arm/hevcdsp_init_neon.c |  16 ++--
- libavcodec/arm/hevcdsp_sao_neon.S  | 155 +++++++++++++------------------------
- 2 files changed, 61 insertions(+), 110 deletions(-)
-
-diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c
-index c32940e..6379810 100644
---- a/libavcodec/arm/hevcdsp_init_neon.c
-+++ b/libavcodec/arm/hevcdsp_init_neon.c
-@@ -45,10 +45,10 @@ void ff_hevc_transform_add_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
- void ff_hevc_transform_add_32x32_neon_8(uint8_t *_dst, int16_t *coeffs,
-                                       ptrdiff_t stride);
- 
--void ff_hevc_sao_band_w8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t * offset_table);
--void ff_hevc_sao_band_w16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t * offset_table);
--void ff_hevc_sao_band_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t * offset_table);
--void ff_hevc_sao_band_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t * offset_table);
-+void ff_hevc_sao_band_w8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int8_t * offset_table, int height);
-+void ff_hevc_sao_band_w16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int8_t * offset_table, int height);
-+void ff_hevc_sao_band_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int8_t * offset_table, int height);
-+void ff_hevc_sao_band_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int8_t * offset_table, int height);
- 
- void ff_hevc_sao_edge_eo0_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
- void ff_hevc_sao_edge_eo1_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
-@@ -185,16 +185,16 @@ static void ff_hevc_sao_band_neon_wrapper(uint8_t *_dst, uint8_t *_src, ptrdiff_
- 
-     switch(width){
-     case 8:
--        ff_hevc_sao_band_w8_neon_8(_dst, _src, stride_dst, stride_src, height, offset_table);
-+        ff_hevc_sao_band_w8_neon_8(_dst, _src, stride_dst, stride_src, offset_table, height);
-         break;
-     case 16:
--        ff_hevc_sao_band_w16_neon_8(_dst, _src, stride_dst, stride_src, height, offset_table);
-+        ff_hevc_sao_band_w16_neon_8(_dst, _src, stride_dst, stride_src, offset_table, height);
-         break;
-     case 32:
--        ff_hevc_sao_band_w32_neon_8(_dst, _src, stride_dst, stride_src, height, offset_table);
-+        ff_hevc_sao_band_w32_neon_8(_dst, _src, stride_dst, stride_src, offset_table, height);
-         break;
-     case 64:
--        ff_hevc_sao_band_w64_neon_8(_dst, _src, stride_dst, stride_src, height, offset_table);
-+        ff_hevc_sao_band_w64_neon_8(_dst, _src, stride_dst, stride_src, offset_table, height);
-         break;
-     default:
-         for (y = 0; y < height; y++) {
-diff --git a/libavcodec/arm/hevcdsp_sao_neon.S b/libavcodec/arm/hevcdsp_sao_neon.S
-index ac21013..8852550 100644
---- a/libavcodec/arm/hevcdsp_sao_neon.S
-+++ b/libavcodec/arm/hevcdsp_sao_neon.S
-@@ -21,53 +21,13 @@
- #include "libavutil/arm/asm.S"
- #include "neon.S"
- 
--function ff_hevc_sao_band_w8_neon_8, export=1
--        ldr      r12, [sp, #4]    // offset_table address
-+.macro init_sao_band
-+        ldr      r12, [sp, #0]    // offset_table address
-         vld1.8   {q0, q1}, [r12]  // offset table
--        ldr      r12, [sp, #0]    // height
--
--1:      subs     r12, #1
--        vld1.8   {d24}, [r1,:64], r3
--        vshr.u8  d16, d24, #3
--        vtbl.8   d16, {q0, q1}, d16
--        vmovl.u8 q6, d24
--        vaddw.s8 q6, d16
--        vqmovun.s16 d4, q2
--        vst1.8  {d4}, [r0,:64], r2
--        bne    1b
--
--        bx lr
--endfunc
--
--function ff_hevc_sao_band_w16_neon_8, export=1
--        ldr      r12, [sp, #4]    // offset_table address
--        vld1.8   {q0, q1}, [r12]  // offset table
--        ldr      r12, [sp, #0]    // height
--
--1:      subs     r12, #1
--        vld1.8  {q12}, [r1,:128], r3
--        vshr.u8   q8, q12, #3
--        vtbl.8  d16, {q0, q1}, d16
--        vtbl.8  d17, {q0, q1}, d17
--        vmovl.u8 q10, d24
--        vmovl.u8 q11, d25
--        vaddw.s8 q10, d16
--        vaddw.s8 q11, d17
--        vqmovun.s16 d4, q10
--        vqmovun.s16 d5, q11
--        vst1.8   {q2}, [r0,:128], r2
--        bne    1b
--
--        bx lr
--endfunc
--
--function ff_hevc_sao_band_w32_neon_8, export=1
--        ldr      r12, [sp, #4]    // offset_table address
--        vld1.8   {q0, q1}, [r12]  // offset table
--        ldr      r12, [sp, #0]    // height
-+        ldr      r12, [sp, #4]    // height
-+.endm
- 
--1:      subs     r12, #1
--        vld1.8   {q2-q3}, [r1,:128], r3
-+.macro sao_band_32
-         vshr.u8  q8, q2, #3
-         vshr.u8  q9, q3, #3
-         vtbl.8   d16, {q0, q1}, d16
-@@ -86,6 +46,43 @@ function ff_hevc_sao_band_w32_neon_8, export=1
-         vqmovun.s16 d5, q13
-         vqmovun.s16 d6, q14
-         vqmovun.s16 d7, q15
-+.endm
-+
-+function ff_hevc_sao_band_w8_neon_8, export=1
-+        init_sao_band
-+1:      subs     r12, #4
-+        vld1.8   {d4}, [r1,:64], r3
-+        vld1.8   {d5}, [r1,:64], r3
-+        vld1.8   {d6}, [r1,:64], r3
-+        vld1.8   {d7}, [r1,:64], r3
-+        sao_band_32
-+        vst1.8  {d4}, [r0,:64], r2
-+        vst1.8  {d5}, [r0,:64], r2
-+        vst1.8  {d6}, [r0,:64], r2
-+        vst1.8  {d7}, [r0,:64], r2
-+        bne    1b
-+
-+        bx lr
-+endfunc
-+
-+function ff_hevc_sao_band_w16_neon_8, export=1
-+        init_sao_band
-+1:      subs     r12, #2
-+        vld1.8  {q2}, [r1,:128], r3
-+        vld1.8  {q3}, [r1,:128], r3
-+        sao_band_32
-+        vst1.8   {q2}, [r0,:128], r2
-+        vst1.8   {q3}, [r0,:128], r2
-+        bne    1b
-+
-+        bx lr
-+endfunc
-+
-+function ff_hevc_sao_band_w32_neon_8, export=1
-+        init_sao_band
-+1:      subs     r12, #1
-+        vld1.8   {q2-q3}, [r1,:128], r3
-+        sao_band_32
-         vst1.8   {q2-q3}, [r0,:128], r2
-         bne      1b
- 
-@@ -93,63 +90,17 @@ function ff_hevc_sao_band_w32_neon_8, export=1
- endfunc
- 
- function ff_hevc_sao_band_w64_neon_8, export=1
--        ldr      r12, [sp, #4]    // offset_table address
--        vld1.8   {q0, q1}, [r12]  // offset table
--        ldr      r12, [sp, #0]    // height
--
--1:      subs     r12, #1
--        vld1.8  {q12-q13}, [r1,:128]!
--        vld1.8  {q14-q15}, [r1,:128], r3
--        sub     r1, #32
--
--        vshr.u8   q8, q12, #3
--        vshr.u8   q9, q13, #3
--        vshr.u8  q10, q14, #3
--        vshr.u8  q11, q15, #3
--
--        vtbl.8  d16, {q0, q1}, d16
--        vtbl.8  d17, {q0, q1}, d17
--        vtbl.8  d18, {q0, q1}, d18
--        vtbl.8  d19, {q0, q1}, d19
--        vtbl.8  d20, {q0, q1}, d20
--        vtbl.8  d21, {q0, q1}, d21
--        vtbl.8  d22, {q0, q1}, d22
--        vtbl.8  d23, {q0, q1}, d23
--
--        vmovl.u8 q2, d24
--        vmovl.u8 q3, d25
--        vmovl.u8 q12, d26
--        vmovl.u8 q13, d27
--
--        vaddw.s8 q2, d16
--        vaddw.s8 q3, d17
--        vaddw.s8 q12, d18
--        vaddw.s8 q13, d19
--
--        vqmovun.s16 d4, q2
--        vqmovun.s16 d5, q3
--        vqmovun.s16 d6, q12
--        vqmovun.s16 d7, q13
--
--        vmovl.u8 q12, d28
--        vmovl.u8 q13, d29
--        vmovl.u8 q14, d30
--        vmovl.u8 q15, d31
--
--        vaddw.s8 q12, d20
--        vaddw.s8 q13, d21
--        vaddw.s8 q14, d22
--        vaddw.s8 q15, d23
--
--        vqmovun.s16  d8, q12
--        vqmovun.s16  d9, q13
--        vqmovun.s16 d10, q14
--        vqmovun.s16 d11, q15
--
--        vst1.8     {q2-q3}, [r0,:128]!
--        vst1.8     {q4-q5}, [r0,:128], r2
--        sub    r0, #32
--        bne    1b
-+        init_sao_band
-+1:      subs      r12, #1
-+        vld1.8    {q2-q3}, [r1,:128]!
-+        sao_band_32
-+        vst1.8    {q2-q3}, [r0,:128]!
-+        vld1.8    {q2-q3}, [r1,:128], r3
-+        sub       r1, #32
-+        sao_band_32
-+        vst1.8    {q2-q3}, [r0,:128], r2
-+        sub       r0, #32
-+        bne       1b
- 
-         bx lr
- endfunc
--- 
-2.5.0
-
-
-From f93646a97bc885b81759e774d04be3781916a3e7 Mon Sep 17 00:00:00 2001
-From: Seppo Tomperi <seppo.tomperi@vtt.fi>
-Date: Wed, 7 Jan 2015 15:27:38 +0200
-Subject: [PATCH 6/9] More SAO NEON optimizations Now uses only 8 bit integers
- for SAO calculations
-
----
- libavcodec/arm/hevcdsp_init_neon.c |   7 +-
- libavcodec/arm/hevcdsp_sao_neon.S  | 664 +++++++++++++++----------------------
- 2 files changed, 272 insertions(+), 399 deletions(-)
-
-diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c
-index 6379810..8d6e863 100644
---- a/libavcodec/arm/hevcdsp_init_neon.c
-+++ b/libavcodec/arm/hevcdsp_init_neon.c
-@@ -225,7 +225,7 @@ static void ff_hevc_sao_edge_neon_wrapper(uint8_t *_dst /* align 16 */, uint8_t
-     int x, y;
- 
-     for (x = 0; x < 5; x++) {
--        sao_offset_val[x] = _sao_offset_val[x];
-+        sao_offset_val[x] = _sao_offset_val[edge_idx[x]];
-     }
- 
-     stride_src /= sizeof(pixel);
-@@ -271,8 +271,9 @@ static void ff_hevc_sao_edge_neon_wrapper(uint8_t *_dst /* align 16 */, uint8_t
-             for (x = 0; x < width; x++) {
-                 int diff0         = CMP(src[x], src[x + a_stride]);
-                 int diff1         = CMP(src[x], src[x + b_stride]);
--                int offset_val    = edge_idx[2 + diff0 + diff1];
--                dst[x] = av_clip_pixel(src[x] + sao_offset_val[offset_val]);
-+                int idx           = diff0 + diff1;
-+                if (idx)
-+                    dst[x] = av_clip_pixel(src[x] + sao_offset_val[idx+2]);
-             }
-             src += stride_src;
-             dst += stride_dst;
-diff --git a/libavcodec/arm/hevcdsp_sao_neon.S b/libavcodec/arm/hevcdsp_sao_neon.S
-index 8852550..5fc482b 100644
---- a/libavcodec/arm/hevcdsp_sao_neon.S
-+++ b/libavcodec/arm/hevcdsp_sao_neon.S
-@@ -1,5 +1,5 @@
- /*
-- * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
-+ * Copyright (c) 2014 - 2015 Seppo Tomperi <seppo.tomperi@vtt.fi>
-  *
-  * This file is part of FFmpeg.
-  *
-@@ -23,6 +23,7 @@
- 
- .macro init_sao_band
-         ldr      r12, [sp, #0]    // offset_table address
-+        pld      [r1]
-         vld1.8   {q0, q1}, [r12]  // offset table
-         ldr      r12, [sp, #4]    // height
- .endm
-@@ -30,36 +31,31 @@
- .macro sao_band_32
-         vshr.u8  q8, q2, #3
-         vshr.u8  q9, q3, #3
-+        vmov.u8  q14, #128
-         vtbl.8   d16, {q0, q1}, d16
-         vtbl.8   d17, {q0, q1}, d17
-         vtbl.8   d18, {q0, q1}, d18
-         vtbl.8   d19, {q0, q1}, d19
--        vmovl.u8 q12, d4
--        vmovl.u8 q13, d5
--        vmovl.u8 q14, d6
--        vmovl.u8 q15, d7
--        vaddw.s8 q12, d16
--        vaddw.s8 q13, d17
--        vaddw.s8 q14, d18
--        vaddw.s8 q15, d19
--        vqmovun.s16 d4, q12
--        vqmovun.s16 d5, q13
--        vqmovun.s16 d6, q14
--        vqmovun.s16 d7, q15
-+        vadd.s8  q2, q14
-+        vadd.s8  q3, q14
-+        vqadd.s8 q2, q8
-+        vqadd.s8 q3, q9
-+        vsub.s8  q2, q14
-+        vsub.s8  q3, q14
- .endm
- 
- function ff_hevc_sao_band_w8_neon_8, export=1
-         init_sao_band
- 1:      subs     r12, #4
--        vld1.8   {d4}, [r1,:64], r3
--        vld1.8   {d5}, [r1,:64], r3
--        vld1.8   {d6}, [r1,:64], r3
--        vld1.8   {d7}, [r1,:64], r3
-+        vld1.8   {d4}, [r1, :64], r3
-+        vld1.8   {d5}, [r1, :64], r3
-+        vld1.8   {d6}, [r1, :64], r3
-+        vld1.8   {d7}, [r1, :64], r3
-         sao_band_32
--        vst1.8  {d4}, [r0,:64], r2
--        vst1.8  {d5}, [r0,:64], r2
--        vst1.8  {d6}, [r0,:64], r2
--        vst1.8  {d7}, [r0,:64], r2
-+        vst1.8  {d4}, [r0, :64], r2
-+        vst1.8  {d5}, [r0, :64], r2
-+        vst1.8  {d6}, [r0, :64], r2
-+        vst1.8  {d7}, [r0, :64], r2
-         bne    1b
- 
-         bx lr
-@@ -68,11 +64,11 @@ endfunc
- function ff_hevc_sao_band_w16_neon_8, export=1
-         init_sao_band
- 1:      subs     r12, #2
--        vld1.8  {q2}, [r1,:128], r3
--        vld1.8  {q3}, [r1,:128], r3
-+        vld1.8  {q2}, [r1, :128], r3
-+        vld1.8  {q3}, [r1, :128], r3
-         sao_band_32
--        vst1.8   {q2}, [r0,:128], r2
--        vst1.8   {q3}, [r0,:128], r2
-+        vst1.8   {q2}, [r0, :128], r2
-+        vst1.8   {q3}, [r0, :128], r2
-         bne    1b
- 
-         bx lr
-@@ -81,9 +77,9 @@ endfunc
- function ff_hevc_sao_band_w32_neon_8, export=1
-         init_sao_band
- 1:      subs     r12, #1
--        vld1.8   {q2-q3}, [r1,:128], r3
-+        vld1.8   {q2-q3}, [r1, :128], r3
-         sao_band_32
--        vst1.8   {q2-q3}, [r0,:128], r2
-+        vst1.8   {q2-q3}, [r0, :128], r2
-         bne      1b
- 
-         bx       lr
-@@ -92,263 +88,153 @@ endfunc
- function ff_hevc_sao_band_w64_neon_8, export=1
-         init_sao_band
- 1:      subs      r12, #1
--        vld1.8    {q2-q3}, [r1,:128]!
-+        pld       [r1, r3]
-+        vld1.8    {q2-q3}, [r1, :128]!
-         sao_band_32
--        vst1.8    {q2-q3}, [r0,:128]!
--        vld1.8    {q2-q3}, [r1,:128], r3
-+        vst1.8    {q2-q3}, [r0, :128]!
-+        vld1.8    {q2-q3}, [r1, :128], r3
-         sub       r1, #32
-         sao_band_32
--        vst1.8    {q2-q3}, [r0,:128], r2
-+        vst1.8    {q2-q3}, [r0, :128], r2
-         sub       r0, #32
-         bne       1b
- 
-         bx lr
- endfunc
--
-+// input
-+// a in q0 - q3
-+// c in q4 - q7
-+// b in q8 - q11
-+// offset table in r7 and r5
-+// output in q0 - q3
-+// clobbers q12 - q15
- .macro edge_w64_body
--        vcgt.u8 q12, q4, q0 // c > a -> -1 , otherwise 0
--        vcgt.u8 q0,  q0, q4 // a > c -> -1 , otherwise 0
--        vcgt.u8 q13, q5, q1
--        vcgt.u8 q1,  q1, q5
--        vcgt.u8 q14, q6, q2
--        vcgt.u8 q2,  q2, q6
--        vcgt.u8 q15, q7, q3
--        vcgt.u8 q3,  q3, q7
--
--        vsub.s8 q12, q0, q12 // diff0
--        vsub.s8 q13, q1, q13
--        vsub.s8 q14, q2, q14
--        vsub.s8 q15, q3, q15
--
-+        vcgt.u8 q12,  q4, q0 // c > a -> -1 , otherwise 0
-+        vcgt.u8  q0,  q0, q4 // a > c -> -1 , otherwise 0
-+        vcgt.u8 q13,  q5, q1
-+        vcgt.u8  q1,  q1, q5
-+        vsub.s8 q12,  q0, q12 // diff0
-         vcgt.u8  q0,  q4, q8 // c > b
--        vcgt.u8  q8,  q8, q4 // b > c
-+        vsub.s8 q13,  q1, q13
-+
-+        vcgt.u8 q14,  q8, q4 // b > c
-         vcgt.u8  q1,  q5, q9
--        vcgt.u8  q9,  q9, q5
--        vcgt.u8  q2,  q6, q10
--        vcgt.u8 q10, q10, q6
--        vcgt.u8  q3,  q7, q11
--        vcgt.u8 q11, q11, q7
-+        vcgt.u8 q15,  q9, q5
-+        vsub.s8  q0, q14, q0 // diff1
- 
--        vsub.s8 q0, q8, q0 // diff1
--        vsub.s8 q1, q9, q1
--        vsub.s8 q2, q10, q2
--        vsub.s8 q3, q11, q3
-+        vsub.s8  q1, q15, q1
- 
--        vadd.s8 q0, q12 //diff0 + diff1
--        vadd.s8 q1, q13
--        vadd.s8 q2, q14
--        vadd.s8 q3, q15
--
--        vdup.s8 q9, r6 // 3 to all elements
--        sub     r6, #1
--
--        vclt.s8 q12, q0, #0 // diff0 + diff1 < 0
--        vclt.s8 q13, q1, #0
--        vclt.s8 q14, q2, #0
--        vclt.s8 q15, q3, #0
--
--        vadd.s8  q8,  q0, q9 // diff0 + diff1 + 3
--        vadd.s8  q10,  q1, q9
--        vand.8   q12, q8, q12 // if (diff0 + diff1 < 0) then (diff0 + diff1 + 3) else 0
--        vand.8   q13, q10, q13
--        vadd.s8  q8,  q2, q9
--        vadd.s8  q10,  q3, q9
--        vand.8   q14, q8, q14
--        vand.8   q15, q10, q15
--
--        vdup.s8 q9, r6  // 2 to all elements
--        add     r6, #1
--
--        vcgt.s8  q10, q0, #0 // diff0 + diff1 > 0
--        vadd.s8   q8, q0, q9 // diff0 + diff1 + 2
--        vand.8   q11, q8, q10 // if (diff0 + diff1 > 0) then (diff0 + diff1 + 2) else 0
--        vcgt.s8  q10, q1, #0
--        vadd.s8   q0, q11, q12  // offset_idx
--
--        vadd.s8   q8, q1, q9 // diff0 + diff1 + 2
--        vcgt.s8  q12, q2, #0
--        vand.8   q11, q8, q10 // if (diff0 + diff1 > 0) then (diff0 + diff1 + 2) else 0
--        vadd.s8   q8, q2, q9 // diff0 + diff1 + 2
--        vadd.s8   q1, q11, q13
--
--        vand.8   q11, q8, q12 // if (diff0 + diff1 > 0) then (diff0 + diff1 + 2) else 0
--        vcgt.s8  q10, q3, #0
--        vadd.s8   q2, q11, q14
--
--        vadd.s8   q8, q3, q9 // diff0 + diff1 + 2
--        vmov.32  d18[0], r7  // load offset table from general registers
--        vand.8   q11, q8, q10 // if (diff0 + diff1 > 0) then (diff0 + diff1 + 2) else 0
--        vmov.32  d18[1], r5  // load rest of offset table
--        vadd.s8   q3, q11, q15
--
--        vtbl.8   d0, {d18}, d0
--        vtbl.8   d1, {d18}, d1
--        vtbl.8   d2, {d18}, d2
--        vtbl.8   d3, {d18}, d3
--        vtbl.8   d4, {d18}, d4
--        vtbl.8   d5, {d18}, d5
--        vtbl.8   d6, {d18}, d6
--        vtbl.8   d7, {d18}, d7
--
--        vmovl.u8   q8, d8
--        vmovl.u8   q9, d9
--        vmovl.u8  q10, d10
--        vmovl.u8  q11, d11
--        vmovl.u8  q12, d12
--        vmovl.u8  q13, d13
--        vmovl.u8  q14, d14
--        vmovl.u8  q15, d15
--
--        vaddw.s8  q8, d0
--        vaddw.s8  q9, d1
--        vaddw.s8 q10, d2
--        vaddw.s8 q11, d3
--        vaddw.s8 q12, d4
--        vaddw.s8 q13, d5
--        vaddw.s8 q14, d6
--        vaddw.s8 q15, d7
--
--        vqmovun.s16  d0, q8
--        vqmovun.s16  d1, q9
--        vqmovun.s16  d2, q10
--        vqmovun.s16  d3, q11
--        vqmovun.s16  d4, q12
--        vqmovun.s16  d5, q13
--        vqmovun.s16  d6, q14
--        vqmovun.s16  d7, q15
--
--        vstm r0, {q0-q3}
--        add  r0, r2
--.endm
-+        vadd.s8  q0, q12 //diff0 + diff1
-+        vadd.s8  q1, q13
- 
--.macro edge_w32_body
--        vcgt.u8 q12, q4, q0 // c > a -> -1 , otherwise 0
--        vcgt.u8 q0,  q0, q4 // a > c -> -1 , otherwise 0
--        vcgt.u8 q13, q5, q1
--        vcgt.u8 q1,  q1, q5
-+        vcgt.u8 q14,  q6, q2
-+        vcgt.u8  q2,  q2, q6
-+        vcgt.u8 q15,  q7, q3
-+        vcgt.u8  q3,  q3, q7
- 
--        vsub.s8 q12, q0, q12 // diff0
--        vcgt.u8  q0,  q4, q8 // c > b
--        vsub.s8 q13, q1, q13 // diff0 part 2
-+        vsub.s8 q14,  q2, q14
-+        vcgt.u8  q2,  q6, q10
-+        vsub.s8 q15,  q3, q15
- 
--        vcgt.u8  q6,  q8, q4 // b > c
--        vcgt.u8  q1,  q5, q9
--        vcgt.u8  q7,  q9, q5
-+        vcgt.u8 q12, q10, q6
-+        vcgt.u8  q3,  q7, q11
-+        vcgt.u8 q13, q11, q7
-+        vsub.s8  q2, q12, q2
-+        vsub.s8  q3, q13, q3
- 
--        vsub.s8 q0, q6, q0 // diff1
--        vsub.s8 q1, q7, q1 // diff1 part 2
--        vadd.s8 q0, q12 //diff0 + diff1
-+        vmov.s8 q13, #2 // 2 to all elements
- 
--        vdup.s8 q7, r6 // 3 to all elements
--        sub     r6, #1
--        vadd.s8 q1, q13
-+        vadd.s8  q2, q14
-+        vadd.s8  q3, q15
-+
-+        vmov.32  d24[0], r4  // load offset table from general registers
-+        vmov.32  d24[1], r5  // load rest of offset table
- 
--        vclt.s8 q12, q0, #0 // diff0 + diff1 < 0
--        vclt.s8 q13, q1, #0
--
--        vadd.s8  q6,  q0, q7 // diff0 + diff1 + 3
--        vadd.s8  q10,  q1, q7
--        vdup.s8 q7, r6  // 2 to all elements
--        add     r6, #1
--        vand.8   q12, q6, q12 // if (diff0 + diff1 < 0) then (diff0 + diff1 + 3) else 0
--        vand.8   q13, q10, q13
--
--
--        vcgt.s8  q10, q0, #0 // diff0 + diff1 > 0
--        vadd.s8   q6, q0, q7 // diff0 + diff1 + 2
--        vand.8   q11, q6, q10 // if (diff0 + diff1 > 0) then (diff0 + diff1 + 2) else 0
--        vcgt.s8  q10, q1, #0
--        vadd.s8   q0, q11, q12  // offset_idx
--
--        vadd.s8   q6, q1, q7 // diff0 + diff1 + 2
--        vmov.32  d14[0], r7  // load offset table from general registers
--        vand.8   q11, q6, q10 // if (diff0 + diff1 > 0) then (diff0 + diff1 + 2) else 0
--        vmov.32  d14[1], r5  // load rest of offset table
--        vadd.s8   q1, q11, q13
--
--        vtbl.8   d0, {d14}, d0
--        vtbl.8   d1, {d14}, d1
--        vtbl.8   d2, {d14}, d2
--        vtbl.8   d3, {d14}, d3
--
--        vmovl.u8   q6, d8
--        vmovl.u8   q7, d9
--        vmovl.u8  q10, d10
--        vmovl.u8  q11, d11
--
--        vaddw.s8  q6, d0
--        vaddw.s8  q7, d1
--        vaddw.s8 q10, d2
--        vaddw.s8 q11, d3
--
--        vqmovun.s16  d0, q6
--        vqmovun.s16  d1, q7
--        vqmovun.s16  d2, q10
--        vqmovun.s16  d3, q11
--
--        vstm r0, {q0-q1}
--        add  r0, r2
-+        vadd.s8 q0, q13
-+        vadd.s8 q1, q13
-+        vadd.s8 q2, q13
-+        vadd.s8 q3, q13
-+
-+        vmov.u8  q15, #128 // s8 #-128
-+        vtbl.8   d0, {d24}, d0
-+        vtbl.8   d1, {d24}, d1
-+        vtbl.8   d2, {d24}, d2
-+        vtbl.8   d3, {d24}, d3
-+        vtbl.8   d4, {d24}, d4
-+        vtbl.8   d5, {d24}, d5
-+        vtbl.8   d6, {d24}, d6
-+        vtbl.8   d7, {d24}, d7
-+
-+        vadd.s8  q12,  q4, q15
-+        vadd.s8  q13,  q5, q15
-+        vadd.s8  q14,  q6, q15
-+        vadd.s8  q15,  q7, q15
-+        vqadd.s8 q12,  q0
-+        vqadd.s8 q15,  q3
-+        vmov.u8   q3, #128 // s8 #-128
-+        vqadd.s8 q13,  q1
-+        vqadd.s8 q14,  q2
-+        vsub.s8   q0, q12, q3
-+        vsub.s8   q1, q13, q3
-+        vsub.s8   q2, q14, q3
-+        vsub.s8   q3, q15, q3
-+        vst1.8  {q0-q1}, [r0, :128]!
-+        vst1.8  {q2-q3}, [r0, :128], r2
-+        sub     r0, #32
- .endm
- 
--function ff_hevc_sao_edge_eo0_w64_neon_8, export=1
--        push  {r4-r8}
--        ldr    r4, [sp, #20] // height
--        ldr    r5, [sp, #24] // sao_offset_val_table
--        ldr    r6, =0x03
--        ldr    r7, [r5]
-+.macro init_edge_64
-+        push   {r4-r5}
-+        ldr    r12, [sp, #8] // height
-+        ldr    r5, [sp, #12] // sao_offset_val_table
-+        ldr    r4, [r5]
-         add    r5, #4
-         ldr    r5, [r5]
-+.endm
-+
-+function ff_hevc_sao_edge_eo0_w64_neon_8, export=1
-+        init_edge_64
-         vpush {d8-d15}
-         sub    r1, #8
--1:      subs    r4, #1
--        vld1.64  {q10-q11}, [r1]!
--        vld1.64  {q12-q13}, [r1]!
--        vld1.64  {q14}, [r1], r3
--        sub      r1, #64
-+1:      subs    r12, #1
-+        vld1.64  {d7}, [r1, :64]!
-+        vld1.64  {q4-q5}, [r1, :128]! // load c
-+        vld1.64  {q6-q7}, [r1, :128]!
-+        vld1.64  {d24}, [r1, :64], r3
-+        sub      r1, #72
-         // load a
--        vext.8 q0, q10, q11, #7
--        vext.8 q1, q11, q12, #7
--        vext.8 q2, q12, q13, #7
--        vext.8 q3, q13, q14, #7
--        // load c
--        vext.8 q4, q10, q11, #8
--        vext.8 q5, q11, q12, #8
--        vext.8 q6, q12, q13, #8
--        vext.8 q7, q13, q14, #8
-+        vext.8 q0, q3, q4, #15
-+        vext.8 q1, q4, q5, #15
-+        vext.8 q2, q5, q6, #15
-+        vext.8 q3, q6, q7, #15
-         // load b
--        vext.8 q8, q10, q11, #9
--        vext.8 q9, q11, q12, #9
--        vext.8 q10, q12, q13, #9
--        vext.8 q11, q13, q14, #9
-+        vext.8 q8, q4, q5, #1
-+        vext.8 q9, q5, q6, #1
-+        vext.8 q10, q6, q7, #1
-+        vext.8 q11, q7, q12, #1
-         edge_w64_body
-         bne   1b
-         vpop  {d8-d15}
--        pop   {r4-r8}
-+        pop   {r4-r5}
-         bx lr
- endfunc
- 
- function ff_hevc_sao_edge_eo1_w64_neon_8, export=1
--        push  {r4-r8}
--        ldr    r4, [sp, #20] // height
--        ldr    r5, [sp, #24] // sao_offset_val_table
--        ldr    r6, =0x03
--        ldr    r7, [r5]
--        add    r5, #4
--        ldr    r5, [r5]
-+        init_edge_64
-         vpush {d8-d15}
-         sub     r1, r3
-         // load a
--        vld1.8  {q0-q1}, [r1]!
--        vld1.8  {q2-q3}, [r1], r3
-+        vld1.8  {q0-q1}, [r1, :128]!
-+        vld1.8  {q2-q3}, [r1, :128], r3
-         sub     r1, #32
--1:      subs    r4, #1
-         // load c
--        vld1.8  {q4-q5}, [r1]!
--        vld1.8  {q6-q7}, [r1], r3
-+        vld1.8  {q4-q5}, [r1, :128]!
-+        vld1.8  {q6-q7}, [r1, :128], r3
-         sub     r1, #32
-+1:      subs    r12, #1
-         // load b
--        vld1.8  {q8-q9}, [r1]!
--        vld1.8  {q10-q11}, [r1]
-+        vld1.8  {q8-q9}, [r1, :128]!
-+        vld1.8  {q10-q11}, [r1, :128], r3
-         sub     r1, #32
-         edge_w64_body
-         // copy c to a
-@@ -356,20 +242,19 @@ function ff_hevc_sao_edge_eo1_w64_neon_8, export=1
-         vmov.64 q1, q5
-         vmov.64 q2, q6
-         vmov.64 q3, q7
-+        // copy b to c
-+        vmov.64 q4, q8
-+        vmov.64 q5, q9
-+        vmov.64 q6, q10
-+        vmov.64 q7, q11
-         bne   1b
-         vpop  {d8-d15}
--        pop   {r4-r8}
-+        pop   {r4-r5}
-         bx lr
- endfunc
- 
- function ff_hevc_sao_edge_eo2_w64_neon_8, export=1
--        push  {r4-r8}
--        ldr    r4, [sp, #20] // height
--        ldr    r5, [sp, #24] // sao_offset_val_table
--        ldr    r6, =0x03
--        ldr    r7, [r5]
--        add    r5, #4
--        ldr    r5, [r5]
-+        init_edge_64
-         vpush {d8-d15}
- 1:      sub     r1, r3
-         // load a
-@@ -379,10 +264,10 @@ function ff_hevc_sao_edge_eo2_w64_neon_8, export=1
-         vld1.8  {q0-q1}, [r1]!
-         vld1.8  {q2-q3}, [r1], r3
-         sub     r1, #31
--        subs    r4, #1
-+        subs    r12, #1
-         // load c
--        vld1.8  {q4-q5}, [r1]!
--        vld1.8  {q6-q7}, [r1], r3
-+        vld1.8  {q4-q5}, [r1, :128]!
-+        vld1.8  {q6-q7}, [r1, :128], r3
-         sub     r1, #32
-         // load b
-         add     r1, #1
-@@ -390,25 +275,14 @@ function ff_hevc_sao_edge_eo2_w64_neon_8, export=1
-         vld1.8  {q10-q11}, [r1]
-         sub     r1, #33
-         edge_w64_body
--        // copy c to a
--        vmov.64 q0, q4
--        vmov.64 q1, q5
--        vmov.64 q2, q6
--        vmov.64 q3, q7
-         bne   1b
-         vpop  {d8-d15}
--        pop   {r4-r8}
-+        pop   {r4-r5}
-         bx lr
- endfunc
- 
- function ff_hevc_sao_edge_eo3_w64_neon_8, export=1
--        push  {r4-r8}
--        ldr    r4, [sp, #20] // height
--        ldr    r5, [sp, #24] // sao_offset_val_table
--        ldr    r6, =0x03
--        ldr    r7, [r5]
--        add    r5, #4
--        ldr    r5, [r5]
-+        init_edge_64
-         vpush {d8-d15}
- 1:      sub     r1, r3
-         // load a
-@@ -418,10 +292,10 @@ function ff_hevc_sao_edge_eo3_w64_neon_8, export=1
-         vld1.8  {q0-q1}, [r1]!
-         vld1.8  {q2-q3}, [r1], r3
-         sub     r1, #33
--        subs    r4, #1
-+        subs    r12, #1
-         // load c
--        vld1.8  {q4-q5}, [r1]!
--        vld1.8  {q6-q7}, [r1], r3
-+        vld1.8  {q4-q5}, [r1, :128]!
-+        vld1.8  {q6-q7}, [r1, :128], r3
-         sub     r1, #32
-         // load b
-         sub     r1, #1
-@@ -429,178 +303,176 @@ function ff_hevc_sao_edge_eo3_w64_neon_8, export=1
-         vld1.8  {q10-q11}, [r1]
-         sub     r1, #31
-         edge_w64_body
--        // copy c to a
--        vmov.64 q0, q4
--        vmov.64 q1, q5
--        vmov.64 q2, q6
--        vmov.64 q3, q7
-         bne   1b
-         vpop  {d8-d15}
--        pop   {r4-r8}
-+        pop   {r4-r5}
-         bx lr
- endfunc
- 
-+// inputs:
-+// a in q0, q1
-+// c in q2, q3
-+// b in q8, q9
-+// offset table in d31
-+// clobbered registers q0, q1, q10, q11, q12, q13
-+// output q0, q1
-+.macro edge_w32_body
-+        vcgt.u8 q12, q2, q0 // c > a -> -1 , otherwise 0
-+        vcgt.u8 q0,  q0, q2 // a > c -> -1 , otherwise 0
-+        vcgt.u8 q13, q3, q1
-+        vcgt.u8 q1,  q1, q3
-+
-+        vsub.s8 q12, q0, q12 // diff0
-+        vcgt.u8  q0,  q2, q8 // c > b
-+        vsub.s8 q13, q1, q13 // diff0 part 2
-+
-+        vcgt.u8  q10,  q8, q2 // b > c
-+        vcgt.u8  q1,  q3, q9
-+        vcgt.u8  q11,  q9, q3
-+
-+        vsub.s8 q0, q10, q0 // diff1
-+
-+        vmov.s8 q10, #2 // 2 to all elements
-+        vsub.s8 q1, q11, q1 // diff1 part 2
-+        vadd.s8 q0, q12 //diff0 + diff1
-+        vadd.s8 q1, q13
-+
-+        vadd.s8 q0, q10
-+        vadd.s8 q1, q10
-+
-+        vmov.u8  q10, #128
-+        vtbl.8   d0, {d31}, d0
-+        vtbl.8   d1, {d31}, d1
-+        vtbl.8   d2, {d31}, d2
-+        vtbl.8   d3, {d31}, d3
-+
-+        vadd.s8    q11, q2, q10
-+        vadd.s8    q12, q3, q10
-+        vqadd.s8   q11, q0
-+        vqadd.s8   q12, q1
-+        vsub.s8    q0, q11, q10
-+        vsub.s8    q1, q12, q10
-+        vst1.8   {q0-q1}, [r0, :128], r2
-+.endm
-+
-+.macro init_edge_32
-+        ldr     r12, [sp, #4] // sao_offset_val_table
-+        vld1.32 {d31}, [r12]
-+        ldr     r12, [sp] // height
-+.endm
-+
- function ff_hevc_sao_edge_eo0_w32_neon_8, export=1
--        push  {r4-r8}
--        ldr    r4, [sp, #20] // height
--        ldr    r5, [sp, #24] // sao_offset_val_table
--        ldr    r6, =0x03
--        ldr    r7, [r5]
--        add    r5, #4
--        ldr    r5, [r5]
--        vpush {d8-d15}
--        sub    r1, #8 // load 8 extra bytes
--1:      subs    r4, #1
--        vld1.8  {q10-q11}, [r1]
--        add    r1, #32
--        vld1.8  {q12}, [r1], r3 // only first 9 bytes are used
--        sub    r1, #32
-+        init_edge_32
-+        sub     r1, #4 // load 4 extra bytes
-+1:      subs    r12, #1
-+        vld1.32 d3[1], [r1]!
-+        vld1.8  {q2-q3}, [r1, :128]! // c
-+        vld1.32 d20[0], [r1], r3
-+        sub     r1, #36
-         // a
--        vext.8  q0, q10, q11, #7
--        vext.8  q1, q11, q12, #7
--        // c
--        vext.8  q4, q10, q11, #8
--        vext.8  q5, q11, q12, #8
-+        vext.8  q0, q1, q2, #15
-+        vext.8  q1, q2, q3, #15
-         // b
--        vext.8  q8, q10, q11, #9
--        vext.8  q9, q11, q12, #9
-+        vext.8  q8, q2, q3, #1
-+        vext.8  q9, q3, q10, #1
-         edge_w32_body
--        bne   1b
--        vpop  {d8-d15}
--        pop   {r4-r8}
--        bx lr
-+        bne     1b
-+        bx      lr
- endfunc
- 
- function ff_hevc_sao_edge_eo1_w32_neon_8, export=1
--        push  {r4-r8}
--        ldr    r4, [sp, #20] // height
--        ldr    r5, [sp, #24] // sao_offset_val_table
--        ldr    r6, =0x03
--        ldr    r7, [r5]
--        add    r5, #4
--        ldr    r5, [r5]
--        vpush {d8-d15}
-+        init_edge_32
-         // load a
-         sub     r1, r3
--        vld1.8  {q0-q1}, [r1], r3
-+        vld1.8  {q0-q1}, [r1, :128], r3
-         // load c
--        vld1.8  {q4-q5}, [r1], r3
--1:      subs    r4, #1
-+        vld1.8  {q2-q3}, [r1, :128], r3
-+1:      subs    r12, #1
-         // load b
--        vld1.8  {q8-q9}, [r1], r3
-+        vld1.8  {q8-q9}, [r1, :128], r3
-         edge_w32_body
-         // inputs for next loop iteration
-         // a
--        vmov.64 q0, q4
--        vmov.64 q1, q5
-+        vmov.64 q0, q2
-+        vmov.64 q1, q3
-         // c
--        vmov.64 q4, q8
--        vmov.64 q5, q9
--        bne   1b
--        vpop  {d8-d15}
--        pop   {r4-r8}
--        bx lr
-+        vmov.64 q2, q8
-+        vmov.64 q3, q9
-+        bne     1b
-+        bx      lr
- endfunc
- 
- function ff_hevc_sao_edge_eo2_w32_neon_8, export=1
--        push  {r4-r8}
--        ldr    r4, [sp, #20] // height
--        ldr    r5, [sp, #24] // sao_offset_val_table
--        ldr    r6, =0x03
--        ldr    r7, [r5]
--        add    r5, #4
--        ldr    r5, [r5]
--        vpush {d8-d15}
-+        init_edge_32
-+        vpush   {d8-d15}
-         // load a
-         sub     r1, r3
--        sub    r1, #8
--        vld1.8  {q10-q11}, [r1]
--        add    r1, #32
--        vld1.8  {q12}, [r1], r3
--        sub    r1, #32
-+        sub     r1, #8
-+        vld1.8  {q10-q11}, [r1, :64]!
-+        vld1.8  {d24}, [r1, :64], r3
-+        sub     r1, #32
-         vext.8  q0, q10, q11, #7
-         vext.8  q1, q11, q12, #7
-         // load c
--        vld1.8  {q10-q11}, [r1]
--        add    r1, #32
--        vld1.8  {q12}, [r1], r3
--        sub    r1, #32
--        vext.8  q4, q10, q11, #8
--        vext.8  q5, q11, q12, #8
--        vext.8  q2, q10, q11, #7
--1:      subs    r4, #1
-+        vld1.8  {d9}, [r1, :64]!
-+        vld1.8  {q2-q3}, [r1, :64], r3
-+        sub     r1, #8
-+        vext.8  q4, q4, q2, #15
-+1:      subs    r12, #1
-         // load b
--        vld1.8  {q10-q11}, [r1]
--        add    r1, #32
--        vld1.8  {q12}, [r1], r3
--        sub    r1, #32
-+        vld1.8  {q10-q11}, [r1, :64]!
-+        vld1.8  {q12}, [r1, :64], r3
-+        sub     r1, #32
-         vext.8  q8, q10, q11, #9
-         vext.8  q9, q11, q12, #9
--        vext.8  q14, q10, q11, #8
--        vext.8  q15, q11, q12, #8
--        vext.8  q3, q10, q11, #7
-+        vext.8  q6, q10, q11, #8
-+        vext.8  q7, q11, q12, #8
-+        vext.8  q5, q10, q11, #7
-         edge_w32_body
-         // inputs for next loop iteration
-         // a
--        vmov.8 q0, q2
--        vext.8 q1, q4, q5, #15
-+        vmov.8  q0, q4
-+        vext.8  q1, q2, q3, #15
-         // c
--        vmov.8  q4, q14
--        vmov.8  q5, q15
--        vmov.8  q2, q3
--        bne   1b
--        vpop  {d8-d15}
--        pop   {r4-r8}
--        bx lr
-+        vmov.8  q2, q6
-+        vmov.8  q3, q7
-+        vmov.8  q4, q5
-+        bne     1b
-+        vpop    {d8-d15}
-+        bx      lr
- endfunc
- 
- function ff_hevc_sao_edge_eo3_w32_neon_8, export=1
--        push  {r4-r8}
--        ldr    r4, [sp, #20] // height
--        ldr    r5, [sp, #24] // sao_offset_val_table
--        ldr    r6, =0x03
--        ldr    r7, [r5]
--        add    r5, #4
--        sub    r1, r3
--        ldr    r5, [r5]
--        sub    r1, #8
--        vpush {d8-d15}
-+        init_edge_32
-+        sub     r1, r3
-         // load a
--        vld1.8  {q10-q11}, [r1]
--        add    r1, #32
--        vld1.8  {q12}, [r1], r3
--        sub    r1, #32
--        vext.8  q0, q10, q11, #9
--        vext.8  q1, q11, q12, #9
-+        vld1.8  {q10-q11}, [r1, :64]!
-+        vld1.8  {d24}, [r1, :64], r3
-+        sub     r1, #32
-+        vext.8  q0, q10, q11, #1
-+        vext.8  q1, q11, q12, #1
-         // load c
--        vld1.8  {q10-q11}, [r1]
--        add    r1, #32
--        vld1.8  {q12}, [r1], r3
--        sub    r1, #32
--        vext.8  q4, q10, q11, #8
--        vext.8  q5, q11, q12, #8
--        vext.8  q2, q12, q11, #8
--1:      subs    r4, #1
-+        vld1.8  {q2-q3}, [r1, :64]!
-+        vld1.8  {d30}, [r1, :64], r3
-+        sub     r1, #40
-+1:      subs    r12, #1
-         // load b
--        vld1.8  {q10-q11}, [r1]
--        add    r1, #32
--        vld1.8  {q12}, [r1], r3
--        sub    r1, #32
-+        vld1.8  {q10-q11}, [r1, :64]!
-+        vld1.8  {q12}, [r1, :64], r3
-+        sub     r1, #32
-         vext.8  q8, q10, q11, #7
-         vext.8  q9, q11, q12, #7
--        vext.8  q3, q12, q10, #7
-+        vext.8  q14, q12, q10, #7
-         edge_w32_body
-         // inputs for next loop iteration
-         // a
--        vext.8 q0, q4, q5, #1
--        vext.8 q1, q5, q2, #1
-+        vext.8  q0, q2, q3, #1
-+        vext.8  q1, q3, q15, #1
-         // c
--        vext.8  q4, q8, q9, #1
--        vext.8  q5, q9, q3, #1
--        vext.8  q2, q3, q1, #1
--        bne   1b
--        vpop  {d8-d15}
--        pop   {r4-r8}
--        bx lr
-+        vext.8  q2, q8, q9, #1
-+        vext.8  q3, q9, q14, #1
-+        vext.8  d30, d28, d2, #1
-+        bne     1b
-+        bx      lr
- endfunc
- 
--- 
-2.5.0
-
-
-From 016c39d46b86830204a4519590332d2a38f7ee51 Mon Sep 17 00:00:00 2001
-From: Seppo Tomperi <seppo.tomperi@vtt.fi>
-Date: Thu, 8 Jan 2015 09:58:55 +0200
-Subject: [PATCH 7/9] small optimization to SAO BAND. correct path for
- bit_depth_template.c
-
----
- libavcodec/arm/hevcdsp_init_neon.c | 2 +-
- libavcodec/arm/hevcdsp_sao_neon.S  | 2 +-
- 2 files changed, 2 insertions(+), 2 deletions(-)
-
-diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c
-index 8d6e863..385c35d 100644
---- a/libavcodec/arm/hevcdsp_init_neon.c
-+++ b/libavcodec/arm/hevcdsp_init_neon.c
-@@ -23,7 +23,7 @@
- #include "libavcodec/hevcdsp.h"
- #include "hevcdsp_arm.h"
- #include "libavcodec/avcodec.h"
--#include "../bit_depth_template.c"
-+#include "libavcodec/bit_depth_template.c"
- 
- void ff_hevc_v_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
- void ff_hevc_h_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
-diff --git a/libavcodec/arm/hevcdsp_sao_neon.S b/libavcodec/arm/hevcdsp_sao_neon.S
-index 5fc482b..710b32b 100644
---- a/libavcodec/arm/hevcdsp_sao_neon.S
-+++ b/libavcodec/arm/hevcdsp_sao_neon.S
-@@ -26,12 +26,12 @@
-         pld      [r1]
-         vld1.8   {q0, q1}, [r12]  // offset table
-         ldr      r12, [sp, #4]    // height
-+        vmov.u8  q14, #128
- .endm
- 
- .macro sao_band_32
-         vshr.u8  q8, q2, #3
-         vshr.u8  q9, q3, #3
--        vmov.u8  q14, #128
-         vtbl.8   d16, {q0, q1}, d16
-         vtbl.8   d17, {q0, q1}, d17
-         vtbl.8   d18, {q0, q1}, d18
--- 
-2.5.0
-
-
-From 579f1584d688e1ac24fb7d22697e2a7b64f62e8e Mon Sep 17 00:00:00 2001
-From: Seppo Tomperi <seppo.tomperi@vtt.fi>
-Date: Fri, 9 Jan 2015 10:28:52 +0200
-Subject: [PATCH 8/9] Added height check for SAO NEON optimizations. Faster SAO
- band NEON Some reordering to use NEON pipelines more efficiently
-
----
- libavcodec/arm/hevcdsp_init_neon.c |  12 +++-
- libavcodec/arm/hevcdsp_sao_neon.S  | 142 ++++++++++++++++++++++---------------
- 2 files changed, 93 insertions(+), 61 deletions(-)
-
-diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c
-index 385c35d..6d0689c 100644
---- a/libavcodec/arm/hevcdsp_init_neon.c
-+++ b/libavcodec/arm/hevcdsp_init_neon.c
-@@ -176,6 +176,7 @@ static void ff_hevc_sao_band_neon_wrapper(uint8_t *_dst, uint8_t *_src, ptrdiff_
-     int8_t offset_table[32] = { 0 };
-     int k, y, x;
-     int shift  = 3; // BIT_DEPTH - 5
-+    int cwidth = 0;
- 
-     stride_src /= sizeof(pixel);
-     stride_dst /= sizeof(pixel);
-@@ -183,7 +184,10 @@ static void ff_hevc_sao_band_neon_wrapper(uint8_t *_dst, uint8_t *_src, ptrdiff_
-     for (k = 0; k < 4; k++)
-         offset_table[(k + sao_left_class) & 31] = sao_offset_val[k + 1];
- 
--    switch(width){
-+    if (height % 8 == 0)
-+        cwidth = width;
-+
-+    switch(cwidth){
-     case 8:
-         ff_hevc_sao_band_w8_neon_8(_dst, _src, stride_dst, stride_src, offset_table, height);
-         break;
-@@ -223,15 +227,19 @@ static void ff_hevc_sao_edge_neon_wrapper(uint8_t *_dst /* align 16 */, uint8_t
-     pixel *src = (pixel *)_src;
-     int a_stride, b_stride;
-     int x, y;
-+    int cwidth = 0;
- 
-     for (x = 0; x < 5; x++) {
-         sao_offset_val[x] = _sao_offset_val[edge_idx[x]];
-     }
- 
-+    if (height % 8 == 0)
-+        cwidth = width;
-+
-     stride_src /= sizeof(pixel);
-     stride_dst /= sizeof(pixel);
- 
--    switch (width) {
-+    switch (cwidth) {
-     case 32:
-         switch(eo) {
-         case 0:
-diff --git a/libavcodec/arm/hevcdsp_sao_neon.S b/libavcodec/arm/hevcdsp_sao_neon.S
-index 710b32b..08f50b8 100644
---- a/libavcodec/arm/hevcdsp_sao_neon.S
-+++ b/libavcodec/arm/hevcdsp_sao_neon.S
-@@ -26,36 +26,59 @@
-         pld      [r1]
-         vld1.8   {q0, q1}, [r12]  // offset table
-         ldr      r12, [sp, #4]    // height
--        vmov.u8  q14, #128
-+        vmov.u8  q3, #128
- .endm
- 
--.macro sao_band_32
--        vshr.u8  q8, q2, #3
--        vshr.u8  q9, q3, #3
--        vtbl.8   d16, {q0, q1}, d16
--        vtbl.8   d17, {q0, q1}, d17
--        vtbl.8   d18, {q0, q1}, d18
--        vtbl.8   d19, {q0, q1}, d19
--        vadd.s8  q2, q14
--        vadd.s8  q3, q14
--        vqadd.s8 q2, q8
--        vqadd.s8 q3, q9
--        vsub.s8  q2, q14
--        vsub.s8  q3, q14
-+// 128 in q3
-+// input q8 - q11
-+// 32 cycles
-+.macro sao_band_64
-+        vshr.u8  q12, q8, #3
-+        vshr.u8  q13, q9, #3
-+        vshr.u8  q14, q10, #3
-+        vshr.u8  q15, q11, #3
-+        vtbl.8   d24, {d0, d1, d2, d3}, d24
-+        vadd.s8  q8, q3
-+        vtbl.8   d25, {d0, d1, d2, d3}, d25
-+        vadd.s8  q9, q3
-+        vtbl.8   d26, {d0, d1, d2, d3}, d26
-+        vadd.s8  q10, q3
-+        vtbl.8   d27, {d0, d1, d2, d3}, d27
-+        vadd.s8  q11, q3
-+        vtbl.8   d28, {d0, d1, d2, d3}, d28
-+        vqadd.s8 q8, q12
-+        vtbl.8   d29, {d0, d1, d2, d3}, d29
-+        vqadd.s8 q9, q13
-+        vtbl.8   d30, {d0, d1, d2, d3}, d30
-+        vqadd.s8 q10, q14
-+        vtbl.8   d31, {d0, d1, d2, d3}, d31
-+        vqadd.s8 q11, q15
-+        vsub.s8  q8, q3
-+        vsub.s8  q9, q3
-+        vsub.s8  q10, q3
-+        vsub.s8  q11, q3
- .endm
- 
- function ff_hevc_sao_band_w8_neon_8, export=1
-         init_sao_band
--1:      subs     r12, #4
--        vld1.8   {d4}, [r1, :64], r3
--        vld1.8   {d5}, [r1, :64], r3
--        vld1.8   {d6}, [r1, :64], r3
--        vld1.8   {d7}, [r1, :64], r3
--        sao_band_32
--        vst1.8  {d4}, [r0, :64], r2
--        vst1.8  {d5}, [r0, :64], r2
--        vst1.8  {d6}, [r0, :64], r2
--        vst1.8  {d7}, [r0, :64], r2
-+1:      subs     r12, #8
-+        vld1.8   {d16}, [r1, :64], r3
-+        vld1.8   {d17}, [r1, :64], r3
-+        vld1.8   {d18}, [r1, :64], r3
-+        vld1.8   {d19}, [r1, :64], r3
-+        vld1.8   {d20}, [r1, :64], r3
-+        vld1.8   {d21}, [r1, :64], r3
-+        vld1.8   {d22}, [r1, :64], r3
-+        vld1.8   {d23}, [r1, :64], r3
-+        sao_band_64
-+        vst1.8  {d16}, [r0, :64], r2
-+        vst1.8  {d17}, [r0, :64], r2
-+        vst1.8  {d18}, [r0, :64], r2
-+        vst1.8  {d19}, [r0, :64], r2
-+        vst1.8  {d20}, [r0, :64], r2
-+        vst1.8  {d21}, [r0, :64], r2
-+        vst1.8  {d22}, [r0, :64], r2
-+        vst1.8  {d23}, [r0, :64], r2
-         bne    1b
- 
-         bx lr
-@@ -63,12 +86,16 @@ endfunc
- 
- function ff_hevc_sao_band_w16_neon_8, export=1
-         init_sao_band
--1:      subs     r12, #2
--        vld1.8  {q2}, [r1, :128], r3
--        vld1.8  {q3}, [r1, :128], r3
--        sao_band_32
--        vst1.8   {q2}, [r0, :128], r2
--        vst1.8   {q3}, [r0, :128], r2
-+1:      subs     r12, #4
-+        vld1.8  {q8}, [r1, :128], r3
-+        vld1.8  {q9}, [r1, :128], r3
-+        vld1.8  {q10}, [r1, :128], r3
-+        vld1.8  {q11}, [r1, :128], r3
-+        sao_band_64
-+        vst1.8   {q8}, [r0, :128], r2
-+        vst1.8   {q9}, [r0, :128], r2
-+        vst1.8   {q10}, [r0, :128], r2
-+        vst1.8   {q11}, [r0, :128], r2
-         bne    1b
- 
-         bx lr
-@@ -76,10 +103,12 @@ endfunc
- 
- function ff_hevc_sao_band_w32_neon_8, export=1
-         init_sao_band
--1:      subs     r12, #1
--        vld1.8   {q2-q3}, [r1, :128], r3
--        sao_band_32
--        vst1.8   {q2-q3}, [r0, :128], r2
-+1:      subs     r12, #2
-+        vld1.8   {q8-q9}, [r1, :128], r3
-+        vld1.8   {q10-q11}, [r1, :128], r3
-+        sao_band_64
-+        vst1.8   {q8-q9}, [r0, :128], r2
-+        vst1.8   {q10-q11}, [r0, :128], r2
-         bne      1b
- 
-         bx       lr
-@@ -89,13 +118,12 @@ function ff_hevc_sao_band_w64_neon_8, export=1
-         init_sao_band
- 1:      subs      r12, #1
-         pld       [r1, r3]
--        vld1.8    {q2-q3}, [r1, :128]!
--        sao_band_32
--        vst1.8    {q2-q3}, [r0, :128]!
--        vld1.8    {q2-q3}, [r1, :128], r3
-+        vld1.8    {q8-q9}, [r1, :128]!
-+        vld1.8    {q10-q11}, [r1, :128], r3
-         sub       r1, #32
--        sao_band_32
--        vst1.8    {q2-q3}, [r0, :128], r2
-+        sao_band_64
-+        vst1.8    {q8-q9}, [r0, :128]!
-+        vst1.8    {q10-q11}, [r0, :128], r2
-         sub       r0, #32
-         bne       1b
- 
-@@ -121,7 +149,6 @@ endfunc
-         vcgt.u8  q1,  q5, q9
-         vcgt.u8 q15,  q9, q5
-         vsub.s8  q0, q14, q0 // diff1
--
-         vsub.s8  q1, q15, q1
- 
-         vadd.s8  q0, q12 //diff0 + diff1
-@@ -157,27 +184,25 @@ endfunc
- 
-         vmov.u8  q15, #128 // s8 #-128
-         vtbl.8   d0, {d24}, d0
-+        vadd.s8  q13,  q4, q15
-         vtbl.8   d1, {d24}, d1
-+        vadd.s8  q14,  q5, q15
-         vtbl.8   d2, {d24}, d2
-+        vqadd.s8 q0, q13
-         vtbl.8   d3, {d24}, d3
-+        vqadd.s8 q1, q14
-         vtbl.8   d4, {d24}, d4
-+        vadd.s8  q13,  q6, q15
-         vtbl.8   d5, {d24}, d5
-+        vadd.s8  q14,  q7, q15
-         vtbl.8   d6, {d24}, d6
-+        vqadd.s8 q2, q13
-         vtbl.8   d7, {d24}, d7
--
--        vadd.s8  q12,  q4, q15
--        vadd.s8  q13,  q5, q15
--        vadd.s8  q14,  q6, q15
--        vadd.s8  q15,  q7, q15
--        vqadd.s8 q12,  q0
--        vqadd.s8 q15,  q3
--        vmov.u8   q3, #128 // s8 #-128
--        vqadd.s8 q13,  q1
--        vqadd.s8 q14,  q2
--        vsub.s8   q0, q12, q3
--        vsub.s8   q1, q13, q3
--        vsub.s8   q2, q14, q3
--        vsub.s8   q3, q15, q3
-+        vqadd.s8 q3, q14
-+        vsub.s8   q0, q15
-+        vsub.s8   q1, q15
-+        vsub.s8   q2, q15
-+        vsub.s8   q3, q15
-         vst1.8  {q0-q1}, [r0, :128]!
-         vst1.8  {q2-q3}, [r0, :128], r2
-         sub     r0, #32
-@@ -342,13 +367,12 @@ endfunc
- 
-         vmov.u8  q10, #128
-         vtbl.8   d0, {d31}, d0
-+        vadd.s8  q11, q2, q10
-         vtbl.8   d1, {d31}, d1
-+        vadd.s8  q12, q3, q10
-         vtbl.8   d2, {d31}, d2
-+        vqadd.s8 q11, q0
-         vtbl.8   d3, {d31}, d3
--
--        vadd.s8    q11, q2, q10
--        vadd.s8    q12, q3, q10
--        vqadd.s8   q11, q0
-         vqadd.s8   q12, q1
-         vsub.s8    q0, q11, q10
-         vsub.s8    q1, q12, q10
--- 
-2.5.0
-
-
-From 026bac1824e4936e948e6b1efec82868c520ea66 Mon Sep 17 00:00:00 2001
-From: Seppo Tomperi <seppo.tomperi@vtt.fi>
-Date: Mon, 2 Feb 2015 16:08:27 +0200
-Subject: [PATCH 9/9] Further SAO NEON optimisations
-
----
- libavcodec/arm/hevcdsp_init_neon.c |  16 +--
- libavcodec/arm/hevcdsp_sao_neon.S  | 224 +++++++++++++++++++------------------
- 2 files changed, 124 insertions(+), 116 deletions(-)
-
-diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c
-index 6d0689c..e5da7e9 100644
---- a/libavcodec/arm/hevcdsp_init_neon.c
-+++ b/libavcodec/arm/hevcdsp_init_neon.c
-@@ -45,10 +45,10 @@ void ff_hevc_transform_add_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
- void ff_hevc_transform_add_32x32_neon_8(uint8_t *_dst, int16_t *coeffs,
-                                       ptrdiff_t stride);
- 
--void ff_hevc_sao_band_w8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int8_t * offset_table, int height);
--void ff_hevc_sao_band_w16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int8_t * offset_table, int height);
--void ff_hevc_sao_band_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int8_t * offset_table, int height);
--void ff_hevc_sao_band_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int8_t * offset_table, int height);
-+void ff_hevc_sao_band_w8_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height);
-+void ff_hevc_sao_band_w16_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height);
-+void ff_hevc_sao_band_w32_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height);
-+void ff_hevc_sao_band_w64_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height);
- 
- void ff_hevc_sao_edge_eo0_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
- void ff_hevc_sao_edge_eo1_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
-@@ -189,16 +189,16 @@ static void ff_hevc_sao_band_neon_wrapper(uint8_t *_dst, uint8_t *_src, ptrdiff_
- 
-     switch(cwidth){
-     case 8:
--        ff_hevc_sao_band_w8_neon_8(_dst, _src, stride_dst, stride_src, offset_table, height);
-+        ff_hevc_sao_band_w8_neon_8(_dst, _src, offset_table, stride_src, stride_dst, height);
-         break;
-     case 16:
--        ff_hevc_sao_band_w16_neon_8(_dst, _src, stride_dst, stride_src, offset_table, height);
-+        ff_hevc_sao_band_w16_neon_8(_dst, _src, offset_table, stride_src, stride_dst, height);
-         break;
-     case 32:
--        ff_hevc_sao_band_w32_neon_8(_dst, _src, stride_dst, stride_src, offset_table, height);
-+        ff_hevc_sao_band_w32_neon_8(_dst, _src, offset_table, stride_src, stride_dst, height);
-         break;
-     case 64:
--        ff_hevc_sao_band_w64_neon_8(_dst, _src, stride_dst, stride_src, offset_table, height);
-+        ff_hevc_sao_band_w64_neon_8(_dst, _src, offset_table, stride_src, stride_dst, height);
-         break;
-     default:
-         for (y = 0; y < height; y++) {
-diff --git a/libavcodec/arm/hevcdsp_sao_neon.S b/libavcodec/arm/hevcdsp_sao_neon.S
-index 08f50b8..9c7808d 100644
---- a/libavcodec/arm/hevcdsp_sao_neon.S
-+++ b/libavcodec/arm/hevcdsp_sao_neon.S
-@@ -22,21 +22,16 @@
- #include "neon.S"
- 
- .macro init_sao_band
--        ldr      r12, [sp, #0]    // offset_table address
-         pld      [r1]
--        vld1.8   {q0, q1}, [r12]  // offset table
--        ldr      r12, [sp, #4]    // height
-+        vld1.8   {q0, q1}, [r2]  // offset table
-+        ldr       r2, [sp, #0]   // stride_dst
-+        ldr      r12, [sp, #4]   // height
-         vmov.u8  q3, #128
- .endm
- 
- // 128 in q3
- // input q8 - q11
--// 32 cycles
- .macro sao_band_64
--        vshr.u8  q12, q8, #3
--        vshr.u8  q13, q9, #3
--        vshr.u8  q14, q10, #3
--        vshr.u8  q15, q11, #3
-         vtbl.8   d24, {d0, d1, d2, d3}, d24
-         vadd.s8  q8, q3
-         vtbl.8   d25, {d0, d1, d2, d3}, d25
-@@ -52,8 +47,8 @@
-         vtbl.8   d30, {d0, d1, d2, d3}, d30
-         vqadd.s8 q10, q14
-         vtbl.8   d31, {d0, d1, d2, d3}, d31
--        vqadd.s8 q11, q15
-         vsub.s8  q8, q3
-+        vqadd.s8 q11, q15
-         vsub.s8  q9, q3
-         vsub.s8  q10, q3
-         vsub.s8  q11, q3
-@@ -64,12 +59,16 @@ function ff_hevc_sao_band_w8_neon_8, export=1
- 1:      subs     r12, #8
-         vld1.8   {d16}, [r1, :64], r3
-         vld1.8   {d17}, [r1, :64], r3
-+        vshr.u8  q12, q8, #3
-         vld1.8   {d18}, [r1, :64], r3
-         vld1.8   {d19}, [r1, :64], r3
-+        vshr.u8  q13, q9, #3
-         vld1.8   {d20}, [r1, :64], r3
-         vld1.8   {d21}, [r1, :64], r3
-+        vshr.u8  q14, q10, #3
-         vld1.8   {d22}, [r1, :64], r3
-         vld1.8   {d23}, [r1, :64], r3
-+        vshr.u8  q15, q11, #3
-         sao_band_64
-         vst1.8  {d16}, [r0, :64], r2
-         vst1.8  {d17}, [r0, :64], r2
-@@ -88,9 +87,13 @@ function ff_hevc_sao_band_w16_neon_8, export=1
-         init_sao_band
- 1:      subs     r12, #4
-         vld1.8  {q8}, [r1, :128], r3
-+        vshr.u8  q12, q8, #3
-         vld1.8  {q9}, [r1, :128], r3
-+        vshr.u8  q13, q9, #3
-         vld1.8  {q10}, [r1, :128], r3
-+        vshr.u8  q14, q10, #3
-         vld1.8  {q11}, [r1, :128], r3
-+        vshr.u8  q15, q11, #3
-         sao_band_64
-         vst1.8   {q8}, [r0, :128], r2
-         vst1.8   {q9}, [r0, :128], r2
-@@ -105,7 +108,11 @@ function ff_hevc_sao_band_w32_neon_8, export=1
-         init_sao_band
- 1:      subs     r12, #2
-         vld1.8   {q8-q9}, [r1, :128], r3
-+        vshr.u8  q12, q8, #3
-+        vshr.u8  q13, q9, #3
-         vld1.8   {q10-q11}, [r1, :128], r3
-+        vshr.u8  q14, q10, #3
-+        vshr.u8  q15, q11, #3
-         sao_band_64
-         vst1.8   {q8-q9}, [r0, :128], r2
-         vst1.8   {q10-q11}, [r0, :128], r2
-@@ -119,7 +126,11 @@ function ff_hevc_sao_band_w64_neon_8, export=1
- 1:      subs      r12, #1
-         pld       [r1, r3]
-         vld1.8    {q8-q9}, [r1, :128]!
-+        vshr.u8  q12, q8, #3
-+        vshr.u8  q13, q9, #3
-         vld1.8    {q10-q11}, [r1, :128], r3
-+        vshr.u8  q14, q10, #3
-+        vshr.u8  q15, q11, #3
-         sub       r1, #32
-         sao_band_64
-         vst1.8    {q8-q9}, [r0, :128]!
-@@ -129,51 +140,18 @@ function ff_hevc_sao_band_w64_neon_8, export=1
- 
-         bx lr
- endfunc
--// input
--// a in q0 - q3
--// c in q4 - q7
--// b in q8 - q11
--// offset table in r7 and r5
--// output in q0 - q3
--// clobbers q12 - q15
--.macro edge_w64_body
--        vcgt.u8 q12,  q4, q0 // c > a -> -1 , otherwise 0
--        vcgt.u8  q0,  q0, q4 // a > c -> -1 , otherwise 0
--        vcgt.u8 q13,  q5, q1
--        vcgt.u8  q1,  q1, q5
--        vsub.s8 q12,  q0, q12 // diff0
--        vcgt.u8  q0,  q4, q8 // c > b
--        vsub.s8 q13,  q1, q13
--
--        vcgt.u8 q14,  q8, q4 // b > c
--        vcgt.u8  q1,  q5, q9
--        vcgt.u8 q15,  q9, q5
--        vsub.s8  q0, q14, q0 // diff1
--        vsub.s8  q1, q15, q1
- 
--        vadd.s8  q0, q12 //diff0 + diff1
--        vadd.s8  q1, q13
--
--        vcgt.u8 q14,  q6, q2
--        vcgt.u8  q2,  q2, q6
--        vcgt.u8 q15,  q7, q3
--        vcgt.u8  q3,  q3, q7
--
--        vsub.s8 q14,  q2, q14
--        vcgt.u8  q2,  q6, q10
--        vsub.s8 q15,  q3, q15
--
--        vcgt.u8 q12, q10, q6
--        vcgt.u8  q3,  q7, q11
--        vcgt.u8 q13, q11, q7
--        vsub.s8  q2, q12, q2
--        vsub.s8  q3, q13, q3
-+.macro diff32 out0, out1, tmp0, tmp1, in0, in1, in2, in3
-+        vcgt.u8 \out0, \in2, \in0  // c > a -> -1 , otherwise 0
-+        vcgt.u8 \tmp0,  \in0, \in2  // a > c -> -1 , otherwise 0
-+        vcgt.u8 \out1, \in3, \in1  // c > a -> -1 , otherwise 0 part 2
-+        vcgt.u8 \tmp1,  \in1, \in3  // a > c -> -1 , otherwise 0 part 2
-+        vsub.s8 \out0, \tmp0, \out0 // diff0
-+        vsub.s8 \out1, \tmp1, \out1 // diff0 part 2
-+.endm
- 
-+.macro table64
-         vmov.s8 q13, #2 // 2 to all elements
--
--        vadd.s8  q2, q14
--        vadd.s8  q3, q15
--
-         vmov.32  d24[0], r4  // load offset table from general registers
-         vmov.32  d24[1], r5  // load rest of offset table
- 
-@@ -208,6 +186,28 @@ endfunc
-         sub     r0, #32
- .endm
- 
-+// input
-+// a in q0 - q3
-+// c in q4 - q7
-+// b in q8 - q11
-+// offset table in r7 and r5
-+// output in q0 - q3
-+// clobbers q12 - q15
-+.macro edge_w64_body
-+        diff32 q12, q13, q0, q1, q0, q1, q4, q5
-+        diff32 q0, q1, q14, q15, q8, q9, q4, q5
-+
-+        vadd.s8  q0, q12 //diff0 + diff1
-+        vadd.s8  q1, q13
-+
-+        diff32  q14, q15, q2, q3, q2, q3, q6, q7
-+        diff32  q2, q3, q12, q13, q10, q11, q6, q7
-+
-+        vadd.s8  q2, q14
-+        vadd.s8  q3, q15
-+        table64
-+.endm
-+
- .macro init_edge_64
-         push   {r4-r5}
-         ldr    r12, [sp, #8] // height
-@@ -334,38 +334,23 @@ function ff_hevc_sao_edge_eo3_w64_neon_8, export=1
-         bx lr
- endfunc
- 
--// inputs:
--// a in q0, q1
--// c in q2, q3
--// b in q8, q9
--// offset table in d31
--// clobbered registers q0, q1, q10, q11, q12, q13
--// output q0, q1
--.macro edge_w32_body
--        vcgt.u8 q12, q2, q0 // c > a -> -1 , otherwise 0
--        vcgt.u8 q0,  q0, q2 // a > c -> -1 , otherwise 0
--        vcgt.u8 q13, q3, q1
--        vcgt.u8 q1,  q1, q3
--
--        vsub.s8 q12, q0, q12 // diff0
--        vcgt.u8  q0,  q2, q8 // c > b
--        vsub.s8 q13, q1, q13 // diff0 part 2
--
--        vcgt.u8  q10,  q8, q2 // b > c
--        vcgt.u8  q1,  q3, q9
--        vcgt.u8  q11,  q9, q3
--
--        vsub.s8 q0, q10, q0 // diff1
--
--        vmov.s8 q10, #2 // 2 to all elements
--        vsub.s8 q1, q11, q1 // diff1 part 2
--        vadd.s8 q0, q12 //diff0 + diff1
--        vadd.s8 q1, q13
-+.macro init_edge_32
-+        ldr     r12, [sp, #4] // sao_offset_val_table
-+        vld1.32 {d31}, [r12]
-+        ldr     r12, [sp] // height
-+.endm
- 
--        vadd.s8 q0, q10
--        vadd.s8 q1, q10
-+.macro diff out0, tmp0, in0, in1
-+        vcgt.u8 \out0, \in1, \in0  // c > a -> -1 , otherwise 0
-+        vcgt.u8 \tmp0,  \in0, \in1  // a > c -> -1 , otherwise 0
-+        vsub.s8 \out0, \tmp0, \out0 // diff0
-+.endm
- 
--        vmov.u8  q10, #128
-+.macro table32
-+        vmov.s8  q10, #2
-+        vadd.s8  q0, q10
-+        vadd.s8  q1, q10
-+        vmov.s8  q10, #128
-         vtbl.8   d0, {d31}, d0
-         vadd.s8  q11, q2, q10
-         vtbl.8   d1, {d31}, d1
-@@ -373,56 +358,68 @@ endfunc
-         vtbl.8   d2, {d31}, d2
-         vqadd.s8 q11, q0
-         vtbl.8   d3, {d31}, d3
--        vqadd.s8   q12, q1
--        vsub.s8    q0, q11, q10
--        vsub.s8    q1, q12, q10
-+        vqadd.s8 q12, q1
-+        vsub.s8  q0, q11, q10
-+        vsub.s8  q1, q12, q10
-         vst1.8   {q0-q1}, [r0, :128], r2
- .endm
- 
--.macro init_edge_32
--        ldr     r12, [sp, #4] // sao_offset_val_table
--        vld1.32 {d31}, [r12]
--        ldr     r12, [sp] // height
--.endm
--
- function ff_hevc_sao_edge_eo0_w32_neon_8, export=1
-         init_edge_32
--        sub     r1, #4 // load 4 extra bytes
-+        vpush {q4-q7}
-+        sub     r1, #4
- 1:      subs    r12, #1
--        vld1.32 d3[1], [r1]!
--        vld1.8  {q2-q3}, [r1, :128]! // c
--        vld1.32 d20[0], [r1], r3
--        sub     r1, #36
-+        vld1.8  {q13-q14}, [r1]!
-+        vld1.32 d30, [r1], r3
-+        sub     r1, #32
-         // a
--        vext.8  q0, q1, q2, #15
--        vext.8  q1, q2, q3, #15
--        // b
--        vext.8  q8, q2, q3, #1
--        vext.8  q9, q3, q10, #1
--        edge_w32_body
-+        vext.8   q0, q13, q14, #3
-+        vext.8   q1, q14, q15, #3
-+        vshr.u64 d24, d30, #24
-+        // c
-+        vext.8   q2, q13, q14, #4
-+        vext.8   q3, q14, q15, #4
-+        vshr.u64 d16, d30, #32
-+        // diff0
-+        diff32 q13, q14, q4, q5, q0, q1, q2, q3
-+        diff   d18, d25, d24, d16
-+        // -diff1
-+        vext.s8 q0, q13, q14, #1
-+        vext.s8 q1, q14, q9, #1
-+
-+        vsub.s8 q0, q13, q0 //diff0 + diff1
-+        vsub.s8 q1, q14, q1
-+        table32
-         bne     1b
-+        vpop {q4-q7}
-+
-         bx      lr
- endfunc
- 
- function ff_hevc_sao_edge_eo1_w32_neon_8, export=1
-         init_edge_32
-+        vpush {q4-q7}
-         // load a
-         sub     r1, r3
-         vld1.8  {q0-q1}, [r1, :128], r3
-         // load c
-         vld1.8  {q2-q3}, [r1, :128], r3
-+        diff32 q12, q13, q0, q1, q0, q1, q2, q3 // CMP ( c, a )
- 1:      subs    r12, #1
-         // load b
-         vld1.8  {q8-q9}, [r1, :128], r3
--        edge_w32_body
--        // inputs for next loop iteration
--        // a
--        vmov.64 q0, q2
--        vmov.64 q1, q3
-+        diff32 q4, q5, q10, q11, q8, q9, q2, q3 // CMP ( c, b )
-+        vadd.s8 q0, q4, q12 //diff0 + diff1
-+        vadd.s8 q1, q5, q13
-+        table32
-+        // CMP ( c, a )
-+        vneg.s8 q12, q4
-+        vneg.s8 q13, q5
-         // c
-         vmov.64 q2, q8
-         vmov.64 q3, q9
-         bne     1b
-+        vpop {q4-q7}
-         bx      lr
- endfunc
- 
-@@ -452,7 +449,11 @@ function ff_hevc_sao_edge_eo2_w32_neon_8, export=1
-         vext.8  q6, q10, q11, #8
-         vext.8  q7, q11, q12, #8
-         vext.8  q5, q10, q11, #7
--        edge_w32_body
-+        diff32 q12, q13, q0, q1, q0, q1, q2, q3
-+        diff32 q0, q1, q10, q11, q8, q9, q2, q3
-+        vadd.s8 q0, q12 //diff0 + diff1
-+        vadd.s8 q1, q13
-+        table32
-         // inputs for next loop iteration
-         // a
-         vmov.8  q0, q4
-@@ -487,7 +488,14 @@ function ff_hevc_sao_edge_eo3_w32_neon_8, export=1
-         vext.8  q8, q10, q11, #7
-         vext.8  q9, q11, q12, #7
-         vext.8  q14, q12, q10, #7
--        edge_w32_body
-+
-+        diff32 q12, q13, q0, q1, q0, q1, q2, q3
-+        diff32 q0, q1, q10, q11, q8, q9, q2, q3
-+
-+        vadd.s8 q0, q12 //diff0 + diff1
-+        vadd.s8 q1, q13
-+        table32
-+
-         // inputs for next loop iteration
-         // a
-         vext.8  q0, q2, q3, #1
--- 
-2.5.0
-
-
diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch
index 3634d4316f..2e052294f2 100644
--- a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch
+++ b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch
@@ -1,14 +1,17 @@
-From b9b5434c61afd492a54dad5158b4d56ecbf7f01d Mon Sep 17 00:00:00 2001
-From: Peter de Rivaz <peter.derivaz@gmail.com>
-Date: Tue, 28 Apr 2015 16:18:40 +0100
-Subject: [PATCH 01/68] Added display output
-
----
- ffmpeg.c | 159 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
- 1 file changed, 159 insertions(+)
-
+diff --git a/.gitignore b/.gitignore
+index 524fb73..305632b 100644
+--- a/.gitignore
++++ b/.gitignore
+@@ -23,6 +23,7 @@
+ .\#*
+ /.config
+ /.version
++/build/
+ /ffmpeg
+ /ffplay
+ /ffprobe
 diff --git a/ffmpeg.c b/ffmpeg.c
-index 9ffd833..50c6e86 100644
+index 9ffd833..7a86d7e 100644
 --- a/ffmpeg.c
 +++ b/ffmpeg.c
 @@ -23,6 +23,11 @@
@@ -17,17 +20,20 @@ index 9ffd833..50c6e86 100644
  
 +#ifdef RPI
 +#define RPI_DISPLAY
-+//#define RPI_ZERO_COPY
++#define RPI_ZERO_COPY
 +#endif
 +
  #include "config.h"
  #include <ctype.h>
  #include <string.h>
-@@ -66,6 +71,20 @@
+@@ -66,6 +71,25 @@
  # include "libavfilter/buffersrc.h"
  # include "libavfilter/buffersink.h"
  
 +#ifdef RPI_DISPLAY
++#pragma GCC diagnostic push
++// Many many redundant decls in the header files
++#pragma GCC diagnostic ignored "-Wredundant-decls"
 +#include <bcm_host.h>
 +#include <interface/mmal/mmal.h>
 +#include <interface/mmal/mmal_parameters_camera.h>
@@ -36,15 +42,17 @@ index 9ffd833..50c6e86 100644
 +#include <interface/mmal/util/mmal_default_components.h>
 +#include <interface/mmal/util/mmal_connection.h>
 +#include <interface/mmal/util/mmal_util_params.h>
++#pragma GCC diagnostic pop
 +#ifdef RPI_ZERO_COPY
 +#include "libavcodec/rpi_qpu.h"
 +#endif
++#include "libavcodec/rpi_zc.h"
 +#endif
 +
  #if HAVE_SYS_RESOURCE_H
  #include <sys/time.h>
  #include <sys/types.h>
-@@ -158,6 +177,134 @@ static int restore_tty;
+@@ -158,6 +182,169 @@ static int restore_tty;
  static void free_input_threads(void);
  #endif
  
@@ -54,13 +62,7 @@ index 9ffd833..50c6e86 100644
 +
 +static MMAL_COMPONENT_T* rpi_display = NULL;
 +static MMAL_POOL_T *rpi_pool = NULL;
-+
-+#ifdef RPI_ZERO_COPY
-+static uint8_t *get_vc_handle(AVBufferRef *bref) {
-+  GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
-+  return (uint8_t *)p->vc_handle;
-+}
-+#endif
++static volatile int rpi_display_count = 0;
 +
 +static MMAL_POOL_T* display_alloc_pool(MMAL_PORT_T* port, size_t w, size_t h)
 +{
@@ -77,7 +79,7 @@ index 9ffd833..50c6e86 100644
 +    for (i = 0; i < NUM_BUFFERS; ++i)
 +    {
 +       MMAL_BUFFER_HEADER_T* buffer = pool->header[i];
-+       void* bufPtr = buffer->data;
++       char * bufPtr = buffer->data;
 +       memset(bufPtr, i*30, w*h);
 +       memset(bufPtr+w*h, 128, (w*h)/2);
 +    }
@@ -86,81 +88,122 @@ index 9ffd833..50c6e86 100644
 +    return pool;
 +}
 +
-+static void display_cb_input(MMAL_PORT_T *port,MMAL_BUFFER_HEADER_T *buffer) {
++static void display_cb_input(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer) {
++#ifdef RPI_ZERO_COPY
++    av_rpi_zc_unref(buffer->user_data);
++    --rpi_display_count;
++#endif
++    mmal_buffer_header_release(buffer);
++}
++
++static void display_cb_control(MMAL_PORT_T *port,MMAL_BUFFER_HEADER_T *buffer) {
 +  mmal_buffer_header_release(buffer);
 +}
 +
 +static MMAL_COMPONENT_T* display_init(size_t x, size_t y, size_t w, size_t h)
 +{
 +    MMAL_COMPONENT_T* display;
-+    int w2 = (w+31)&~31;
-+    int h2 = (h+15)&~15;
 +    MMAL_DISPLAYREGION_T region =
 +    {
-+        {MMAL_PARAMETER_DISPLAYREGION, sizeof(region)},
++        .hdr = {MMAL_PARAMETER_DISPLAYREGION, sizeof(region)},
 +        .set = MMAL_DISPLAY_SET_LAYER | MMAL_DISPLAY_SET_FULLSCREEN | MMAL_DISPLAY_SET_DEST_RECT,
 +        .layer = 2,
 +        .fullscreen = 0,
 +        .dest_rect = {x, y, w, h}
 +    };
++    const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(w, h);
++
 +    bcm_host_init();  // TODO is this needed?
 +    mmal_component_create(MMAL_COMPONENT_DEFAULT_VIDEO_RENDERER, &display);
 +    assert(display);
 +
 +    mmal_port_parameter_set(display->input[0], &region.hdr);
 +
-+    MMAL_ES_FORMAT_T* format = display->input[0]->format;
-+    format->encoding = MMAL_ENCODING_I420;
-+    format->es->video.width = w2;
-+    format->es->video.height = h2;
-+    format->es->video.crop.x = 0;
-+    format->es->video.crop.y = 0;
-+    format->es->video.crop.width = w;
-+    format->es->video.crop.height = h;
-+    mmal_port_format_commit(display->input[0]);
++    {
++        MMAL_ES_FORMAT_T* format = display->input[0]->format;
++        format->encoding = MMAL_ENCODING_I420;
++        format->es->video.width = geo.stride_y;
++        format->es->video.height = geo.height_y;
++        format->es->video.crop.x = 0;
++        format->es->video.crop.y = 0;
++        format->es->video.crop.width = w;
++        format->es->video.crop.height = h;
++        mmal_port_format_commit(display->input[0]);
++    }
 +
 +    mmal_component_enable(display);
 +
-+    rpi_pool = display_alloc_pool(display->input[0], w2, h2);
++    rpi_pool = display_alloc_pool(display->input[0], geo.stride_y, geo.height_y);
 +
 +    mmal_port_enable(display->input[0],display_cb_input);
-+    mmal_port_enable(display->control,display_cb_input);
++    mmal_port_enable(display->control,display_cb_control);
 +
-+    printf("Allocated display %d %d\n",w,h);
++    printf("Allocated display %dx%d in %dx%d\n", w, h, geo.stride_y, geo.height_y);
 +
 +    return display;
 +}
 +
-+static void display_frame(MMAL_COMPONENT_T* display,AVFrame* fr)
++static void display_frame(struct AVCodecContext * const s, MMAL_COMPONENT_T* const display, const AVFrame* const fr)
 +{
++    MMAL_BUFFER_HEADER_T* buf;
++
++    if (!display || !rpi_pool)
++        return;
++
++    if (rpi_display_count >= 3) {
++        av_log(s, AV_LOG_VERBOSE, "Frame dropped\n");
++        return;
++    }
++
++    buf = mmal_queue_get(rpi_pool->queue);
++    if (!buf) {
++        // Running too fast so drop the frame
++        printf("Q alloc failure\n");
++        return;
++    }
++    assert(buf);
++    buf->cmd = 0;
++    buf->offset = 0; // Offset to valid data
++    buf->flags = 0;
++#ifdef RPI_ZERO_COPY
++{
++    const AVRpiZcRefPtr fr_buf = av_rpi_zc_ref(s, fr, 1);
++
++    buf->user_data = fr_buf;
++    buf->data = av_rpi_zc_vc_handle(fr_buf);
++    buf->alloc_size =
++        buf->length = av_rpi_zc_numbytes(fr_buf);
++
++    ++rpi_display_count;
++}
++#else
++{
++#error YYY
 +    int w = fr->width;
 +    int h = fr->height;
 +    int w2 = (w+31)&~31;
 +    int h2 = (h+15)&~15;
-+    if (!display || !rpi_pool)
-+        return;
-+    MMAL_BUFFER_HEADER_T* buf = mmal_queue_get(rpi_pool->queue);
-+    if (!buf) {
-+      // Running too fast so drop the frame
-+      return;
-+    }
-+    assert(buf);
-+    buf->cmd = 0;
++
 +    buf->length = (w2 * h2 * 3)/2;
-+    buf->offset = 0; // Offset to valid data
-+    buf->flags = 0;
-+#ifdef RPI_ZERO_COPY
-+    buf->data = get_vc_handle(fr->buf[0]);
-+    buf->alloc_size = (w2*h2*3)/2;
-+#else
++    buf->user_data = NULL;
++
 +    //mmal_buffer_header_mem_lock(buf);
 +    memcpy(buf->data, fr->data[0], w2 * h);
 +    memcpy(buf->data+w2*h2, fr->data[1], w2 * h / 4);
 +    memcpy(buf->data+w2*h2*5/4, fr->data[2], w2 * h / 4);
 +    //mmal_buffer_header_mem_unlock(buf);
++}
 +#endif
 +
-+    mmal_port_send_buffer(display->input[0], buf);  // I assume this will automatically get released
++    while (rpi_display_count >= 3) {
++        usleep(5000);
++    }
++
++    if (mmal_port_send_buffer(display->input[0], buf) != MMAL_SUCCESS)
++    {
++        printf("** send failed: depth=%d\n", rpi_display_count);
++        display_cb_input(NULL, buf);
++    }
 +}
 +
 +static void display_exit(MMAL_COMPONENT_T* display)
@@ -179,52 +222,2577 @@ index 9ffd833..50c6e86 100644
  /* sub2video hack:
     Convert subtitles to video with alpha to insert them in filter graphs.
     This is a temporary solution until libavfilter gets real subtitles support.
-@@ -581,6 +728,10 @@ static void ffmpeg_cleanup(int ret)
+@@ -540,6 +727,11 @@ static void ffmpeg_cleanup(int ret)
+         avformat_close_input(&input_files[i]->ctx);
+         av_freep(&input_files[i]);
      }
-     term_exit();
-     ffmpeg_exited = 1;
 +
 +#ifdef RPI_DISPLAY
 +    display_exit(rpi_display);
 +#endif
++
+     for (i = 0; i < nb_input_streams; i++) {
+         InputStream *ist = input_streams[i];
+ 
+@@ -551,6 +743,9 @@ static void ffmpeg_cleanup(int ret)
+         av_freep(&ist->filters);
+         av_freep(&ist->hwaccel_device);
+ 
++#ifdef RPI_ZERO_COPY
++        av_rpi_zc_uninit(ist->dec_ctx);
++#endif
+         avcodec_free_context(&ist->dec_ctx);
+ 
+         av_freep(&input_streams[i]);
+@@ -581,6 +776,7 @@ static void ffmpeg_cleanup(int ret)
+     }
+     term_exit();
+     ffmpeg_exited = 1;
++
  }
  
  void remove_avoptions(AVDictionary **a, AVDictionary *b)
-@@ -940,6 +1091,14 @@ static void do_video_out(AVFormatContext *s,
-     int frame_size = 0;
-     InputStream *ist = NULL;
-     AVFilterContext *filter = ost->filter->filter;
-+#ifdef RPI_DISPLAY
-+    if (next_picture)
-+    {
-+	if (!rpi_display)
-+           rpi_display = display_init(0,0,next_picture->width,next_picture->height);
-+        display_frame(rpi_display,next_picture);
-+    }
-+#endif
- 
+@@ -944,6 +1140,15 @@ static void do_video_out(AVFormatContext *s,
      if (ost->source_index >= 0)
          ist = input_streams[ost->source_index];
--- 
-2.7.4
-
-
-From b90a5aff7bf9112ebd2a07949c8d79a49fcafe48 Mon Sep 17 00:00:00 2001
-From: Peter de Rivaz <peter.derivaz@gmail.com>
-Date: Wed, 29 Apr 2015 16:49:43 +0100
-Subject: [PATCH 02/68] Split transform and intra prediction into commands
-
----
- libavcodec/hevc.c       | 119 +++++++++++++++++++++++++++++++++++++++++++++++-
- libavcodec/hevc.h       |  58 +++++++++++++++++++++++
- libavcodec/hevc_cabac.c |  15 ++++++
- 3 files changed, 191 insertions(+), 1 deletion(-)
-
+ 
++#ifdef RPI_DISPLAY
++    if (next_picture && ist != NULL)
++    {
++        if (!rpi_display)
++           rpi_display = display_init(0,0,next_picture->width,next_picture->height);
++        display_frame(ist->dec_ctx, rpi_display, next_picture);
++    }
++#endif
++
+     if (filter->inputs[0]->frame_rate.num > 0 &&
+         filter->inputs[0]->frame_rate.den > 0)
+         duration = 1/(av_q2d(filter->inputs[0]->frame_rate) * av_q2d(enc->time_base));
+@@ -2549,6 +2754,12 @@ static int init_input_stream(int ist_index, char *error, int error_len)
+         ist->dec_ctx->opaque                = ist;
+         ist->dec_ctx->get_format            = get_format;
+         ist->dec_ctx->get_buffer2           = get_buffer;
++
++#ifdef RPI_ZERO_COPY
++        // Overrides the above get_buffer2
++        av_rpi_zc_init(ist->dec_ctx);
++#endif
++
+         ist->dec_ctx->thread_safe_callbacks = 1;
+ 
+         av_opt_set_int(ist->dec_ctx, "refcounted_frames", 1, 0);
+diff --git a/libavcodec/Makefile b/libavcodec/Makefile
+index fd0d1f0..40d22d2 100644
+--- a/libavcodec/Makefile
++++ b/libavcodec/Makefile
+@@ -5,6 +5,11 @@ NAME = avcodec
+ HEADERS = avcodec.h                                                     \
+           avdct.h                                                       \
+           avfft.h                                                       \
++          rpi_qpu.h                                                     \
++          rpi_shader.h                                                  \
++          rpi_mailbox.h                                                 \
++          rpi_hevc_transform.h                                          \
++          rpi_zc.h                                                      \
+           d3d11va.h                                                     \
+           dirac.h                                                       \
+           dv_profile.h                                                  \
+@@ -43,6 +48,10 @@ OBJS = allcodecs.o                                                      \
+        resample.o                                                       \
+        resample2.o                                                      \
+        utils.o                                                          \
++       rpi_qpu.o                                                        \
++       rpi_shader.o                                                     \
++       rpi_mailbox.o                                                    \
++       rpi_zc.o                                                         \
+        vorbis_parser.o                                                  \
+        xiph.o                                                           \
+ 
+@@ -1078,3 +1087,11 @@ $(SUBDIR)qdm2.o: $(SUBDIR)qdm2_tables.h
+ $(SUBDIR)sinewin.o: $(SUBDIR)sinewin_tables.h
+ $(SUBDIR)sinewin_fixed.o: $(SUBDIR)sinewin_fixed_tables.h
+ endif
++
++$(SUBDIR)rpi_shader.c: $(SUBDIR)rpi_shader.qasm
++	python $(SUBDIR)../pi-util/qasm.py -mc_c:rpi_shader,rpi_shader,rpi_shader $< > $@
++
++$(SUBDIR)rpi_shader.h: $(SUBDIR)rpi_shader.qasm
++	python $(SUBDIR)../pi-util/qasm.py -mc_h:rpi_shader,rpi_shader,rpi_shader $< > $@
++
++$(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_shader.h
+diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c
+index 54efaad..02a89c3 100644
+--- a/libavcodec/allcodecs.c
++++ b/libavcodec/allcodecs.c
+@@ -667,6 +667,7 @@ void avcodec_register_all(void)
+     REGISTER_PARSER(H261,               h261);
+     REGISTER_PARSER(H263,               h263);
+     REGISTER_PARSER(H264,               h264);
++    REGISTER_PARSER(H264_MVC,           h264_mvc);
+     REGISTER_PARSER(HEVC,               hevc);
+     REGISTER_PARSER(MJPEG,              mjpeg);
+     REGISTER_PARSER(MLP,                mlp);
+diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
+index a4ceca7..1354c14 100644
+--- a/libavcodec/arm/Makefile
++++ b/libavcodec/arm/Makefile
+@@ -132,8 +132,10 @@ NEON-OBJS-$(CONFIG_LLAUDDSP)           += arm/lossless_audiodsp_neon.o
+ NEON-OBJS-$(CONFIG_DCA_DECODER)        += arm/synth_filter_neon.o
+ NEON-OBJS-$(CONFIG_HEVC_DECODER)       += arm/hevcdsp_init_neon.o       \
+                                           arm/hevcdsp_deblock_neon.o    \
++                                          arm/hevcdsp_epel_neon.o       \
+                                           arm/hevcdsp_idct_neon.o       \
+-                                          arm/hevcdsp_qpel_neon.o
++                                          arm/hevcdsp_qpel_neon.o       \
++                                          arm/hevcdsp_sao_neon.o
+ NEON-OBJS-$(CONFIG_RV30_DECODER)       += arm/rv34dsp_neon.o
+ NEON-OBJS-$(CONFIG_RV40_DECODER)       += arm/rv34dsp_neon.o            \
+                                           arm/rv40dsp_neon.o
+diff --git a/libavcodec/arm/cabac.h b/libavcodec/arm/cabac.h
+index fdbf86b..0a3980a 100644
+--- a/libavcodec/arm/cabac.h
++++ b/libavcodec/arm/cabac.h
+@@ -26,13 +26,34 @@
+ #include "libavutil/internal.h"
+ #include "libavcodec/cabac.h"
+ 
++
++#if UNCHECKED_BITSTREAM_READER
++#define LOAD_16BITS_BEHI\
++        "ldrh       %[tmp]        , [%[ptr]]    , #2            \n\t"\
++        "rev        %[tmp]        , %[tmp]                      \n\t"
++#elif CONFIG_THUMB
++#define LOAD_16BITS_BEHI\
++        "ldr        %[tmp]        , [%[c], %[end]]              \n\t"\
++        "cmp        %[tmp]        , %[ptr]                      \n\t"\
++        "it         cs                                          \n\t"\
++        "ldrhcs     %[tmp]        , [%[ptr]]    , #2            \n\t"\
++        "rev        %[tmp]        , %[tmp]                      \n\t"
++#else
++#define LOAD_16BITS_BEHI\
++        "ldr        %[tmp]        , [%[c], %[end]]              \n\t"\
++        "cmp        %[tmp]        , %[ptr]                      \n\t"\
++        "ldrcsh     %[tmp]        , [%[ptr]]    , #2            \n\t"\
++        "rev        %[tmp]        , %[tmp]                      \n\t"
++#endif
++
++
+ #define get_cabac_inline get_cabac_inline_arm
+ static av_always_inline int get_cabac_inline_arm(CABACContext *c,
+                                                  uint8_t *const state)
+ {
+     int bit;
++#if 0
+     void *reg_b, *reg_c, *tmp;
+-
+     __asm__ volatile(
+         "ldrb       %[bit]        , [%[state]]                  \n\t"
+         "add        %[r_b]        , %[tables]   , %[lps_off]    \n\t"
+@@ -100,9 +121,141 @@ static av_always_inline int get_cabac_inline_arm(CABACContext *c,
+           [mlps_off]"I"(H264_MLPS_STATE_OFFSET + 128)
+         : "memory", "cc"
+         );
++#else
++   // *** Not thumb compatible yet
++   unsigned int reg_b, tmp;
++    __asm__ (
++        "ldrb       %[bit]        , [%[state]]                  \n\t"
++        "sub        %[r_b]        , %[mlps_tables], %[lps_off]  \n\t"
++        "and        %[tmp]        , %[range]    , #0xC0         \n\t"
++        "add        %[r_b]        , %[r_b]      , %[bit]        \n\t"
++        "ldrb       %[tmp]        , [%[r_b]     , %[tmp], lsl #1] \n\t"
++// %bit = *state
++// %range = range
++// %tmp = RangeLPS
++        "sub        %[range]      , %[range]    , %[tmp]        \n\t"
++
++        "cmp        %[low]        , %[range]    , lsl #17       \n\t"
++        "ittt       ge                                          \n\t"
++        "subge      %[low]        , %[low]      , %[range], lsl #17 \n\t"
++        "mvnge      %[bit]        , %[bit]                      \n\t"
++        "movge      %[range]      , %[tmp]                      \n\t"
++
++        "clz        %[tmp]        , %[range]                    \n\t"
++        "sub        %[tmp]        , #23                         \n\t"
++
++        "ldrb       %[r_b]        , [%[mlps_tables], %[bit]]    \n\t"
++        "lsl        %[low]        , %[low]      , %[tmp]        \n\t"
++        "lsl        %[range]      , %[range]    , %[tmp]        \n\t"
++
++        "strb       %[r_b]        , [%[state]]                  \n\t"
++        "lsls       %[tmp]        , %[low]      , #16           \n\t"
++
++        "bne        2f                                          \n\t"
++        LOAD_16BITS_BEHI
++        "lsr        %[tmp]        , %[tmp]      , #15           \n\t"
++        "movw       %[r_b]        , #0xFFFF                     \n\t"
++        "sub        %[tmp]        , %[tmp]      , %[r_b]        \n\t"
++
++        "rbit       %[r_b]        , %[low]                      \n\t"
++        "clz        %[r_b]        , %[r_b]                      \n\t"
++        "sub        %[r_b]        , %[r_b]      , #16           \n\t"
++#if CONFIG_THUMB
++        "lsl        %[tmp]        , %[tmp]      , %[r_b]        \n\t"
++        "add        %[low]        , %[low]      , %[tmp]        \n\t"
++#else
++        "add        %[low]        , %[low]      , %[tmp], lsl %[r_b] \n\t"
++#endif
++        "2:                                                     \n\t"
++        :    [bit]"=&r"(bit),
++             [low]"+&r"(c->low),
++           [range]"+&r"(c->range),
++             [r_b]"=&r"(reg_b),
++             [ptr]"+&r"(c->bytestream),
++             [tmp]"=&r"(tmp)
++          :  [state]"r"(state),
++            [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128),
++              [byte]"M"(offsetof(CABACContext, bytestream)),
++#if !UNCHECKED_BITSTREAM_READER
++                 [c]"r"(c),
++               [end]"M"(offsetof(CABACContext, bytestream_end)),
++#endif
++           [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
++        : "memory", "cc"
++        );
++#endif
+ 
+     return bit & 1;
+ }
++
++#define get_cabac_bypass get_cabac_bypass_arm
++static inline int get_cabac_bypass_arm(CABACContext * const c)
++{
++    int rv = 0;
++    unsigned int tmp;
++    __asm (
++        "lsl        %[low]        , #1                          \n\t"
++        "cmp        %[low]        , %[range]    , lsl #17       \n\t"
++        "adc        %[rv]         , %[rv]       , #0            \n\t"
++        "it         cs                                          \n\t"
++        "subcs      %[low]        , %[low]      , %[range], lsl #17 \n\t"
++        "lsls       %[tmp]        , %[low]      , #16           \n\t"
++        "bne        1f                                          \n\t"
++        LOAD_16BITS_BEHI
++        "add        %[low]        , %[low]      , %[tmp], lsr #15 \n\t"
++        "movw       %[tmp]        , #0xFFFF                     \n\t"
++        "sub        %[low]        , %[low]      , %[tmp]        \n\t"
++        "1:                                                     \n\t"
++        : // Outputs
++              [rv]"+&r"(rv),
++             [low]"+&r"(c->low),
++             [tmp]"=&r"(tmp),
++             [ptr]"+&r"(c->bytestream)
++        : // Inputs
++#if !UNCHECKED_BITSTREAM_READER
++                 [c]"r"(c),
++               [end]"M"(offsetof(CABACContext, bytestream_end)),
++#endif
++             [range]"r"(c->range)
++        : "cc"
++    );
++    return rv;
++}
++
++
++#define get_cabac_bypass_sign get_cabac_bypass_sign_arm
++static inline int get_cabac_bypass_sign_arm(CABACContext * const c, int rv)
++{
++    unsigned int tmp;
++    __asm (
++        "lsl        %[low]        , #1                          \n\t"
++        "cmp        %[low]        , %[range]    , lsl #17       \n\t"
++        "ite        cc                                          \n\t"
++        "rsbcc      %[rv]         , %[rv]       , #0            \n\t"
++        "subcs      %[low]        , %[low]      , %[range], lsl #17 \n\t"
++        "lsls       %[tmp]        , %[low]      , #16           \n\t"
++        "bne        1f                                          \n\t"
++        LOAD_16BITS_BEHI
++        "add        %[low]        , %[low]      , %[tmp], lsr #15 \n\t"
++        "movw       %[tmp]        , #0xFFFF                     \n\t"
++        "sub        %[low]        , %[low]      , %[tmp]        \n\t"
++        "1:                                                     \n\t"
++        : // Outputs
++              [rv]"+&r"(rv),
++             [low]"+&r"(c->low),
++             [tmp]"=&r"(tmp),
++             [ptr]"+&r"(c->bytestream)
++        : // Inputs
++#if !UNCHECKED_BITSTREAM_READER
++                 [c]"r"(c),
++               [end]"M"(offsetof(CABACContext, bytestream_end)),
++#endif
++             [range]"r"(c->range)
++        : "cc"
++    );
++    return rv;
++}
++
+ #endif /* HAVE_ARMV6T2_INLINE */
+ 
+ #endif /* AVCODEC_ARM_CABAC_H */
+diff --git a/libavcodec/arm/hevc_cabac.h b/libavcodec/arm/hevc_cabac.h
+new file mode 100644
+index 0000000..31d3c59
+--- /dev/null
++++ b/libavcodec/arm/hevc_cabac.h
+@@ -0,0 +1,491 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#ifndef AVCODEC_ARM_HEVC_CABAC_H
++#define AVCODEC_ARM_HEVC_CABAC_H
++
++#include "config.h"
++#if HAVE_ARMV6T2_INLINE
++
++#define hevc_mem_bits32 hevc_mem_bits32_arm
++static inline uint32_t hevc_mem_bits32_arm(const void * p, const unsigned int bits)
++{
++    unsigned int n;
++    __asm__ (
++        "rev        %[n], %[x]                     \n\t"
++        : [n]"=r"(n)
++        : [x]"r"(*(const uint32_t *)((const uint8_t *)p + (bits >> 3)))
++        :
++        );
++    return n << (bits & 7);
++}
++
++
++// ---------------------------------------------------------------------------
++//
++// Helper fns - little bits of code where ARM has an instraction that the
++// compiler doesn't know about / use
++
++#define trans_scale_sat trans_scale_sat_arm
++static inline int trans_scale_sat_arm(const int level, const unsigned int scale, const unsigned int scale_m, const unsigned int shift)
++{
++    int rv;
++    int t = ((level * (int)(scale * scale_m)) >> shift) + 1;
++
++    __asm__ (
++    "ssat %[rv], #16, %[t], ASR #1 \n\t"
++    : [rv]"=r"(rv)
++    : [t]"r"(t)
++    :
++    );
++    return rv;
++}
++
++#define update_rice update_rice_arm
++static inline void update_rice_arm(uint8_t * const stat_coeff,
++    const unsigned int last_coeff_abs_level_remaining,
++    const unsigned int c_rice_param)
++{
++    int t;
++    __asm__ (
++    "lsl   %[t], %[coeff], #1               \n\t"
++    "lsrs  %[t], %[t], %[shift]             \n\t"
++    "it    eq                               \n\t"
++    "subeq %[stat], %[stat], #1             \n\t"
++    "cmp   %[t], #6                         \n\t"
++    "adc   %[stat], %[stat], #0             \n\t"
++    "usat  %[stat], #8, %[stat]             \n\t"
++    : [stat]"+&r"(*stat_coeff),
++         [t]"=&r"(t)
++    :  [coeff]"r"(last_coeff_abs_level_remaining),
++       [shift]"r"(c_rice_param)
++    : "cc"
++    );
++}
++
++// ---------------------------------------------------------------------------
++//
++// CABAC get loops
++//
++// Where the loop is simple enough we can normally do 10-30% better than the
++// compiler
++
++// Get the residual greater than 1 bits
++
++#define get_cabac_greater1_bits get_cabac_greater1_bits_arm
++static inline unsigned int get_cabac_greater1_bits_arm(CABACContext * const c, const unsigned int n,
++    uint8_t * const state0)
++{
++    unsigned int i, reg_b, st, tmp, bit, rv;
++     __asm__ (
++         "mov        %[i]          , #0                          \n\t"
++         "mov        %[rv]         , #0                          \n\t"
++         "1:                                                     \n\t"
++         "add        %[i]          , %[i]        , #1            \n\t"
++         "cmp        %[rv]         , #0                          \n\t"
++         "ite        eq                                          \n\t"
++         "usateq     %[st]         , #2          , %[i]          \n\t"
++         "movne      %[st]         , #0                          \n\t"
++
++         "ldrb       %[bit]        , [%[state0], %[st]]          \n\t"
++         "sub        %[r_b]        , %[mlps_tables], %[lps_off]  \n\t"
++         "and        %[tmp]        , %[range]    , #0xC0         \n\t"
++         "add        %[r_b]        , %[r_b]      , %[bit]        \n\t"
++         "ldrb       %[tmp]        , [%[r_b], %[tmp], lsl #1]    \n\t"
++         "sub        %[range]      , %[range]    , %[tmp]        \n\t"
++
++         "cmp        %[low]        , %[range], lsl #17           \n\t"
++         "ittt       ge                                          \n\t"
++         "subge      %[low]        , %[low]      , %[range], lsl #17 \n\t"
++         "mvnge      %[bit]        , %[bit]                      \n\t"
++         "movge      %[range]      , %[tmp]                      \n\t"
++
++         "ldrb       %[r_b]        , [%[mlps_tables], %[bit]]    \n\t"
++         "and        %[bit]        , %[bit]      , #1            \n\t"
++         "orr        %[rv]         , %[bit]      , %[rv], lsl #1 \n\t"
++
++         "clz        %[tmp]        , %[range]                    \n\t"
++         "sub        %[tmp]        , #23                         \n\t"
++
++         "lsl        %[low]        , %[low]      , %[tmp]        \n\t"
++         "lsl        %[range]      , %[range]    , %[tmp]        \n\t"
++
++         "strb       %[r_b]        , [%[state0], %[st]]          \n\t"
++// There is a small speed gain from combining both conditions, using a single
++// branch and then working out what that meant later
++         "lsls       %[tmp]        , %[low]      , #16           \n\t"
++         "it         ne                                          \n\t"
++         "cmpne      %[n]          , %[i]                        \n\t"
++         "bne        1b                                          \n\t"
++
++// If reload is not required then we must have run out of flags to decode
++         "tst        %[tmp]        , %[tmp]                      \n\t"
++         "bne        2f                                          \n\t"
++
++// Do reload
++         "ldrh       %[tmp]        , [%[bptr]]   , #2            \n\t"
++         "movw       %[r_b]        , #0xFFFF                     \n\t"
++         "rev        %[tmp]        , %[tmp]                      \n\t"
++         "rsb        %[tmp]        , %[r_b]      , %[tmp], lsr #15 \n\t"
++
++         "rbit       %[r_b]        , %[low]                      \n\t"
++         "clz        %[r_b]        , %[r_b]                      \n\t"
++         "sub        %[r_b]        , %[r_b]      , #16           \n\t"
++
++#if CONFIG_THUMB
++         "lsl        %[tmp]        , %[tmp]      , %[r_b]        \n\t"
++         "add        %[low]        , %[low]      , %[tmp]        \n\t"
++#else
++         "add        %[low]        , %[low]      , %[tmp], lsl %[r_b] \n\t"
++#endif
++
++         "cmp        %[n]          , %[i]                        \n\t"
++         "bne        1b                                          \n\t"
++         "2:                                                     \n\t"
++         :    [bit]"=&r"(bit),
++              [low]"+&r"(c->low),
++            [range]"+&r"(c->range),
++              [r_b]"=&r"(reg_b),
++             [bptr]"+&r"(c->bytestream),
++                [i]"=&r"(i),
++              [tmp]"=&r"(tmp),
++               [st]"=&r"(st),
++               [rv]"=&r"(rv)
++          :  [state0]"r"(state0),
++                  [n]"r"(n),
++        [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128),
++               [byte]"M"(offsetof(CABACContext, bytestream)),
++            [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
++         : "memory", "cc"
++    );
++    return rv;
++}
++
++
++// n must be > 0 on entry
++#define get_cabac_sig_coeff_flag_idxs get_cabac_sig_coeff_flag_idxs_arm
++static inline uint8_t * get_cabac_sig_coeff_flag_idxs_arm(CABACContext * const c, uint8_t * const state0,
++    unsigned int n,
++    const uint8_t const * ctx_map,
++    uint8_t * p)
++{
++    unsigned int reg_b, tmp, st, bit;
++     __asm__ (
++         "1:                                                     \n\t"
++// Get bin from map
++         "ldrb       %[st]         , [%[ctx_map], %[n]]          \n\t"
++
++// Load state & ranges
++         "sub        %[r_b]        , %[mlps_tables], %[lps_off]  \n\t"
++         "ldrb       %[bit]        , [%[state0], %[st]]          \n\t"
++         "and        %[tmp]        , %[range]    , #0xC0         \n\t"
++         "add        %[r_b]        , %[r_b]      , %[tmp], lsl #1 \n\t"
++         "ldrb       %[tmp]        , [%[r_b], %[bit]]            \n\t"
++         "sub        %[range]      , %[range]    , %[tmp]        \n\t"
++
++         "cmp        %[low]        , %[range], lsl #17           \n\t"
++         "ittt       ge                                          \n\t"
++         "subge      %[low]        , %[low]      , %[range], lsl #17 \n\t"
++         "mvnge      %[bit]        , %[bit]                      \n\t"
++         "movge      %[range]      , %[tmp]                      \n\t"
++
++         "ldrb       %[r_b]        , [%[mlps_tables], %[bit]]    \n\t"
++         "tst        %[bit]        , #1                          \n\t"
++// GCC asm seems to need strbne written differently for thumb and arm
++#if CONFIG_THUMB
++         "it         ne                                          \n\t"
++         "strbne     %[n]          , [%[idx]]    , #1            \n\t"
++#else
++         "strneb     %[n]          , [%[idx]]    , #1            \n\t"
++#endif
++
++// Renorm
++         "clz        %[tmp]        , %[range]                    \n\t"
++         "sub        %[tmp]        , #23                         \n\t"
++         "lsl        %[low]        , %[low]      , %[tmp]        \n\t"
++         "lsl        %[range]      , %[range]    , %[tmp]        \n\t"
++
++         "strb       %[r_b]        , [%[state0], %[st]]          \n\t"
++// There is a small speed gain from combining both conditions, using a single
++// branch and then working out what that meant later
++         "subs       %[n]          , %[n]        , #1            \n\t"
++#if CONFIG_THUMB
++         "itt        ne                                          \n\t"
++         "lslsne     %[tmp]        , %[low]      , #16           \n\t"
++         "bne        1b                                          \n\t"
++#else
++         "lslnes     %[tmp]        , %[low]      , #16           \n\t"
++         "bne        1b                                          \n\t"
++#endif
++
++// If we have bits left then n must be 0 so give up now
++         "lsls       %[tmp]        , %[low]      , #16           \n\t"
++         "bne        2f                                          \n\t"
++
++// Do reload
++         "ldrh       %[tmp]        , [%[bptr]]   , #2            \n\t"
++         "movw       %[r_b]        , #0xFFFF                     \n\t"
++         "rev        %[tmp]        , %[tmp]                      \n\t"
++         "rsb        %[tmp]        , %[r_b]      , %[tmp], lsr #15 \n\t"
++
++         "rbit       %[r_b]        , %[low]                      \n\t"
++         "clz        %[r_b]        , %[r_b]                      \n\t"
++         "sub        %[r_b]        , %[r_b]      , #16           \n\t"
++
++#if CONFIG_THUMB
++         "lsl        %[tmp]        , %[tmp]      , %[r_b]        \n\t"
++         "add        %[low]        , %[low]      , %[tmp]        \n\t"
++#else
++         "add        %[low]        , %[low]      , %[tmp], lsl %[r_b] \n\t"
++#endif
++
++// Check to see if we still have more to do
++         "cmp        %[n]          , #0                          \n\t"
++         "bne        1b                                          \n\t"
++         "2:                                                     \n\t"
++         :    [bit]"=&r"(bit),
++              [low]"+&r"(c->low),
++            [range]"+&r"(c->range),
++              [r_b]"=&r"(reg_b),
++             [bptr]"+&r"(c->bytestream),
++              [idx]"+&r"(p),
++                [n]"+&r"(n),
++              [tmp]"=&r"(tmp),
++               [st]"=&r"(st)
++          :  [state0]"r"(state0),
++            [ctx_map]"r"(ctx_map),
++        [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128),
++               [byte]"M"(offsetof(CABACContext, bytestream)),
++            [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
++         : "memory", "cc"
++    );
++
++    return p;
++}
++
++// ---------------------------------------------------------------------------
++//
++// CABAC_BY22 functions
++//
++// By and large these are (at best) no faster than their C equivalents - the
++// only one worth having is _peek where we do a slightly better job than the
++// compiler
++//
++// The others have been stashed here for reference in case larger scale asm
++// is attempted in which case they might be a useful base
++
++
++#define get_cabac_by22_peek get_cabac_by22_peek_arm
++static inline uint32_t get_cabac_by22_peek_arm(const CABACContext *const c)
++{
++    uint32_t rv, tmp;
++    __asm__ (
++        "bic      %[rv]  , %[low], #1            \n\t"
++        "cmp      %[inv] , #0                    \n\t"
++        "it       ne                             \n\t"
++        "umullne  %[tmp] , %[rv] , %[inv], %[rv] \n\t"
++        :  // Outputs
++             [rv]"=&r"(rv),
++             [tmp]"=r"(tmp)
++        :  // Inputs
++             [low]"r"(c->low),
++             [inv]"r"(c->range)
++        :  // Clobbers
++                "cc"
++    );
++    return rv << 1;
++}
++
++#if 0
++
++// ***** Slower than the C  :-(
++#define get_cabac_by22_flush get_cabac_by22_flush_arm
++static inline void get_cabac_by22_flush_arm(CABACContext *const c, const unsigned int n, const uint32_t val)
++{
++    uint32_t m, tmp;
++    __asm__ (
++    "add    %[bits], %[bits], %[n]   \n\t"
++    "ldr    %[m], [%[ptr], %[bits], lsr #3]  \n\t"
++
++    "rsb    %[tmp], %[n], #32        \n\t"
++    "lsr    %[tmp], %[val], %[tmp]   \n\t"
++    "mul    %[tmp], %[range], %[tmp] \n\t"
++
++    "rev    %[m], %[m]               \n\t"
++
++    "lsl    %[tmp], %[tmp], #23      \n\t"
++    "rsb    %[low], %[tmp], %[low], lsl %[n] \n\t"
++
++    "and    %[tmp], %[bits], #7         \n\t"
++    "lsl    %[m], %[m], %[tmp]          \n\t"
++
++    "orr    %[low], %[low], %[m], lsr #9      \n\t"
++        :  // Outputs
++             [m]"=&r"(m),
++           [tmp]"=&r"(tmp),
++          [bits]"+&r"(c->by22.bits),
++           [low]"+&r"(c->low)
++        :  // Inputs
++               [n]"r"(n),
++             [val]"r"(val),
++             [inv]"r"(c->range),
++           [range]"r"(c->by22.range),
++             [ptr]"r"(c->bytestream)
++        :  // Clobbers
++    );
++}
++
++
++// Works but slower than C
++#define coeff_abs_level_remaining_decode_by22(c,r) coeff_abs_level_remaining_decode_by22_arm(c, r)
++static int coeff_abs_level_remaining_decode_by22_arm(CABACContext * const c, const unsigned int c_rice_param)
++{
++    uint32_t n, val, tmp, level;
++
++//    PROFILE_START();
++
++    __asm__ (
++            // Peek
++            "bic    %[val],  %[low],   #1  \n\t"
++            "cmp    %[inv], #0          \n\t"
++            "umullne  %[tmp], %[val], %[inv], %[val] \n\t"
++            "lsl    %[val], %[val], #1  \n\t"
++
++            // Count bits (n = prefix)
++            "mvn    %[n], %[val] \n\t"
++            "clz    %[n], %[n]   \n\t"
++
++            "lsl    %[level], %[val], %[n] \n\t"
++            "subs   %[tmp], %[n], #3 \n\t"
++            "blo    2f \n\t"
++
++            // prefix >= 3
++            // < tmp = prefix - 3
++            // > tmp = prefix + rice - 3
++            "add    %[tmp], %[tmp], %[rice] \n\t"
++            // > n = prefix * 2 + rice - 3
++            "add    %[n], %[tmp], %[n] \n\t"
++            "cmp    %[n], #21 \n\t"
++            "bhi    3f \n\t"
++
++            "orr    %[level], %[level], #0x80000000 \n\t"
++            "rsb    %[tmp], %[tmp], #31 \n\t"
++            "lsr    %[level], %[level], %[tmp] \n\t"
++
++            "mov    %[tmp], #2 \n\t"
++            "add    %[level], %[level], %[tmp], lsl %[rice] \n\t"
++            "b      1f \n\t"
++
++            // > 22 bits used in total - need reload
++            "3:  \n\t"
++
++            // Stash prefix + rice - 3 in level (only spare reg)
++            "mov    %[level], %[tmp] \n\t"
++            // Restore n to flush value (prefix)
++            "sub    %[n], %[n], %[tmp] \n\t"
++
++            // Flush + reload
++
++//          "rsb    %[tmp], %[n], #32        \n\t"
++//          "lsr    %[tmp], %[val], %[tmp]   \n\t"
++//          "mul    %[tmp], %[range], %[tmp] \n\t"
++
++            // As it happens we know that all the bits we are flushing are 1
++            // so we can cheat slightly
++            "rsb    %[tmp], %[range], %[range], lsl %[n] \n\t"
++            "lsl    %[tmp], %[tmp], #23      \n\t"
++            "rsb    %[low], %[tmp], %[low], lsl %[n] \n\t"
++
++            "add    %[bits], %[bits], %[n]   \n\t"
++            "ldr    %[n], [%[ptr], %[bits], lsr #3]  \n\t"
++            "rev    %[n], %[n]               \n\t"
++            "and    %[tmp], %[bits], #7         \n\t"
++            "lsl    %[n], %[n], %[tmp]          \n\t"
++
++            "orr    %[low], %[low], %[n], lsr #9      \n\t"
++
++            // (reload)
++
++            "bic    %[val],  %[low],   #1  \n\t"
++            "cmp    %[inv], #0          \n\t"
++            "umullne  %[tmp], %[val], %[inv], %[val] \n\t"
++            "lsl    %[val], %[val], #1  \n\t"
++
++            // Build value
++
++            "mov    %[n], %[level] \n\t"
++
++            "orr     %[tmp], %[val], #0x80000000 \n\t"
++            "rsb     %[level], %[level], #31 \n\t"
++            "lsr     %[level], %[tmp], %[level] \n\t"
++
++            "mov    %[tmp], #2 \n\t"
++            "add    %[level], %[level], %[tmp], lsl %[rice] \n\t"
++            "b      1f \n\t"
++
++            // prefix < 3
++            "2:  \n\t"
++            "rsb    %[tmp], %[rice], #31 \n\t"
++            "lsr    %[level], %[level], %[tmp] \n\t"
++            "orr    %[level], %[level], %[n], lsl %[rice] \n\t"
++            "add    %[n], %[n], %[rice] \n\t"
++
++            "1:  \n\t"
++            // Flush
++            "add    %[n], %[n], #1 \n\t"
++
++            "rsb    %[tmp], %[n], #32        \n\t"
++            "lsr    %[tmp], %[val], %[tmp]   \n\t"
++
++            "add    %[bits], %[bits], %[n]   \n\t"
++            "ldr    %[val], [%[ptr], %[bits], lsr #3]  \n\t"
++
++            "mul    %[tmp], %[range], %[tmp] \n\t"
++            "lsl    %[tmp], %[tmp], #23      \n\t"
++            "rsb    %[low], %[tmp], %[low], lsl %[n] \n\t"
++
++            "rev    %[val], %[val]               \n\t"
++            "and    %[tmp], %[bits], #7         \n\t"
++            "lsl    %[val], %[val], %[tmp]          \n\t"
++
++            "orr    %[low], %[low], %[val], lsr #9      \n\t"
++        :  // Outputs
++         [level]"=&r"(level),
++             [n]"=&r"(n),
++           [val]"=&r"(val),
++           [tmp]"=&r"(tmp),
++          [bits]"+&r"(c->by22.bits),
++           [low]"+&r"(c->low)
++        :  // Inputs
++            [rice]"r"(c_rice_param),
++             [inv]"r"(c->range),
++           [range]"r"(c->by22.range),
++             [ptr]"r"(c->bytestream)
++        :  // Clobbers
++                "cc"
++    );
++
++//    PROFILE_ACC(residual_abs);
++
++    return level;
++}
++#endif
++
++#endif /* HAVE_ARMV6T2_INLINE */
++
++#endif /* AVCODEC_ARM_HEVC_CABAC_H */
+diff --git a/libavcodec/arm/hevcdsp_deblock_neon.S b/libavcodec/arm/hevcdsp_deblock_neon.S
+index 166bddb..a088cc3 100644
+--- a/libavcodec/arm/hevcdsp_deblock_neon.S
++++ b/libavcodec/arm/hevcdsp_deblock_neon.S
+@@ -383,3 +383,127 @@ function ff_hevc_h_loop_filter_chroma_neon, export=1
+         vst1.8   {d4}, [r0]
+         bx       lr
+ endfunc
++
++/* ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_inc, int out_inc,
++ *                                            int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
++ *                                            MvField *curr, MvField *neigh, uint8_t *bs)
++ */
++function ff_hevc_deblocking_boundary_strengths_neon, export=1
++        add         ip, sp, #4*4
++        push        {a2-a4,v1-v8,lr}
++        ldmia       ip, {v5-v7}
++1:      ldmdb       ip, {v1-v4}
++        ldrsb       a3, [v5, #8]    @ curr->ref_idx
++        ldrsb       v8, [v5, #9]
++        ldrsb       ip, [v6, #8]    @ neigh->ref_idx
++        ldrsb       lr, [v6, #9]
++        ldr         v1, [v1, a3, lsl #2]
++        ldrb        a3, [v5, #10]   @ curr->pred_flag
++        ldr         v2, [v2, v8, lsl #2]
++        ldrb        v8, [v6, #10]   @ neigh->pred_flag
++        ldr         v3, [v3, ip, lsl #2]
++        ldr         v4, [v4, lr, lsl #2]
++        teq         a3, #3
++        beq         20f
++        teq         v8, #3
++        beq         90f
++
++        tst         a3, #1
++        itee        ne
++        ldrne       a3, [v5, #0]    @ curr->mv[0]
++        ldreq       a3, [v5, #4]    @ curr->mv[1]
++        moveq       v1, v2
++        tst         v8, #1
++        itee        ne
++        ldrne       v8, [v6, #0]    @ neigh->mv[0]
++        ldreq       v8, [v6, #4]    @ neigh->mv[1]
++        moveq       v3, v4
++        teq         v1, v3
++        bne         10f
++        ldr         lr, =0xFFFCFFFC
++        ssub16      ip, v8, a3
++        ssub16      a3, a3, v8
++        sel         a3, a3, ip
++        ands        a3, a3, lr
++        @ drop through
++10:     it          ne
++        movne       a3, #1
++11:     subs        a2, a2, #1
++12:
++A       strbhs      a3, [v7], a4
++T       itt         hs
++T       strbhs      a3, [v7]
++T       addhs       v7, v7, a4
++        subs        a2, a2, #1
++        bhs         12b
++
++        ldm         sp, {a2, a3}
++        add         ip, sp, #16*4
++        subs        a1, a1, #1
++        add         v5, v5, a3
++        add         v6, v6, a3
++        bhi         1b
++        pop         {a2-a4,v1-v8,pc}
++
++20:     teq         v8, #3
++        bne         10b
++
++        teq         v1, v3
++        it          eq
++        teqeq       v2, v4
++        bne         40f
++        teq         v1, v2
++        bne         30f
++
++        ldrd        v1, v2, [v5]    @ curr->mv
++        ldrd        v3, v4, [v6]    @ neigh->mv
++        ldr         lr, =0xFFFCFFFC
++        ssub16      ip, v3, v1
++        ssub16      a3, v1, v3
++        sel         a3, a3, ip
++        ands        a3, a3, lr
++        bne         25f
++        ssub16      ip, v4, v2
++        ssub16      a3, v2, v4
++        sel         a3, a3, ip
++        ands        a3, a3, lr
++        beq         11b
++        @ drop through
++25:     ssub16      ip, v4, v1
++        ssub16      a3, v1, v4
++        sel         a3, a3, ip
++        ands        a3, a3, lr
++        bne         10b
++        ssub16      ip, v3, v2
++        ssub16      a3, v2, v3
++        sel         a3, a3, ip
++        ands        a3, a3, lr
++        b           10b
++
++30:     ldrd        v1, v2, [v5]    @ curr->mv
++        ldrd        v3, v4, [v6]    @ neigh->mv
++        ldr         lr, =0xFFFCFFFC
++        ssub16      ip, v3, v1
++        ssub16      a3, v1, v3
++        sel         a3, a3, ip
++        ands        a3, a3, lr
++        bne         10b
++        ssub16      ip, v4, v2
++        ssub16      a3, v2, v4
++        sel         a3, a3, ip
++        ands        a3, a3, lr
++        b           10b
++
++40:     teq         v1, v4
++        ite         eq
++        teqeq       v2, v3
++        bne         10b
++
++        ldrd        v1, v2, [v5]    @ curr->mv
++        ldrd        v3, v4, [v6]    @ neigh->mv
++        ldr         lr, =0xFFFCFFFC
++        b           25b
++
++90:     mov         a3, #1
++        b           11b
++endfunc
+diff --git a/libavcodec/arm/hevcdsp_epel_neon.S b/libavcodec/arm/hevcdsp_epel_neon.S
+new file mode 100644
+index 0000000..00eab9e
+--- /dev/null
++++ b/libavcodec/arm/hevcdsp_epel_neon.S
+@@ -0,0 +1,337 @@
++/*
++ * Copyright (c) 2014 - 2015 Seppo Tomperi <seppo.tomperi@vtt.fi>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++#define MAX_PB_SIZE #64
++
++.macro vextin_d4
++    vld1.8    {q10}, [r1], r2
++    vmov      d16, d20
++    vext.8    d17, d20, d21, #1
++    vext.8    d18, d20, d21, #2
++    vext.8    d19, d20, d21, #3
++.endm
++
++.macro vextin_d4_8
++    vld1.8    d16, [r1], r2
++    vext.8    d17, d16, d16, #1
++    vext.8    d18, d16, d16, #2
++    vext.8    d19, d16, d16, #3
++.endm
++
++.macro load_coeffs_16b coeffs
++    ldr      \coeffs, [\coeffs]
++    vdup.i8  d0, \coeffs
++    lsr      \coeffs, #8
++    vdup.i8  d1, \coeffs
++    lsr      \coeffs, #8
++    vdup.i8  d2, \coeffs
++    lsr      \coeffs, #8
++    vdup.i8  d3, \coeffs
++.endm
++
++.macro epel_filter_16b out=q12
++    vmull.u8 q3, d16, d0
++    vmull.u8 q11, d19, d3
++    vmull.u8 \out, d17, d1
++    vmull.u8 q10, d18, d2
++    vadd.s16 q3, q11
++    vadd.s16 \out, q10
++    vsub.s16 \out, q3
++.endm
++
++.macro load_coeffs_32b coeffs
++    ldr      \coeffs, [\coeffs]
++    vmov.i64 d4, #0
++    vmov.8   d4[0], \coeffs
++    lsr      \coeffs, #8
++    vmov.8   d4[2], \coeffs
++    lsr      \coeffs, #8
++    vmov.8   d4[4], \coeffs
++    lsr      \coeffs, #8
++    vmov.8   d4[6], \coeffs
++.endm
++
++.macro epel_filter_32b
++    vmull.s16 q3, d24, d4[0] //q12
++    vmull.s16 q4, d25, d4[0]
++    vmull.s16 q5, d30, d4[3] //q15
++    vmull.s16 q6, d31, d4[3]
++
++    vmull.s16 q7, d26, d4[1] // q13
++    vmull.s16 q8, d27, d4[1]
++    vmull.s16 q9, d28, d4[2] // q14
++    vmull.s16 q10, d29, d4[2]
++    vadd.s32 q3, q5
++    vadd.s32 q4, q6
++    vadd.s32 q7, q9
++    vadd.s32 q8, q10
++    vsub.s32 q7, q3
++    vsub.s32 q8, q4
++    vqshrn.s32  d6, q7, #6
++    vqshrn.s32  d7, q8, #6
++.endm
++
++.macro epel_filter_32b_4
++    vmull.s16 q3, d24, d4[0] //q12
++    vmull.s16 q5, d30, d4[3] //q15
++    vmull.s16 q7, d26, d4[1] // q13
++    vmull.s16 q9, d28, d4[2] // q14
++    vadd.s32 q3, q5
++    vadd.s32 q7, q9
++    vsub.s32 q7, q3
++    vqshrn.s32  d6, q7, #6
++.endm
++
++function ff_hevc_put_epel_h_neon_8, export=1
++        push   {r4-r7}
++        mov    r4, MAX_PB_SIZE
++        ldr    r7, [sp, #16] // mx
++        ldr    r5, [sp, #24] // width
++        sub    r7, #1
++        lsl    r7, #2
++        vpush {d8-d15}
++@ adr reaches if we are in thumb mode but not in arm
++T       adr    r12, epel_coeffs
++A       adrl   r12, epel_coeffs
++        add    r7, r12
++        sub       r1, #1
++        lsl       r4, #1
++        load_coeffs_16b r7
++        mov   r12, r3
++        mov   r6, r0
++        mov   r7, r1
++        cmp       r5, #6
++        bgt       8f
++        cmp       r5, #4
++        blt       2f
++        b         4f
++8:      subs r3, #1
++        pld [r1]
++        vextin_d4
++        epel_filter_16b
++        vst1.16    {q12}, [r0], r4
++        bne 8b
++        subs    r5, #8
++        beq  99f
++        mov       r3, r12
++        add       r6, #16
++        mov       r0, r6
++        add       r7, #8
++        mov       r1, r7
++        cmp       r5, #4
++        bgt       8b
++4:      subs r3, #1
++        pld [r1]
++        vextin_d4_8
++        epel_filter_16b
++        vst1.16    d24, [r0], r4
++        bne 4b
++        subs      r5, #4
++        beq       99f
++        mov       r3, r12
++        add       r6, #8
++        mov       r0, r6
++        add       r7, #4
++        mov       r1, r7
++2:      subs r3, #1
++        pld [r1]
++        vextin_d4_8
++        epel_filter_16b
++        vst1.32    d24[0], [r0], r4
++        bne 2b
++99:     vpop {d8-d15}
++        pop {r4-r7}
++        bx lr
++endfunc
++
++function ff_hevc_put_epel_v_neon_8, export=1
++        push   {r4-r7}
++        mov    r4, MAX_PB_SIZE
++        ldr    r7, [sp, #20] // my
++        ldr    r5, [sp, #24] // width
++        sub    r7, #1
++        lsl    r7, #2
++        vpush {d8-d15}
++T       adr    r12, epel_coeffs
++A       adrl   r12, epel_coeffs
++        add    r7, r12
++        load_coeffs_16b r7
++        sub       r1, r2
++        lsl       r4, #1
++        mov   r12, r3
++        mov   r6, r0
++        mov   r7, r1
++0:      pld [r1]
++        vld1.8    {d16}, [r1], r2
++        pld [r1]
++        vld1.8    {d17}, [r1], r2
++        pld [r1]
++        vld1.8    {d18}, [r1], r2
++        cmp       r5, #6
++        bgt       8f
++        cmp       r5, #4
++        blt       2f
++        b         4f
++8:      pld [r1]
++        vld1.8    {d19}, [r1], r2
++        subs r3, #1
++        epel_filter_16b
++        vst1.16    {q12}, [r0], r4
++        vmov d16, d17
++        vmov d17, d18
++        vmov d18, d19
++        bne 8b
++        subs    r5, #8
++        beq  99f
++        mov       r3, r12
++        add       r6, #16
++        mov       r0, r6
++        add       r7, #8
++        mov       r1, r7
++        b         0b
++4:      pld       [r1]
++        vld1.8    {d19}, [r1], r2
++        subs r3, #1
++        epel_filter_16b
++        vst1.16    d24, [r0], r4
++        vmov d16, d17
++        vmov d17, d18
++        vmov d18, d19
++        bne 4b
++        subs      r5, #4
++        beq       99f
++        mov       r3, r12
++        add       r6, #8
++        mov       r0, r6
++        add       r7, #4
++        mov       r1, r7
++        b         0b
++2:      pld [r1]
++        vld1.8    {d19}, [r1], r2
++        subs r3, #1
++        epel_filter_16b
++        vst1.32    d24[0], [r0], r4
++        vmov d16, d17
++        vmov d17, d18
++        vmov d18, d19
++        bne 2b
++99:     vpop {d8-d15}
++        pop {r4-r7}
++        bx lr
++endfunc
++
++function ff_hevc_put_epel_hv_neon_8, export=1
++        push   {r4-r7}
++        mov    r4, MAX_PB_SIZE
++        ldr    r6, [sp, #16] // mx
++        ldr    r7, [sp, #20] // my
++        ldr    r5, [sp, #24] // width
++        sub    r7, #1
++        lsl    r7, #2
++        vpush {d8-d15}
++        adr    r12, epel_coeffs
++        sub    r6, #1
++        lsl    r6, #2
++        add    r6, r12 // mx epel coeff offset
++        add    r7, r12
++        sub       r1, #1
++        sub       r1, r2
++        lsl       r4, #1
++        load_coeffs_16b r6
++        load_coeffs_32b r7
++        mov   r12, r3
++        mov   r6, r0
++        mov   r7, r1
++0:      pld   [r1]
++        vextin_d4
++        epel_filter_16b q12
++        pld   [r1]
++        vextin_d4
++        epel_filter_16b q13
++        pld   [r1]
++        vextin_d4
++        epel_filter_16b q14
++        cmp       r5, #6
++        bgt       8f
++        cmp       r5, #4
++        blt       2f
++        b         4f
++8:      pld     [r1]
++        vextin_d4
++        epel_filter_16b q15
++        subs r3, #1
++        epel_filter_32b
++        vst1.16    {q3}, [r0], r4
++        vmov q12, q13
++        vmov q13, q14
++        vmov q14, q15
++        bne 8b
++        subs    r5, #8
++        beq  99f
++        mov       r3, r12
++        add       r6, #16
++        mov       r0, r6
++        add       r7, #8
++        mov       r1, r7
++        b         0b
++4:      pld      [r1]
++        vextin_d4_8
++        epel_filter_16b q15
++        subs r3, #1
++        epel_filter_32b_4
++        vst1.16    d6, [r0], r4
++        vmov q12, q13
++        vmov q13, q14
++        vmov q14, q15
++        bne 4b
++        subs      r5, #4
++        beq       99f
++        mov       r3, r12
++        add       r6, #8
++        mov       r0, r6
++        add       r7, #4
++        mov       r1, r7
++        b         0b
++2:      pld      [r1]
++        vextin_d4_8
++        epel_filter_16b q15
++        subs r3, #1
++        epel_filter_32b_4
++        vst1.32    d6[0], [r0], r4
++        vmov q12, q13
++        vmov q13, q14
++        vmov q14, q15
++        bne 2b
++99:     vpop {d8-d15}
++        pop {r4-r7}
++        bx lr
++endfunc
++
++epel_coeffs:
++       .byte 2, 58, 10, 2
++       .byte 4, 54, 16, 2
++       .byte 6, 46, 28, 4
++       .byte 4, 36, 36, 4
++       .byte 4, 28, 46, 6
++       .byte 2, 16, 54, 4
++       .byte 2, 10, 58, 2
+diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c
+index 5591807..49c70dd 100644
+--- a/libavcodec/arm/hevcdsp_init_neon.c
++++ b/libavcodec/arm/hevcdsp_init_neon.c
+@@ -22,6 +22,8 @@
+ #include "libavutil/arm/cpu.h"
+ #include "libavcodec/hevcdsp.h"
+ #include "hevcdsp_arm.h"
++#include "libavcodec/avcodec.h"
++#include "libavcodec/bit_depth_template.c"
+ 
+ void ff_hevc_v_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+ void ff_hevc_h_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+@@ -43,6 +45,21 @@ void ff_hevc_transform_add_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
+ void ff_hevc_transform_add_32x32_neon_8(uint8_t *_dst, int16_t *coeffs,
+                                       ptrdiff_t stride);
+ 
++void ff_hevc_sao_band_w8_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height);
++void ff_hevc_sao_band_w16_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height);
++void ff_hevc_sao_band_w32_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height);
++void ff_hevc_sao_band_w64_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height);
++
++void ff_hevc_sao_edge_eo0_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
++void ff_hevc_sao_edge_eo1_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
++void ff_hevc_sao_edge_eo2_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
++void ff_hevc_sao_edge_eo3_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
++
++void ff_hevc_sao_edge_eo0_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
++void ff_hevc_sao_edge_eo1_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
++void ff_hevc_sao_edge_eo2_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
++void ff_hevc_sao_edge_eo3_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
++
+ #define PUT_PIXELS(name) \
+     void name(int16_t *dst, uint8_t *src, \
+                                 ptrdiff_t srcstride, int height, \
+@@ -58,6 +75,15 @@ PUT_PIXELS(ff_hevc_put_pixels_w32_neon_8);
+ PUT_PIXELS(ff_hevc_put_pixels_w48_neon_8);
+ PUT_PIXELS(ff_hevc_put_pixels_w64_neon_8);
+ #undef PUT_PIXELS
++void ff_hevc_put_epel_h_neon_8(int16_t *dst, uint8_t *src,
++                                ptrdiff_t srcstride, int height,
++                                intptr_t mx, intptr_t my, int width);
++void ff_hevc_put_epel_v_neon_8(int16_t *dst, uint8_t *src,
++                                ptrdiff_t srcstride, int height,
++                                intptr_t mx, intptr_t my, int width);
++void ff_hevc_put_epel_hv_neon_8(int16_t *dst, uint8_t *src,
++                                ptrdiff_t srcstride, int height,
++                                intptr_t mx, intptr_t my, int width);
+ 
+ static void (*put_hevc_qpel_neon[4][4])(int16_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
+                                    int height, int width);
+@@ -142,6 +168,132 @@ void ff_hevc_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t
+     put_hevc_qpel_uw_neon[my][mx](dst, dststride, src, srcstride, width, height, src2, MAX_PB_SIZE);
+ }
+ 
++static void ff_hevc_sao_band_neon_wrapper(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                          int16_t *sao_offset_val, int sao_left_class, int width, int height)
++{
++    pixel *dst = (pixel *)_dst;
++    pixel *src = (pixel *)_src;
++    int8_t offset_table[32] = { 0 };
++    int k, y, x;
++    int shift  = 3; // BIT_DEPTH - 5
++    int cwidth = 0;
++
++    stride_src /= sizeof(pixel);
++    stride_dst /= sizeof(pixel);
++
++    for (k = 0; k < 4; k++)
++        offset_table[(k + sao_left_class) & 31] = sao_offset_val[k + 1];
++
++    if (height % 8 == 0)
++        cwidth = width;
++
++    switch(cwidth){
++    case 8:
++        ff_hevc_sao_band_w8_neon_8(_dst, _src, offset_table, stride_src, stride_dst, height);
++        break;
++    case 16:
++        ff_hevc_sao_band_w16_neon_8(_dst, _src, offset_table, stride_src, stride_dst, height);
++        break;
++    case 32:
++        ff_hevc_sao_band_w32_neon_8(_dst, _src, offset_table, stride_src, stride_dst, height);
++        break;
++    case 64:
++        ff_hevc_sao_band_w64_neon_8(_dst, _src, offset_table, stride_src, stride_dst, height);
++        break;
++    default:
++        for (y = 0; y < height; y++) {
++            for (x = 0; x < width; x++)
++                dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
++            dst += stride_dst;
++            src += stride_src;
++        }
++    }
++}
++
++#define CMP(a, b) ((a) > (b) ? 1 : ((a) == (b) ? 0 : -1))
++static void ff_hevc_sao_edge_neon_wrapper(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
++                                          int16_t *_sao_offset_val, int eo, int width, int height)
++{
++    static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
++    static const int8_t pos[4][2][2] = {
++        { { -1,  0 }, {  1, 0 } }, // horizontal
++        { {  0, -1 }, {  0, 1 } }, // vertical
++        { { -1, -1 }, {  1, 1 } }, // 45 degree
++        { {  1, -1 }, { -1, 1 } }, // 135 degree
++    };
++    int8_t sao_offset_val[8];  // padding of 3 for vld
++    ptrdiff_t stride_src = (2*MAX_PB_SIZE + FF_INPUT_BUFFER_PADDING_SIZE);
++    pixel *dst = (pixel *)_dst;
++    pixel *src = (pixel *)_src;
++    int a_stride, b_stride;
++    int x, y;
++    int cwidth = 0;
++
++    for (x = 0; x < 5; x++) {
++        sao_offset_val[x] = _sao_offset_val[edge_idx[x]];
++    }
++
++    if (height % 8 == 0)
++        cwidth = width;
++
++    stride_src /= sizeof(pixel);
++    stride_dst /= sizeof(pixel);
++
++    switch (cwidth) {
++    case 32:
++        switch(eo) {
++        case 0:
++            ff_hevc_sao_edge_eo0_w32_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
++            break;
++        case 1:
++            ff_hevc_sao_edge_eo1_w32_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
++            break;
++        case 2:
++            ff_hevc_sao_edge_eo2_w32_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
++            break;
++        case 3:
++            ff_hevc_sao_edge_eo3_w32_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
++            break;
++        }
++        break;
++    case 64:
++        switch(eo) {
++        case 0:
++            ff_hevc_sao_edge_eo0_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
++            break;
++        case 1:
++            ff_hevc_sao_edge_eo1_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
++            break;
++        case 2:
++            ff_hevc_sao_edge_eo2_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
++            break;
++        case 3:
++            ff_hevc_sao_edge_eo3_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
++            break;
++        }
++        break;
++    default:
++        a_stride = pos[eo][0][0] + pos[eo][0][1] * stride_src;
++        b_stride = pos[eo][1][0] + pos[eo][1][1] * stride_src;
++        for (y = 0; y < height; y++) {
++            for (x = 0; x < width; x++) {
++                int diff0         = CMP(src[x], src[x + a_stride]);
++                int diff1         = CMP(src[x], src[x + b_stride]);
++                int idx           = diff0 + diff1;
++                if (idx)
++                    dst[x] = av_clip_pixel(src[x] + sao_offset_val[idx+2]);
++            }
++            src += stride_src;
++            dst += stride_dst;
++        }
++    }
++}
++#undef CMP
++
++void ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_inc, int out_inc,
++                                                int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
++                                                MvField *curr, MvField *neigh, uint8_t *bs);
++
+ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
+ {
+     if (bit_depth == 8) {
+@@ -161,6 +313,10 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
+         c->transform_add[2]            = ff_hevc_transform_add_16x16_neon_8;
+         c->transform_add[3]            = ff_hevc_transform_add_32x32_neon_8;
+         c->idct_4x4_luma               = ff_hevc_transform_luma_4x4_neon_8;
++        for (x = 0; x < sizeof c->sao_band_filter / sizeof *c->sao_band_filter; x++) {
++          c->sao_band_filter[x]        = ff_hevc_sao_band_neon_wrapper;
++          c->sao_edge_filter[x]        = ff_hevc_sao_edge_neon_wrapper;
++        }
+         put_hevc_qpel_neon[1][0]       = ff_hevc_put_qpel_v1_neon_8;
+         put_hevc_qpel_neon[2][0]       = ff_hevc_put_qpel_v2_neon_8;
+         put_hevc_qpel_neon[3][0]       = ff_hevc_put_qpel_v3_neon_8;
+@@ -201,7 +357,21 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
+             c->put_hevc_qpel_bi[x][1][0]      = ff_hevc_put_qpel_bi_neon_wrapper;
+             c->put_hevc_qpel_bi[x][0][1]      = ff_hevc_put_qpel_bi_neon_wrapper;
+             c->put_hevc_qpel_bi[x][1][1]      = ff_hevc_put_qpel_bi_neon_wrapper;
++            c->put_hevc_epel[x][1][0]         = ff_hevc_put_epel_v_neon_8;
++            c->put_hevc_epel[x][0][1]         = ff_hevc_put_epel_h_neon_8;
++            c->put_hevc_epel[x][1][1]         = ff_hevc_put_epel_hv_neon_8;
+         }
++        c->put_hevc_epel[0][0][0]  = ff_hevc_put_pixels_w2_neon_8;
++        c->put_hevc_epel[1][0][0]  = ff_hevc_put_pixels_w4_neon_8;
++        c->put_hevc_epel[2][0][0]  = ff_hevc_put_pixels_w6_neon_8;
++        c->put_hevc_epel[3][0][0]  = ff_hevc_put_pixels_w8_neon_8;
++        c->put_hevc_epel[4][0][0]  = ff_hevc_put_pixels_w12_neon_8;
++        c->put_hevc_epel[5][0][0]  = ff_hevc_put_pixels_w16_neon_8;
++        c->put_hevc_epel[6][0][0]  = ff_hevc_put_pixels_w24_neon_8;
++        c->put_hevc_epel[7][0][0]  = ff_hevc_put_pixels_w32_neon_8;
++        c->put_hevc_epel[8][0][0]  = ff_hevc_put_pixels_w48_neon_8;
++        c->put_hevc_epel[9][0][0]  = ff_hevc_put_pixels_w64_neon_8;
++
+         c->put_hevc_qpel[0][0][0]  = ff_hevc_put_pixels_w2_neon_8;
+         c->put_hevc_qpel[1][0][0]  = ff_hevc_put_pixels_w4_neon_8;
+         c->put_hevc_qpel[2][0][0]  = ff_hevc_put_pixels_w6_neon_8;
+@@ -221,4 +391,9 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
+         c->put_hevc_qpel_uni[8][0][0]  = ff_hevc_put_qpel_uw_pixels_w48_neon_8;
+         c->put_hevc_qpel_uni[9][0][0]  = ff_hevc_put_qpel_uw_pixels_w64_neon_8;
+     }
++
++    assert(offsetof(MvField, mv) == 0);
++    assert(offsetof(MvField, ref_idx) == 8);
++    assert(offsetof(MvField, pred_flag) == 10);
++    c->hevc_deblocking_boundary_strengths = ff_hevc_deblocking_boundary_strengths_neon;
+ }
+diff --git a/libavcodec/arm/hevcdsp_sao_neon.S b/libavcodec/arm/hevcdsp_sao_neon.S
+new file mode 100644
+index 0000000..9c7808d
+--- /dev/null
++++ b/libavcodec/arm/hevcdsp_sao_neon.S
+@@ -0,0 +1,510 @@
++/*
++ * Copyright (c) 2014 - 2015 Seppo Tomperi <seppo.tomperi@vtt.fi>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++.macro init_sao_band
++        pld      [r1]
++        vld1.8   {q0, q1}, [r2]  // offset table
++        ldr       r2, [sp, #0]   // stride_dst
++        ldr      r12, [sp, #4]   // height
++        vmov.u8  q3, #128
++.endm
++
++// 128 in q3
++// input q8 - q11
++.macro sao_band_64
++        vtbl.8   d24, {d0, d1, d2, d3}, d24
++        vadd.s8  q8, q3
++        vtbl.8   d25, {d0, d1, d2, d3}, d25
++        vadd.s8  q9, q3
++        vtbl.8   d26, {d0, d1, d2, d3}, d26
++        vadd.s8  q10, q3
++        vtbl.8   d27, {d0, d1, d2, d3}, d27
++        vadd.s8  q11, q3
++        vtbl.8   d28, {d0, d1, d2, d3}, d28
++        vqadd.s8 q8, q12
++        vtbl.8   d29, {d0, d1, d2, d3}, d29
++        vqadd.s8 q9, q13
++        vtbl.8   d30, {d0, d1, d2, d3}, d30
++        vqadd.s8 q10, q14
++        vtbl.8   d31, {d0, d1, d2, d3}, d31
++        vsub.s8  q8, q3
++        vqadd.s8 q11, q15
++        vsub.s8  q9, q3
++        vsub.s8  q10, q3
++        vsub.s8  q11, q3
++.endm
++
++function ff_hevc_sao_band_w8_neon_8, export=1
++        init_sao_band
++1:      subs     r12, #8
++        vld1.8   {d16}, [r1, :64], r3
++        vld1.8   {d17}, [r1, :64], r3
++        vshr.u8  q12, q8, #3
++        vld1.8   {d18}, [r1, :64], r3
++        vld1.8   {d19}, [r1, :64], r3
++        vshr.u8  q13, q9, #3
++        vld1.8   {d20}, [r1, :64], r3
++        vld1.8   {d21}, [r1, :64], r3
++        vshr.u8  q14, q10, #3
++        vld1.8   {d22}, [r1, :64], r3
++        vld1.8   {d23}, [r1, :64], r3
++        vshr.u8  q15, q11, #3
++        sao_band_64
++        vst1.8  {d16}, [r0, :64], r2
++        vst1.8  {d17}, [r0, :64], r2
++        vst1.8  {d18}, [r0, :64], r2
++        vst1.8  {d19}, [r0, :64], r2
++        vst1.8  {d20}, [r0, :64], r2
++        vst1.8  {d21}, [r0, :64], r2
++        vst1.8  {d22}, [r0, :64], r2
++        vst1.8  {d23}, [r0, :64], r2
++        bne    1b
++
++        bx lr
++endfunc
++
++function ff_hevc_sao_band_w16_neon_8, export=1
++        init_sao_band
++1:      subs     r12, #4
++        vld1.8  {q8}, [r1, :128], r3
++        vshr.u8  q12, q8, #3
++        vld1.8  {q9}, [r1, :128], r3
++        vshr.u8  q13, q9, #3
++        vld1.8  {q10}, [r1, :128], r3
++        vshr.u8  q14, q10, #3
++        vld1.8  {q11}, [r1, :128], r3
++        vshr.u8  q15, q11, #3
++        sao_band_64
++        vst1.8   {q8}, [r0, :128], r2
++        vst1.8   {q9}, [r0, :128], r2
++        vst1.8   {q10}, [r0, :128], r2
++        vst1.8   {q11}, [r0, :128], r2
++        bne    1b
++
++        bx lr
++endfunc
++
++function ff_hevc_sao_band_w32_neon_8, export=1
++        init_sao_band
++1:      subs     r12, #2
++        vld1.8   {q8-q9}, [r1, :128], r3
++        vshr.u8  q12, q8, #3
++        vshr.u8  q13, q9, #3
++        vld1.8   {q10-q11}, [r1, :128], r3
++        vshr.u8  q14, q10, #3
++        vshr.u8  q15, q11, #3
++        sao_band_64
++        vst1.8   {q8-q9}, [r0, :128], r2
++        vst1.8   {q10-q11}, [r0, :128], r2
++        bne      1b
++
++        bx       lr
++endfunc
++
++function ff_hevc_sao_band_w64_neon_8, export=1
++        init_sao_band
++1:      subs      r12, #1
++        pld       [r1, r3]
++        vld1.8    {q8-q9}, [r1, :128]!
++        vshr.u8  q12, q8, #3
++        vshr.u8  q13, q9, #3
++        vld1.8    {q10-q11}, [r1, :128], r3
++        vshr.u8  q14, q10, #3
++        vshr.u8  q15, q11, #3
++        sub       r1, #32
++        sao_band_64
++        vst1.8    {q8-q9}, [r0, :128]!
++        vst1.8    {q10-q11}, [r0, :128], r2
++        sub       r0, #32
++        bne       1b
++
++        bx lr
++endfunc
++
++.macro diff32 out0, out1, tmp0, tmp1, in0, in1, in2, in3
++        vcgt.u8 \out0, \in2, \in0  // c > a -> -1 , otherwise 0
++        vcgt.u8 \tmp0,  \in0, \in2  // a > c -> -1 , otherwise 0
++        vcgt.u8 \out1, \in3, \in1  // c > a -> -1 , otherwise 0 part 2
++        vcgt.u8 \tmp1,  \in1, \in3  // a > c -> -1 , otherwise 0 part 2
++        vsub.s8 \out0, \tmp0, \out0 // diff0
++        vsub.s8 \out1, \tmp1, \out1 // diff0 part 2
++.endm
++
++.macro table64
++        vmov.s8 q13, #2 // 2 to all elements
++        vmov.32  d24[0], r4  // load offset table from general registers
++        vmov.32  d24[1], r5  // load rest of offset table
++
++        vadd.s8 q0, q13
++        vadd.s8 q1, q13
++        vadd.s8 q2, q13
++        vadd.s8 q3, q13
++
++        vmov.u8  q15, #128 // s8 #-128
++        vtbl.8   d0, {d24}, d0
++        vadd.s8  q13,  q4, q15
++        vtbl.8   d1, {d24}, d1
++        vadd.s8  q14,  q5, q15
++        vtbl.8   d2, {d24}, d2
++        vqadd.s8 q0, q13
++        vtbl.8   d3, {d24}, d3
++        vqadd.s8 q1, q14
++        vtbl.8   d4, {d24}, d4
++        vadd.s8  q13,  q6, q15
++        vtbl.8   d5, {d24}, d5
++        vadd.s8  q14,  q7, q15
++        vtbl.8   d6, {d24}, d6
++        vqadd.s8 q2, q13
++        vtbl.8   d7, {d24}, d7
++        vqadd.s8 q3, q14
++        vsub.s8   q0, q15
++        vsub.s8   q1, q15
++        vsub.s8   q2, q15
++        vsub.s8   q3, q15
++        vst1.8  {q0-q1}, [r0, :128]!
++        vst1.8  {q2-q3}, [r0, :128], r2
++        sub     r0, #32
++.endm
++
++// input
++// a in q0 - q3
++// c in q4 - q7
++// b in q8 - q11
++// offset table in r7 and r5
++// output in q0 - q3
++// clobbers q12 - q15
++.macro edge_w64_body
++        diff32 q12, q13, q0, q1, q0, q1, q4, q5
++        diff32 q0, q1, q14, q15, q8, q9, q4, q5
++
++        vadd.s8  q0, q12 //diff0 + diff1
++        vadd.s8  q1, q13
++
++        diff32  q14, q15, q2, q3, q2, q3, q6, q7
++        diff32  q2, q3, q12, q13, q10, q11, q6, q7
++
++        vadd.s8  q2, q14
++        vadd.s8  q3, q15
++        table64
++.endm
++
++.macro init_edge_64
++        push   {r4-r5}
++        ldr    r12, [sp, #8] // height
++        ldr    r5, [sp, #12] // sao_offset_val_table
++        ldr    r4, [r5]
++        add    r5, #4
++        ldr    r5, [r5]
++.endm
++
++function ff_hevc_sao_edge_eo0_w64_neon_8, export=1
++        init_edge_64
++        vpush {d8-d15}
++        sub    r1, #8
++1:      subs    r12, #1
++        vld1.64  {d7}, [r1, :64]!
++        vld1.64  {q4-q5}, [r1, :128]! // load c
++        vld1.64  {q6-q7}, [r1, :128]!
++        vld1.64  {d24}, [r1, :64], r3
++        sub      r1, #72
++        // load a
++        vext.8 q0, q3, q4, #15
++        vext.8 q1, q4, q5, #15
++        vext.8 q2, q5, q6, #15
++        vext.8 q3, q6, q7, #15
++        // load b
++        vext.8 q8, q4, q5, #1
++        vext.8 q9, q5, q6, #1
++        vext.8 q10, q6, q7, #1
++        vext.8 q11, q7, q12, #1
++        edge_w64_body
++        bne   1b
++        vpop  {d8-d15}
++        pop   {r4-r5}
++        bx lr
++endfunc
++
++function ff_hevc_sao_edge_eo1_w64_neon_8, export=1
++        init_edge_64
++        vpush {d8-d15}
++        sub     r1, r3
++        // load a
++        vld1.8  {q0-q1}, [r1, :128]!
++        vld1.8  {q2-q3}, [r1, :128], r3
++        sub     r1, #32
++        // load c
++        vld1.8  {q4-q5}, [r1, :128]!
++        vld1.8  {q6-q7}, [r1, :128], r3
++        sub     r1, #32
++1:      subs    r12, #1
++        // load b
++        vld1.8  {q8-q9}, [r1, :128]!
++        vld1.8  {q10-q11}, [r1, :128], r3
++        sub     r1, #32
++        edge_w64_body
++        // copy c to a
++        vmov.64 q0, q4
++        vmov.64 q1, q5
++        vmov.64 q2, q6
++        vmov.64 q3, q7
++        // copy b to c
++        vmov.64 q4, q8
++        vmov.64 q5, q9
++        vmov.64 q6, q10
++        vmov.64 q7, q11
++        bne   1b
++        vpop  {d8-d15}
++        pop   {r4-r5}
++        bx lr
++endfunc
++
++function ff_hevc_sao_edge_eo2_w64_neon_8, export=1
++        init_edge_64
++        vpush {d8-d15}
++1:      sub     r1, r3
++        // load a
++        // TODO: fix unaligned load
++        //       don't reload a like in eo1
++        sub     r1, #1
++        vld1.8  {q0-q1}, [r1]!
++        vld1.8  {q2-q3}, [r1], r3
++        sub     r1, #31
++        subs    r12, #1
++        // load c
++        vld1.8  {q4-q5}, [r1, :128]!
++        vld1.8  {q6-q7}, [r1, :128], r3
++        sub     r1, #32
++        // load b
++        add     r1, #1
++        vld1.8  {q8-q9}, [r1]!
++        vld1.8  {q10-q11}, [r1]
++        sub     r1, #33
++        edge_w64_body
++        bne   1b
++        vpop  {d8-d15}
++        pop   {r4-r5}
++        bx lr
++endfunc
++
++function ff_hevc_sao_edge_eo3_w64_neon_8, export=1
++        init_edge_64
++        vpush {d8-d15}
++1:      sub     r1, r3
++        // load a
++        // TODO: fix unaligned load
++        //       don't reload a like in eo1
++        add     r1, #1
++        vld1.8  {q0-q1}, [r1]!
++        vld1.8  {q2-q3}, [r1], r3
++        sub     r1, #33
++        subs    r12, #1
++        // load c
++        vld1.8  {q4-q5}, [r1, :128]!
++        vld1.8  {q6-q7}, [r1, :128], r3
++        sub     r1, #32
++        // load b
++        sub     r1, #1
++        vld1.8  {q8-q9}, [r1]!
++        vld1.8  {q10-q11}, [r1]
++        sub     r1, #31
++        edge_w64_body
++        bne   1b
++        vpop  {d8-d15}
++        pop   {r4-r5}
++        bx lr
++endfunc
++
++.macro init_edge_32
++        ldr     r12, [sp, #4] // sao_offset_val_table
++        vld1.32 {d31}, [r12]
++        ldr     r12, [sp] // height
++.endm
++
++.macro diff out0, tmp0, in0, in1
++        vcgt.u8 \out0, \in1, \in0  // c > a -> -1 , otherwise 0
++        vcgt.u8 \tmp0,  \in0, \in1  // a > c -> -1 , otherwise 0
++        vsub.s8 \out0, \tmp0, \out0 // diff0
++.endm
++
++.macro table32
++        vmov.s8  q10, #2
++        vadd.s8  q0, q10
++        vadd.s8  q1, q10
++        vmov.s8  q10, #128
++        vtbl.8   d0, {d31}, d0
++        vadd.s8  q11, q2, q10
++        vtbl.8   d1, {d31}, d1
++        vadd.s8  q12, q3, q10
++        vtbl.8   d2, {d31}, d2
++        vqadd.s8 q11, q0
++        vtbl.8   d3, {d31}, d3
++        vqadd.s8 q12, q1
++        vsub.s8  q0, q11, q10
++        vsub.s8  q1, q12, q10
++        vst1.8   {q0-q1}, [r0, :128], r2
++.endm
++
++function ff_hevc_sao_edge_eo0_w32_neon_8, export=1
++        init_edge_32
++        vpush {q4-q7}
++        sub     r1, #4
++1:      subs    r12, #1
++        vld1.8  {q13-q14}, [r1]!
++        vld1.32 d30, [r1], r3
++        sub     r1, #32
++        // a
++        vext.8   q0, q13, q14, #3
++        vext.8   q1, q14, q15, #3
++        vshr.u64 d24, d30, #24
++        // c
++        vext.8   q2, q13, q14, #4
++        vext.8   q3, q14, q15, #4
++        vshr.u64 d16, d30, #32
++        // diff0
++        diff32 q13, q14, q4, q5, q0, q1, q2, q3
++        diff   d18, d25, d24, d16
++        // -diff1
++        vext.s8 q0, q13, q14, #1
++        vext.s8 q1, q14, q9, #1
++
++        vsub.s8 q0, q13, q0 //diff0 + diff1
++        vsub.s8 q1, q14, q1
++        table32
++        bne     1b
++        vpop {q4-q7}
++
++        bx      lr
++endfunc
++
++function ff_hevc_sao_edge_eo1_w32_neon_8, export=1
++        init_edge_32
++        vpush {q4-q7}
++        // load a
++        sub     r1, r3
++        vld1.8  {q0-q1}, [r1, :128], r3
++        // load c
++        vld1.8  {q2-q3}, [r1, :128], r3
++        diff32 q12, q13, q0, q1, q0, q1, q2, q3 // CMP ( c, a )
++1:      subs    r12, #1
++        // load b
++        vld1.8  {q8-q9}, [r1, :128], r3
++        diff32 q4, q5, q10, q11, q8, q9, q2, q3 // CMP ( c, b )
++        vadd.s8 q0, q4, q12 //diff0 + diff1
++        vadd.s8 q1, q5, q13
++        table32
++        // CMP ( c, a )
++        vneg.s8 q12, q4
++        vneg.s8 q13, q5
++        // c
++        vmov.64 q2, q8
++        vmov.64 q3, q9
++        bne     1b
++        vpop {q4-q7}
++        bx      lr
++endfunc
++
++function ff_hevc_sao_edge_eo2_w32_neon_8, export=1
++        init_edge_32
++        vpush   {d8-d15}
++        // load a
++        sub     r1, r3
++        sub     r1, #8
++        vld1.8  {q10-q11}, [r1, :64]!
++        vld1.8  {d24}, [r1, :64], r3
++        sub     r1, #32
++        vext.8  q0, q10, q11, #7
++        vext.8  q1, q11, q12, #7
++        // load c
++        vld1.8  {d9}, [r1, :64]!
++        vld1.8  {q2-q3}, [r1, :64], r3
++        sub     r1, #8
++        vext.8  q4, q4, q2, #15
++1:      subs    r12, #1
++        // load b
++        vld1.8  {q10-q11}, [r1, :64]!
++        vld1.8  {q12}, [r1, :64], r3
++        sub     r1, #32
++        vext.8  q8, q10, q11, #9
++        vext.8  q9, q11, q12, #9
++        vext.8  q6, q10, q11, #8
++        vext.8  q7, q11, q12, #8
++        vext.8  q5, q10, q11, #7
++        diff32 q12, q13, q0, q1, q0, q1, q2, q3
++        diff32 q0, q1, q10, q11, q8, q9, q2, q3
++        vadd.s8 q0, q12 //diff0 + diff1
++        vadd.s8 q1, q13
++        table32
++        // inputs for next loop iteration
++        // a
++        vmov.8  q0, q4
++        vext.8  q1, q2, q3, #15
++        // c
++        vmov.8  q2, q6
++        vmov.8  q3, q7
++        vmov.8  q4, q5
++        bne     1b
++        vpop    {d8-d15}
++        bx      lr
++endfunc
++
++function ff_hevc_sao_edge_eo3_w32_neon_8, export=1
++        init_edge_32
++        sub     r1, r3
++        // load a
++        vld1.8  {q10-q11}, [r1, :64]!
++        vld1.8  {d24}, [r1, :64], r3
++        sub     r1, #32
++        vext.8  q0, q10, q11, #1
++        vext.8  q1, q11, q12, #1
++        // load c
++        vld1.8  {q2-q3}, [r1, :64]!
++        vld1.8  {d30}, [r1, :64], r3
++        sub     r1, #40
++1:      subs    r12, #1
++        // load b
++        vld1.8  {q10-q11}, [r1, :64]!
++        vld1.8  {q12}, [r1, :64], r3
++        sub     r1, #32
++        vext.8  q8, q10, q11, #7
++        vext.8  q9, q11, q12, #7
++        vext.8  q14, q12, q10, #7
++
++        diff32 q12, q13, q0, q1, q0, q1, q2, q3
++        diff32 q0, q1, q10, q11, q8, q9, q2, q3
++
++        vadd.s8 q0, q12 //diff0 + diff1
++        vadd.s8 q1, q13
++        table32
++
++        // inputs for next loop iteration
++        // a
++        vext.8  q0, q2, q3, #1
++        vext.8  q1, q3, q15, #1
++        // c
++        vext.8  q2, q8, q9, #1
++        vext.8  q3, q9, q14, #1
++        vext.8  d30, d28, d2, #1
++        bne     1b
++        bx      lr
++endfunc
++
+diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
+index 39713ed..25eb52b 100644
+--- a/libavcodec/avcodec.h
++++ b/libavcodec/avcodec.h
+@@ -410,6 +410,8 @@ enum AVCodecID {
+     AV_CODEC_ID_SHEERVIDEO,
+     AV_CODEC_ID_YLC,
+ 
++    AV_CODEC_ID_H264_MVC,
++
+     /* various PCM "codecs" */
+     AV_CODEC_ID_FIRST_AUDIO = 0x10000,     ///< A dummy id pointing at the start of audio codecs
+     AV_CODEC_ID_PCM_S16LE = 0x10000,
+@@ -2850,6 +2852,7 @@ typedef struct AVCodecContext {
+ #define FF_BUG_DC_CLIP          4096
+ #define FF_BUG_MS               8192 ///< Work around various bugs in Microsoft's broken decoders.
+ #define FF_BUG_TRUNCATED       16384
++#define FF_BUG_GMC_UNSUPPORTED 32768
+ 
+     /**
+      * strictly follow the standard (MPEG-4, ...).
+@@ -3195,6 +3198,9 @@ typedef struct AVCodecContext {
+ #define FF_PROFILE_H264_HIGH_444_PREDICTIVE  244
+ #define FF_PROFILE_H264_HIGH_444_INTRA       (244|FF_PROFILE_H264_INTRA)
+ #define FF_PROFILE_H264_CAVLC_444            44
++#define FF_PROFILE_H264_MULTIVIEW_HIGH       118
++#define FF_PROFILE_H264_STEREO_HIGH          128
++#define FF_PROFILE_H264_MULTIVIEW_HIGH_DEPTH 138
+ 
+ #define FF_PROFILE_VC1_SIMPLE   0
+ #define FF_PROFILE_VC1_MAIN     1
+@@ -3505,6 +3511,12 @@ typedef struct AVCodecContext {
+ #define FF_SUB_TEXT_FMT_ASS_WITH_TIMINGS 1
+ #endif
+ 
++    /**
++     * Opaque pointer for use by replacement get_buffer2 code
++     *
++     * @author jc (08/02/2016)
++     */
++    void * get_buffer_context;
+ } AVCodecContext;
+ 
+ AVRational av_codec_get_pkt_timebase         (const AVCodecContext *avctx);
+diff --git a/libavcodec/cabac.h b/libavcodec/cabac.h
+index 1bf1c62..ccfa991 100644
+--- a/libavcodec/cabac.h
++++ b/libavcodec/cabac.h
+@@ -43,7 +43,14 @@ extern const uint8_t ff_h264_cabac_tables[512 + 4*2*64 + 4*64 + 63];
+ typedef struct CABACContext{
+     int low;
+     int range;
+-    int outstanding_count;
++    union
++    {
++        int outstanding_count;
++        struct {
++            uint16_t bits;
++            uint16_t range;
++        } by22;
++    };
+     const uint8_t *bytestream_start;
+     const uint8_t *bytestream;
+     const uint8_t *bytestream_end;
+diff --git a/libavcodec/codec_desc.c b/libavcodec/codec_desc.c
+index 9d94b72..535ebf0 100644
+--- a/libavcodec/codec_desc.c
++++ b/libavcodec/codec_desc.c
+@@ -1563,6 +1563,13 @@ static const AVCodecDescriptor codec_descriptors[] = {
+         .long_name = NULL_IF_CONFIG_SMALL("YUY2 Lossless Codec"),
+         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+     },
++    {
++        .id        = AV_CODEC_ID_H264_MVC,
++        .type      = AVMEDIA_TYPE_VIDEO,
++        .name      = "h264_mvc",
++        .long_name = NULL_IF_CONFIG_SMALL("H264 MVC"),
++        .props     = AV_CODEC_PROP_LOSSY,
++    },
+ 
+     /* various PCM "codecs" */
+     {
+diff --git a/libavcodec/h264.h b/libavcodec/h264.h
+index efe3555..16358aa 100644
+--- a/libavcodec/h264.h
++++ b/libavcodec/h264.h
+@@ -126,7 +126,9 @@ enum {
+     NAL_END_STREAM      = 11,
+     NAL_FILLER_DATA     = 12,
+     NAL_SPS_EXT         = 13,
++    NAL_SPS_SUBSET      = 15,
+     NAL_AUXILIARY_SLICE = 19,
++    NAL_SLICE_EXT       = 20,
+     NAL_FF_IGNORE       = 0xff0f001,
+ };
+ 
+diff --git a/libavcodec/h264_parser.c b/libavcodec/h264_parser.c
+index ce4bab2..b9b0c78 100644
+--- a/libavcodec/h264_parser.c
++++ b/libavcodec/h264_parser.c
+@@ -58,6 +58,8 @@ typedef struct H264ParseContext {
+     uint8_t parse_history[6];
+     int parse_history_count;
+     int parse_last_mb;
++    int is_mvc;
++    int slice_ext;
+ } H264ParseContext;
+ 
+ 
+@@ -105,24 +107,27 @@ static int h264_find_frame_end(H264ParseContext *p, const uint8_t *buf,
+         } else if (state <= 5) {
+             int nalu_type = buf[i] & 0x1F;
+             if (nalu_type == NAL_SEI || nalu_type == NAL_SPS ||
+-                nalu_type == NAL_PPS || nalu_type == NAL_AUD) {
++                nalu_type == NAL_PPS || nalu_type == NAL_AUD ||
++                nalu_type == NAL_SPS_SUBSET) {
+                 if (pc->frame_start_found) {
+                     i++;
+                     goto found;
+                 }
+             } else if (nalu_type == NAL_SLICE || nalu_type == NAL_DPA ||
+-                       nalu_type == NAL_IDR_SLICE) {
++                       nalu_type == NAL_IDR_SLICE || (p->is_mvc && nalu_type == NAL_SLICE_EXT)) {
+                 state += 8;
++
++                p->slice_ext = (nalu_type == NAL_SLICE_EXT);
+                 continue;
+             }
+             state = 7;
+         } else {
+             p->parse_history[p->parse_history_count++] = buf[i];
+-            if (p->parse_history_count > 5) {
++            if (p->parse_history_count > 8) {
+                 unsigned int mb, last_mb = p->parse_last_mb;
+                 GetBitContext gb;
+ 
+-                init_get_bits(&gb, p->parse_history, 8*p->parse_history_count);
++                init_get_bits8(&gb, p->parse_history + 3*p->slice_ext, p->parse_history_count - 3*p->slice_ext);
+                 p->parse_history_count = 0;
+                 mb= get_ue_golomb_long(&gb);
+                 p->parse_last_mb = mb;
+@@ -145,7 +150,7 @@ found:
+     pc->frame_start_found = 0;
+     if (p->is_avc)
+         return next_avc;
+-    return i - (state & 5) - 5 * (state > 7);
++    return i - (state & 5) - 8 * (state > 7);
+ }
+ 
+ static int scan_mmco_reset(AVCodecParserContext *s, GetBitContext *gb,
+@@ -585,7 +590,8 @@ static int h264_parse(AVCodecParserContext *s,
+         }
+     }
+ 
+-    parse_nal_units(s, avctx, buf, buf_size);
++    if (!p->is_mvc)
++        parse_nal_units(s, avctx, buf, buf_size);
+ 
+     if (avctx->framerate.num)
+         avctx->time_base = av_inv_q(av_mul_q(avctx->framerate, (AVRational){avctx->ticks_per_frame, 1}));
+@@ -622,7 +628,7 @@ static int h264_split(AVCodecContext *avctx,
+         if ((state & 0xFFFFFF00) != 0x100)
+             break;
+         nalu_type = state & 0x1F;
+-        if (nalu_type == NAL_SPS) {
++        if (nalu_type == NAL_SPS || nalu_type == NAL_SPS_SUBSET) {
+             has_sps = 1;
+         } else if (nalu_type == NAL_PPS)
+             has_pps = 1;
+@@ -672,3 +678,23 @@ AVCodecParser ff_h264_parser = {
+     .parser_close   = h264_close,
+     .split          = h264_split,
+ };
++
++static av_cold int init_mvc(AVCodecParserContext *s)
++{
++    H264ParseContext *p = s->priv_data;
++    int ret = init(s);
++    if (ret < 0)
++        return ret;
++
++    p->is_mvc = 1;
++    return 0;
++}
++
++AVCodecParser ff_h264_mvc_parser = {
++    .codec_ids      = { AV_CODEC_ID_H264_MVC },
++    .priv_data_size = sizeof(H264ParseContext),
++    .parser_init    = init_mvc,
++    .parser_parse   = h264_parse,
++    .parser_close   = h264_close,
++    .split          = h264_split,
++};
 diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-index b478065..aa45dd6 100644
+index b478065..88dd40b 100644
 --- a/libavcodec/hevc.c
 +++ b/libavcodec/hevc.c
-@@ -931,6 +931,25 @@ static int hls_cross_component_pred(HEVCContext *s, int idx) {
+@@ -41,8 +41,186 @@
+ #include "hevc.h"
+ #include "profiles.h"
+ 
++#ifdef RPI
++  #include "rpi_qpu.h"
++  #include "rpi_user_vcsm.h"
++  // Move Inter prediction into separate pass
++  #define RPI_INTER
++
++  #ifdef RPI_INTER_QPU
++    // Define RPI_MULTI_MAILBOX to use the updated mailbox that can launch both QPU and VPU
++    #define RPI_MULTI_MAILBOX
++  #endif
++
++  // Define RPI_CACHE_UNIF_MVS to write motion vector uniform stream to cached memory
++  // RPI_CACHE_UNIF_MVS doesn't seem to make much difference, so left undefined.
++
++  // Define RPI_SIMULATE_QPUS for debugging to run QPU code on the ARMs (*rotted*)
++  //#define RPI_SIMULATE_QPUS
++  #ifdef RPI_WORKER
++    #include "pthread.h"
++  #endif
++
++  static void rpi_execute_dblk_cmds(HEVCContext *s);
++  static void rpi_execute_transform(HEVCContext *s);
++  static void rpi_launch_vpu_qpu(HEVCContext *s);
++  static void rpi_execute_pred_cmds(HEVCContext *s);
++  static void rpi_execute_inter_cmds(HEVCContext *s);
++  static void rpi_begin(HEVCContext *s);
++  static void flush_frame(HEVCContext *s,AVFrame *frame);
++  static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2, int job);
++
++#endif
++
++// #define DISABLE_MC
++
++#define PACK2(hi,lo) (((hi) << 16) | ((lo) & 0xffff))
++
++#ifndef av_mod_uintp2
++static av_always_inline av_const unsigned av_mod_uintp2_c(unsigned a, unsigned p)
++{
++    return a & ((1 << p) - 1);
++}
++#   define av_mod_uintp2   av_mod_uintp2_c
++#endif
++
+ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
+ 
++
++#ifdef RPI_INTER_QPU
++
++// Each luma QPU processes 2*RPI_NUM_CHUNKS 64x64 blocks
++// Each chroma QPU processes 3*RPI_NUM_CHUNKS 64x64 blocks, but requires two commands for B blocks
++// For each block of 64*64 the smallest block size is 8x4
++// We also need an extra command for the setup information
++
++#define RPI_CHROMA_COMMAND_WORDS 12
++#define UV_COMMANDS_PER_QPU ((1 + 3*RPI_NUM_CHUNKS*(64*64)*2/(8*4)) * RPI_CHROMA_COMMAND_WORDS)
++// The QPU code for UV blocks only works up to a block width of 8
++#define RPI_CHROMA_BLOCK_WIDTH 8
++
++#define RPI_LUMA_COMMAND_WORDS 10
++#define Y_COMMANDS_PER_QPU ((1+2*RPI_NUM_CHUNKS*(64*64)/(8*4)) * RPI_LUMA_COMMAND_WORDS)
++
++#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24)
++
++// TODO Chroma only needs 4 taps
++
++// Actual filter goes -ve, +ve, +ve, -ve using these values
++static const uint32_t rpi_filter_coefs[8][1] = {
++        { ENCODE_COEFFS(   0,  64,   0,   0) },
++        { ENCODE_COEFFS(  2,  58,  10,  2) },
++        { ENCODE_COEFFS(  4,  54,  16,  2) },
++        { ENCODE_COEFFS(  6,  46,  28,  4) },
++        { ENCODE_COEFFS(  4,  36,  36,  4) },
++        { ENCODE_COEFFS(  4,  28,  46,  6) },
++        { ENCODE_COEFFS(  2,  16,  54,  4) },
++        { ENCODE_COEFFS(  2,  10,  58,  2) }
++};
++
++#endif
++
++
++#ifdef RPI_WORKER
++
++//#define LOG_ENTER printf("Enter %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s);
++//#define LOG_EXIT printf("Exit %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s);
++
++#define LOG_ENTER
++#define LOG_EXIT
++
++// Call this when we have completed pass0 and wish to trigger pass1 for the current job
++static void worker_submit_job(HEVCContext *s)
++{
++  LOG_ENTER
++  pthread_mutex_lock(&s->worker_mutex);
++  s->worker_tail++;
++  s->pass0_job = (s->pass0_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
++  pthread_cond_broadcast(&s->worker_cond_tail); // Let people know that the tail has moved
++  pthread_mutex_unlock(&s->worker_mutex);
++  LOG_EXIT
++}
++
++// Call this to say we have completed pass1
++static void worker_complete_job(HEVCContext *s)
++{
++  LOG_ENTER
++  pthread_mutex_lock(&s->worker_mutex);
++  s->worker_head++;
++  s->pass1_job = (s->pass1_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
++  pthread_cond_broadcast(&s->worker_cond_head); // Let people know that the head has moved
++  pthread_mutex_unlock(&s->worker_mutex);
++  LOG_EXIT
++}
++
++// Call this to wait for all jobs to have completed at the end of a frame
++static void worker_wait(HEVCContext *s)
++{
++  LOG_ENTER
++  pthread_mutex_lock(&s->worker_mutex);
++  while( s->worker_head !=s->worker_tail)
++  {
++    pthread_cond_wait(&s->worker_cond_head, &s->worker_mutex);
++  }
++  pthread_mutex_unlock(&s->worker_mutex);
++  LOG_EXIT
++}
++
++// Call worker_pass0_ready to wait until the s->pass0_job slot becomes
++// available to receive the next job.
++static void worker_pass0_ready(HEVCContext *s)
++{
++  LOG_ENTER
++    pthread_mutex_lock(&s->worker_mutex);
++    // tail is number of submitted jobs
++    // head is number of completed jobs
++    // tail-head is number of outstanding jobs in the queue
++    // we need to ensure there is at least 1 space left for us to use
++    while( s->worker_tail - s->worker_head >= RPI_MAX_JOBS)
++    {
++      // Wait until another job is completed
++      pthread_cond_wait(&s->worker_cond_head, &s->worker_mutex);
++    }
++    pthread_mutex_unlock(&s->worker_mutex);
++  LOG_EXIT
++}
++
++static void *worker_start(void *arg)
++{
++  HEVCContext *s = (HEVCContext *)arg;
++  while(1) {
++    pthread_mutex_lock(&s->worker_mutex);
++
++    while( !s->kill_worker && s->worker_tail - s->worker_head <= 0)
++    {
++      pthread_cond_wait(&s->worker_cond_tail, &s->worker_mutex);
++    }
++    pthread_mutex_unlock(&s->worker_mutex);
++
++    if (s->kill_worker) {
++      break;
++    }
++    LOG_ENTER
++    // printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
++    rpi_launch_vpu_qpu(s);
++    // Perform inter prediction
++    rpi_execute_inter_cmds(s);
++    // Wait for transform completion
++    vpu_wait(s->vpu_id);
++
++    // Perform intra prediction and residual reconstruction
++    rpi_execute_pred_cmds(s);
++    // Perform deblocking for CTBs in this row
++    rpi_execute_dblk_cmds(s);
++
++    worker_complete_job(s);
++    LOG_EXIT
++  }
++  return NULL;
++}
++
++#endif
++
+ /**
+  * NOTE: Each function hls_foo correspond to the function foo in the
+  * specification (HLS stands for High Level Syntax).
+@@ -55,6 +233,32 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
+ /* free everything allocated  by pic_arrays_init() */
+ static void pic_arrays_free(HEVCContext *s)
+ {
++#ifdef RPI
++    int job;
++    for(job=0;job<RPI_MAX_JOBS;job++) {
++      if (s->coeffs_buf_arm[job][0]) {
++        gpu_free(&s->coeffs_buf_default[job]);
++        s->coeffs_buf_arm[job][0] = 0;
++      }
++      if (s->coeffs_buf_arm[job][2]) {
++        gpu_free(&s->coeffs_buf_accelerated[job]);
++        s->coeffs_buf_arm[job][2] = 0;
++      }
++    }
++#endif
++#ifdef RPI_DEBLOCK_VPU
++    {
++        int i;
++        for (i = 0; i != RPI_DEBLOCK_VPU_Q_COUNT; ++i) {
++            struct dblk_vpu_q_s * const dvq = s->dvq_ents + i;
++
++            if (dvq->vpu_cmds_arm) {
++                gpu_free(&dvq->deblock_vpu_gmem);
++              dvq->vpu_cmds_arm = 0;
++            }
++        }
++    }
++#endif
+     av_freep(&s->sao);
+     av_freep(&s->deblock);
+ 
+@@ -91,6 +295,87 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
+     int ctb_count        = sps->ctb_width * sps->ctb_height;
+     int min_pu_size      = sps->min_pu_width * sps->min_pu_height;
+ 
++#ifdef RPI
++    int coefs_in_ctb = (1 << sps->log2_ctb_size) * (1 << sps->log2_ctb_size);
++    int coefs_per_luma = 64*64*24*RPI_NUM_CHUNKS;
++    int coefs_per_chroma = (coefs_per_luma * 2) >> sps->vshift[1] >> sps->hshift[1];
++    int coefs_per_row = coefs_per_luma + coefs_per_chroma;
++    int job;
++
++    av_assert0(sps);
++    s->max_ctu_count = coefs_per_luma / coefs_in_ctb;
++    s->ctu_per_y_chan = s->max_ctu_count / 12;
++    s->ctu_per_uv_chan = s->max_ctu_count / 8;
++    for(job=0;job<RPI_MAX_JOBS;job++) {
++      printf("Allocated %d\n",coefs_per_row);
++      for(job=0;job<RPI_MAX_JOBS;job++) {
++        gpu_malloc_cached(sizeof(int16_t) * coefs_per_row, &s->coeffs_buf_default[job]);
++        s->coeffs_buf_arm[job][0] = (int16_t*) s->coeffs_buf_default[job].arm;
++        if (!s->coeffs_buf_arm[job][0])
++            goto fail;
++        gpu_malloc_cached(sizeof(int16_t) * (coefs_per_row + 32*32), &s->coeffs_buf_accelerated[job]);  // We prefetch past the end so provide an extra blocks worth of data
++        s->coeffs_buf_arm[job][2] = (int16_t*) s->coeffs_buf_accelerated[job].arm;
++        s->coeffs_buf_vc[job][2] = s->coeffs_buf_accelerated[job].vc;
++        if (!s->coeffs_buf_arm[job][2])
++            goto fail;
++        s->coeffs_buf_arm[job][3] = coefs_per_row + s->coeffs_buf_arm[job][2];  // This points to just beyond the end of the buffer.  Coefficients fill in backwards.
++        s->coeffs_buf_vc[job][3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[job][2];
++      }
++    }
++#endif
++#ifdef RPI_DEBLOCK_VPU
++    {
++        int i;
++        s->enable_rpi_deblock = !sps->sao_enabled;
++        s->setup_width = (sps->width+15) / 16;
++        s->setup_height = (sps->height+15) / 16;
++        s->uv_setup_width = ( (sps->width >> sps->hshift[1]) + 15) / 16;
++        s->uv_setup_height = ( (sps->height >> sps->vshift[1]) + 15) / 16;
++
++        for (i = 0; i != RPI_DEBLOCK_VPU_Q_COUNT; ++i)
++        {
++            struct dblk_vpu_q_s * const dvq = s->dvq_ents + i;
++            const unsigned int cmd_size = (sizeof(*dvq->vpu_cmds_arm) * 3 + 15) & ~15;
++            const unsigned int y_size = (sizeof(*dvq->y_setup_arm) * s->setup_width * s->setup_height + 15) & ~15;
++            const unsigned int uv_size = (sizeof(*dvq->uv_setup_arm) * s->uv_setup_width * s->uv_setup_height + 15) & ~15;
++            const unsigned int total_size =- cmd_size + y_size + uv_size;
++            int p_vc;
++            uint8_t * p_arm;
++ #if RPI_VPU_DEBLOCK_CACHED
++            gpu_malloc_cached(total_size, &dvq->deblock_vpu_gmem);
++ #else
++            gpu_malloc_uncached(total_size, &dvq->deblock_vpu_gmem);
++ #endif
++            p_vc = dvq->deblock_vpu_gmem.vc;
++            p_arm = dvq->deblock_vpu_gmem.arm;
++
++            // Zap all
++            memset(p_arm, 0, dvq->deblock_vpu_gmem.numbytes);
++
++            // Subdivide
++            dvq->vpu_cmds_arm = (void*)p_arm;
++            dvq->vpu_cmds_vc = p_vc;
++
++            p_arm += cmd_size;
++            p_vc += cmd_size;
++
++            dvq->y_setup_arm = (void*)p_arm;
++            dvq->y_setup_vc = (void*)p_vc;
++
++            p_arm += y_size;
++            p_vc += y_size;
++
++            dvq->uv_setup_arm = (void*)p_arm;
++            dvq->uv_setup_vc = (void*)p_vc;
++
++            dvq->cmd_id = -1;
++        }
++
++        s->dvq_n = 0;
++        s->dvq = s->dvq_ents + s->dvq_n;
++    }
++#endif
++
+     s->bs_width  = (width  >> 2) + 1;
+     s->bs_height = (height >> 2) + 1;
+ 
+@@ -137,6 +422,29 @@ fail:
+     return AVERROR(ENOMEM);
+ }
+ 
++static void default_pred_weight_table(HEVCContext * const s)
++{
++  unsigned int i;
++  s->sh.luma_log2_weight_denom = 0;
++  s->sh.chroma_log2_weight_denom = 0;
++  for (i = 0; i < s->sh.nb_refs[L0]; i++) {
++      s->sh.luma_weight_l0[i] = 1;
++      s->sh.luma_offset_l0[i] = 0;
++      s->sh.chroma_weight_l0[i][0] = 1;
++      s->sh.chroma_offset_l0[i][0] = 0;
++      s->sh.chroma_weight_l0[i][1] = 1;
++      s->sh.chroma_offset_l0[i][1] = 0;
++  }
++  for (i = 0; i < s->sh.nb_refs[L1]; i++) {
++      s->sh.luma_weight_l1[i] = 1;
++      s->sh.luma_offset_l1[i] = 0;
++      s->sh.chroma_weight_l1[i][0] = 1;
++      s->sh.chroma_offset_l1[i][0] = 0;
++      s->sh.chroma_weight_l1[i][1] = 1;
++      s->sh.chroma_offset_l1[i][1] = 0;
++  }
++}
++
+ static void pred_weight_table(HEVCContext *s, GetBitContext *gb)
+ {
+     int i = 0;
+@@ -674,6 +982,11 @@ static int hls_slice_header(HEVCContext *s)
+                 (s->ps.pps->weighted_bipred_flag && sh->slice_type == B_SLICE)) {
+                 pred_weight_table(s, gb);
+             }
++            else
++            {
++              // Give us unit weights
++              default_pred_weight_table(s);
++            }
+ 
+             sh->max_num_merge_cand = 5 - get_ue_golomb_long(gb);
+             if (sh->max_num_merge_cand < 1 || sh->max_num_merge_cand > 5) {
+@@ -931,6 +1244,25 @@ static int hls_cross_component_pred(HEVCContext *s, int idx) {
      return 0;
  }
  
@@ -233,7 +2801,7 @@ index b478065..aa45dd6 100644
 +{
 +    if (s->enable_rpi) {
 +        HEVCLocalContext *lc = s->HEVClc;
-+        HEVCPredCmd *cmd = s->univ_pred_cmds + s->num_pred_cmds++;
++        HEVCPredCmd *cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++;
 +        cmd->type = RPI_PRED_INTRA;
 +        cmd->size = log2_trafo_size;
 +        cmd->c_idx = c_idx;
@@ -250,7 +2818,7 @@ index b478065..aa45dd6 100644
  static int hls_transform_unit(HEVCContext *s, int x0, int y0,
                                int xBase, int yBase, int cb_xBase, int cb_yBase,
                                int log2_cb_size, int log2_trafo_size,
-@@ -943,8 +962,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+@@ -943,8 +1275,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
      if (lc->cu.pred_mode == MODE_INTRA) {
          int trafo_size = 1 << log2_trafo_size;
          ff_hevc_set_neighbour_available(s, x0, y0, trafo_size, trafo_size);
@@ -263,7 +2831,7 @@ index b478065..aa45dd6 100644
      }
  
      if (cbf_luma || cbf_cb[0] || cbf_cr[0] ||
-@@ -1030,7 +1052,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+@@ -1030,7 +1365,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
              for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
                  if (lc->cu.pred_mode == MODE_INTRA) {
                      ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
@@ -275,7 +2843,7 @@ index b478065..aa45dd6 100644
                  }
                  if (cbf_cb[i])
                      ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c),
-@@ -1059,7 +1085,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+@@ -1059,7 +1398,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
              for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
                  if (lc->cu.pred_mode == MODE_INTRA) {
                      ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
@@ -287,7 +2855,7 @@ index b478065..aa45dd6 100644
                  }
                  if (cbf_cr[i])
                      ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c),
-@@ -1088,7 +1118,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+@@ -1088,7 +1431,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
                  if (lc->cu.pred_mode == MODE_INTRA) {
                      ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size),
                                                      trafo_size_h, trafo_size_v);
@@ -299,7 +2867,7 @@ index b478065..aa45dd6 100644
                  }
                  if (cbf_cb[i])
                      ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size),
-@@ -1098,7 +1132,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+@@ -1098,7 +1445,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
                  if (lc->cu.pred_mode == MODE_INTRA) {
                      ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size),
                                                  trafo_size_h, trafo_size_v);
@@ -311,7 +2879,7 @@ index b478065..aa45dd6 100644
                  }
                  if (cbf_cr[i])
                      ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size),
-@@ -1110,26 +1148,46 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+@@ -1110,26 +1461,46 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
              int trafo_size_h = 1 << (log2_trafo_size_c + s->ps.sps->hshift[1]);
              int trafo_size_v = 1 << (log2_trafo_size_c + s->ps.sps->vshift[1]);
              ff_hevc_set_neighbour_available(s, x0, y0, trafo_size_h, trafo_size_v);
@@ -358,6820 +2926,17 @@ index b478065..aa45dd6 100644
              }
          }
      }
-@@ -2304,6 +2362,31 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
-     lc->ctb_up_left_flag = ((x_ctb > 0) && (y_ctb > 0)  && (ctb_addr_in_slice-1 >= s->ps.sps->ctb_width) && (s->ps.pps->tile_id[ctb_addr_ts] == s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1 - s->ps.sps->ctb_width]]));
- }
- 
-+#ifdef RPI
-+static void rpi_execute_pred_cmds(HEVCContext *s)
-+{
-+  int i;
-+  HEVCPredCmd *cmd = s->univ_pred_cmds;
-+  HEVCLocalContext *lc = s->HEVClc;
-+
-+  for(i = s->num_pred_cmds; i > 0; i--, cmd++) {
-+      if (cmd->type == RPI_PRED_INTRA) {
-+          lc->tu.intra_pred_mode_c = lc->tu.intra_pred_mode = cmd->mode;
-+          lc->na.cand_bottom_left  = (cmd->na >> 4) & 1;
-+          lc->na.cand_left         = (cmd->na >> 3) & 1;
-+          lc->na.cand_up_left      = (cmd->na >> 2) & 1;
-+          lc->na.cand_up           = (cmd->na >> 1) & 1;
-+          lc->na.cand_up_right     = (cmd->na >> 0) & 1;
-+          s->hpc.intra_pred[cmd->size - 2](s, cmd->x, cmd->y, cmd->c_idx);
-+      } else {
-+          s->hevcdsp.transform_add[cmd->size-2](cmd->dst, cmd->buf, cmd->stride);
-+      }
-+  }
-+  s->num_pred_cmds = 0;
-+  s->num_coeffs = 0;
-+}
-+#endif
-+
- static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
- {
-     HEVCContext *s  = avctxt->priv_data;
-@@ -2313,6 +2396,10 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-     int y_ctb       = 0;
-     int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs];
- 
-+#ifdef RPI
-+    s->enable_rpi = 1; // TODO this should depend on cross component and frame width etc.
-+#endif
-+
-     if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) {
-         av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n");
-         return AVERROR_INVALIDDATA;
-@@ -2342,6 +2429,9 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-         s->filter_slice_edges[ctb_addr_rs]  = s->sh.slice_loop_filter_across_slices_enabled_flag;
- 
-         more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
-+#ifdef RPI
-+        rpi_execute_pred_cmds(s);
-+#endif
-         if (more_data < 0) {
-             s->tab_slice_address[ctb_addr_rs] = -1;
-             return more_data;
-@@ -2387,6 +2477,10 @@ static int hls_decode_entry_wpp(AVCodecContext *avctxt, void *input_ctb_row, int
-     s = s1->sList[self_id];
-     lc = s->HEVClc;
- 
-+#ifdef RPI
-+    s->enable_rpi = 0;
-+#endif
-+
-     if(ctb_row) {
-         ret = init_get_bits8(&lc->gb, s->data + s->sh.offset[ctb_row - 1], s->sh.size[ctb_row - 1]);
- 
-@@ -3075,6 +3169,13 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
- 
-     av_freep(&s->cabac_state);
- 
-+#ifdef RPI
-+    av_freep(&s->unif_mv_cmds);
-+    av_freep(&s->unif_xfm_cmds);
-+    av_freep(&s->univ_pred_cmds);
-+    av_freep(&s->coeffs_buf);
-+#endif
-+
-     for (i = 0; i < 3; i++) {
-         av_freep(&s->sao_pixel_buffer_h[i]);
-         av_freep(&s->sao_pixel_buffer_v[i]);
-@@ -3129,6 +3230,22 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
-     s->HEVClcList[0] = s->HEVClc;
-     s->sList[0] = s;
- 
-+#ifdef RPI
-+    s->unif_mv_cmds = av_mallocz(sizeof(HEVCMvCmd)*RPI_MAX_MV_CMDS);
-+    if (!s->unif_mv_cmds)
-+        goto fail;
-+    s->unif_xfm_cmds = av_mallocz(sizeof(HEVCXfmCmd)*RPI_MAX_XFM_CMDS);
-+    if (!s->unif_xfm_cmds)
-+        goto fail;
-+    s->univ_pred_cmds = av_mallocz(sizeof(HEVCPredCmd)*RPI_MAX_PRED_CMDS);
-+    if (!s->univ_pred_cmds)
-+        goto fail;
-+    s->coeffs_buf = av_mallocz(sizeof(int16_t)*RPI_MAX_XFM_CMDS*16);
-+    if (!s->coeffs_buf)
-+        goto fail;
-+    s->enable_rpi = 0;
-+#endif
-+
-     s->cabac_state = av_malloc(HEVC_CONTEXTS);
-     if (!s->cabac_state)
-         goto fail;
-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-index be91010..7a1c35f 100644
---- a/libavcodec/hevc.h
-+++ b/libavcodec/hevc.h
-@@ -23,6 +23,9 @@
- #ifndef AVCODEC_HEVC_H
- #define AVCODEC_HEVC_H
- 
-+// define RPI to split the CABAC/prediction/transform into separate stages
-+#include "config.h"
-+
- #include "libavutil/buffer.h"
- #include "libavutil/md5.h"
- 
-@@ -790,6 +793,49 @@ typedef struct HEVCLocalContext {
-     int boundary_flags;
- } HEVCLocalContext;
- 
-+#ifdef RPI
-+
-+// RPI_MAX_WIDTH is maximum width in pixels supported by the accelerated code
-+#define RPI_MAX_WIDTH 2048
-+
-+// Worst case is for 4:4:4 4x4 blocks with 64 high coding tree blocks, so 16 MV cmds per 4 pixels across for each colour plane
-+#define RPI_MAX_MV_CMDS   (16*3*(RPI_MAX_WIDTH/4))
-+#define RPI_MAX_XFM_CMDS  (16*3*(RPI_MAX_WIDTH/4))
-+// Each block can have an intra prediction and a transform_add command
-+#define RPI_MAX_PRED_CMDS (2*16*3*(RPI_MAX_WIDTH/4))
-+
-+// Command for inter prediction
-+typedef struct HEVCMvCmd {
-+} HEVCMvCmd;
-+
-+// Command for transform to process a block of coefficients
-+typedef struct HEVCXfmCmd {
-+} HEVCXfmCmd;
-+
-+// Command for intra prediction and transform_add of predictions to coefficients
-+#define RPI_PRED_TRANSFORM_ADD 0
-+#define RPI_PRED_INTRA 1
-+typedef struct HEVCPredCmd {
-+    uint8_t size;
-+    uint8_t type;
-+    uint8_t na;
-+    uint8_t c_idx;
-+    union {
-+        uint8_t *dst; // RPI_PRED_TRANSFORM_ADD
-+        uint32_t x;   // RPI_PRED_INTRA
-+    };
-+    union {
-+        int16_t *buf; // RPI_PRED_TRANSFORM_ADD
-+        uint32_t y;   // RPI_PRED_INTRA
-+    };
-+    union {
-+        enum IntraPredMode mode; // RPI_PRED_TRANSFORM_ADD
-+        uint32_t stride;         // RPI_PRED_INTRA
-+    };
-+} HEVCPredCmd;
-+
-+#endif
-+
- typedef struct HEVCContext {
-     const AVClass *c;  // needed by private avoptions
-     AVCodecContext *avctx;
-@@ -805,6 +851,18 @@ typedef struct HEVCContext {
-     int                 width;
-     int                 height;
- 
-+#ifdef RPI
-+    int enable_rpi;
-+    HEVCMvCmd *unif_mv_cmds;
-+    HEVCXfmCmd *unif_xfm_cmds;
-+    HEVCPredCmd *univ_pred_cmds;
-+    int16_t *coeffs_buf;
-+    int num_mv_cmds;
-+    int num_xfm_cmds;
-+    int num_pred_cmds;
-+    int num_coeffs;
-+#endif
-+
-     uint8_t *cabac_state;
- 
-     /** 1 if the independent slice segment header was successfully parsed */
-diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
-index 05b2821..4e97f06 100644
---- a/libavcodec/hevc_cabac.c
-+++ b/libavcodec/hevc_cabac.c
-@@ -1510,6 +1510,21 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-             coeffs[i] = coeffs[i] + ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
-         }
-     }
-+#ifdef RPI
-+    if (s->enable_rpi) {
-+        int16_t *c = s->coeffs_buf + s->num_coeffs;
-+        int n = trafo_size * trafo_size;
-+        HEVCPredCmd *cmd = s->univ_pred_cmds + s->num_pred_cmds++;
-+        memcpy(c, coeffs, n * sizeof(int16_t));  // TODO change pointer earlier and we can avoid this copy
-+        s->num_coeffs += n;
-+        cmd->type = RPI_PRED_TRANSFORM_ADD;
-+        cmd->size = log2_trafo_size;
-+        cmd->buf = c;
-+        cmd->dst = dst;
-+        cmd->stride = stride;
-+        return;
-+    }
-+#endif
-     s->hevcdsp.transform_add[log2_trafo_size-2](dst, coeffs, stride);
- }
- 
--- 
-2.7.4
-
-
-From f8293de11dc040d9fa2a558762a357c0c353d2c9 Mon Sep 17 00:00:00 2001
-From: Peter de Rivaz <peter.derivaz@gmail.com>
-Date: Thu, 30 Apr 2015 15:23:22 +0100
-Subject: [PATCH 03/68] Added simple VPU test code
-
----
- libavcodec/Makefile             |    7 +
- libavcodec/hevc.c               |   33 +-
- libavcodec/rpi_hevc_transform.h |  212 ++++++
- libavcodec/rpi_hevc_transform.s |  147 ++++
- libavcodec/rpi_mailbox.c        |  293 ++++++++
- libavcodec/rpi_mailbox.h        |   20 +
- libavcodec/rpi_qpu.c            |  652 ++++++++++++++++++
- libavcodec/rpi_qpu.h            |   45 ++
- libavcodec/rpi_shader.c         |  818 ++++++++++++++++++++++
- libavcodec/rpi_shader.h         |   20 +
- libavcodec/rpi_shader.qasm      | 1413 +++++++++++++++++++++++++++++++++++++++
- libavcodec/rpi_user_vcsm.h      |  425 ++++++++++++
- 12 files changed, 4084 insertions(+), 1 deletion(-)
- create mode 100644 libavcodec/rpi_hevc_transform.h
- create mode 100644 libavcodec/rpi_hevc_transform.s
- create mode 100644 libavcodec/rpi_mailbox.c
- create mode 100644 libavcodec/rpi_mailbox.h
- create mode 100644 libavcodec/rpi_qpu.c
- create mode 100644 libavcodec/rpi_qpu.h
- create mode 100644 libavcodec/rpi_shader.c
- create mode 100644 libavcodec/rpi_shader.h
- create mode 100644 libavcodec/rpi_shader.qasm
- create mode 100644 libavcodec/rpi_user_vcsm.h
-
-diff --git a/libavcodec/Makefile b/libavcodec/Makefile
-index fd0d1f0..03065cd 100644
---- a/libavcodec/Makefile
-+++ b/libavcodec/Makefile
-@@ -5,6 +5,10 @@ NAME = avcodec
- HEADERS = avcodec.h                                                     \
-           avdct.h                                                       \
-           avfft.h                                                       \
-+          rpi_qpu.h                                                     \
-+          rpi_shader.h                                                  \
-+          rpi_mailbox.h                                                 \
-+          rpi_hevc_transform.h                                          \
-           d3d11va.h                                                     \
-           dirac.h                                                       \
-           dv_profile.h                                                  \
-@@ -43,6 +47,9 @@ OBJS = allcodecs.o                                                      \
-        resample.o                                                       \
-        resample2.o                                                      \
-        utils.o                                                          \
-+       rpi_qpu.o                                                        \
-+       rpi_shader.o                                                     \
-+       rpi_mailbox.o                                                    \
-        vorbis_parser.o                                                  \
-        xiph.o                                                           \
- 
-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-index aa45dd6..ab55df1 100644
---- a/libavcodec/hevc.c
-+++ b/libavcodec/hevc.c
-@@ -41,6 +41,10 @@
- #include "hevc.h"
- #include "profiles.h"
- 
-+#ifdef RPI
-+#include "rpi_qpu.h"
-+#endif
-+
- const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
- 
- /**
-@@ -2430,7 +2434,9 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
- 
-         more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
- #ifdef RPI
--        rpi_execute_pred_cmds(s);
-+        if (x_ctb + ctb_size >= s->ps.sps->width) {
-+            rpi_execute_pred_cmds(s);
-+        }
- #endif
-         if (more_data < 0) {
-             s->tab_slice_address[ctb_addr_rs] = -1;
-@@ -3244,6 +3250,31 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
-     if (!s->coeffs_buf)
-         goto fail;
-     s->enable_rpi = 0;
-+
-+    // A little test program
-+    {
-+      GPU_MEM_PTR_T p;
-+      int err = gpu_malloc_cached(16, &p);
-+      short *q = (short *)p.arm;
-+      int i;
-+      int r;
-+      printf("Allocated memory %d ARM 0x%x, VC 0x%x, Code 0x%x\n",err,(int)p.arm,p.vc,(int)vpu_get_fn());
-+      printf("Allocated memory %d ARM 0x%x, VC 0x%x\n",err,(int)p.arm,p.vc);
-+      printf("Preparing data %p\n",q);
-+      for(i=0;i<16;i++)
-+        q[i] = i;
-+      printf("Flush cache\n");
-+      gpu_cache_flush(&p);
-+      printf("Executing code\n");
-+      r = vpu_execute_code( vpu_get_fn(), p.vc, 0, 0, 0, 0, 0);
-+      printf("Return value %d (",r);
-+      for(i=0;i<16;i++)
-+        printf("%d ",q[i]);
-+      printf(")\n");
-+      gpu_free(&p);
-+      goto fail; // Early out
-+    }
-+
- #endif
- 
-     s->cabac_state = av_malloc(HEVC_CONTEXTS);
-diff --git a/libavcodec/rpi_hevc_transform.h b/libavcodec/rpi_hevc_transform.h
-new file mode 100644
-index 0000000..85a9102
---- /dev/null
-+++ b/libavcodec/rpi_hevc_transform.h
-@@ -0,0 +1,212 @@
-+unsigned char rpi_hevc_transform [] = {
-+169,
-+3,
-+3,
-+232,
-+128,
-+0,
-+0,
-+0,
-+20,
-+248,
-+0,
-+136,
-+0,
-+0,
-+192,
-+248,
-+0,
-+0,
-+0,
-+96,
-+3,
-+232,
-+32,
-+0,
-+0,
-+0,
-+7,
-+232,
-+0,
-+2,
-+0,
-+0,
-+8,
-+232,
-+0,
-+4,
-+0,
-+0,
-+12,
-+248,
-+0,
-+128,
-+0,
-+0,
-+192,
-+8,
-+4,
-+0,
-+4,
-+232,
-+64,
-+0,
-+0,
-+0,
-+5,
-+232,
-+0,
-+0,
-+8,
-+0,
-+128,
-+69,
-+113,
-+66,
-+12,
-+248,
-+0,
-+128,
-+0,
-+0,
-+192,
-+8,
-+4,
-+0,
-+128,
-+69,
-+113,
-+70,
-+128,
-+144,
-+39,
-+0,
-+4,
-+255,
-+48,
-+192,
-+128,
-+3,
-+32,
-+8,
-+16,
-+0,
-+76,
-+254,
-+48,
-+192,
-+9,
-+4,
-+32,
-+8,
-+0,
-+0,
-+4,
-+254,
-+0,
-+144,
-+128,
-+2,
-+0,
-+248,
-+62,
-+0,
-+128,
-+144,
-+22,
-+0,
-+4,
-+255,
-+48,
-+192,
-+128,
-+3,
-+32,
-+8,
-+16,
-+0,
-+76,
-+254,
-+48,
-+192,
-+9,
-+4,
-+32,
-+8,
-+0,
-+0,
-+140,
-+248,
-+44,
-+0,
-+0,
-+0,
-+32,
-+48,
-+4,
-+0,
-+128,
-+69,
-+113,
-+66,
-+242,
-+140,
-+211,
-+192,
-+41,
-+3,
-+68,
-+192,
-+80,
-+7,
-+164,
-+255,
-+36,
-+220,
-+96,
-+2,
-+0,
-+248,
-+62,
-+0,
-+3,
-+255,
-+55,
-+208,
-+120,
-+3,
-+224,
-+3,
-+190,
-+11,
-+16,
-+139,
-+246,
-+83,
-+0,
-+103,
-+90,
-+0,
-+8,
-+240,
-+0,
-+128,
-+128,
-+3,
-+0,
-+247,
-+32,
-+128,
-+10,
-+4,
-+136,
-+240,
-+32,
-+0,
-+128,
-+3,
-+112,
-+96,
-+90,
-+0,
-+};
-diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s
-new file mode 100644
-index 0000000..5e2728d
---- /dev/null
-+++ b/libavcodec/rpi_hevc_transform.s
-@@ -0,0 +1,147 @@
-+# ******************************************************************************
-+# Argon Design Ltd.
-+# (c) Copyright 2015 Argon Design Ltd. All rights reserved.
-+#
-+# Module : HEVC
-+# Author : Peter de Rivaz
-+# ******************************************************************************
-+
-+# HEVC VPU Transform
-+#
-+# Transform matrix can be thought of as
-+#   output row vector = input row vector * transMatrix2
-+#
-+# The even rows of the matrix are symmetric
-+# The odd rows of the matrix are antisymmetric
-+#
-+# So only need to compute the first half of the results, then can compute the remainder with a butterfly
-+#
-+# EXAMPLE
-+#   (a b c d) (1 2  2  1)
-+#             (3 4 -4 -3)
-+#             (5 6  6  5)
-+#             (7 8 -8 -7)
-+#
-+#  x=(a c)(1 2) = 1a+5c 2a+6c
-+#         (5 6)
-+#
-+#  y=(b d)(3 4) = 3b+7d 4b+8d
-+#         (7 8)
-+#
-+#  u=x+y = 1a+5c+3b+7d 2a+4b+6c+8d
-+#  v=x-y = 1a+5c-3b-7d 2a+6c-4b-8d
-+#
-+#  Final results are (u , v[::-1])
-+#
-+#
-+#  For 32x1 input, load even rows into HX(0++,0), odd rows into HX(16++,0)
-+#  Apply the even matrix first and stop before rounding
-+#  Then apply the odd matrix in a full manner:
-+#
-+#   First step is to compute partial products with the first input (16 cycles)
-+#   1a 3b 5c 7d   16x1 input coefficients produce 16x16 output
-+#   2a 4b 6c 8d
-+#   2a -4b 6c -8d
-+#   1a -3b 5c -7d
-+#
-+#   Second step is to sum partial products into final position (8 cycles)
-+#   1a+3b+5c+7d
-+#   2a+4b+6c+8d
-+#   2a-4b+6c-8d
-+#   1a-3b+5c-7d
-+#
-+#   Then can apply butterfly to combine even results and odd results + rounding to produce 16 rows of output at a time (need to save in transposed format)
-+#
-+#   For 16x16 no butterfly is required and can store final results in original location  (Could do 2 16x16s in parallel to make use of the trick - saves on the adds)
-+#
-+#   For 8x8 we could compute two in parallel.
-+#
-+#
-+
-+test_add:
-+  vldh HX(0,0),(r0)
-+  vadd HX(0,0),HX(0,0),10
-+  vsth HX(0,0),(r0)
-+  mov r0,7 # return value
-+  b lr
-+
-+# Columns are transformed first
-+#
-+# Store top left half of transMatrix2 in
-+# Store bottom left half of transMatrix2 in HX(32,32)
-+#
-+# For 16x16
-+# HX(0:15,0) contains input data before transform
-+# HY(0:15,0) contains 32bit output data after transform
-+# HX(32,0) contains even rows of left half of transMatrix2
-+# HX(32,32) contains odd rows of left half of transMatrix2
-+# HY(48,0) contains partial products ready for summing
-+#
-+
-+
-+# hevc_trans_16x16(short *transMatrix2, short *coeffs, int num)
-+# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory)
-+# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
-+# num: number of 16x16 transforms to be done
-+#
-+hevc_trans_16x16:
-+  push r6-r15, lr # TODO cut down number of used registers
-+
-+  mov r3, 2*32*2 # Twice Stride of transMatrix2 in bytes
-+  vld HX(32++,0),(r0 += r3) REP 16 # This is the 16x16 matrix, a transform is equivalent to multiplying input row vector * matrix
-+  # Now use r0 to describe which matrix we are working on.
-+  # Allows us to prefetch the next block of coefficients for efficiency.
-+  mov r0,0 # This describes the location where we read our coefficients from
-+  mov r3,16*2 # Stride of coefficients in bytes
-+  mov r7,16*16*2 # Total block size
-+  mov r8,64*16 # Value used to swap from current to next VRF location
-+  vldh HX(0++,0)+r0,(r1 += r3) REP 16
-+  mov r4,64 # Constant used for rounding first pass
-+  mov r5,1<<19 # Constant used for rounding second pass
-+
-+  # At start of block r0,r1 point to the current block (that has already been loaded)
-+block_loop:
-+  eor r0,r8
-+  add r1,r7
-+  # Prefetch the next block
-+  vldh HX(0++,0)+r0,(r1 += r3) REP 16
-+  eor r0,r8
-+  sub r1,r7
-+
-+  # Transform the current block
-+  bl col_trans_16
-+  vadd HY(0++,0)+r0,HY(0++,0)+r0,r4 REP 16   # Now add on rounding, shift down by 7, and saturate
-+  #vsasls HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # 9+7=16 so this ends up with the output saturated and in the top half of the word.
-+  vasl HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16    # This should be saturating, but the instruction above does not assemble?
-+  vmov VX(0,0++), HX(0++,32) REP 16          # For simplicity transpose this back to the original position
-+
-+  bl col_trans_16
-+  vadd HY(0++,0)+r0,HY(0++,0)+r0,r4 REP 16   # Now add on rounding, shift down by 7, and saturate
-+  #vsasls HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # 9+7=16 so this ends up with the output saturated and in the top half of the word.
-+  vasl HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16    # This should be saturating, but the instruction above does not assemble?
-+
-+  # Save results - note there has been a transposition during the processing so we save columns
-+  vsth VX(0,32++)+r0, (r1 += r3) REP 16
-+
-+  # Move onto next block
-+  eor r0,r8
-+  add r1,r7
-+
-+  addcmpbgt r2,-1,0,block_loop
-+  pop r6-r15, pc
-+
-+# r1,r2,r3 r7,r8 should be preserved
-+# HX(0++,0)+r0 is the block to be transformed
-+# HX(32++,0) is the 16x16 matrix of transform coefficients
-+# Use HY(48,0) for intermediate results
-+# r0 can be used, but should be returned to its original value at the end
-+col_trans_16:
-+  add r4,r0,16 # Final value for this loop
-+col_trans_16_loop:
-+  # First compute partial products for a single column
-+  vmul32s VY(48,0++), VX(0,0)+r0, VX(32,0++) REP 16
-+  # Then sum up the results and place back
-+  vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
-+  addcmpblt r0,1,r4,col_trans_16_loop
-+  sub r0,16  # but r0 back to its original value
-+  b lr
-diff --git a/libavcodec/rpi_mailbox.c b/libavcodec/rpi_mailbox.c
-new file mode 100644
-index 0000000..536896f
---- /dev/null
-+++ b/libavcodec/rpi_mailbox.c
-@@ -0,0 +1,293 @@
-+/*
-+Copyright (c) 2012, Broadcom Europe Ltd.
-+All rights reserved.
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+    * Redistributions of source code must retain the above copyright
-+      notice, this list of conditions and the following disclaimer.
-+    * Redistributions in binary form must reproduce the above copyright
-+      notice, this list of conditions and the following disclaimer in the
-+      documentation and/or other materials provided with the distribution.
-+    * Neither the name of the copyright holder nor the
-+      names of its contributors may be used to endorse or promote products
-+      derived from this software without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+*/
-+
-+#include <stdio.h>
-+#include <string.h>
-+#include <stdlib.h>
-+#include <fcntl.h>
-+#include <unistd.h>
-+#include <assert.h>
-+#include <stdint.h>
-+#include <sys/mman.h>
-+#include <sys/ioctl.h>
-+
-+#include <linux/ioctl.h>
-+
-+#define MAJOR_NUM 100
-+#define IOCTL_MBOX_PROPERTY _IOWR(MAJOR_NUM, 0, char *)
-+#define DEVICE_FILE_NAME "/dev/char_dev"
-+
-+#include "rpi_mailbox.h"
-+
-+#define PAGE_SIZE (4*1024)
-+
-+// Shared memory will not be cached in ARM cache
-+void *mapmem_shared(unsigned base, unsigned size)
-+{
-+   int mem_fd;
-+   unsigned offset = base % PAGE_SIZE;
-+   base = base - offset;
-+   /* open /dev/mem */
-+   if ((mem_fd = open("/dev/mem", O_RDWR|O_SYNC) ) < 0) {
-+      printf("can't open /dev/mem\nThis program should be run as root. Try prefixing command with: sudo\n");
-+      return NULL;
-+   }
-+   void *mem = mmap(
-+      0,
-+      size,
-+      PROT_READ|PROT_WRITE,
-+      MAP_SHARED/*|MAP_FIXED*/,
-+      mem_fd,
-+      base);
-+#ifdef DEBUG
-+   printf("base=0x%x, mem=%p\n", base, mem);
-+#endif
-+   if (mem == MAP_FAILED) {
-+      printf("mmap error %d\n", (int)mem);
-+      return NULL;
-+   }
-+   close(mem_fd);
-+   return (char *)mem + offset;
-+}
-+
-+// Unshared memory will be faster as lives in ARM cache, but requires cache flushing
-+void *mapmem_private(unsigned base, unsigned size)
-+{
-+   int mem_fd;
-+   unsigned offset = base % PAGE_SIZE;
-+   base = base - offset;
-+   /* open /dev/mem */
-+   if ((mem_fd = open("/dev/mem", O_RDWR|O_SYNC) ) < 0) {
-+      printf("can't open /dev/mem\nThis program should be run as root. Try prefixing command with: sudo\n");
-+      return NULL;
-+   }
-+   void *mem = mmap(
-+      0,
-+      size,
-+      PROT_READ|PROT_WRITE,
-+      MAP_PRIVATE/*|MAP_FIXED*/,
-+      mem_fd,
-+      base);
-+#ifdef DEBUG
-+   printf("base=0x%x, mem=%p\n", base, mem);
-+#endif
-+   if (mem == MAP_FAILED) {
-+      printf("mmap error %d\n", (int)mem);
-+      return NULL;
-+   }
-+   close(mem_fd);
-+   return (char *)mem + offset;
-+}
-+
-+void unmapmem(void *addr, unsigned size)
-+{
-+   int s = munmap(addr, size);
-+   if (s != 0) {
-+      printf("munmap error %d\n", s);
-+      exit (-1);
-+   }
-+}
-+
-+/*
-+ * use ioctl to send mbox property message
-+ */
-+
-+static int mbox_property(int file_desc, void *buf)
-+{
-+   int ret_val = ioctl(file_desc, IOCTL_MBOX_PROPERTY, buf);
-+
-+   if (ret_val < 0) {
-+      printf("ioctl_set_msg failed:%d\n", ret_val);
-+   }
-+
-+#ifdef DEBUG
-+   unsigned *p = buf; int i; unsigned size = *(unsigned *)buf;
-+   for (i=0; i<size/4; i++)
-+      printf("%04x: 0x%08x\n", i*sizeof *p, p[i]);
-+#endif
-+   return ret_val;
-+}
-+
-+unsigned mem_alloc(int file_desc, unsigned size, unsigned align, unsigned flags)
-+{
-+   int i=0;
-+   unsigned p[32];
-+   p[i++] = 0; // size
-+   p[i++] = 0x00000000; // process request
-+
-+   p[i++] = 0x3000c; // (the tag id)
-+   p[i++] = 12; // (size of the buffer)
-+   p[i++] = 12; // (size of the data)
-+   p[i++] = size; // (num bytes? or pages?)
-+   p[i++] = align; // (alignment)
-+   p[i++] = flags; // (MEM_FLAG_L1_NONALLOCATING)
-+
-+   p[i++] = 0x00000000; // end tag
-+   p[0] = i*sizeof *p; // actual size
-+
-+   mbox_property(file_desc, p);
-+   return p[5];
-+}
-+
-+unsigned mem_free(int file_desc, unsigned handle)
-+{
-+   int i=0;
-+   unsigned p[32];
-+   p[i++] = 0; // size
-+   p[i++] = 0x00000000; // process request
-+
-+   p[i++] = 0x3000f; // (the tag id)
-+   p[i++] = 4; // (size of the buffer)
-+   p[i++] = 4; // (size of the data)
-+   p[i++] = handle;
-+
-+   p[i++] = 0x00000000; // end tag
-+   p[0] = i*sizeof *p; // actual size
-+
-+   mbox_property(file_desc, p);
-+   return p[5];
-+}
-+
-+unsigned mem_lock(int file_desc, unsigned handle)
-+{
-+   int i=0;
-+   unsigned p[32];
-+   p[i++] = 0; // size
-+   p[i++] = 0x00000000; // process request
-+
-+   p[i++] = 0x3000d; // (the tag id)
-+   p[i++] = 4; // (size of the buffer)
-+   p[i++] = 4; // (size of the data)
-+   p[i++] = handle;
-+
-+   p[i++] = 0x00000000; // end tag
-+   p[0] = i*sizeof *p; // actual size
-+
-+   mbox_property(file_desc, p);
-+   return p[5];
-+}
-+
-+unsigned mem_unlock(int file_desc, unsigned handle)
-+{
-+   int i=0;
-+   unsigned p[32];
-+   p[i++] = 0; // size
-+   p[i++] = 0x00000000; // process request
-+
-+   p[i++] = 0x3000e; // (the tag id)
-+   p[i++] = 4; // (size of the buffer)
-+   p[i++] = 4; // (size of the data)
-+   p[i++] = handle;
-+
-+   p[i++] = 0x00000000; // end tag
-+   p[0] = i*sizeof *p; // actual size
-+
-+   mbox_property(file_desc, p);
-+   return p[5];
-+}
-+
-+unsigned execute_code(int file_desc, unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5)
-+{
-+   int i=0;
-+   unsigned p[32];
-+   p[i++] = 0; // size
-+   p[i++] = 0x00000000; // process request
-+
-+   p[i++] = 0x30010; // (the tag id)
-+   p[i++] = 28; // (size of the buffer)
-+   p[i++] = 28; // (size of the data)
-+   p[i++] = code;
-+   p[i++] = r0;
-+   p[i++] = r1;
-+   p[i++] = r2;
-+   p[i++] = r3;
-+   p[i++] = r4;
-+   p[i++] = r5;
-+
-+   p[i++] = 0x00000000; // end tag
-+   p[0] = i*sizeof *p; // actual size
-+
-+   mbox_property(file_desc, p);
-+   return p[5];
-+}
-+
-+unsigned qpu_enable(int file_desc, unsigned enable)
-+{
-+   int i=0;
-+   unsigned p[32];
-+
-+   p[i++] = 0; // size
-+   p[i++] = 0x00000000; // process request
-+
-+   p[i++] = 0x30012; // (the tag id)
-+   p[i++] = 4; // (size of the buffer)
-+   p[i++] = 4; // (size of the data)
-+   p[i++] = enable;
-+
-+   p[i++] = 0x00000000; // end tag
-+   p[0] = i*sizeof *p; // actual size
-+
-+   mbox_property(file_desc, p);
-+   return p[5];
-+}
-+
-+unsigned execute_qpu(int file_desc, unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout) {
-+   int i=0;
-+   unsigned p[32];
-+
-+   p[i++] = 0; // size
-+   p[i++] = 0x00000000; // process request
-+   p[i++] = 0x30011; // (the tag id)
-+   p[i++] = 16; // (size of the buffer)
-+   p[i++] = 16; // (size of the data)
-+   p[i++] = num_qpus;
-+   p[i++] = control;
-+   p[i++] = noflush;
-+   p[i++] = timeout; // ms
-+
-+   p[i++] = 0x00000000; // end tag
-+   p[0] = i*sizeof *p; // actual size
-+
-+   mbox_property(file_desc, p);
-+   return p[5];
-+}
-+
-+int mbox_open() {
-+   int file_desc;
-+
-+   // open a char device file used for communicating with kernel mbox driver
-+   file_desc = open(DEVICE_FILE_NAME, 0);
-+   if (file_desc < 0) {
-+      printf("Can't open device file: %s\n", DEVICE_FILE_NAME);
-+      printf("Try creating a device file with: sudo mknod %s c %d 0\n", DEVICE_FILE_NAME, MAJOR_NUM);
-+   }
-+   return file_desc;
-+}
-+
-+void mbox_close(int file_desc) {
-+  close(file_desc);
-+}
-diff --git a/libavcodec/rpi_mailbox.h b/libavcodec/rpi_mailbox.h
-new file mode 100644
-index 0000000..c264d2e
---- /dev/null
-+++ b/libavcodec/rpi_mailbox.h
-@@ -0,0 +1,20 @@
-+#ifndef RPI_MAILBOX_H
-+#define RPI_MAILBOX_H
-+
-+extern int mbox_open(void);
-+extern void mbox_close(int file_desc);
-+
-+extern unsigned get_version(int file_desc);
-+extern unsigned mem_alloc(int file_desc, unsigned size, unsigned align, unsigned flags);
-+extern unsigned mem_free(int file_desc, unsigned handle);
-+extern unsigned mem_lock(int file_desc, unsigned handle);
-+extern unsigned mem_unlock(int file_desc, unsigned handle);
-+extern void *mapmem_shared(unsigned base, unsigned size);
-+extern void *mapmem_private(unsigned base, unsigned size);
-+extern void unmapmem(void *addr, unsigned size);
-+
-+extern unsigned execute_code(int file_desc, unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5);
-+extern unsigned execute_qpu(int file_desc, unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout);
-+extern unsigned qpu_enable(int file_desc, unsigned enable);
-+
-+#endif
-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-new file mode 100644
-index 0000000..b1f50ee
---- /dev/null
-+++ b/libavcodec/rpi_qpu.c
-@@ -0,0 +1,652 @@
-+#ifdef RPI
-+// Use the vcsm device for shared memory
-+// This works better than the mmap in that the memory can be cached, but requires a kernel modification to enable the device.
-+#define RPI_USE_VCSM
-+#define RPI_TIME_TOTAL_QPU
-+
-+#include <stdio.h>
-+#include <stdlib.h>
-+#include <string.h>
-+#include <stddef.h>
-+#include <assert.h>
-+
-+#include "config.h"
-+
-+#include <pthread.h>
-+#include <time.h>
-+
-+#include "rpi_mailbox.h"
-+#include "rpi_qpu.h"
-+#include "rpi_shader.h"
-+#include "rpi_hevc_transform.h"
-+
-+#ifdef RPI_USE_VCSM
-+#include "rpi_user_vcsm.h"
-+#endif
-+
-+// On Pi2 there is no way to access the VPU L2 cache
-+// GPU_MEM_FLG should be 4 for uncached memory.
-+// However, if using VCSM allocated buffers, need to use C at the moment because VCSM does not allocate uncached memory correctly
-+// The QPU crashes if we mix L2 cached and L2 uncached accesses due to a HW bug.
-+#define GPU_MEM_FLG 0xC
-+#define GPU_MEM_MAP 0x0
-+
-+#define vcos_verify(x) ((x)>=0)
-+
-+typedef unsigned char uint8_t;
-+typedef signed char int8_t;
-+typedef unsigned short uint16_t;
-+typedef unsigned int uint32_t;
-+typedef int int32_t;
-+
-+/*static const unsigned code[] =
-+{
-+  #include "rpi_shader.hex"
-+};*/
-+
-+// Size in 32bit words
-+#define QPU_CODE_SIZE 2048
-+#define VPU_CODE_SIZE 2048
-+
-+struct GPU
-+{
-+  unsigned int qpu_code[QPU_CODE_SIZE];
-+  unsigned int vpu_code[VPU_CODE_SIZE];
-+  int open_count; // Number of allocated video buffers
-+  unsigned int vc_handle; // Handle of this memory
-+  int      mb; // Mailbox handle
-+  int      vc; // Address in GPU memory
-+  int mail[12]; // These are used to pass pairs of code/unifs to the QPUs
-+};
-+
-+// Stop more than one thread trying to allocate memory or use the processing resources at once
-+static pthread_mutex_t gpu_mutex = PTHREAD_MUTEX_INITIALIZER;
-+static volatile struct GPU* gpu = NULL;
-+
-+#ifdef RPI_TIME_TOTAL_QPU
-+static unsigned int Microseconds(void) {
-+    struct timespec ts;
-+    unsigned int x;
-+    static unsigned int base = 0;
-+    clock_gettime(CLOCK_REALTIME, &ts);
-+    x = ts.tv_sec*1000000 + ts.tv_nsec/1000;
-+    if (base==0) base=x;
-+    return x-base;
-+}
-+#endif
-+
-+// Connect to QPU, returns 0 on success.
-+static int gpu_init(volatile struct GPU **gpu) {
-+  int mb = mbox_open();
-+  int vc;
-+  int handle;
-+  volatile struct GPU* ptr;
-+	if (mb < 0)
-+		return -1;
-+
-+	if (qpu_enable(mb, 1)) return -2;
-+
-+#ifdef RPI_USE_VCSM
-+  vcsm_init();
-+#endif
-+
-+  handle = mem_alloc(mb, sizeof(struct GPU), 4096, GPU_MEM_FLG);
-+  if (!handle)
-+  {
-+    qpu_enable(mb, 0);
-+    return -3;
-+  }
-+	vc = mem_lock(mb, handle);
-+	ptr = mapmem_shared((vc+GPU_MEM_MAP)&~0xc0000000, sizeof(struct GPU));
-+	if (ptr == NULL)
-+	{	mem_free(mb, handle);
-+		mem_unlock(mb, handle);
-+		qpu_enable(mb, 0);
-+		return -4;
-+	}
-+
-+	ptr->mb = mb;
-+	ptr->vc_handle = handle;
-+	ptr->vc = vc;
-+
-+  *gpu = ptr;
-+
-+  // Now copy over the QPU code into GPU memory
-+  {
-+    int num_bytes = qpu_get_fn(QPU_MC_END) - qpu_get_fn(QPU_MC_SETUP);
-+    assert(num_bytes<=QPU_CODE_SIZE*sizeof(unsigned int));
-+    memcpy((void*)ptr->qpu_code, rpi_shader, num_bytes);
-+  }
-+  // And the VPU code
-+  {
-+    int num_bytes = sizeof(rpi_hevc_transform);
-+    assert(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
-+    memcpy((void*)ptr->vpu_code, rpi_hevc_transform, num_bytes);
-+  }
-+
-+  return 0;
-+}
-+
-+// Make sure we have exclusive access to the mailbox, and enable qpu if necessary.
-+static void gpu_lock(void) {
-+  pthread_mutex_lock(&gpu_mutex);
-+  if (gpu==NULL) {
-+    gpu_init(&gpu);
-+  }
-+}
-+
-+static void gpu_unlock(void) {
-+  pthread_mutex_unlock(&gpu_mutex);
-+}
-+
-+// Allocate memory on GPU
-+// Fills in structure <p> containing ARM pointer, videocore handle, videocore memory address, numbytes
-+// Returns 0 on success.
-+// This allocates memory that will not be cached in ARM's data cache.
-+// Therefore safe to use without data cache flushing.
-+int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p) {
-+  gpu_lock();
-+  p->vc_handle = mem_alloc(gpu->mb, numbytes, 4096, GPU_MEM_FLG);
-+  p->vcsm_handle = 0;
-+  if (!p->vc_handle)
-+  {
-+    qpu_enable(gpu->mb, 0);
-+    return -3;
-+  }
-+  p->vc = mem_lock(gpu->mb, p->vc_handle);
-+  p->arm = mapmem_shared((p->vc+GPU_MEM_MAP)&~0xc0000000,numbytes);
-+  p->numbytes = numbytes;
-+  if (p->arm == NULL)
-+  {
-+    mem_free(gpu->mb, p->vc_handle);
-+    mem_unlock(gpu->mb, p->vc_handle);
-+    gpu_unlock();
-+    qpu_enable(gpu->mb, 0);
-+    return -4;
-+  }
-+  gpu->open_count++;
-+  gpu_unlock();
-+  return 0;
-+}
-+
-+void gpu_cache_flush(GPU_MEM_PTR_T *p)
-+{
-+  // This only works when using RPI_USE_VCSM
-+  void *tmp = vcsm_lock(p->vcsm_handle);
-+  vcsm_unlock_ptr(tmp);
-+}
-+
-+// This allocates data that will be
-+//    Cached in ARM L2
-+//    Uncached in VPU L2
-+int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p) {
-+  gpu_lock();
-+#ifdef RPI_USE_VCSM
-+  {
-+      p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST, (char *)"Video Frame" ); // f....... locks up for VP9 - retest this?
-+      //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_VC, (char *)"Video Frame" ); // 3b...... works
-+      //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" ); //fb...... locks up
-+      //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST_AND_VC, (char *)"Video Frame" ); // 3b works (but corrupted due to caching)
-+      p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
-+      p->arm = vcsm_lock(p->vcsm_handle);
-+      p->vc = mem_lock(gpu->mb, p->vc_handle);
-+  }
-+#else
-+  p->vc_handle = mem_alloc(gpu->mb, numbytes, 4096, GPU_MEM_FLG);
-+  p->vcsm_handle = 0;
-+  if (!p->handle)
-+  {
-+    qpu_enable(gpu->mb, 0);
-+    return -3;
-+  }
-+  p->vc = mem_lock(gpu->mb, p->vc_handle);
-+  printf("This mapmem_private does not seem to work\n");
-+  exit(-1);
-+  p->arm = mapmem_private((p->vc+GPU_MEM_MAP)&~0xc0000000,numbytes);
-+  p->numbytes = numbytes;
-+  if (p->arm == NULL)
-+  {
-+    mem_free(gpu->mb, p->handle);
-+    mem_unlock(gpu->mb, p->handle);
-+    gpu_unlock();
-+    qpu_enable(gpu->mb, 0);
-+    return -4;
-+  }
-+#endif
-+  gpu->open_count++;
-+  gpu_unlock();
-+  return 0;
-+}
-+
-+static void gpu_term(void)
-+{
-+	int mb = gpu->mb;
-+	unsigned handle = gpu->vc_handle;
-+  if (gpu==NULL)
-+    return;
-+	unmapmem((void*)gpu, sizeof(struct GPU));
-+	mem_unlock(mb, handle);
-+	mem_free(mb, handle);
-+	qpu_enable(mb, 0);
-+#ifdef RPI_USE_VCSM
-+  vcsm_exit();
-+#endif
-+	mbox_close(mb);
-+  gpu = NULL;
-+}
-+
-+void gpu_free(GPU_MEM_PTR_T *p) {
-+  int mb = gpu->mb;
-+	unsigned handle = p->vc_handle;
-+  gpu_lock();
-+#ifdef RPI_USE_VCSM
-+  if (p->vcsm_handle) {
-+      mem_unlock(mb,p->vc_handle);
-+      vcsm_unlock_ptr(p->arm);
-+      vcsm_free(p->vcsm_handle);
-+  } else {
-+	unmapmem((void*)p->arm, sizeof(struct GPU));
-+      mem_unlock(mb, handle);
-+      mem_free(mb, handle);
-+  }
-+#else
-+	unmapmem((void*)p->arm, sizeof(struct GPU));
-+	mem_unlock(mb, handle);
-+	mem_free(mb, handle);
-+#endif
-+
-+  gpu->open_count--;
-+  if (gpu->open_count==0) {
-+      printf("Closing GPU\n");
-+      gpu_term();
-+      gpu = NULL;
-+  }
-+  gpu_unlock();
-+}
-+
-+unsigned int vpu_get_fn(void) {
-+  // Make sure that the gpu is initialized
-+  if (gpu==NULL) {
-+    printf("Preparing gpu\n");
-+    gpu_lock();
-+    gpu_unlock();
-+  }
-+  return gpu->vc + offsetof(struct GPU,vpu_code);
-+}
-+
-+unsigned vpu_execute_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5)
-+{
-+  unsigned r;
-+  gpu_lock();
-+  r = execute_code(gpu->mb, code, r0, r1, r2, r3, r4, r5);
-+  gpu_unlock();
-+  return r;
-+}
-+
-+// Run a program on a QPU with the given code and uniform stream (given in GPU addresses)
-+// The first num QPUs will start at code, the next num2 QPUs will start at code2
-+void qpu_run_shader12(int code, int num, int code2, int num2, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8, int unifs9, int unifs10, int unifs11, int unifs12)
-+{
-+  int i;
-+#ifdef RPI_TIME_TOTAL_QPU
-+  static int last_time=0;
-+  static long long on_time=0;
-+  static long long off_time=0;
-+  int start_time;
-+  int end_time;
-+  static int count=0;
-+#endif
-+
-+  gpu_lock();
-+#ifdef RPI_TIME_TOTAL_QPU
-+  start_time = Microseconds();
-+  if (last_time==0)
-+    last_time = start_time;
-+  off_time += start_time-last_time;
-+#endif
-+  for(i=0;i<num;i++) {
-+    gpu->mail[i*2 + 1] = code;
-+  }
-+  for(;i<num+num2;i++) {
-+    gpu->mail[i*2 + 1] = code2;
-+  }
-+  gpu->mail[0 ] = unifs1;
-+  gpu->mail[2 ] = unifs2;
-+  gpu->mail[4 ] = unifs3;
-+  gpu->mail[6 ] = unifs4;
-+  gpu->mail[8 ] = unifs5;
-+  gpu->mail[10] = unifs6;
-+	gpu->mail[12] = unifs7;
-+	gpu->mail[14] = unifs8;
-+	gpu->mail[16] = unifs9;
-+	gpu->mail[18] = unifs10;
-+	gpu->mail[20] = unifs11;
-+	gpu->mail[22] = unifs12;
-+	execute_qpu(
-+		gpu->mb,
-+		12 /* Number of QPUs */,
-+		gpu->vc + offsetof(struct GPU, mail),
-+		1 /* no flush */,  // Don't flush VPU L1 cache
-+		5000 /* timeout ms */);
-+#ifdef RPI_TIME_TOTAL_QPU
-+  end_time = Microseconds();
-+  last_time = end_time;
-+  on_time += end_time - start_time;
-+  count++;
-+  if ((count&0x7f)==0)
-+    printf("On=%dms, Off=%dms\n",(int)(on_time/1000),(int)(off_time/1000));
-+#endif
-+  gpu_unlock();
-+}
-+
-+unsigned int qpu_get_fn(int num) {
-+    // Make sure that the gpu is initialized
-+    unsigned int *fn;
-+    if (gpu==NULL) {
-+      printf("Preparing gpu\n");
-+      gpu_lock();
-+      gpu_unlock();
-+    }
-+    switch(num) {
-+    case QPU_MC_SETUP:
-+      fn = mc_setup;
-+      break;
-+    case QPU_MC_FILTER:
-+      fn = mc_filter;
-+      break;
-+    case QPU_MC_EXIT:
-+      fn = mc_exit;
-+      break;
-+    case QPU_MC_INTERRUPT_EXIT:
-+      fn = mc_interrupt_exit;
-+      break;
-+    case QPU_MC_FILTER_B:
-+      fn = mc_filter_b;
-+      break;
-+    case QPU_MC_FILTER_HONLY:
-+      fn = mc_filter_honly;
-+      break;
-+    case QPU_MC_SETUP_UV:
-+      fn = mc_setup_uv;
-+      break;
-+    case QPU_MC_FILTER_UV:
-+      fn = mc_filter_uv;
-+      break;
-+    case QPU_MC_FILTER_UV_B:
-+      fn = mc_filter_uv_b;
-+      break;
-+    case QPU_MC_END:
-+      fn = mc_end;
-+      break;
-+    default:
-+      printf("Unknown function\n");
-+      exit(-1);
-+    }
-+    return gpu->vc + 4*(int)(fn-rpi_shader);
-+    //return code[num] + gpu->vc;
-+}
-+
-+#if 0
-+
-+int32_t hcoeffs[] = {-4, 10, -21, 70, 90, -24, 11, -4};
-+//int32_t hcoeffs[] = {1, 1, 1, 1, 1, 1, 1, 1};
-+int32_t vcoeffs[] = {-2, 6, -13, 37, 115, -20, 9, -4};
-+//int32_t vcoeffs[] = {1, 1, 1, 1, 1, 1, 1, 1};
-+
-+#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0-1) & 0xff) | ((c1-1) & 0xff) << 8 | ((c2-1) & 0xff) << 16 | ((c3-1) & 0xff) << 24);
-+
-+static uint8_t av_clip_uint8(int32_t a)
-+{
-+    if (a&(~255)) return (-a)>>31;
-+    else          return a;
-+}
-+
-+static int32_t filter8(const uint8_t *data, int pitch)
-+{
-+   int32_t vsum = 0;
-+   int x, y;
-+
-+   for (y = 0; y < 8; y++) {
-+      int32_t hsum = 0;
-+
-+      for (x = 0; x < 8; x++)
-+         hsum += hcoeffs[x]*data[x + y * pitch];
-+
-+      vsum += vcoeffs[y]*av_clip_uint8( (hsum + 64) >> 7); // Added brackets to stop compiler warning
-+   }
-+
-+   return av_clip_uint8( (vsum + 64) >> 7);
-+}
-+
-+// Note regression changes coefficients so is not thread safe
-+//#define REGRESSION
-+#ifdef REGRESSION
-+#define CMAX 100
-+#else
-+#define CMAX 2
-+#endif
-+#define YMAX 16
-+
-+int rpi_test_shader(void)
-+{
-+   int i, c;
-+
-+   uint32_t *unifs;
-+
-+   uint8_t *in_buffer;
-+   uint8_t *out_buffer[2];
-+
-+   GPU_MEM_PTR_T unifs_ptr;
-+   GPU_MEM_PTR_T in_buffer_ptr;
-+   GPU_MEM_PTR_T out_buffer_ptr[2];
-+
-+   // Addresses in GPU memory of filter programs
-+   uint32_t mc_setup = 0;
-+   uint32_t mc_filter = 0;
-+   uint32_t mc_exit = 0;
-+
-+   int pitch = 0x500;
-+
-+   if (gpu==NULL) {
-+      gpu_lock();
-+      gpu_unlock();
-+   }
-+
-+   printf("This needs to change to reflect new assembler\n");
-+   // Use table to compute locations of program start points
-+   mc_setup = code[0] + gpu->vc;
-+   mc_filter = code[1] + gpu->vc;
-+   mc_exit = code[2] + gpu->vc;
-+
-+   if (!vcos_verify(gpu_malloc_uncached(4*64,&unifs_ptr))) {
-+      return -2;
-+   }
-+   unifs = (uint32_t*)unifs_ptr.arm;
-+
-+   if (!vcos_verify(gpu_malloc_uncached(64*23,&in_buffer_ptr))) {
-+      return -3;
-+   }
-+   in_buffer = (uint8_t*)in_buffer_ptr.arm;
-+
-+   if (!vcos_verify(gpu_malloc_uncached(16*pitch,&out_buffer_ptr[0])) || !vcos_verify(gpu_malloc_uncached(16*pitch,&out_buffer_ptr[1]))) {
-+      return -4;
-+   }
-+   out_buffer[0] = (uint8_t*)out_buffer_ptr[0].arm;
-+   out_buffer[1] = (uint8_t*)out_buffer_ptr[1].arm;
-+
-+   for (c = 0; c < CMAX; c++) {
-+      int xo[] = {rand()&31, rand()&31};
-+
-+#ifdef REGRESSION
-+      for (i = 0; i < 8; i++) {
-+         hcoeffs[i] = (int8_t)rand();
-+         vcoeffs[i] = (int8_t)rand();
-+         if (hcoeffs[i]==-128)
-+           hcoeffs[i]++;
-+         if (vcoeffs[i]==-128)
-+           vcoeffs[i]++;
-+      }
-+#endif
-+
-+      for (i = 0; i < 64*23; i++) {
-+         //printf("%d %d %p\n",i,gpu->mb,&in_buffer[i]);
-+         in_buffer[i] = rand();
-+      }
-+
-+      // Clear output array
-+      {
-+        int b;
-+        for(b=0;b<2;b++) {
-+          for(i=0;i<16*16;i++) {
-+            out_buffer[b][i] = 3;
-+          }
-+        }
-+      }
-+
-+      unifs[0] = mc_filter;
-+      unifs[1] = in_buffer_ptr.vc+xo[0]+16;
-+      unifs[2] = 64; // src pitch
-+      unifs[3] = pitch; // dst pitch
-+      unifs[4] = 0; // Padding
-+      unifs[5] = 0;
-+      unifs[6] = 0;
-+      unifs[7 ] = mc_filter;
-+      unifs[8 ] = in_buffer_ptr.vc+xo[1]+16;
-+      unifs[9 ] = ENCODE_COEFFS(hcoeffs[0], hcoeffs[1], hcoeffs[2], hcoeffs[3]);
-+      unifs[10] = ENCODE_COEFFS(hcoeffs[4], hcoeffs[5], hcoeffs[6], hcoeffs[7]);
-+      unifs[11] = ENCODE_COEFFS(vcoeffs[0], vcoeffs[1], vcoeffs[2], vcoeffs[3]);
-+      unifs[12] = ENCODE_COEFFS(vcoeffs[4], vcoeffs[5], vcoeffs[6], vcoeffs[7]);
-+      unifs[13] = out_buffer_ptr[0].vc;
-+      unifs[14] = mc_exit;
-+      unifs[15] = in_buffer_ptr.vc+xo[1]+16;        // dummy
-+      unifs[16] = ENCODE_COEFFS(hcoeffs[0], hcoeffs[1], hcoeffs[2], hcoeffs[3]);
-+      unifs[17] = ENCODE_COEFFS(hcoeffs[4], hcoeffs[5], hcoeffs[6], hcoeffs[7]);
-+      unifs[18] = ENCODE_COEFFS(vcoeffs[0], vcoeffs[1], vcoeffs[2], vcoeffs[3]);
-+      unifs[19] = ENCODE_COEFFS(vcoeffs[4], vcoeffs[5], vcoeffs[6], vcoeffs[7]);
-+      unifs[20] = out_buffer_ptr[1].vc;
-+
-+      printf("Gpu->vc=%x Code=%x dst=%x\n",gpu->vc, mc_filter,out_buffer_ptr[1].vc);
-+
-+      // flush_dcache(); TODO is this needed on ARM side? - tried to use the direct alias to avoid this problem
-+
-+      //qpu_run_shader(mc_setup, unifs_ptr.vc);
-+      //qpu_run_shader(gpu, gpu->vc, unifs_ptr.vc);
-+      rpi_do_block(in_buffer_ptr.vc+xo[0]+16, 64, out_buffer_ptr[0].vc, pitch,out_buffer[0]);
-+      rpi_do_block(in_buffer_ptr.vc+xo[1]+16, 64, out_buffer_ptr[1].vc, pitch,out_buffer[1]);
-+
-+      if (1)
-+      {
-+         int x, y, b;
-+         int bad = 0;
-+
-+         for (b=0; b<2; ++b)
-+            for (y=0; y<YMAX; ++y)
-+               for (x=0; x<16; ++x) {
-+                  int32_t ref = filter8(in_buffer+x+y*64+xo[b], 64);
-+
-+                  if (out_buffer[b][x+y*pitch] != ref) {
-+                      bad = 1;
-+//                     printf("%d, %d, %d, %d\n", c, b, x, y);
-+                  }
-+#ifndef REGRESSION
-+                  //printf("%08x %08x\n", out_buffer[b][x+y*pitch], ref);
-+#endif
-+               }
-+          if (bad)
-+            printf("Failed dst=%x test=%d\n",out_buffer_ptr[1].vc,c);
-+          else
-+            printf("Passed dst=%x test=%d\n",out_buffer_ptr[1].vc,c);
-+      }
-+      //printf("%d\n", simpenrose_get_qpu_tick_count());
-+   }
-+
-+   gpu_free(&out_buffer_ptr[0]);
-+   gpu_free(&out_buffer_ptr[1]);
-+   gpu_free(&in_buffer_ptr);
-+   gpu_free(&unifs_ptr);
-+
-+   return 0;
-+}
-+
-+void rpi_do_block_arm(const uint8_t *in_buffer, int src_pitch, uint8_t *dst, int dst_pitch)
-+{
-+  int x,y;
-+  for (y=0; y<16; ++y) {
-+    for (x=0; x<16; ++x) {
-+       dst[x+y*dst_pitch] = filter8(in_buffer+x+y*src_pitch, src_pitch);
-+    }
-+  }
-+}
-+
-+void rpi_do_block(const uint8_t *in_buffer_vc, int src_pitch, uint8_t *dst_vc, int dst_pitch, uint8_t *dst)
-+{
-+   uint32_t *unifs;
-+
-+   GPU_MEM_PTR_T unifs_ptr;
-+   //uint8_t *out_buffer;
-+   //GPU_MEM_PTR_T out_buffer_ptr;
-+
-+   // Addresses in GPU memory of filter programs
-+   uint32_t mc_setup = 0;
-+   uint32_t mc_filter = 0;
-+   uint32_t mc_exit = 0;
-+   //int x,y;
-+
-+   if (gpu==NULL) {
-+      gpu_lock();
-+      gpu_unlock();
-+   }
-+
-+   // Use table to compute locations of program start points
-+   mc_setup = code[0] + gpu->vc;
-+   mc_filter = code[1] + gpu->vc;
-+   mc_exit = code[2] + gpu->vc;
-+
-+   if (!vcos_verify(gpu_malloc_uncached(4*64,&unifs_ptr))) {
-+      return;
-+   }
-+   //gpu_malloc_uncached(16*dst_pitch,&out_buffer_ptr);
-+   //out_buffer = (uint8_t*)out_buffer_ptr.arm;
-+
-+   /*for (y=0; y<16; ++y) {
-+      for (x=0; x<16; ++x) {
-+         out_buffer[x+y*dst_pitch] = 7;
-+      }
-+    }*/
-+
-+   unifs = (uint32_t*)unifs_ptr.arm;
-+
-+    unifs[0] = mc_filter;
-+    unifs[1] = (int)in_buffer_vc;
-+    unifs[2] = src_pitch; // src pitch
-+    unifs[3] = dst_pitch; // dst pitch
-+    unifs[4] = 0; // Padding
-+    unifs[5] = 0;
-+    unifs[6] = 0;
-+    unifs[7 ] = mc_exit;
-+    unifs[8 ] = (int)in_buffer_vc;
-+    unifs[9 ] = ENCODE_COEFFS(hcoeffs[0], hcoeffs[1], hcoeffs[2], hcoeffs[3]);
-+    unifs[10] = ENCODE_COEFFS(hcoeffs[4], hcoeffs[5], hcoeffs[6], hcoeffs[7]);
-+    unifs[11] = ENCODE_COEFFS(vcoeffs[0], vcoeffs[1], vcoeffs[2], vcoeffs[3]);
-+    unifs[12] = ENCODE_COEFFS(vcoeffs[4], vcoeffs[5], vcoeffs[6], vcoeffs[7]);
-+    unifs[13] = (int)dst_vc;
-+    //unifs[13] = (int)out_buffer_ptr.vc;
-+
-+    //printf("Gpu->vc=%x Code=%x dst=%x\n",gpu->vc, mc_filter,out_buffer_ptr[1].vc);
-+
-+    qpu_run_shader(mc_setup, unifs_ptr.vc);
-+
-+    /*for (y=0; y<16; ++y) {
-+      for (x=0; x<16; ++x) {
-+         dst[x+y*dst_pitch] = out_buffer[x+y*dst_pitch];
-+      }
-+    }*/
-+
-+    gpu_free(&unifs_ptr);
-+    //gpu_free(&out_buffer_ptr);
-+}
-+
-+
-+#endif
-+
-+#endif // RPI
-diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
-new file mode 100644
-index 0000000..4e3c35c
---- /dev/null
-+++ b/libavcodec/rpi_qpu.h
-@@ -0,0 +1,45 @@
-+#ifndef RPI_QPU_H
-+#define RPI_QPU_H
-+
-+typedef struct gpu_mem_ptr_s {
-+  unsigned char *arm; // Pointer to memory mapped on ARM side
-+  int vc_handle;   // Videocore handle of relocatable memory
-+  int vcsm_handle; // Handle for use by VCSM
-+  int vc;       // Address for use in GPU code
-+  int numbytes; // Size of memory block
-+} GPU_MEM_PTR_T;
-+
-+// General GPU functions
-+extern int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p);
-+extern int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p);
-+extern void gpu_free(GPU_MEM_PTR_T *p);
-+extern void gpu_cache_flush(GPU_MEM_PTR_T *p);
-+
-+// QPU specific functions
-+extern void qpu_run_shader12(int code, int num, int code2, int num2, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8, int unifs9, int unifs10, int unifs11, int unifs12);
-+
-+enum {
-+  QPU_MC_SETUP,
-+  QPU_MC_FILTER,
-+  QPU_MC_EXIT,
-+  QPU_MC_INTERRUPT_EXIT,
-+  QPU_MC_FILTER_B,
-+  QPU_MC_FILTER_HONLY,
-+  QPU_MC_SETUP_UV,
-+  QPU_MC_FILTER_UV,
-+  QPU_MC_FILTER_UV_B,
-+  QPU_MC_END
-+  };
-+extern unsigned int qpu_get_fn(int num);
-+
-+// VPU specific functions
-+extern unsigned int vpu_get_fn(void);
-+extern unsigned vpu_execute_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5);
-+
-+// Simple test of shader code
-+extern int rpi_test_shader(void);
-+
-+extern void rpi_do_block(const unsigned char *in_buffer_vc, int src_pitch, unsigned char *dst_vc, int dst_pitch, unsigned char *dst);
-+extern void rpi_do_block_arm(const unsigned char *in_buffer, int src_pitch, unsigned char *dst, int dst_pitch);
-+
-+#endif
-diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
-new file mode 100644
-index 0000000..41cc2e1
---- /dev/null
-+++ b/libavcodec/rpi_shader.c
-@@ -0,0 +1,818 @@
-+#include "rpi_shader.h"
-+
-+#ifdef _MSC_VER
-+   #include <stdint.h>
-+   /* cast through uintptr_t to avoid warnings */
-+   #define POINTER_TO_UINT(X) ((unsigned int)(uintptr_t)(X))
-+#else
-+   #define POINTER_TO_UINT(X) ((unsigned int)(X))
-+#endif
-+
-+#ifdef __cplusplus
-+extern "C" { /* the types are probably wrong... */
-+#endif
-+#ifdef __cplusplus
-+}
-+#endif
-+
-+#ifdef _MSC_VER
-+__declspec(align(8))
-+#elif defined(__GNUC__)
-+__attribute__((aligned(8)))
-+#endif
-+unsigned int rpi_shader[] = {
-+// ::mc_setup
-+/* [0x00000000] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x00000008] */ 0x0c9a0f80, 0x10020427, // add ra_x_base, unif, elem_num
-+/* [0x00000010] */ 0x15827d80, 0x10020767, // mov ra_y, unif
-+/* [0x00000018] */ 0x15827d80, 0x10020627, // mov ra_x2_base, unif
-+/* [0x00000020] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
-+/* [0x00000028] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
-+/* [0x00000030] */ 0x15827d80, 0x10021427, // mov rb16, unif
-+/* [0x00000038] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000040] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-+/* [0x00000048] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
-+/* [0x00000050] */ 0x00000001, 0xe0020527, // mov ra20, 1
-+/* [0x00000058] */ 0x00000040, 0xe0020567, // mov ra21, 64
-+/* [0x00000060] */ 0x00000100, 0xe00205a7, // mov ra22, 256
-+/* [0x00000068] */ 0x00000008, 0xe00205e7, // mov ra23, 8
-+/* [0x00000070] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
-+/* [0x00000078] */ 0x00000040, 0xe0021567, // mov rb21, 64
-+/* [0x00000080] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
-+/* [0x00000088] */ 0x00000018, 0xe00215e7, // mov rb23, 24
-+/* [0x00000090] */ 0x00000000, 0xe0020227, // mov ra8, 0
-+/* [0x00000098] */ 0x00000000, 0xe0020267, // mov ra9, 0
-+/* [0x000000a0] */ 0x00000000, 0xe00202a7, // mov ra10, 0
-+/* [0x000000a8] */ 0x00000000, 0xe00202e7, // mov ra11, 0
-+/* [0x000000b0] */ 0x00000000, 0xe0020327, // mov ra12, 0
-+/* [0x000000b8] */ 0x00000000, 0xe0020367, // mov ra13, 0
-+/* [0x000000c0] */ 0x00000000, 0xe00203a7, // mov ra14, 0
-+/* [0x000000c8] */ 0x00000000, 0xe00203e7, // mov ra15, 0
-+/* [0x000000d0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+/* [0x000000d8] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-+/* [0x000000e0] */ 0x159e7480, 0x10020867, // mov r1, r2
-+/* [0x000000e8] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+/* [0x000000f0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+/* [0x000000f8] */ 0x159e7480, 0x10020827, // mov r0, r2
-+/* [0x00000100] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-+/* [0x00000108] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000110] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-+/* [0x00000118] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-+/* [0x00000120] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
-+/* [0x00000128] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+/* [0x00000130] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-+/* [0x00000138] */ 0x159e7480, 0x10020867, // mov r1, r2
-+/* [0x00000140] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+/* [0x00000148] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+/* [0x00000150] */ 0x159e7480, 0x10020827, // mov r0, r2
-+/* [0x00000158] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-+/* [0x00000160] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000168] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-+/* [0x00000170] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
-+/* [0x00000178] */ 0x15427d80, 0x10020827, // mov r0, ra_x_base
-+/* [0x00000180] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
-+/* [0x00000188] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
-+/* [0x00000190] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base
-+/* [0x00000198] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x000001a0] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
-+/* [0x000001a8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
-+/* [0x000001b0] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
-+/* [0x000001b8] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
-+/* [0x000001c0] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-+/* [0x000001c8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+/* [0x000001d0] */ 0x4c9d00cf, 0x10024821, // add r0, r0, r3; mul24 r1, r1, rb_pitch
-+/* [0x000001d8] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
-+/* [0x000001e0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+/* [0x000001e8] */ 0x949dc5c0, 0xd0025890, // and r2, r2, ~3; mov ra_x_base, r0
-+/* [0x000001f0] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
-+/* [0x000001f8] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
-+/* [0x00000200] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000208] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000210] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-+/* [0x00000218] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+/* [0x00000220] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-+/* [0x00000228] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00000230] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-+/* [0x00000238] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
-+/* [0x00000240] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
-+// ::mc_filter_uv
-+/* [0x00000248] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x00000250] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+/* [0x00000258] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+/* [0x00000260] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+/* [0x00000268] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+/* [0x00000270] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x00000278] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+/* [0x00000280] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+/* [0x00000288] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+/* [0x00000290] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+/* [0x00000298] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+/* [0x000002a0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+/* [0x000002a8] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+/* [0x000002b0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x000002b8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+/* [0x000002c0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+/* [0x000002c8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+/* [0x000002d0] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+/* [0x000002d8] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+/* [0x000002e0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+/* [0x000002e8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x000002f0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+/* [0x000002f8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+/* [0x00000300] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-+/* [0x00000308] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000310] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000318] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000320] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000328] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+/* [0x00000330] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000338] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000340] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000348] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+/* [0x00000350] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000358] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000360] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000368] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+/* [0x00000370] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000378] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000380] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000388] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-+/* [0x00000390] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+/* [0x00000398] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x000003a0] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+// :uvloop
-+/* [0x000003a8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+/* [0x000003b0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+/* [0x000003b8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+/* [0x000003c0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x000003c8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+/* [0x000003d0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x000003d8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x000003e0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+/* [0x000003e8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+/* [0x000003f0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+/* [0x000003f8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000400] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
-+/* [0x00000408] */ 0x40038031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
-+/* [0x00000410] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-+/* [0x00000418] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+/* [0x00000420] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+/* [0x00000428] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+/* [0x00000430] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+/* [0x00000438] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+/* [0x00000440] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+/* [0x00000448] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+/* [0x00000450] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+/* [0x00000458] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+/* [0x00000460] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+/* [0x00000468] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+/* [0x00000470] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+/* [0x00000478] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+/* [0x00000480] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
-+/* [0x00000488] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+/* [0x00000490] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-+/* [0x00000498] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+/* [0x000004a0] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+/* [0x000004a8] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+/* [0x000004b0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+/* [0x000004b8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+/* [0x000004c0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-+/* [0x000004c8] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop
-+/* [0x000004d0] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
-+/* [0x000004d8] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
-+/* [0x000004e0] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
-+/* [0x000004e8] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
-+/* [0x000004f0] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
-+/* [0x000004f8] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
-+/* [0x00000500] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
-+/* [0x00000508] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
-+/* [0x00000510] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
-+/* [0x00000518] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
-+/* [0x00000520] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
-+/* [0x00000528] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
-+/* [0x00000530] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+/* [0x00000538] */ 0xfffffe50, 0xf06809e7, // brr.anyn -, r:uvloop
-+/* [0x00000540] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
-+/* [0x00000548] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+/* [0x00000550] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+/* [0x00000558] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x00000560] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000568] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x00000570] */ 0x00000010, 0xe0020827, // mov r0, 16
-+/* [0x00000578] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000580] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00000588] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+/* [0x00000590] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000598] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+// ::mc_filter
-+/* [0x000005a0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x000005a8] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+/* [0x000005b0] */ 0x156e7d80, 0x10020667, // mov ra_x2shift, ra_x2shift_next
-+/* [0x000005b8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+/* [0x000005c0] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
-+/* [0x000005c8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+/* [0x000005d0] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3,unif
-+/* [0x000005d8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x000005e0] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
-+/* [0x000005e8] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
-+/* [0x000005f0] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
-+/* [0x000005f8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+/* [0x00000600] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
-+/* [0x00000608] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+/* [0x00000610] */ 0x149dc5c0, 0xd00206a7, // and ra_x2_base_next, r2, ~3
-+/* [0x00000618] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+/* [0x00000620] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+/* [0x00000628] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+/* [0x00000630] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000638] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+/* [0x00000640] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+/* [0x00000648] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+/* [0x00000650] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+/* [0x00000658] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+/* [0x00000660] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+/* [0x00000668] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000670] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+/* [0x00000678] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+/* [0x00000680] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-+/* [0x00000688] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000690] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000698] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000006a0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000006a8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+/* [0x000006b0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000006b8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000006c0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000006c8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+/* [0x000006d0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x000006d8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x000006e0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000006e8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+/* [0x000006f0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x000006f8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000700] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000708] */ 0x000001d0, 0xf07809e7, // brr.anynn -, r:fast_path
-+/* [0x00000710] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-+/* [0x00000718] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+/* [0x00000720] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000728] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+// :loop
-+/* [0x00000730] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+/* [0x00000738] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+/* [0x00000740] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+/* [0x00000748] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x00000750] */ 0xee654987, 0x10024860, // shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20
-+/* [0x00000758] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x00000760] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00000768] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+/* [0x00000770] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+/* [0x00000778] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+/* [0x00000780] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000788] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
-+/* [0x00000790] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-+/* [0x00000798] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+/* [0x000007a0] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+/* [0x000007a8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+/* [0x000007b0] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+/* [0x000007b8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+/* [0x000007c0] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+/* [0x000007c8] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+/* [0x000007d0] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+/* [0x000007d8] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+/* [0x000007e0] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+/* [0x000007e8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+/* [0x000007f0] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+/* [0x000007f8] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+/* [0x00000800] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
-+/* [0x00000808] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+/* [0x00000810] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-+/* [0x00000818] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+/* [0x00000820] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+/* [0x00000828] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+/* [0x00000830] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+/* [0x00000838] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+/* [0x00000840] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-+/* [0x00000848] */ 0xfffffec8, 0xf06809e7, // brr.anyn -, r:loop
-+/* [0x00000850] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
-+/* [0x00000858] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
-+/* [0x00000860] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
-+/* [0x00000868] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
-+/* [0x00000870] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
-+/* [0x00000878] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
-+/* [0x00000880] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
-+/* [0x00000888] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
-+/* [0x00000890] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
-+/* [0x00000898] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
-+/* [0x000008a0] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
-+/* [0x000008a8] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
-+/* [0x000008b0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+/* [0x000008b8] */ 0xfffffe58, 0xf06809e7, // brr.anyn -, r:loop
-+/* [0x000008c0] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
-+/* [0x000008c8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+/* [0x000008d0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+/* [0x000008d8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x000008e0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x000008e8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x000008f0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+// :fast_path
-+/* [0x000008f8] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+// :fast_loop
-+/* [0x00000900] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+/* [0x00000908] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+/* [0x00000910] */ 0x95727d9b, 0x1004475f, // mov.ifz ra_y, ra_y_next   ; mov rb31, r3
-+/* [0x00000918] */ 0x95690dbf, 0x10044623, // mov.ifz ra_x2_base, ra_x2_base_next   ; mov r3, rb_pitch
-+/* [0x00000920] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x00000928] */ 0x929de5e4, 0x100248a1, // min r2, r2, rb_frame_height_minus_1 ; mov r1, r4
-+/* [0x00000930] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+/* [0x00000938] */ 0xec414c87, 0x10024e20, // add t0s, ra_x_base, r2    ; v8subs r0, r0, rb20
-+/* [0x00000940] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+/* [0x00000948] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
-+/* [0x00000950] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra1 << 1, r0 << 1
-+/* [0x00000958] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra2 << 2, r0 << 2
-+/* [0x00000960] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra3 << 3, r0 << 3
-+/* [0x00000968] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra4 << 4, r0 << 4
-+/* [0x00000970] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra5 << 5, r0 << 5
-+/* [0x00000978] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra6 << 6, r0 << 6
-+/* [0x00000980] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra7 << 7, r0 << 7
-+/* [0x00000988] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
-+/* [0x00000990] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-+/* [0x00000998] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+/* [0x000009a0] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+/* [0x000009a8] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+/* [0x000009b0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+/* [0x000009b8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+/* [0x000009c0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8       ; mov r1, ra22
-+/* [0x000009c8] */ 0xffffff18, 0xf06809e7, // brr.anyn -, r:fast_loop
-+/* [0x000009d0] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
-+/* [0x000009d8] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
-+/* [0x000009e0] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
-+/* [0x000009e8] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
-+/* [0x000009f0] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
-+/* [0x000009f8] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
-+/* [0x00000a00] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
-+/* [0x00000a08] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
-+/* [0x00000a10] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
-+/* [0x00000a18] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
-+/* [0x00000a20] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
-+/* [0x00000a28] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
-+/* [0x00000a30] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+/* [0x00000a38] */ 0xfffffea8, 0xf06809e7, // brr.anyn -, r:fast_loop
-+/* [0x00000a40] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
-+/* [0x00000a48] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+/* [0x00000a50] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+/* [0x00000a58] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00000a60] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x00000a68] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000a70] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+// ::mc_filter_b
-+/* [0x00000a78] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x00000a80] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+/* [0x00000a88] */ 0x156e7d80, 0x10020667, // mov ra_x2shift, ra_x2shift_next
-+/* [0x00000a90] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+/* [0x00000a98] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
-+/* [0x00000aa0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+/* [0x00000aa8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3,unif
-+/* [0x00000ab0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x00000ab8] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
-+/* [0x00000ac0] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
-+/* [0x00000ac8] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
-+/* [0x00000ad0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+/* [0x00000ad8] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
-+/* [0x00000ae0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+/* [0x00000ae8] */ 0x149dc5c0, 0xd00206a7, // and ra_x2_base_next, r2, ~3
-+/* [0x00000af0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+/* [0x00000af8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+/* [0x00000b00] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+/* [0x00000b08] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000b10] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+/* [0x00000b18] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+/* [0x00000b20] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+/* [0x00000b28] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+/* [0x00000b30] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+/* [0x00000b38] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+/* [0x00000b40] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-+/* [0x00000b48] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-+/* [0x00000b50] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-+/* [0x00000b58] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000b60] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+/* [0x00000b68] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+/* [0x00000b70] */ 0x0c9dc7c0, 0x10020c67, // add vr_setup, r3, rb28
-+/* [0x00000b78] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000b80] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000b88] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000b90] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000b98] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+/* [0x00000ba0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000ba8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000bb0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000bb8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+/* [0x00000bc0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000bc8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000bd0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000bd8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+/* [0x00000be0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000be8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000bf0] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000bf8] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-+/* [0x00000c00] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+/* [0x00000c08] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000c10] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+// :bloop
-+/* [0x00000c18] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+/* [0x00000c20] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+/* [0x00000c28] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+/* [0x00000c30] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x00000c38] */ 0xee654987, 0x10024860, // shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20
-+/* [0x00000c40] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x00000c48] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00000c50] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+/* [0x00000c58] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+/* [0x00000c60] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+/* [0x00000c68] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000c70] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
-+/* [0x00000c78] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-+/* [0x00000c80] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+/* [0x00000c88] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+/* [0x00000c90] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+/* [0x00000c98] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+/* [0x00000ca0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+/* [0x00000ca8] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+/* [0x00000cb0] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+/* [0x00000cb8] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+/* [0x00000cc0] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+/* [0x00000cc8] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+/* [0x00000cd0] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+/* [0x00000cd8] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+/* [0x00000ce0] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+/* [0x00000ce8] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
-+/* [0x00000cf0] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+/* [0x00000cf8] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-+/* [0x00000d00] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+/* [0x00000d08] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+/* [0x00000d10] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+/* [0x00000d18] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+/* [0x00000d20] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+/* [0x00000d28] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-+/* [0x00000d30] */ 0xfffffec8, 0xf06809e7, // brr.anyn -, r:bloop
-+/* [0x00000d38] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
-+/* [0x00000d40] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
-+/* [0x00000d48] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
-+/* [0x00000d50] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
-+/* [0x00000d58] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
-+/* [0x00000d60] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
-+/* [0x00000d68] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
-+/* [0x00000d70] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
-+/* [0x00000d78] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
-+/* [0x00000d80] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
-+/* [0x00000d88] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
-+/* [0x00000d90] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
-+/* [0x00000d98] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+/* [0x00000da0] */ 0x8fc8f3f6, 0xd0020867, // asr r1, r1, 15          ; mov -, vr_wait
-+/* [0x00000da8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+/* [0x00000db0] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
-+/* [0x00000db8] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:bloop
-+/* [0x00000dc0] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-+/* [0x00000dc8] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
-+/* [0x00000dd0] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
-+/* [0x00000dd8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00000de0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x00000de8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000df0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+// ::mc_filter_honly
-+/* [0x00000df8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x00000e00] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+/* [0x00000e08] */ 0x156e7d80, 0x10020667, // mov ra_x2shift, ra_x2shift_next
-+/* [0x00000e10] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+/* [0x00000e18] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
-+/* [0x00000e20] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+/* [0x00000e28] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3,unif
-+/* [0x00000e30] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x00000e38] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
-+/* [0x00000e40] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
-+/* [0x00000e48] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
-+/* [0x00000e50] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+/* [0x00000e58] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
-+/* [0x00000e60] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+/* [0x00000e68] */ 0x149dc5c0, 0xd00206a7, // and ra_x2_base_next, r2, ~3
-+/* [0x00000e70] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+/* [0x00000e78] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+/* [0x00000e80] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+/* [0x00000e88] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000e90] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+/* [0x00000e98] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+/* [0x00000ea0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+/* [0x00000ea8] */ 0x0c9de1c0, 0xd0021467, // add rb17, r0, -2
-+/* [0x00000eb0] */ 0x919c71c0, 0xd0024812, // shl r0, r0, 7 ; mov rb18,r0
-+/* [0x00000eb8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000ec0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+/* [0x00000ec8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+/* [0x00000ed0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000ed8] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000ee0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000ee8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000ef0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+/* [0x00000ef8] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000f00] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000f08] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000f10] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+/* [0x00000f18] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000f20] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+/* [0x00000f28] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000f30] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+// :loop_honly
-+/* [0x00000f38] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+/* [0x00000f40] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+/* [0x00000f48] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+/* [0x00000f50] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x00000f58] */ 0xee654987, 0x10024860, // shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20
-+/* [0x00000f60] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x00000f68] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00000f70] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+/* [0x00000f78] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+/* [0x00000f80] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+/* [0x00000f88] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000f90] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
-+/* [0x00000f98] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-+/* [0x00000fa0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+/* [0x00000fa8] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+/* [0x00000fb0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+/* [0x00000fb8] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+/* [0x00000fc0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+/* [0x00000fc8] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+/* [0x00000fd0] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+/* [0x00000fd8] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+/* [0x00000fe0] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+/* [0x00000fe8] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+/* [0x00000ff0] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+/* [0x00000ff8] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+/* [0x00001000] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+/* [0x00001008] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
-+/* [0x00001010] */ 0x8d5927f6, 0x100269e1, // sub.setf -, r3, rb18 ; mov r1, ra22
-+/* [0x00001018] */ 0x559f2fc1, 0x100049e0, // mov -, vw_wait   ; mul24 r0, r0, r1
-+/* [0x00001020] */ 0xfffffef8, 0xf06809e7, // brr.anyn -, r:loop_honly
-+/* [0x00001028] */ 0x0f9cf1c0, 0xd0020827, // asr r0, r0, 15
-+/* [0x00001030] */ 0x129d61c0, 0x10020827, // min r0, r0, rb22
-+/* [0x00001038] */ 0x139c01c0, 0xd0020c27, // max vpm, r0, 0
-+/* [0x00001040] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00001048] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x00001050] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00001058] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+// ::mc_exit
-+/* [0x00001060] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x00001068] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-+/* [0x00001070] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00001078] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00001080] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00001088] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00001090] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x00001098] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+/* [0x000010a0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+// ::mc_exit1
-+/* [0x000010a8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x000010b0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x000010b8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x000010c0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x000010c8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x000010d0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x000010d8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+/* [0x000010e0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+// ::mc_interrupt_exit
-+/* [0x000010e8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x000010f0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x000010f8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00001100] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00001108] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00001110] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00001118] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00001120] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00001128] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00001130] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00001138] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00001140] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00001148] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00001150] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00001158] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00001160] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00001168] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x00001170] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+/* [0x00001178] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+// ::mc_interrupt_exit4
-+/* [0x00001180] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x00001188] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00001190] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00001198] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x000011a0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x000011a8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000011b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000011b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000011c0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x000011c8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+/* [0x000011d0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+// ::mc_interrupt_exit8
-+/* [0x000011d8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x000011e0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x000011e8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x000011f0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x000011f8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00001200] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00001208] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00001210] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00001218] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00001220] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00001228] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00001230] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00001238] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x00001240] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+/* [0x00001248] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+// ::mc_setup_uv
-+/* [0x00001250] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x00001258] */ 0x0c9a0f80, 0x10020427, // add ra_x_base, unif, elem_num
-+/* [0x00001260] */ 0x15827d80, 0x10020767, // mov ra_y, unif
-+/* [0x00001268] */ 0x15827d80, 0x10020627, // mov ra_x2_base, unif
-+/* [0x00001270] */ 0x009e7000, 0x100009e7, // nop
-+/* [0x00001278] */ 0x0d620f80, 0x10020667, // sub ra_u2v_ref_offset, unif, ra_x2_base
-+/* [0x00001280] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
-+/* [0x00001288] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
-+/* [0x00001290] */ 0x15827d80, 0x10021427, // mov rb16, unif
-+/* [0x00001298] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x000012a0] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-+/* [0x000012a8] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
-+/* [0x000012b0] */ 0x00000001, 0xe0020527, // mov ra20, 1
-+/* [0x000012b8] */ 0x00000040, 0xe0020567, // mov ra21, 64
-+/* [0x000012c0] */ 0x00000100, 0xe00205a7, // mov ra22, 256
-+/* [0x000012c8] */ 0x00000008, 0xe00205e7, // mov ra23, 8
-+/* [0x000012d0] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
-+/* [0x000012d8] */ 0x00000040, 0xe0021567, // mov rb21, 64
-+/* [0x000012e0] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
-+/* [0x000012e8] */ 0x00000018, 0xe00215e7, // mov rb23, 24
-+/* [0x000012f0] */ 0x00000000, 0xe0020227, // mov ra8, 0
-+/* [0x000012f8] */ 0x00000000, 0xe0020267, // mov ra9, 0
-+/* [0x00001300] */ 0x00000000, 0xe00202a7, // mov ra10, 0
-+/* [0x00001308] */ 0x00000000, 0xe00202e7, // mov ra11, 0
-+/* [0x00001310] */ 0x00000000, 0xe0020327, // mov ra12, 0
-+/* [0x00001318] */ 0x00000000, 0xe0020367, // mov ra13, 0
-+/* [0x00001320] */ 0x00000000, 0xe00203a7, // mov ra14, 0
-+/* [0x00001328] */ 0x00000000, 0xe00203e7, // mov ra15, 0
-+/* [0x00001330] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+/* [0x00001338] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-+/* [0x00001340] */ 0x159e7480, 0x10020867, // mov r1, r2
-+/* [0x00001348] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+/* [0x00001350] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+/* [0x00001358] */ 0x159e7480, 0x10020827, // mov r0, r2
-+/* [0x00001360] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-+/* [0x00001368] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00001370] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-+/* [0x00001378] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-+/* [0x00001380] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
-+/* [0x00001388] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+/* [0x00001390] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-+/* [0x00001398] */ 0x159e7480, 0x10020867, // mov r1, r2
-+/* [0x000013a0] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+/* [0x000013a8] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+/* [0x000013b0] */ 0x159e7480, 0x10020827, // mov r0, r2
-+/* [0x000013b8] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-+/* [0x000013c0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x000013c8] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-+/* [0x000013d0] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
-+/* [0x000013d8] */ 0x15427d80, 0x10020827, // mov r0, ra_x_base
-+/* [0x000013e0] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
-+/* [0x000013e8] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base
-+/* [0x000013f0] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
-+/* [0x000013f8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
-+/* [0x00001400] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+/* [0x00001408] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+/* [0x00001410] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x_base, r0
-+/* [0x00001418] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+/* [0x00001420] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
-+/* [0x00001428] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
-+/* [0x00001430] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
-+/* [0x00001438] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00001440] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00001448] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00001450] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-+/* [0x00001458] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+/* [0x00001460] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-+/* [0x00001468] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00001470] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-+/* [0x00001478] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
-+/* [0x00001480] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
-+// ::mc_filter_uv_b
-+/* [0x00001488] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x00001490] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+/* [0x00001498] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+/* [0x000014a0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+/* [0x000014a8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+/* [0x000014b0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x000014b8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+/* [0x000014c0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+/* [0x000014c8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+/* [0x000014d0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+/* [0x000014d8] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+/* [0x000014e0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+/* [0x000014e8] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+/* [0x000014f0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x000014f8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+/* [0x00001500] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+/* [0x00001508] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+/* [0x00001510] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+/* [0x00001518] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+/* [0x00001520] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+/* [0x00001528] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-+/* [0x00001530] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-+/* [0x00001538] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-+/* [0x00001540] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00001548] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+/* [0x00001550] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+/* [0x00001558] */ 0x0c9dc7c0, 0x10020c67, // add vr_setup, r3, rb28
-+/* [0x00001560] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-+/* [0x00001568] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00001570] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00001578] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00001580] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00001588] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+/* [0x00001590] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00001598] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000015a0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000015a8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+/* [0x000015b0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x000015b8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x000015c0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000015c8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+/* [0x000015d0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x000015d8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x000015e0] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x000015e8] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-+/* [0x000015f0] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+/* [0x000015f8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00001600] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+// :uvloop_b
-+/* [0x00001608] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+/* [0x00001610] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+/* [0x00001618] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+/* [0x00001620] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x00001628] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+/* [0x00001630] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x00001638] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00001640] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+/* [0x00001648] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+/* [0x00001650] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+/* [0x00001658] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00001660] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
-+/* [0x00001668] */ 0x40038031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
-+/* [0x00001670] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-+/* [0x00001678] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+/* [0x00001680] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+/* [0x00001688] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+/* [0x00001690] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+/* [0x00001698] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+/* [0x000016a0] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+/* [0x000016a8] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+/* [0x000016b0] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+/* [0x000016b8] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+/* [0x000016c0] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+/* [0x000016c8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+/* [0x000016d0] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+/* [0x000016d8] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+/* [0x000016e0] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
-+/* [0x000016e8] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+/* [0x000016f0] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-+/* [0x000016f8] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+/* [0x00001700] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+/* [0x00001708] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+/* [0x00001710] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+/* [0x00001718] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+/* [0x00001720] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-+/* [0x00001728] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+/* [0x00001730] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
-+/* [0x00001738] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
-+/* [0x00001740] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
-+/* [0x00001748] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
-+/* [0x00001750] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
-+/* [0x00001758] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
-+/* [0x00001760] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
-+/* [0x00001768] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
-+/* [0x00001770] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
-+/* [0x00001778] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
-+/* [0x00001780] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
-+/* [0x00001788] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
-+/* [0x00001790] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+/* [0x00001798] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
-+/* [0x000017a0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+/* [0x000017a8] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
-+/* [0x000017b0] */ 0xfffffe38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+/* [0x000017b8] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-+/* [0x000017c0] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
-+/* [0x000017c8] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
-+/* [0x000017d0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x000017d8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x000017e0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x000017e8] */ 0x00000010, 0xe0020827, // mov r0, 16
-+/* [0x000017f0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x000017f8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00001800] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+/* [0x00001808] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00001810] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+// ::mc_end
-+};
-+#ifdef __HIGHC__
-+#pragma Align_to(8, rpi_shader)
-+#endif
-diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
-new file mode 100644
-index 0000000..db971f4
---- /dev/null
-+++ b/libavcodec/rpi_shader.h
-@@ -0,0 +1,20 @@
-+#ifndef rpi_shader_H
-+#define rpi_shader_H
-+
-+extern unsigned int rpi_shader[];
-+
-+#define mc_setup (rpi_shader + 0)
-+#define mc_filter_uv (rpi_shader + 146)
-+#define mc_filter (rpi_shader + 360)
-+#define mc_filter_b (rpi_shader + 670)
-+#define mc_filter_honly (rpi_shader + 894)
-+#define mc_exit (rpi_shader + 1048)
-+#define mc_exit1 (rpi_shader + 1066)
-+#define mc_interrupt_exit (rpi_shader + 1082)
-+#define mc_interrupt_exit4 (rpi_shader + 1120)
-+#define mc_interrupt_exit8 (rpi_shader + 1142)
-+#define mc_setup_uv (rpi_shader + 1172)
-+#define mc_filter_uv_b (rpi_shader + 1314)
-+#define mc_end (rpi_shader + 1542)
-+
-+#endif
-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-new file mode 100644
-index 0000000..6851e83
---- /dev/null
-+++ b/libavcodec/rpi_shader.qasm
-@@ -0,0 +1,1413 @@
-+# register allocation
-+#
-+# ra0...ra7                                     eight horizontal filter coefficients
-+#
-+# rb1...rb7                                     seven shifted copies of the current unfiltered row
-+#
-+# ra8...ra15                                    eight filtered rows of context (rb15 == most recent)
-+#
-+#                                               (ra15 isn't clamped to zero - this happens during the
-+#                                                copy to ra14, and during its use in the vertical filter)
-+#
-+# rb8...rb15                                    eight vertical filter coefficients
-+#
-+# ra16                                          clipped(row start address+elem_num)&~3
-+# ra17                                          per-channel shifts
-+# ra19                                          next ra17
-+#
-+# rb16                                          pitch
-+# rb17                                          height + 5
-+# rb18                                          height + 7
-+# rb19                                          next ra16
-+#
-+# ra20                                          1
-+# ra21                                          64
-+# ra22                                          256
-+# ra23                                          8
-+#
-+# rb20                                          0xffffff00
-+# rb21                                          64
-+# rb22                                          255
-+# rb23                                          24
-+#
-+# rb24                                          vdw_setup_1(dst_pitch)
-+# rb25                                          frame width-1
-+# rb26                                          height<<23 + width<<16 + vdw_setup_0
-+# rb27                                          vdw_setup_0 (depends on QPU number)
-+# rb28                                          vpm_setup (depends on QPU number)
-+# rb29                                          vdw_setup_1(dst_pitch-width)
-+# rb30                                          frame height-1
-+# rb31                                          used as temp to count loop iterations
-+#
-+# ra24...ra30                                   15, 14, 13, 12, 11, 10, 9
-+# ra24                                          clipped(row start address+8+elem_num)&~3
-+# ra25                                          per-channel shifts 2
-+# ra26                                          next ra24
-+# ra27                                          next ra25
-+# ra28                                          next y
-+# ra29                                          y for next texture access
-+#
-+# ra31                                          next kernel address
-+
-+.set rb_frame_width_minus_1,       rb25
-+.set rb_frame_height_minus_1,      rb30
-+.set rb_pitch,                     rb16
-+.set ra_x_base,                    ra16
-+.set rb_x_base_next,               rb19
-+.set ra_x2_base,                   ra24
-+.set ra_x2_base_next,              ra26
-+.set ra_xshift,                    ra17
-+
-+.set ra_x2shift,                   ra25
-+.set ra_u2v_ref_offset,            ra25
-+
-+.set ra_xshift_next,               ra19
-+
-+.set ra_x2shift_next,              ra27
-+.set ra_u2v_dst_offset,            ra27
-+
-+.set ra_y_next,                    ra28
-+.set ra_y,                         ra29
-+
-+.set rb_const_64,                  rb21
-+
-+# mc_setup(next_kernel, x, y, ref_base, frame_width, frame_height, pitch, dst_pitch, pad0, pad1)
-+::mc_setup
-+
-+# Read starting kernel
-+mov ra31, unif
-+
-+# Load first request location
-+add ra_x_base, unif, elem_num # Store x
-+mov ra_y, unif # Store y
-+mov ra_x2_base, unif # Store frame base
-+
-+# Read image dimensions
-+sub rb25,unif,1
-+sub rb30,unif,1
-+
-+# get source pitch
-+mov rb16, unif
-+
-+# get destination pitch
-+mov r0, unif
-+mov r1, vdw_setup_1(0)
-+add rb24, r1, r0
-+
-+# load constants
-+
-+mov ra20, 1
-+mov ra21, 64
-+mov ra22, 256
-+mov ra23, 8
-+
-+mov rb20, 0xffffff00
-+mov rb21, 64
-+mov rb22, 255
-+mov rb23, 24
-+
-+# touch vertical context to keep simulator happy
-+
-+mov ra8, 0
-+mov ra9, 0
-+mov ra10, 0
-+mov ra11, 0
-+mov ra12, 0
-+mov ra13, 0
-+mov ra14, 0
-+mov ra15, 0
-+
-+# Compute part of VPM to use for DMA output
-+mov r2, qpu_num
-+and r2, r2, 15
-+mov r1, r2
-+asr r1, r1, 2
-+shl r1, r1, 6
-+mov r0, r2
-+and r0, r0, 3
-+add r0, r0, r1
-+mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
-+shl r0, r0, 5
-+add rb27, r0, r1
-+
-+# Compute part of VPM to save data into
-+mov r2, qpu_num
-+and r2, r2, 15
-+mov r1, r2
-+asr r1, r1, 2
-+shl r1, r1, 6
-+mov r0, r2
-+and r0, r0, 3
-+add r0, r0, r1
-+mov r1, vpm_setup(0, 4, h8p(0, 0))
-+add rb28, r0, r1
-+
-+# Compute base address for first and second access
-+#add r0, unif, elem_num     # x
-+mov r0, ra_x_base           # Load x
-+add r2, r0, 8               # x+8
-+max r0, r0, 0; mov r1, ra_y # Load y
-+min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base  # Load the frame base
-+shl ra_xshift_next, r0, 3
-+max r2, r2, 0
-+add ra_y, r1, 1
-+min r2, r2, rb_frame_width_minus_1
-+shl ra_x2shift_next, r2, 3
-+max r1, r1, 0  # y
-+min r1, r1, rb_frame_height_minus_1
-+add r0, r0, r3; mul24 r1, r1, rb_pitch
-+add r2, r2, r3
-+and r0, r0, ~3
-+and r2, r2, ~3; mov ra_x_base, r0
-+# submit texture requests for first line
-+add t0s, r0, r1 ; mov ra_x2_base, r2
-+add t0s, r2, r1
-+
-+# Dump padding words
-+mov r0, unif
-+mov r0, unif
-+
-+# submit texture requests for second line
-+max r1, ra_y, 0
-+min r1, r1, rb_frame_height_minus_1
-+add ra_y, ra_y, 1
-+bra -, ra31
-+nop ; mul24 r1, r1, rb_pitch
-+add t0s, r1, ra_x_base
-+add t0s, r1, ra_x2_base
-+
-+################################################################################
-+
-+# mc_filter_uv(next_kernel, x, y, frame_u_base, frame_v_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_u_dst, this_v_dst)
-+
-+# At this point we have already issued two pairs of texture requests for the current block
-+# ra_x_base, ra_x16_base point to the current coordinates for this block
-+::mc_filter_uv
-+mov ra31, unif
-+
-+# per-channel shifts were calculated on the *previous* invocation
-+
-+mov ra_xshift, ra_xshift_next
-+
-+# get base addresses and per-channel shifts for *next* invocation
-+add r0, unif, elem_num    # x
-+max r0, r0, 0; mov r1, unif # y
-+min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base
-+shl ra_xshift_next, r0, 3
-+sub r2, unif, r3 # compute offset from frame base u to frame base v
-+add r0, r0, r3
-+and rb_x_base_next, r0, ~3
-+mov ra_y_next, r1
-+add ra_x2_base_next, rb_x_base_next, r2
-+
-+# set up VPM write
-+mov vw_setup, rb28
-+
-+# get width,height of block
-+mov r2, 16
-+mov r0, unif
-+shr r1, r0, r2 # Extract width
-+sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
-+and r0, r0, rb22 # Extract height
-+add rb17, r0, 5
-+add rb18, r0, 7
-+shl r0, r0, 7
-+add r0, r0, r1 # Combine width and height of destination area
-+shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
-+add rb26, r0, rb27
-+
-+sub.setf -,8,r1 # 8-r1, so if <0 (negative) we need to use the full code
-+
-+# get filter coefficients
-+
-+mov r0, unif
-+asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+asr ra0, r0, rb23;      mov r0, unif
-+asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+asr ra4, r0, rb23;      mov r0, unif
-+asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+asr rb8, r0, rb23;      mov r0, unif
-+asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+asr rb12, r0, rb23
-+
-+# r2 is elem_num
-+# r3 is loop counter
-+
-+mov r5rep, -8
-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+
-+# retrieve texture results and pick out bytes
-+# then submit two more texture requests
-+
-+mov r3, 0
-+
-+:uvloop
-+# retrieve texture results and pick out bytes
-+# then submit two more texture requests
-+
-+sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
-+shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
-+
-+max r2, ra_y, 0  # y
-+min r2, r2, rb_frame_height_minus_1
-+add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+add t0s, ra_x2_base, r2
-+
-+# generate seven shifted versions
-+# interleave with scroll of vertical context
-+
-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+
-+mov r2, rb21         ; mul24 r3, r0, ra0
-+nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
-+sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-+nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+sub r0, r2, r3
-+
-+mov r3, rb31
-+
-+mov ra8, ra9
-+mov ra9, ra10
-+mov ra10, ra11
-+mov ra11, ra12
-+mov ra12, ra13
-+mov ra13, ra14
-+
-+sub.setf -, r3, 8 ; mov r1, ra22
-+
-+# apply horizontal filter
-+brr.anyn -, r:uvloop
-+max ra14, ra15, 0       ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
-+asr r0, r0, 15          ; mov r1, ra21
-+min.setf ra15, r0, rb22
-+
-+# apply vertical filter and write to VPM
-+
-+nop                     ; mul24 r0, ra14, rb14
-+sub r1, r1, r0          ; mul24 r0, ra13, rb13
-+sub r1, r1, r0          ; mul24 r0, ra12, rb12
-+sub r1, r1, r0          ; mul24 r0, ra11, rb11
-+sub r1, r1, r0          ; mul24 r0, ra10, rb10
-+sub r1, r1, r0          ; mul24 r0, ra9, rb9
-+sub r1, r1, r0          ; mul24 r0, ra8, rb8
-+sub r1, r1, r0          ; mul24 r0, ra15, rb15
-+sub.ifnn r1, r1, r0     ; mov -, vw_wait
-+sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+brr.anyn -, r:uvloop
-+asr r1, r1, 15
-+min r1, r1, rb22
-+max vpm, r1, 0
-+
-+# DMA out for U
-+
-+mov vw_setup, rb26 # VDW setup 0
-+mov vw_setup, rb29 # Stride
-+mov vw_addr, unif # start the VDW
-+
-+# DMA out for V
-+# We need to wait for the U to complete first, but have nothing useful to compute while we wait.
-+# Could potentially push this write into the start of the next pipeline stage.
-+mov r0, 16
-+mov -, vw_wait
-+
-+bra -, ra31
-+add vw_setup, rb26, r0 # VDW setup 0
-+mov vw_setup, rb29 # Stride
-+mov vw_addr, unif # start the VDW
-+
-+################################################################################
-+
-+
-+# mc_filter(next_kernel, x, y, frame_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_dst)
-+
-+# At this point we have already issued two pairs of texture requests for the current block
-+# ra_x_base, ra_x16_base point to the current coordinates for this block
-+::mc_filter
-+mov ra31, unif
-+
-+# per-channel shifts were calculated on the *previous* invocation
-+
-+mov ra_xshift, ra_xshift_next
-+mov ra_x2shift, ra_x2shift_next
-+
-+# get base addresses and per-channel shifts for *next* invocation
-+add r0, unif, elem_num    # x
-+add r2, r0, 8 # x+8
-+max r0, r0, 0; mov r1, unif # y
-+min r0, r0, rb_frame_width_minus_1 ; mov r3,unif # frame_base
-+shl ra_xshift_next, r0, 3
-+max r2, r2, 0
-+min r2, r2, rb_frame_width_minus_1
-+shl ra_x2shift_next, r2, 3
-+add r0, r0, r3
-+add r2, r2, r3
-+and rb_x_base_next, r0, ~3
-+and ra_x2_base_next, r2, ~3
-+mov ra_y_next, r1
-+
-+# set up VPM write
-+mov vw_setup, rb28
-+
-+# get width,height of block
-+mov r2, 16
-+mov r0, unif
-+shr r1, r0, r2 # Extract width
-+sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
-+and r0, r0, rb22 # Extract height
-+add rb17, r0, 5
-+add rb18, r0, 7
-+shl r0, r0, 7
-+add r0, r0, r1 # Combine width and height of destination area
-+shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
-+add rb26, r0, rb27
-+
-+sub.setf -,8,r1 # 8-r1, so if <0 (negative) we need to use the full code
-+
-+# get filter coefficients
-+
-+mov r0, unif
-+asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+asr ra0, r0, rb23;      mov r0, unif
-+asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+asr ra4, r0, rb23;      mov r0, unif
-+asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+asr rb8, r0, rb23;      mov r0, unif
-+asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+brr.anynn -, r:fast_path
-+asr rb12, r0, rb23  # delay slot 1
-+
-+# r2 is elem_num
-+# r3 is loop counter
-+
-+mov r5rep, -8 # delay slot 2
-+
-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]  # delay slot 3
-+
-+# retrieve texture results and pick out bytes
-+# then submit two more texture requests
-+
-+## nop                                                                 ; ldtmu0     # loop counter increment
-+## shr r0, r4, ra17                                                    ; ldtmu0
-+## shr r1, r4, ra17     ; v8subs r0, r0, rb20
-+## add t0s, ra16, r5    ; v8subs r1, r1, rb20
-+## add ra16, ra16, rb16 ; mov t0s, ra16
-+##
-+## # generate seven shifted versions
-+## # interleave with scroll of vertical context
-+##
-+## mov r2, rb21         ; mul24 r3, r0, ra0
-+## sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-+## nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+## sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+## nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+## sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+## nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+## sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+## nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+## sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+## nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+## sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+## nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+## sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+## nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+## sub r2, r2, r3                                                    ; ldtmu0
-+##
-+## mov r0, ra22
-+## shr r0, r4, ra17     ; mul24 r2, r2, r0                           ; ldtmu0
-+## shr r1, r4, ra17     ; v8subs r0, r0, rb20
-+## add t0s, ra16, r5    ; v8subs r1, r1, rb20
-+## add ra16, ra16, rb16 ; mov t0s, ra16
-+##
-+## # apply horizontal filter
-+##
-+## asr r2, r2, 15    ; mul24 r3, r0, ra0
-+## min r2, r2, rb22
-+## max ra13, r2, 0
-+##
-+## # generate seven shifted versions
-+## # interleave with scroll of vertical context
-+##
-+## mov r2, rb21
-+## sub r2, r2, r3 ; mul24      r3, ra1 << 1, r0 << 1
-+## nop            ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+## sub r2, r2, r3 ; mul24    r3, ra2 << 2, r0 << 2
-+## nop            ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+## sub r2, r2, r3 ; mul24    r3, ra3 << 3, r0 << 3
-+## nop            ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+## sub r2, r2, r3 ; mul24    r3, ra4 << 4, r0 << 4
-+## nop            ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+## sub r2, r2, r3 ; mul24    r3, ra5 << 5, r0 << 5
-+## nop            ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+## sub r2, r2, r3 ; mul24    r3, ra6 << 6, r0 << 6
-+## nop            ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+## sub r2, r2, r3 ; mul24    r3, ra7 << 7, r0 << 7
-+## nop            ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+## sub r0, r2, r3
-+##
-+## # apply horizontal filter
-+##
-+## nop          ; mul24 r0, r0, ra22         # last bit of context scroll, including clamp to zero
-+## asr r0, r0, 15
-+## min r0, r0, rb22
-+## max ra14, r0, 0
-+##
-+##
-+##
-+##
-+## nop                                                                 ; ldtmu0     # loop counter increment
-+## shr r0, r4, ra17                                                    ; ldtmu0
-+## shr r1, r4, ra17     ; v8subs r0, r0, rb20
-+## add t0s, ra16, r5    ; v8subs r1, r1, rb20
-+## add ra16, ra16, rb16 ; mov t0s, ra16
-+##
-+## # generate seven shifted versions
-+## # interleave with scroll of vertical context
-+##
-+## mov r2, rb21         ; mul24 r3, r0, ra0
-+## sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-+## nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+## sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+## nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+## sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+## nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+## sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+## nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+## sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+## nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+## sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+## nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+## sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+## nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+## sub r0, r2, r3
-+##
-+## # apply horizontal filter
-+##
-+## nop          ; mul24 r0, r0, ra22         # last bit of context scroll, including clamp to zero
-+## asr r0, r0, 15
-+## min r0, r0, rb22
-+## max ra15, r0, 0
-+
-+
-+
-+
-+mov r3, 0
-+
-+:loop
-+# retrieve texture results and pick out bytes
-+# then submit two more texture requests
-+
-+sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
-+shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
-+
-+max r2, ra_y, 0  # y
-+min r2, r2, rb_frame_height_minus_1
-+add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+add t0s, ra_x2_base, r2
-+
-+# generate seven shifted versions
-+# interleave with scroll of vertical context
-+
-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+
-+mov r2, rb21         ; mul24 r3, r0, ra0
-+sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-+nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+sub r0, r2, r3
-+
-+mov r3, rb31
-+
-+mov ra8, ra9
-+mov ra9, ra10
-+mov ra10, ra11
-+mov ra11, ra12
-+mov ra12, ra13
-+mov ra13, ra14
-+
-+sub.setf -, r3, 8 ; mov r1, ra22
-+
-+# apply horizontal filter
-+brr.anyn -, r:loop
-+max ra14, ra15, 0       ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
-+asr r0, r0, 15          ; mov r1, ra21
-+min.setf ra15, r0, rb22
-+
-+# apply vertical filter and write to VPM
-+
-+nop                     ; mul24 r0, ra14, rb14
-+sub r1, r1, r0          ; mul24 r0, ra13, rb13
-+sub r1, r1, r0          ; mul24 r0, ra12, rb12
-+sub r1, r1, r0          ; mul24 r0, ra11, rb11
-+sub r1, r1, r0          ; mul24 r0, ra10, rb10
-+sub r1, r1, r0          ; mul24 r0, ra9, rb9
-+sub r1, r1, r0          ; mul24 r0, ra8, rb8
-+sub r1, r1, r0          ; mul24 r0, ra15, rb15
-+sub.ifnn r1, r1, r0     ; mov -, vw_wait
-+sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+brr.anyn -, r:loop
-+asr r1, r1, 15
-+min r1, r1, rb22
-+max vpm, r1, 0
-+
-+# DMA out
-+
-+bra -, ra31
-+mov vw_setup, rb26 # VDW: height rows, 16 8-bit units long
-+mov vw_setup, rb29
-+mov vw_addr, unif # start the VDW
-+
-+####################################################
-+
-+:fast_path
-+## nop                                                                 ; ldtmu0     # loop counter increment
-+## shr r0, r4, ra17                                                    ; ldtmu0
-+## shr r1, r4, ra17     ; v8subs r0, r0, rb20
-+## add t0s, ra16, r5    ; v8subs r1, r1, rb20
-+## add ra16, ra16, rb16 ; mov t0s, ra16
-+##
-+## # generate seven shifted versions
-+## # interleave with scroll of vertical context
-+##
-+## mov r2, rb21         ; mul24 r3, r0, ra0
-+## sub r2, r2, r3       ; mul24 r3, ra1 << 1, r0 << 1
-+## sub r2, r2, r3       ; mul24 r3, ra2 << 2, r0 << 2
-+## sub r2, r2, r3       ; mul24 r3, ra3 << 3, r0 << 3
-+## sub r2, r2, r3       ; mul24 r3, ra4 << 4, r0 << 4
-+## sub r2, r2, r3       ; mul24 r3, ra5 << 5, r0 << 5
-+## sub r2, r2, r3       ; mul24 r3, ra6 << 6, r0 << 6
-+## sub r2, r2, r3       ; mul24 r3, ra7 << 7, r0 << 7
-+## sub r2, r2, r3                                                    ; ldtmu0
-+##
-+## mov r0, ra22
-+## shr r0, r4, ra17     ; mul24 r2, r2, r0                           ; ldtmu0
-+## shr r1, r4, ra17     ; v8subs r0, r0, rb20
-+## add t0s, ra16, r5    ; v8subs r1, r1, rb20
-+## add ra16, ra16, rb16 ; mov t0s, ra16
-+##
-+## # apply horizontal filter
-+##
-+## asr r2, r2, 15    ; mul24 r3, r0, ra0
-+## min r2, r2, rb22
-+## max ra13, r2, 0
-+##
-+## # generate seven shifted versions
-+## # interleave with scroll of vertical context
-+##
-+## mov r2, rb21
-+## sub r2, r2, r3 ; mul24    r3, ra1 << 1, r0 << 1
-+## sub r2, r2, r3 ; mul24    r3, ra2 << 2, r0 << 2
-+## sub r2, r2, r3 ; mul24    r3, ra3 << 3, r0 << 3
-+## sub r2, r2, r3 ; mul24    r3, ra4 << 4, r0 << 4
-+## sub r2, r2, r3 ; mul24    r3, ra5 << 5, r0 << 5
-+## sub r2, r2, r3 ; mul24    r3, ra6 << 6, r0 << 6
-+## sub r2, r2, r3 ; mul24    r3, ra7 << 7, r0 << 7
-+## sub r0, r2, r3
-+##
-+## # apply horizontal filter
-+##
-+## nop          ; mul24 r0, r0, ra22         # last bit of context scroll, including clamp to zero
-+## asr r0, r0, 15
-+## min r0, r0, rb22
-+## max ra14, r0, 0
-+##
-+##
-+##
-+##
-+## nop                                                                 ; ldtmu0     # loop counter increment
-+## shr r0, r4, ra17                                                    ; ldtmu0
-+## shr r1, r4, ra17     ; v8subs r0, r0, rb20
-+## add t0s, ra16, r5    ; v8subs r1, r1, rb20
-+## add ra16, ra16, rb16 ; mov t0s, ra16
-+##
-+## # generate seven shifted versions
-+## # interleave with scroll of vertical context
-+##
-+## mov r2, rb21   ; mul24    r3, r0, ra0
-+## sub r2, r2, r3 ; mul24    r3, ra1 << 1, r0 << 1
-+## sub r2, r2, r3 ; mul24    r3, ra2 << 2, r0 << 2
-+## sub r2, r2, r3 ; mul24    r3, ra3 << 3, r0 << 3
-+## sub r2, r2, r3 ; mul24    r3, ra4 << 4, r0 << 4
-+## sub r2, r2, r3 ; mul24    r3, ra5 << 5, r0 << 5
-+## sub r2, r2, r3 ; mul24    r3, ra6 << 6, r0 << 6
-+## sub r2, r2, r3 ; mul24    r3, ra7 << 7, r0 << 7
-+## sub r0, r2, r3
-+##
-+## # apply horizontal filter
-+##
-+## nop          ; mul24 r0, r0, ra22         # last bit of context scroll, including clamp to zero
-+## asr r0, r0, 15
-+## min r0, r0, rb22
-+## max ra15, r0, 0
-+
-+
-+mov r3, 0  # This signifies the amount of unrolling
-+
-+:fast_loop
-+# retrieve texture results and pick out bytes
-+# then submit two more texture requests
-+
-+# Due to pipelining we can only skip second pipeline instructions related to the fetched pixels
-+sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
-+shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+mov.ifz ra_y, ra_y_next   ; mov rb31, r3
-+mov.ifz ra_x2_base, ra_x2_base_next   ; mov r3, rb_pitch
-+
-+max r2, ra_y, 0
-+min r2, r2, rb_frame_height_minus_1 ; mov r1, r4  # discard texture read
-+add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+add t0s, ra_x_base, r2    ; v8subs r0, r0, rb20
-+add t0s, ra_x2_base, r2
-+
-+# generate seven shifted versions
-+# interleave with scroll of vertical context
-+
-+mov r2, rb21         ; mul24 r3, r0, ra0
-+sub r2, r2, r3       ; mul24 r3, ra1 << 1, r0 << 1
-+sub r2, r2, r3       ; mul24 r3, ra2 << 2, r0 << 2
-+sub r2, r2, r3       ; mul24 r3, ra3 << 3, r0 << 3
-+sub r2, r2, r3       ; mul24 r3, ra4 << 4, r0 << 4
-+sub r2, r2, r3       ; mul24 r3, ra5 << 5, r0 << 5
-+sub r2, r2, r3       ; mul24 r3, ra6 << 6, r0 << 6
-+sub r2, r2, r3       ; mul24 r3, ra7 << 7, r0 << 7
-+sub r0, r2, r3       ; mov r3, rb31
-+
-+mov ra8, ra9
-+mov ra9, ra10
-+mov ra10, ra11
-+mov ra11, ra12
-+mov ra12, ra13
-+mov ra13, ra14
-+
-+sub.setf -, r3, 8       ; mov r1, ra22
-+
-+# apply horizontal filter
-+
-+brr.anyn -, r:fast_loop
-+max ra14, ra15, 0       ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
-+asr r0, r0, 15          ; mov r1, ra21
-+min.setf ra15, r0, rb22
-+
-+# apply vertical filter and write to VPM
-+
-+nop                     ; mul24 r0, ra14, rb14
-+sub r1, r1, r0          ; mul24 r0, ra13, rb13
-+sub r1, r1, r0          ; mul24 r0, ra12, rb12
-+sub r1, r1, r0          ; mul24 r0, ra11, rb11
-+sub r1, r1, r0          ; mul24 r0, ra10, rb10
-+sub r1, r1, r0          ; mul24 r0, ra9, rb9
-+sub r1, r1, r0          ; mul24 r0, ra8, rb8
-+sub r1, r1, r0          ; mul24 r0, ra15, rb15
-+sub.ifnn r1, r1, r0     ; mov -, vw_wait
-+sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+brr.anyn -, r:fast_loop
-+asr r1, r1, 15
-+min r1, r1, rb22
-+max vpm, r1, 0
-+
-+# DMA out
-+
-+bra -, ra31
-+mov vw_setup, rb26 # VDW: height rows, 16 8-bit units long
-+mov vw_setup, rb29
-+mov vw_addr, unif # start the VDW
-+
-+################################################################################
-+
-+# mc_filter_b(next_kernel, x, y, frame_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_dst)
-+
-+# At this point we have already issued two pairs of texture requests for the current block
-+# ra_x_base, ra_x16_base point to the current coordinates for this block
-+::mc_filter_b
-+mov ra31, unif
-+
-+# per-channel shifts were calculated on the *previous* invocation
-+
-+mov ra_xshift, ra_xshift_next
-+mov ra_x2shift, ra_x2shift_next
-+
-+# get base addresses and per-channel shifts for *next* invocation
-+add r0, unif, elem_num    # x
-+add r2, r0, 8 # x+8
-+max r0, r0, 0; mov r1, unif # y
-+min r0, r0, rb_frame_width_minus_1 ; mov r3,unif # frame_base
-+shl ra_xshift_next, r0, 3
-+max r2, r2, 0
-+min r2, r2, rb_frame_width_minus_1
-+shl ra_x2shift_next, r2, 3
-+add r0, r0, r3
-+add r2, r2, r3
-+and rb_x_base_next, r0, ~3
-+and ra_x2_base_next, r2, ~3
-+mov ra_y_next, r1
-+
-+# set up VPM write
-+mov vw_setup, rb28
-+
-+# get width,height of block
-+mov r2, 16
-+mov r0, unif
-+shr r1, r0, r2 # Extract width
-+sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
-+and r0, r0, rb22 # Extract height
-+add rb17, r0, 5
-+add rb18, r0, 7
-+shl r0, r0, 7
-+# r0 is currently height<<7
-+# For vr_setup we want height<<20 (so 20-7=13 additional bits)
-+shl r3, r0, 13
-+shl r3, r3, 8 # Mask off top 8 bits
-+shr r3, r3, 8
-+add r0, r0, r1 # Combine width and height of destination area
-+shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
-+add rb26, r0, rb27
-+# In a B frame, so also set up VPM read
-+add vr_setup, r3, rb28
-+
-+# get filter coefficients
-+
-+mov r0, unif
-+asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+asr ra0, r0, rb23;      mov r0, unif
-+asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+asr ra4, r0, rb23;      mov r0, unif
-+asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+asr rb8, r0, rb23;      mov r0, unif
-+asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+asr rb12, r0, rb23
-+
-+# r2 is elem_num
-+# r3 is loop counter
-+
-+mov r5rep, -8
-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+mov r3, 0
-+
-+:bloop
-+# retrieve texture results and pick out bytes
-+# then submit two more texture requests
-+
-+sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
-+shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
-+
-+max r2, ra_y, 0  # y
-+min r2, r2, rb_frame_height_minus_1
-+add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+add t0s, ra_x2_base, r2
-+
-+# generate seven shifted versions
-+# interleave with scroll of vertical context
-+
-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+
-+mov r2, rb21         ; mul24 r3, r0, ra0
-+sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-+nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+sub r0, r2, r3
-+
-+mov r3, rb31
-+
-+mov ra8, ra9
-+mov ra9, ra10
-+mov ra10, ra11
-+mov ra11, ra12
-+mov ra12, ra13
-+mov ra13, ra14
-+
-+sub.setf -, r3, 8 ; mov r1, ra22
-+
-+# apply horizontal filter
-+brr.anyn -, r:bloop
-+max ra14, ra15, 0       ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
-+asr r0, r0, 15          ; mov r1, ra21
-+min.setf ra15, r0, rb22
-+
-+# apply vertical filter and write to VPM
-+
-+nop                     ; mul24 r0, ra14, rb14
-+sub r1, r1, r0          ; mul24 r0, ra13, rb13
-+sub r1, r1, r0          ; mul24 r0, ra12, rb12
-+sub r1, r1, r0          ; mul24 r0, ra11, rb11
-+sub r1, r1, r0          ; mul24 r0, ra10, rb10
-+sub r1, r1, r0          ; mul24 r0, ra9, rb9
-+sub r1, r1, r0          ; mul24 r0, ra8, rb8
-+sub r1, r1, r0          ; mul24 r0, ra15, rb15
-+sub.ifnn r1, r1, r0     ; mov -, vw_wait
-+sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+asr r1, r1, 15          ; mov -, vr_wait
-+min r1, r1, rb22
-+add r0, vpm, 1          # Blend in previous VPM contents at this location
-+brr.anyn -, r:bloop
-+max r1, r1, 0
-+add r1, r1, r0
-+shr vpm, r1, 1
-+
-+# DMA out
-+
-+bra -, ra31
-+mov vw_setup, rb26 # VDW: height rows, 16 8-bit units long
-+mov vw_setup, rb29
-+mov vw_addr, unif # start the VDW
-+
-+################################################################################
-+
-+# mc_filter_honly(next_kernel, x, y, frame_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_dst)
-+# This filter only does horizontal filtering.
-+# It is assumed that the region to fetch does not include extra rows above.
-+
-+# At this point we have already issued two pairs of texture requests for the current block
-+# ra_x_base, ra_x16_base point to the current coordinates for this block
-+::mc_filter_honly
-+mov ra31, unif
-+
-+# per-channel shifts were calculated on the *previous* invocation
-+
-+mov ra_xshift, ra_xshift_next
-+mov ra_x2shift, ra_x2shift_next
-+
-+# get base addresses and per-channel shifts for *next* invocation
-+add r0, unif, elem_num    # x
-+add r2, r0, 8 # x+8
-+max r0, r0, 0; mov r1, unif # y
-+min r0, r0, rb_frame_width_minus_1 ; mov r3,unif # frame_base
-+shl ra_xshift_next, r0, 3
-+max r2, r2, 0
-+min r2, r2, rb_frame_width_minus_1
-+shl ra_x2shift_next, r2, 3
-+add r0, r0, r3
-+add r2, r2, r3
-+and rb_x_base_next, r0, ~3
-+and ra_x2_base_next, r2, ~3
-+mov ra_y_next, r1
-+
-+# set up VPM write
-+mov vw_setup, rb28
-+
-+# get width,height of block
-+mov r2, 16
-+mov r0, unif
-+shr r1, r0, r2 # Extract width
-+sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
-+and r0, r0, rb22 # Extract height
-+add rb17, r0, -2 # Pipelining means we move data across 2 iterations early
-+shl r0, r0, 7 ; mov rb18,r0
-+add r0, r0, r1 # Combine width and height of destination area
-+shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
-+add rb26, r0, rb27
-+
-+# get filter coefficients
-+
-+mov r0, unif
-+asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+asr ra0, r0, rb23;      mov r0, unif
-+asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+asr ra4, r0, rb23;      mov r0, unif
-+mov r0, unif
-+
-+# r2 is elem_num
-+# r3 is loop counter
-+mov r5rep, -8
-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]  # delay slot 3
-+mov r3, 0
-+
-+:loop_honly
-+# retrieve texture results and pick out bytes
-+# then submit two more texture requests
-+
-+sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
-+shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
-+
-+max r2, ra_y, 0  # y
-+min r2, r2, rb_frame_height_minus_1
-+add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+add t0s, ra_x2_base, r2
-+
-+# generate seven shifted versions
-+# interleave with scroll of vertical context
-+
-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+
-+mov r2, rb21         ; mul24 r3, r0, ra0
-+sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-+nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+sub r0, r2, r3       ; mov r3, rb31
-+
-+sub.setf -, r3, rb18 ; mov r1, ra22
-+
-+mov -, vw_wait   ; mul24 r0, r0, r1
-+brr.anyn -, r:loop_honly
-+asr r0, r0, 15          # delay 1
-+min r0, r0, rb22        # delay 2
-+max vpm, r0, 0          # delay 3
-+
-+# DMA out
-+bra -, ra31
-+mov vw_setup, rb26 # VDW: height rows, 16 8-bit units long
-+mov vw_setup, rb29
-+mov vw_addr, unif # start the VDW
-+
-+
-+################################################################################
-+
-+# mc_exit()
-+
-+::mc_exit
-+mov  -, vw_wait # wait on the VDW
-+
-+mov -,srel(0)
-+
-+ldtmu0
-+ldtmu0
-+ldtmu0
-+ldtmu0
-+
-+nop        ; nop ; thrend
-+nop        ; nop # delay slot 1
-+nop        ; nop # delay slot 2
-+
-+::mc_exit1
-+mov  -, vw_wait # wait on the VDW
-+
-+#mov -,srel(1)
-+
-+ldtmu0
-+ldtmu0
-+ldtmu0
-+ldtmu0
-+
-+nop        ; nop ; thrend
-+mov interrupt, 1; nop # delay slot 1
-+nop        ; nop # delay slot 2
-+
-+# mc_interrupt_exit()
-+::mc_interrupt_exit
-+mov  -, vw_wait # wait on the VDW
-+
-+ldtmu0
-+ldtmu0
-+ldtmu0
-+ldtmu0
-+
-+mov -,sacq(0) # 1
-+mov -,sacq(0) # 2
-+mov -,sacq(0) # 3
-+mov -,sacq(0) # 4
-+mov -,sacq(0) # 5
-+mov -,sacq(0) # 6
-+mov -,sacq(0) # 7
-+mov -,sacq(0) # 8
-+mov -,sacq(0) # 9
-+mov -,sacq(0) # 10
-+mov -,sacq(0) # 11
-+
-+nop        ; nop ; thrend
-+mov interrupt, 1; nop # delay slot 1
-+nop        ; nop # delay slot 2
-+
-+# mc_interrupt_exit4()
-+::mc_interrupt_exit4
-+mov  -, vw_wait # wait on the VDW
-+
-+ldtmu0
-+ldtmu0
-+ldtmu0
-+ldtmu0
-+
-+mov -,sacq(0) # 1
-+mov -,sacq(0) # 2
-+mov -,sacq(0) # 3
-+
-+nop        ; nop ; thrend
-+mov interrupt, 1; nop # delay slot 1
-+nop        ; nop # delay slot 2
-+
-+# mc_interrupt_exit8()
-+::mc_interrupt_exit8
-+mov  -, vw_wait # wait on the VDW
-+
-+ldtmu0
-+ldtmu0
-+ldtmu0
-+ldtmu0
-+
-+mov -,sacq(0) # 1
-+mov -,sacq(0) # 2
-+mov -,sacq(0) # 3
-+mov -,sacq(0) # 4
-+mov -,sacq(0) # 5
-+mov -,sacq(0) # 6
-+mov -,sacq(0) # 7
-+
-+nop        ; nop ; thrend
-+mov interrupt, 1; nop # delay slot 1
-+nop        ; nop # delay slot 2
-+
-+################################################################################
-+# mc_setup_uv(next_kernel, x, y, ref_u_base, ref_v_base, frame_width, frame_height, pitch, dst_pitch, pad0, pad1, pad2)
-+::mc_setup_uv
-+
-+# Read starting kernel
-+mov ra31, unif
-+
-+# Load first request location
-+add ra_x_base, unif, elem_num # Store x
-+mov ra_y, unif # Store y
-+mov ra_x2_base, unif # Store frame u base
-+nop
-+sub ra_u2v_ref_offset, unif, ra_x2_base # Store offset to add to move from u to v in reference frame
-+
-+# Read image dimensions
-+sub rb25,unif,1
-+sub rb30,unif,1
-+
-+# get source pitch
-+mov rb16, unif
-+
-+# get destination pitch
-+mov r0, unif
-+mov r1, vdw_setup_1(0)
-+add rb24, r1, r0
-+
-+# load constants
-+
-+mov ra20, 1
-+mov ra21, 64
-+mov ra22, 256
-+mov ra23, 8
-+
-+mov rb20, 0xffffff00
-+mov rb21, 64
-+mov rb22, 255
-+mov rb23, 24
-+
-+# touch vertical context to keep simulator happy
-+
-+mov ra8, 0
-+mov ra9, 0
-+mov ra10, 0
-+mov ra11, 0
-+mov ra12, 0
-+mov ra13, 0
-+mov ra14, 0
-+mov ra15, 0
-+
-+# Compute part of VPM to use for DMA output
-+mov r2, qpu_num
-+and r2, r2, 15
-+mov r1, r2
-+asr r1, r1, 2
-+shl r1, r1, 6
-+mov r0, r2
-+and r0, r0, 3
-+add r0, r0, r1
-+mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
-+shl r0, r0, 5
-+add rb27, r0, r1
-+
-+# Compute part of VPM to save data into
-+mov r2, qpu_num
-+and r2, r2, 15
-+mov r1, r2
-+asr r1, r1, 2
-+shl r1, r1, 6
-+mov r0, r2
-+and r0, r0, 3
-+add r0, r0, r1
-+mov r1, vpm_setup(0, 4, h8p(0, 0))
-+add rb28, r0, r1
-+
-+# Compute base address for first and second access
-+mov r0, ra_x_base           # Load x
-+max r0, r0, 0; mov r1, ra_y # Load y
-+min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base  # Load the frame base
-+shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
-+add ra_y, r1, 1
-+add r0, r0, r3
-+and r0, r0, ~3
-+max r1, r1, 0 ; mov ra_x_base, r0 # y
-+min r1, r1, rb_frame_height_minus_1
-+# submit texture requests for first line
-+add r2, r2, r0 ; mul24 r1, r1, rb_pitch
-+add t0s, r0, r1 ; mov ra_x2_base, r2
-+add t0s, r2, r1
-+
-+# Dump padding words
-+mov r0, unif
-+mov r0, unif
-+mov r0, unif
-+
-+# submit texture requests for second line
-+max r1, ra_y, 0
-+min r1, r1, rb_frame_height_minus_1
-+add ra_y, ra_y, 1
-+bra -, ra31
-+nop ; mul24 r1, r1, rb_pitch
-+add t0s, r1, ra_x_base
-+add t0s, r1, ra_x2_base
-+
-+
-+
-+################################################################################
-+
-+::mc_filter_uv_b
-+mov ra31, unif
-+
-+# per-channel shifts were calculated on the *previous* invocation
-+
-+mov ra_xshift, ra_xshift_next
-+
-+# get base addresses and per-channel shifts for *next* invocation
-+add r0, unif, elem_num    # x
-+max r0, r0, 0; mov r1, unif # y
-+min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base
-+shl ra_xshift_next, r0, 3
-+sub r2, unif, r3 # compute offset from frame base u to frame base v
-+add r0, r0, r3
-+and rb_x_base_next, r0, ~3
-+mov ra_y_next, r1
-+add ra_x2_base_next, rb_x_base_next, r2
-+
-+# set up VPM write
-+mov vw_setup, rb28
-+
-+# get width,height of block
-+mov r2, 16
-+mov r0, unif
-+shr r1, r0, r2 # Extract width
-+sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
-+and r0, r0, rb22 # Extract height
-+add rb17, r0, 5
-+add rb18, r0, 7
-+shl r0, r0, 7
-+
-+# r0 is currently height<<7
-+# For vr_setup we want height<<20 (so 20-7=13 additional bits)
-+shl r3, r0, 13
-+shl r3, r3, 8 # Mask off top 8 bits
-+shr r3, r3, 8
-+
-+add r0, r0, r1 # Combine width and height of destination area
-+shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
-+add rb26, r0, rb27
-+
-+# In a B frame, so also set up VPM read
-+add vr_setup, r3, rb28
-+
-+sub.setf -,8,r1 # 8-r1, so if <0 (negative) we need to use the full code
-+
-+# get filter coefficients
-+
-+mov r0, unif
-+asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+asr ra0, r0, rb23;      mov r0, unif
-+asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+asr ra4, r0, rb23;      mov r0, unif
-+asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+asr rb8, r0, rb23;      mov r0, unif
-+asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+asr rb12, r0, rb23
-+
-+# r2 is elem_num
-+# r3 is loop counter
-+
-+mov r5rep, -8
-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+
-+# retrieve texture results and pick out bytes
-+# then submit two more texture requests
-+
-+mov r3, 0
-+
-+:uvloop_b
-+# retrieve texture results and pick out bytes
-+# then submit two more texture requests
-+
-+sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
-+shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
-+
-+max r2, ra_y, 0  # y
-+min r2, r2, rb_frame_height_minus_1
-+add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+add t0s, ra_x2_base, r2
-+
-+# generate seven shifted versions
-+# interleave with scroll of vertical context
-+
-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+
-+mov r2, rb21         ; mul24 r3, r0, ra0
-+nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
-+sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-+nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+sub r0, r2, r3
-+
-+mov r3, rb31
-+
-+mov ra8, ra9
-+mov ra9, ra10
-+mov ra10, ra11
-+mov ra11, ra12
-+mov ra12, ra13
-+mov ra13, ra14
-+
-+sub.setf -, r3, 8 ; mov r1, ra22
-+
-+# apply horizontal filter
-+brr.anyn -, r:uvloop_b
-+max ra14, ra15, 0       ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
-+asr r0, r0, 15          ; mov r1, ra21
-+min.setf ra15, r0, rb22
-+
-+# apply vertical filter and write to VPM
-+
-+nop                     ; mul24 r0, ra14, rb14
-+sub r1, r1, r0          ; mul24 r0, ra13, rb13
-+sub r1, r1, r0          ; mul24 r0, ra12, rb12
-+sub r1, r1, r0          ; mul24 r0, ra11, rb11
-+sub r1, r1, r0          ; mul24 r0, ra10, rb10
-+sub r1, r1, r0          ; mul24 r0, ra9, rb9
-+sub r1, r1, r0          ; mul24 r0, ra8, rb8
-+sub r1, r1, r0          ; mul24 r0, ra15, rb15
-+sub.ifnn r1, r1, r0     ; mov -, vw_wait
-+sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+asr r1, r1, 15
-+min r1, r1, rb22
-+add r0, vpm, 1          # Blend in previous VPM contents at this location
-+brr.anyn -, r:uvloop_b
-+max r1, r1, 0
-+add r1, r1, r0
-+shr vpm, r1, 1
-+
-+
-+# DMA out for U
-+
-+mov vw_setup, rb26 # VDW setup 0
-+mov vw_setup, rb29 # Stride
-+mov vw_addr, unif # start the VDW
-+
-+# DMA out for V
-+# We need to wait for the U to complete first, but have nothing useful to compute while we wait.
-+# Could potentially push this write into the start of the next pipeline stage.
-+mov r0, 16
-+mov -, vw_wait
-+
-+bra -, ra31
-+add vw_setup, rb26, r0 # VDW setup 0
-+mov vw_setup, rb29 # Stride
-+mov vw_addr, unif # start the VDW
-+
-+::mc_end
-diff --git a/libavcodec/rpi_user_vcsm.h b/libavcodec/rpi_user_vcsm.h
-new file mode 100644
-index 0000000..fbebbbe
---- /dev/null
-+++ b/libavcodec/rpi_user_vcsm.h
-@@ -0,0 +1,425 @@
-+/*
-+Copyright (c) 2012, Broadcom Europe Ltd
-+All rights reserved.
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+    * Redistributions of source code must retain the above copyright
-+      notice, this list of conditions and the following disclaimer.
-+    * Redistributions in binary form must reproduce the above copyright
-+      notice, this list of conditions and the following disclaimer in the
-+      documentation and/or other materials provided with the distribution.
-+    * Neither the name of the copyright holder nor the
-+      names of its contributors may be used to endorse or promote products
-+      derived from this software without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+*/
-+
-+#ifndef __USER_VCSM__H__INCLUDED__
-+#define __USER_VCSM__H__INCLUDED__
-+
-+/* VideoCore Shared Memory - user interface library.
-+**
-+** This library provides all the necessary abstraction for any application to
-+** make use of the shared memory service which is distributed accross a kernel
-+** driver and a videocore service.
-+**
-+** It is an application design decision to choose or not to use this service.
-+**
-+** The logical flow of operations that a user application needs to follow when
-+** using this service is:
-+**
-+**       1) Initialize the service.
-+**       2) Allocate shared memory blocks.
-+**       3) Start using the allocated blocks.
-+**          - In order to gain ownership on a block, lock the allocated block,
-+**            locking a block returns a valid address that the user application
-+**            can access.
-+**          - When finished with using the block for the current execution cycle
-+**            or function, and so when giving up the ownership, unlock the block.
-+**       4) A block can be locked/unlocked as many times required - within or outside
-+**          of - a specific execution context.
-+**       5) To completely release an allocated block, free it.
-+**       6) If the service is no longer required, terminate it.
-+**
-+**
-+** Some generic considerations:
-+
-+** Allocating memory blocks.
-+**
-+**   Memory blocks can be allocated in different manners depending on the cache
-+**   behavior desired.  A given block can either be:
-+
-+**       - Allocated in a non cached fashion all the way through host and videocore.
-+**       - Allocated in a cached fashion on host OR videocore.
-+**       - Allocated in a cached fashion on host AND videocore.
-+**
-+**   It is an application decision to determine how to allocate a block.  Evidently
-+**   if the application will be doing substantial read/write accesses to a given block,
-+**   it is recommended to allocate the block at least in a 'host cached' fashion for
-+**   better results.
-+**
-+**
-+** Locking memory blocks.
-+**
-+**   When the memory block has been allocated in a host cached fashion, locking the
-+**   memory block (and so taking ownership of it) will trigger a cache invalidation.
-+**
-+**   For the above reason and when using host cached allocation, it is important that
-+**   an application properly implements the lock/unlock mechanism to ensure cache will
-+**   stay coherent, otherwise there is no guarantee it will at all be.
-+**
-+**   It is possible to dynamically change the host cache behavior (ie cached or non
-+**   cached) of a given allocation without needing to free and re-allocate the block.
-+**   This feature can be useful for such application which requires access to the block
-+**   only at certain times and not otherwise.  By changing the cache behavior dynamically
-+**   the application can optimize performances for a given duration of use.
-+**   Such dynamic cache behavior remapping only applies to host cache and not videocore
-+**   cache.  If one requires to change the videocore cache behavior, then a new block
-+**   must be created to replace the old one.
-+**
-+**   On successful locking, a valid pointer is returned that the application can use
-+**   to access to data inside the block.  There is no guarantee that the pointer will
-+**   stay valid following the unlock action corresponding to this lock.
-+**
-+**
-+** Unocking memory blocks.
-+**
-+**   When the memory block has been allocated in a host cached fashion, unlocking the
-+**   memory block (and so forgiving its ownership) will trigger a cache flush unless
-+**   explicitely asked not to flush the cache for performances reasons.
-+**
-+**   For the above reason and when using host cached allocation, it is important that
-+**   an application properly implements the lock/unlock mechanism to ensure cache will
-+**   stay coherent, otherwise there is no guarantee it will at all be.
-+**
-+**
-+** A complete API is defined below.
-+*/
-+
-+#ifdef __cplusplus
-+extern "C"
-+{
-+#endif
-+
-+/* Different status that can be dumped.
-+*/
-+typedef enum
-+{
-+   VCSM_STATUS_VC_WALK_ALLOC = 0,   // Walks *all* the allocation on videocore.
-+                                    // Result of the walk is seen in the videocore
-+                                    // log.
-+   VCSM_STATUS_HOST_WALK_MAP,       // Walks the *full* mapping allocation on host
-+                                    // driver (ie for all processes).  Result of
-+                                    // the walk is seen in the kernel log.
-+   VCSM_STATUS_HOST_WALK_PID_MAP,   // Walks the per process mapping allocation on host
-+                                    // driver (for current process).  Result of
-+                                    // the walk is seen in the kernel log.
-+   VCSM_STATUS_HOST_WALK_PID_ALLOC, // Walks the per process host allocation on host
-+                                    // driver (for current process).  Result of
-+                                    // the walk is seen in the kernel log.
-+   VCSM_STATUS_VC_MAP_ALL,          // Equivalent to both VCSM_STATUS_VC_WALK_ALLOC and
-+                                    // VCSM_STATUS_HOST_WALK_MAP.
-+                                    //
-+   VCSM_STATUS_NONE,                // Must be last - invalid.
-+
-+} VCSM_STATUS_T;
-+
-+/* Different kind of cache behavior.
-+*/
-+typedef enum
-+{
-+   VCSM_CACHE_TYPE_NONE = 0,        // No caching applies.
-+   VCSM_CACHE_TYPE_HOST,            // Allocation is cached on host (user space).
-+   VCSM_CACHE_TYPE_VC,              // Allocation is cached on videocore.
-+   VCSM_CACHE_TYPE_HOST_AND_VC,     // Allocation is cached on both host and videocore.
-+
-+} VCSM_CACHE_TYPE_T;
-+
-+/* Initialize the vcsm processing.
-+**
-+** Must be called once before attempting to do anything else.
-+**
-+** Returns 0 on success, -1 on error.
-+*/
-+int vcsm_init( void );
-+
-+
-+/* Terminates the vcsm processing.
-+**
-+** Must be called vcsm services are no longer needed, it will
-+** take care of removing any allocation under the current process
-+** control if deemed necessary.
-+*/
-+void vcsm_exit( void );
-+
-+
-+/* Queries the status of the the vcsm.
-+**
-+** Triggers dump of various kind of information, see the
-+** different variants specified in VCSM_STATUS_T.
-+**
-+** Pid is optional.
-+*/
-+void vcsm_status( VCSM_STATUS_T status, int pid );
-+
-+
-+/* Allocates a non-cached block of memory of size 'size' via the vcsm memory
-+** allocator.
-+**
-+** Returns:        0 on error
-+**                 a non-zero opaque handle on success.
-+**
-+** On success, the user must invoke vcsm_lock with the returned opaque
-+** handle to gain access to the memory associated with the opaque handle.
-+** When finished using the memory, the user calls vcsm_unlock_xx (see those
-+** function definition for more details on the one that can be used).
-+**
-+** A well behaved application should make every attempt to lock/unlock
-+** only for the duration it needs to access the memory data associated with
-+** the opaque handle.
-+*/
-+unsigned int vcsm_malloc( unsigned int size, char *name );
-+
-+
-+/* Allocates a cached block of memory of size 'size' via the vcsm memory
-+** allocator, the type of caching requested is passed as argument of the
-+** function call.
-+**
-+** Returns:        0 on error
-+**                 a non-zero opaque handle on success.
-+**
-+** On success, the user must invoke vcsm_lock with the returned opaque
-+** handle to gain access to the memory associated with the opaque handle.
-+** When finished using the memory, the user calls vcsm_unlock_xx (see those
-+** function definition for more details on the one that can be used).
-+**
-+** A well behaved application should make every attempt to lock/unlock
-+** only for the duration it needs to access the memory data associated with
-+** the opaque handle.
-+*/
-+unsigned int vcsm_malloc_cache( unsigned int size, VCSM_CACHE_TYPE_T cache, char *name );
-+
-+
-+/* Shares an allocated block of memory via the vcsm memory allocator.
-+**
-+** Returns:        0 on error
-+**                 a non-zero opaque handle on success.
-+**
-+** On success, the user must invoke vcsm_lock with the returned opaque
-+** handle to gain access to the memory associated with the opaque handle.
-+** When finished using the memory, the user calls vcsm_unlock_xx (see those
-+** function definition for more details on the one that can be used).
-+**
-+** A well behaved application should make every attempt to lock/unlock
-+** only for the duration it needs to access the memory data associated with
-+** the opaque handle.
-+*/
-+unsigned int vcsm_malloc_share( unsigned int handle );
-+
-+
-+/* Resizes a block of memory allocated previously by vcsm_alloc.
-+**
-+** Returns:        0 on success
-+**                 -errno on error.
-+**
-+** The handle must be unlocked by user prior to attempting any
-+** resize action.
-+**
-+** On error, the original size allocated against the handle
-+** remains available the same way it would be following a
-+** successful vcsm_malloc.
-+*/
-+int vcsm_resize( unsigned int handle, unsigned int new_size );
-+
-+
-+/* Frees a block of memory that was successfully allocated by
-+** a prior call the vcms_alloc.
-+**
-+** The handle should be considered invalid upon return from this
-+** call.
-+**
-+** Whether any memory is actually freed up or not as the result of
-+** this call will depends on many factors, if all goes well it will
-+** be freed.  If something goes wrong, the memory will likely end up
-+** being freed up as part of the vcsm_exit process.  In the end the
-+** memory is guaranteed to be freed one way or another.
-+*/
-+void vcsm_free( unsigned int handle );
-+
-+
-+/* Retrieves a videocore opaque handle from a mapped user address
-+** pointer.  The videocore handle will correspond to the actual
-+** memory mapped in videocore.
-+**
-+** Returns:        0 on error
-+**                 a non-zero opaque handle on success.
-+**
-+** Note: the videocore opaque handle is distinct from the user
-+**       opaque handle (allocated via vcsm_malloc) and it is only
-+**       significant for such application which knows what to do
-+**       with it, for the others it is just a number with little
-+**       use since nothing can be done with it (in particular
-+**       for safety reason it cannot be used to map anything).
-+*/
-+unsigned int vcsm_vc_hdl_from_ptr( void *usr_ptr );
-+
-+
-+/* Retrieves a videocore opaque handle from a opaque handle
-+** pointer.  The videocore handle will correspond to the actual
-+** memory mapped in videocore.
-+**
-+** Returns:        0 on error
-+**                 a non-zero opaque handle on success.
-+**
-+** Note: the videocore opaque handle is distinct from the user
-+**       opaque handle (allocated via vcsm_malloc) and it is only
-+**       significant for such application which knows what to do
-+**       with it, for the others it is just a number with little
-+**       use since nothing can be done with it (in particular
-+**       for safety reason it cannot be used to map anything).
-+*/
-+unsigned int vcsm_vc_hdl_from_hdl( unsigned int handle );
-+
-+
-+/* Retrieves a user opaque handle from a mapped user address
-+** pointer.
-+**
-+** Returns:        0 on error
-+**                 a non-zero opaque handle on success.
-+*/
-+unsigned int vcsm_usr_handle( void *usr_ptr );
-+
-+
-+/* Retrieves a mapped user address from an opaque user
-+** handle.
-+**
-+** Returns:        0 on error
-+**                 a non-zero address on success.
-+**
-+** On success, the address corresponds to the pointer
-+** which can access the data allocated via the vcsm_malloc
-+** call.
-+*/
-+void *vcsm_usr_address( unsigned int handle );
-+
-+
-+/* Locks the memory associated with this opaque handle.
-+**
-+** Returns:        NULL on error
-+**                 a valid pointer on success.
-+**
-+** A user MUST lock the handle received from vcsm_malloc
-+** in order to be able to use the memory associated with it.
-+**
-+** On success, the pointer returned is only valid within
-+** the lock content (ie until a corresponding vcsm_unlock_xx
-+** is invoked).
-+*/
-+void *vcsm_lock( unsigned int handle );
-+
-+
-+/* Locks the memory associated with this opaque handle.  The lock
-+** also gives a chance to update the *host* cache behavior of the
-+** allocated buffer if so desired.  The *videocore* cache behavior
-+** of the allocated buffer cannot be changed by this call and such
-+** attempt will be ignored.
-+**
-+** The system will attempt to honour the cache_update mode request,
-+** the cache_result mode will provide the final answer on which cache
-+** mode is really in use.  Failing to change the cache mode will not
-+** result in a failure to lock the buffer as it is an application
-+** decision to choose what to do if (cache_result != cache_update)
-+**
-+** The value returned in cache_result can only be considered valid if
-+** the returned pointer is non NULL.  The cache_result pointer may be
-+** NULL if the application does not care about the actual outcome of
-+** its action with regards to the cache behavior change.
-+**
-+** Returns:        NULL on error
-+**                 a valid pointer on success.
-+**
-+** A user MUST lock the handle received from vcsm_malloc
-+** in order to be able to use the memory associated with it.
-+**
-+** On success, the pointer returned is only valid within
-+** the lock content (ie until a corresponding vcsm_unlock_xx
-+** is invoked).
-+*/
-+void *vcsm_lock_cache( unsigned int handle,
-+                       VCSM_CACHE_TYPE_T cache_update,
-+                       VCSM_CACHE_TYPE_T *cache_result );
-+
-+
-+/* Unlocks the memory associated with this user mapped address.
-+**
-+** Returns:        0 on success
-+**                 -errno on error.
-+**
-+** After unlocking a mapped address, the user should no longer
-+** attempt to reference it.
-+*/
-+int vcsm_unlock_ptr( void *usr_ptr );
-+
-+
-+/* Unlocks the memory associated with this user mapped address.
-+** Apply special processing that would override the otherwise
-+** default behavior.
-+**
-+** If 'cache_no_flush' is specified:
-+**    Do not flush cache as the result of the unlock (if cache
-+**    flush was otherwise applicable in this case).
-+**
-+** Returns:        0 on success
-+**                 -errno on error.
-+**
-+** After unlocking a mapped address, the user should no longer
-+** attempt to reference it.
-+*/
-+int vcsm_unlock_ptr_sp( void *usr_ptr, int cache_no_flush );
-+
-+
-+/* Unlocks the memory associated with this user opaque handle.
-+**
-+** Returns:        0 on success
-+**                 -errno on error.
-+**
-+** After unlocking an opaque handle, the user should no longer
-+** attempt to reference the mapped addressed once associated
-+** with it.
-+*/
-+int vcsm_unlock_hdl( unsigned int handle );
-+
-+
-+/* Unlocks the memory associated with this user opaque handle.
-+** Apply special processing that would override the otherwise
-+** default behavior.
-+**
-+** If 'cache_no_flush' is specified:
-+**    Do not flush cache as the result of the unlock (if cache
-+**    flush was otherwise applicable in this case).
-+**
-+** Returns:        0 on success
-+**                 -errno on error.
-+**
-+** After unlocking an opaque handle, the user should no longer
-+** attempt to reference the mapped addressed once associated
-+** with it.
-+*/
-+int vcsm_unlock_hdl_sp( unsigned int handle, int cache_no_flush );
-+
-+#ifdef __cplusplus
-+}
-+#endif
-+
-+#endif /* __USER_VCSM__H__INCLUDED__ */
--- 
-2.7.4
-
-
-From 6cfa5910be47865aaaf58c185587189c332765a6 Mon Sep 17 00:00:00 2001
-From: Peter de Rivaz <peter.derivaz@argondesign.com>
-Date: Sat, 2 May 2015 21:15:37 +0100
-Subject: [PATCH 04/68] First working version with uncached memory
-
----
- libavcodec/hevc.c               |  61 +++++-
- libavcodec/hevc.h               |  12 +-
- libavcodec/hevc_cabac.c         |  39 +++-
- libavcodec/hevc_filter.c        |  16 ++
- libavcodec/hevcpred_template.c  |   6 +
- libavcodec/rpi_hevc_transform.h | 422 +++++++++++++++++++++++++++++++++++++++-
- libavcodec/rpi_hevc_transform.s | 153 +++++++++++++--
- libavcodec/rpi_qpu.c            |  72 +++++++
- libavcodec/rpi_qpu.h            |   1 +
- 9 files changed, 736 insertions(+), 46 deletions(-)
-
-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-index ab55df1..94ff709 100644
---- a/libavcodec/hevc.c
-+++ b/libavcodec/hevc.c
-@@ -45,6 +45,8 @@
- #include "rpi_qpu.h"
- #endif
- 
-+// #define DISABLE_MC
-+
- const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
- 
- /**
-@@ -1079,11 +1081,15 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
-                         for (i = 0; i < (size * size); i++) {
-                             coeffs[i] = ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
-                         }
-+                        printf("Cross component not supported\n"); // TODO
-+                        exit(-1);
-                         s->hevcdsp.transform_add[log2_trafo_size_c-2](dst, coeffs, stride);
-                     }
-             }
- 
-             if (lc->tu.cross_pf) {
-+                printf("Cross component not supported\n"); // TODO
-+                exit(-1);
-                 hls_cross_component_pred(s, 1);
-             }
-             for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
-@@ -1112,6 +1118,8 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
-                         for (i = 0; i < (size * size); i++) {
-                             coeffs[i] = ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
-                         }
-+                        printf("Cross component not supported\n"); // TODO
-+                        exit(-1);
-                         s->hevcdsp.transform_add[log2_trafo_size_c-2](dst, coeffs, stride);
-                     }
-             }
-@@ -1409,6 +1417,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
-                            (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
-     int idx              = ff_hevc_pel_weight[block_w];
- 
-+#ifdef DISABLE_MC
-+    return;
-+#endif
-+
-     x_off += mv->x >> 2;
-     y_off += mv->y >> 2;
-     src   += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift));
-@@ -1479,6 +1491,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
-     uint8_t *src0  = ref0->data[0] + y_off0 * src0stride + (int)((unsigned)x_off0 << s->ps.sps->pixel_shift);
-     uint8_t *src1  = ref1->data[0] + y_off1 * src1stride + (int)((unsigned)x_off1 << s->ps.sps->pixel_shift);
- 
-+#ifdef DISABLE_MC
-+    return;
-+#endif
-+
-     if (x_off0 < QPEL_EXTRA_BEFORE || y_off0 < QPEL_EXTRA_AFTER ||
-         x_off0 >= pic_width - block_w - QPEL_EXTRA_AFTER ||
-         y_off0 >= pic_height - block_h - QPEL_EXTRA_AFTER) {
-@@ -1564,6 +1580,10 @@ static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
-     intptr_t _mx         = mx << (1 - hshift);
-     intptr_t _my         = my << (1 - vshift);
- 
-+#ifdef DISABLE_MC
-+    return;
-+#endif
-+
-     x_off += mv->x >> (2 + hshift);
-     y_off += mv->y >> (2 + vshift);
-     src0  += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift));
-@@ -1628,6 +1648,10 @@ static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVF
-     int hshift = s->ps.sps->hshift[1];
-     int vshift = s->ps.sps->vshift[1];
- 
-+#ifdef DISABLE_MC
-+    return;
-+#endif
-+
-     intptr_t mx0 = av_mod_uintp2(mv0->x, 2 + hshift);
-     intptr_t my0 = av_mod_uintp2(mv0->y, 2 + vshift);
-     intptr_t mx1 = av_mod_uintp2(mv1->x, 2 + hshift);
-@@ -2367,6 +2391,22 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
- }
- 
- #ifdef RPI
-+static void rpi_execute_transform(HEVCContext *s)
-+{
-+    int i=2;
-+    //int j;
-+    //int16_t *coeffs = s->coeffs_buf_arm[i];
-+    //for(j=s->num_coeffs[i]; j > 0; j-= 16*16, coeffs+=16*16) {
-+    //    s->hevcdsp.idct[4-2](coeffs, 16);
-+    //}
-+
-+    //gpu_cache_flush(&s->coeffs_buf[i]);
-+    vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf[i].vc, s->num_coeffs[i] >> 8, 0, 0, 0);
-+
-+    for(i=0;i<4;i++)
-+        s->num_coeffs[i] = 0;
-+}
-+
- static void rpi_execute_pred_cmds(HEVCContext *s)
- {
-   int i;
-@@ -2387,7 +2427,6 @@ static void rpi_execute_pred_cmds(HEVCContext *s)
-       }
-   }
-   s->num_pred_cmds = 0;
--  s->num_coeffs = 0;
- }
- #endif
- 
-@@ -2434,7 +2473,8 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
- 
-         more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
- #ifdef RPI
--        if (x_ctb + ctb_size >= s->ps.sps->width) {
-+        if (1 || x_ctb + ctb_size >= s->ps.sps->width) { // TODO watch out for deblocking!
-+            rpi_execute_transform(s);
-             rpi_execute_pred_cmds(s);
-         }
- #endif
-@@ -3179,7 +3219,9 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
-     av_freep(&s->unif_mv_cmds);
-     av_freep(&s->unif_xfm_cmds);
-     av_freep(&s->univ_pred_cmds);
--    av_freep(&s->coeffs_buf);
-+    for(i = 0; i < 4; i++) {
-+        gpu_free(&s->coeffs_buf[i]);
-+    }
- #endif
- 
-     for (i = 0; i < 3; i++) {
-@@ -3246,13 +3288,16 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
-     s->univ_pred_cmds = av_mallocz(sizeof(HEVCPredCmd)*RPI_MAX_PRED_CMDS);
-     if (!s->univ_pred_cmds)
-         goto fail;
--    s->coeffs_buf = av_mallocz(sizeof(int16_t)*RPI_MAX_XFM_CMDS*16);
--    if (!s->coeffs_buf)
--        goto fail;
-+    for(i = 0; i < 4; i++) {
-+        gpu_malloc_uncached(sizeof(int16_t)*RPI_MAX_XFM_CMDS*16, &s->coeffs_buf[i]); // TODO slim this down and share across sizes
-+        s->coeffs_buf_arm[i] = (int16_t*) s->coeffs_buf[i].arm;
-+        if (!s->coeffs_buf_arm[i])
-+            goto fail;
-+    }
-     s->enable_rpi = 0;
- 
-     // A little test program
--    {
-+    /*{
-       GPU_MEM_PTR_T p;
-       int err = gpu_malloc_cached(16, &p);
-       short *q = (short *)p.arm;
-@@ -3273,7 +3318,7 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
-       printf(")\n");
-       gpu_free(&p);
-       goto fail; // Early out
--    }
-+    }*/
- 
- #endif
- 
-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-index 7a1c35f..4167985 100644
---- a/libavcodec/hevc.h
-+++ b/libavcodec/hevc.h
-@@ -40,6 +40,11 @@
- #include "thread.h"
- #include "videodsp.h"
- 
-+// define RPI to split the CABAC/prediction/transform into separate stages
-+#ifdef RPI
-+#include "rpi_qpu.h"
-+#endif
-+
- #define MAX_DPB_SIZE 16 // A.4.1
- #define MAX_REFS 16
- 
-@@ -856,11 +861,12 @@ typedef struct HEVCContext {
-     HEVCMvCmd *unif_mv_cmds;
-     HEVCXfmCmd *unif_xfm_cmds;
-     HEVCPredCmd *univ_pred_cmds;
--    int16_t *coeffs_buf;
--    int num_mv_cmds;
-+    GPU_MEM_PTR_T coeffs_buf[4];
-+    int16_t *coeffs_buf_arm[4];
-+    int num_coeffs[4];
-     int num_xfm_cmds;
-+    int num_mv_cmds;
-     int num_pred_cmds;
--    int num_coeffs;
- #endif
- 
-     uint8_t *cabac_state;
-diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
-index 4e97f06..d1cba86 100644
---- a/libavcodec/hevc_cabac.c
-+++ b/libavcodec/hevc_cabac.c
-@@ -1031,6 +1031,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-     int vshift = s->ps.sps->vshift[c_idx];
-     uint8_t *dst = &s->frame->data[c_idx][(y0 >> vshift) * stride +
-                                           ((x0 >> hshift) << s->ps.sps->pixel_shift)];
-+    int use_vpu = s->enable_rpi && !lc->cu.cu_transquant_bypass_flag && !transform_skip_flag && !lc->tu.cross_pf && log2_trafo_size==4;
-     int16_t *coeffs = (int16_t*)(c_idx ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
-     uint8_t significant_coeff_group_flag[8][8] = {{0}};
-     int explicit_rdpcm_flag = 0;
-@@ -1044,6 +1045,18 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-     uint8_t dc_scale;
-     int pred_mode_intra = (c_idx == 0) ? lc->tu.intra_pred_mode :
-                                          lc->tu.intra_pred_mode_c;
-+#ifdef RPI
-+    if (s->enable_rpi) {
-+        int n = trafo_size * trafo_size;
-+        if (use_vpu) {
-+            coeffs = s->coeffs_buf_arm[log2_trafo_size - 2] + s->num_coeffs[log2_trafo_size - 2];
-+            s->num_coeffs[log2_trafo_size - 2] += n;
-+        } else {
-+            coeffs = s->coeffs_buf_arm[0] + s->num_coeffs[0];
-+            s->num_coeffs[0] += n;
-+        }
-+    }
-+#endif
- 
-     memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
- 
-@@ -1488,6 +1501,24 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-         } else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2) {
-             s->hevcdsp.idct_4x4_luma(coeffs);
-         } else {
-+#ifdef RPI
-+            if (!use_vpu) {
-+              int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
-+              if (max_xy == 0)
-+                  s->hevcdsp.idct_dc[log2_trafo_size-2](coeffs);
-+              else {
-+                  int col_limit = last_significant_coeff_x + last_significant_coeff_y + 4;
-+                  if (max_xy < 4)
-+                      col_limit = FFMIN(4, col_limit);
-+                  else if (max_xy < 8)
-+                      col_limit = FFMIN(8, col_limit);
-+                  else if (max_xy < 12)
-+                      col_limit = FFMIN(24, col_limit);
-+
-+                  s->hevcdsp.idct[log2_trafo_size-2](coeffs, col_limit);
-+              }
-+            }
-+#else
-             int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
-             if (max_xy == 0)
-                 s->hevcdsp.idct_dc[log2_trafo_size-2](coeffs);
-@@ -1501,6 +1532,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-                     col_limit = FFMIN(24, col_limit);
-                 s->hevcdsp.idct[log2_trafo_size-2](coeffs, col_limit);
-             }
-+#endif
-         }
-     }
-     if (lc->tu.cross_pf) {
-@@ -1512,14 +1544,11 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-     }
- #ifdef RPI
-     if (s->enable_rpi) {
--        int16_t *c = s->coeffs_buf + s->num_coeffs;
--        int n = trafo_size * trafo_size;
-         HEVCPredCmd *cmd = s->univ_pred_cmds + s->num_pred_cmds++;
--        memcpy(c, coeffs, n * sizeof(int16_t));  // TODO change pointer earlier and we can avoid this copy
--        s->num_coeffs += n;
-+        //memcpy(coeffs2, coeffs, sizeof(int16_t) * trafo_size * trafo_size); // TODO
-         cmd->type = RPI_PRED_TRANSFORM_ADD;
-         cmd->size = log2_trafo_size;
--        cmd->buf = c;
-+        cmd->buf = coeffs;
-         cmd->dst = dst;
-         cmd->stride = stride;
-         return;
-diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
-index 1f33b0c..e4c3da7 100644
---- a/libavcodec/hevc_filter.c
-+++ b/libavcodec/hevc_filter.c
-@@ -22,6 +22,10 @@
-  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-  */
- 
-+//#define DISABLE_SAO
-+//#define DISABLE_DEBLOCK
-+//#define DISABLE_STRENGTHS
-+
- #include "libavutil/common.h"
- #include "libavutil/internal.h"
- 
-@@ -273,6 +277,10 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
-     edges[2]   = x_ctb == s->ps.sps->ctb_width  - 1;
-     edges[3]   = y_ctb == s->ps.sps->ctb_height - 1;
- 
-+#ifdef DISABLE_SAO
-+    return;
-+#endif
-+
-     if (restore) {
-         if (!edges[0]) {
-             left_tile_edge  = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1]];
-@@ -496,6 +504,10 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
-                 s->ps.sps->pcm.loop_filter_disable_flag) ||
-                s->ps.pps->transquant_bypass_enable_flag;
- 
-+#ifdef DISABLE_DEBLOCK
-+    return;
-+#endif
-+
-     if (x0) {
-         left_tc_offset   = s->deblock[ctb - 1].tc_offset;
-         left_beta_offset = s->deblock[ctb - 1].beta_offset;
-@@ -726,6 +738,10 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
-     int boundary_upper, boundary_left;
-     int i, j, bs;
- 
-+#ifdef DISABLE_STRENGTHS
-+    return;
-+#endif
-+
-     boundary_upper = y0 > 0 && !(y0 & 7);
-     if (boundary_upper &&
-         ((!s->sh.slice_loop_filter_across_slices_enabled_flag &&
-diff --git a/libavcodec/hevcpred_template.c b/libavcodec/hevcpred_template.c
-index 6ae87cc..71c6d52 100644
---- a/libavcodec/hevcpred_template.c
-+++ b/libavcodec/hevcpred_template.c
-@@ -20,6 +20,8 @@
-  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-  */
- 
-+//#define DISABLE_INTRA
-+
- #include "libavutil/pixdesc.h"
- 
- #include "bit_depth_template.c"
-@@ -114,6 +116,10 @@ do {                                  \
-     int top_right_size   = (FFMIN(x0 + 2 * size_in_luma_h, s->ps.sps->width) -
-                            (x0 + size_in_luma_h)) >> hshift;
- 
-+#ifdef DISABLE_INTRA
-+    return;
-+#endif
-+
-     if (s->ps.pps->constrained_intra_pred_flag == 1) {
-         int size_in_luma_pu_v = PU(size_in_luma_v);
-         int size_in_luma_pu_h = PU(size_in_luma_h);
-diff --git a/libavcodec/rpi_hevc_transform.h b/libavcodec/rpi_hevc_transform.h
-index 85a9102..c0c279f 100644
---- a/libavcodec/rpi_hevc_transform.h
-+++ b/libavcodec/rpi_hevc_transform.h
-@@ -3,11 +3,11 @@ unsigned char rpi_hevc_transform [] = {
- 3,
- 3,
- 232,
--128,
-+32,
- 0,
- 0,
- 0,
--20,
-+12,
- 248,
- 0,
- 136,
-@@ -56,9 +56,9 @@ unsigned char rpi_hevc_transform [] = {
- 5,
- 232,
- 0,
--0,
- 8,
- 0,
-+0,
- 128,
- 69,
- 113,
-@@ -108,8 +108,8 @@ unsigned char rpi_hevc_transform [] = {
- 128,
- 2,
- 0,
--248,
--62,
-+8,
-+2,
- 0,
- 128,
- 144,
-@@ -123,13 +123,13 @@ unsigned char rpi_hevc_transform [] = {
- 3,
- 32,
- 8,
--16,
-+20,
- 0,
- 76,
- 254,
- 48,
- 192,
--9,
-+4,
- 4,
- 32,
- 8,
-@@ -155,14 +155,46 @@ unsigned char rpi_hevc_transform [] = {
- 192,
- 41,
- 3,
--68,
-+70,
-+192,
-+80,
-+7,
-+164,
-+255,
-+36,
-+204,
-+96,
-+2,
-+0,
-+248,
-+62,
-+0,
-+3,
-+255,
-+55,
-+208,
-+120,
-+3,
-+224,
-+3,
-+190,
-+11,
-+16,
-+139,
-+246,
-+91,
-+0,
-+103,
-+90,
-+0,
-+70,
- 192,
- 80,
- 7,
- 164,
- 255,
- 36,
--220,
-+204,
- 96,
- 2,
- 0,
-@@ -182,7 +214,7 @@ unsigned char rpi_hevc_transform [] = {
- 16,
- 139,
- 246,
--83,
-+91,
- 0,
- 103,
- 90,
-@@ -209,4 +241,374 @@ unsigned char rpi_hevc_transform [] = {
- 96,
- 90,
- 0,
-+169,
-+3,
-+3,
-+232,
-+32,
-+0,
-+0,
-+0,
-+12,
-+248,
-+0,
-+136,
-+0,
-+0,
-+192,
-+248,
-+0,
-+0,
-+64,
-+232,
-+0,
-+2,
-+0,
-+0,
-+12,
-+248,
-+0,
-+168,
-+0,
-+0,
-+192,
-+248,
-+0,
-+0,
-+3,
-+232,
-+128,
-+0,
-+0,
-+0,
-+7,
-+232,
-+0,
-+2,
-+0,
-+0,
-+4,
-+232,
-+64,
-+0,
-+0,
-+0,
-+5,
-+232,
-+0,
-+8,
-+0,
-+0,
-+57,
-+239,
-+224,
-+247,
-+255,
-+255,
-+72,
-+192,
-+95,
-+207,
-+88,
-+122,
-+88,
-+124,
-+137,
-+64,
-+26,
-+64,
-+161,
-+64,
-+152,
-+64,
-+128,
-+144,
-+31,
-+0,
-+72,
-+232,
-+32,
-+0,
-+0,
-+0,
-+65,
-+232,
-+32,
-+0,
-+0,
-+0,
-+128,
-+144,
-+23,
-+0,
-+145,
-+64,
-+168,
-+64,
-+128,
-+144,
-+19,
-+0,
-+72,
-+232,
-+32,
-+0,
-+0,
-+0,
-+65,
-+232,
-+32,
-+0,
-+0,
-+0,
-+128,
-+144,
-+11,
-+0,
-+74,
-+232,
-+0,
-+8,
-+0,
-+0,
-+242,
-+140,
-+229,
-+192,
-+57,
-+239,
-+32,
-+8,
-+0,
-+0,
-+41,
-+3,
-+12,
-+248,
-+0,
-+128,
-+0,
-+0,
-+192,
-+8,
-+4,
-+0,
-+12,
-+248,
-+0,
-+132,
-+64,
-+0,
-+192,
-+8,
-+4,
-+0,
-+0,
-+96,
-+255,
-+159,
-+131,
-+255,
-+0,
-+232,
-+0,
-+4,
-+0,
-+0,
-+255,
-+159,
-+142,
-+255,
-+4,
-+255,
-+48,
-+204,
-+16,
-+3,
-+224,
-+251,
-+62,
-+0,
-+5,
-+255,
-+51,
-+204,
-+128,
-+3,
-+224,
-+251,
-+16,
-+0,
-+77,
-+254,
-+51,
-+204,
-+9,
-+4,
-+224,
-+251,
-+0,
-+0,
-+128,
-+64,
-+6,
-+232,
-+64,
-+0,
-+0,
-+0,
-+140,
-+248,
-+47,
-+0,
-+0,
-+0,
-+224,
-+99,
-+0,
-+0,
-+4,
-+254,
-+0,
-+144,
-+128,
-+2,
-+0,
-+8,
-+2,
-+0,
-+32,
-+247,
-+240,
-+207,
-+16,
-+3,
-+32,
-+247,
-+176,
-+207,
-+17,
-+3,
-+32,
-+247,
-+112,
-+207,
-+18,
-+3,
-+32,
-+247,
-+48,
-+207,
-+19,
-+3,
-+32,
-+247,
-+240,
-+206,
-+20,
-+3,
-+32,
-+247,
-+176,
-+206,
-+21,
-+3,
-+32,
-+247,
-+112,
-+206,
-+22,
-+3,
-+32,
-+247,
-+48,
-+206,
-+23,
-+3,
-+32,
-+247,
-+240,
-+205,
-+24,
-+3,
-+32,
-+247,
-+176,
-+205,
-+25,
-+3,
-+32,
-+247,
-+112,
-+205,
-+26,
-+3,
-+32,
-+247,
-+48,
-+205,
-+27,
-+3,
-+32,
-+247,
-+240,
-+204,
-+28,
-+3,
-+32,
-+247,
-+176,
-+204,
-+29,
-+3,
-+32,
-+247,
-+112,
-+204,
-+30,
-+3,
-+32,
-+247,
-+48,
-+204,
-+31,
-+3,
-+5,
-+255,
-+51,
-+204,
-+128,
-+3,
-+224,
-+251,
-+16,
-+0,
-+77,
-+254,
-+51,
-+204,
-+9,
-+4,
-+224,
-+251,
-+0,
-+0,
-+0,
-+237,
-+0,
-+4,
-+0,
-+0,
-+140,
-+248,
-+47,
-+0,
-+0,
-+0,
-+224,
-+99,
-+0,
-+0,
-+90,
-+0,
- };
-diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s
-index 5e2728d..1e389c7 100644
---- a/libavcodec/rpi_hevc_transform.s
-+++ b/libavcodec/rpi_hevc_transform.s
-@@ -58,13 +58,6 @@
- #
- #
- 
--test_add:
--  vldh HX(0,0),(r0)
--  vadd HX(0,0),HX(0,0),10
--  vsth HX(0,0),(r0)
--  mov r0,7 # return value
--  b lr
--
- # Columns are transformed first
- #
- # Store top left half of transMatrix2 in
-@@ -79,7 +72,7 @@ test_add:
- #
- 
- 
--# hevc_trans_16x16(short *transMatrix2, short *coeffs, int num)
-+# hevc_trans_16x16(short *transMatrix2, short *coeffs, int num) # TODO add size so we can branch to correct implementation (or perhaps have coeffs32 and num32 as secondary inputs!)
- # transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory)
- # coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
- # num: number of 16x16 transforms to be done
-@@ -87,17 +80,17 @@ test_add:
- hevc_trans_16x16:
-   push r6-r15, lr # TODO cut down number of used registers
- 
--  mov r3, 2*32*2 # Twice Stride of transMatrix2 in bytes
--  vld HX(32++,0),(r0 += r3) REP 16 # This is the 16x16 matrix, a transform is equivalent to multiplying input row vector * matrix
-+  mov r3, 16*2 # Stride of transMatrix2 in bytes
-+  vldh HX(32++,0),(r0 += r3) REP 16 # This is the 16x16 matrix, a transform is equivalent to multiplying input row vector * matrix
-   # Now use r0 to describe which matrix we are working on.
-   # Allows us to prefetch the next block of coefficients for efficiency.
-   mov r0,0 # This describes the location where we read our coefficients from
--  mov r3,16*2 # Stride of coefficients in bytes
-+  mov r3,16*2 # Stride of coefficients in bytes (TODO remove)
-   mov r7,16*16*2 # Total block size
-   mov r8,64*16 # Value used to swap from current to next VRF location
-   vldh HX(0++,0)+r0,(r1 += r3) REP 16
-   mov r4,64 # Constant used for rounding first pass
--  mov r5,1<<19 # Constant used for rounding second pass
-+  mov r5,1<<11 # Constant used for rounding second pass
- 
-   # At start of block r0,r1 point to the current block (that has already been loaded)
- block_loop:
-@@ -113,12 +106,12 @@ block_loop:
-   vadd HY(0++,0)+r0,HY(0++,0)+r0,r4 REP 16   # Now add on rounding, shift down by 7, and saturate
-   #vsasls HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # 9+7=16 so this ends up with the output saturated and in the top half of the word.
-   vasl HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16    # This should be saturating, but the instruction above does not assemble?
--  vmov VX(0,0++), HX(0++,32) REP 16          # For simplicity transpose this back to the original position
-+  vmov VX(0,0++)+r0, HX(0++,32)+r0 REP 16    # For simplicity transpose this back to the original position
- 
-   bl col_trans_16
--  vadd HY(0++,0)+r0,HY(0++,0)+r0,r4 REP 16   # Now add on rounding, shift down by 7, and saturate
--  #vsasls HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # 9+7=16 so this ends up with the output saturated and in the top half of the word.
--  vasl HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16    # This should be saturating, but the instruction above does not assemble?
-+  vadd HY(0++,0)+r0,HY(0++,0)+r0,r5 REP 16   # Now add on rounding, shift down by 7, and saturate
-+  #vsasls HY(0++,0)+r0,HY(0++,0)+r0,4 REP 16 # 4+12=16 so this ends up with the output saturated and in the top half of the word.
-+  vasl HY(0++,0)+r0,HY(0++,0)+r0,4 REP 16    # This should be saturating, but the instruction above does not assemble?  (Probably because it ends with ls which is interpreted as a condition flag)
- 
-   # Save results - note there has been a transposition during the processing so we save columns
-   vsth VX(0,32++)+r0, (r1 += r3) REP 16
-@@ -132,16 +125,136 @@ block_loop:
- 
- # r1,r2,r3 r7,r8 should be preserved
- # HX(0++,0)+r0 is the block to be transformed
--# HX(32++,0) is the 16x16 matrix of transform coefficients
-+# HX(32++,0)+r6 is the 16x16 matrix of transform coefficients
- # Use HY(48,0) for intermediate results
- # r0 can be used, but should be returned to its original value at the end
- col_trans_16:
--  add r4,r0,16 # Final value for this loop
-+  add r6,r0,16 # Final value for this loop
- col_trans_16_loop:
-   # First compute partial products for a single column
--  vmul32s VY(48,0++), VX(0,0)+r0, VX(32,0++) REP 16
-+  vmul32s HY(48++,0), VX(0,0)+r0, VX(32,0++) REP 16
-   # Then sum up the results and place back
-   vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
--  addcmpblt r0,1,r4,col_trans_16_loop
-+  addcmpblt r0,1,r6,col_trans_16_loop
-   sub r0,16  # but r0 back to its original value
-   b lr
-+
-+col_trans_odd_16:
-+  add r6,r0,16 # Final value for this loop
-+col_trans_odd_16_loop:
-+  # First compute partial products for a single column
-+  vmul32s HY(48++,0), VX(0,0)+r0, VX(32,0++) REP 16
-+  # Then sum up the results and place back
-+  vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
-+  addcmpblt r0,1,r6,col_trans_odd_16_loop
-+  sub r0,16  # but r0 back to its original value
-+  b lr
-+
-+
-+test_add:
-+  vldh HX(0,0),(r0)
-+  vadd HX(0,0),HX(0,0),10
-+  vsth HX(0,0),(r0)
-+  mov r0,7 # return value
-+  b lr
-+
-+# hevc_trans_32x32(short *transMatrix2, short *coeffs, int num)
-+# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory) Even followed by odd
-+# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
-+# num: number of 16x16 transforms to be done
-+#
-+hevc_trans_32x32:
-+  push r6-r15, lr # TODO cut down number of used registers
-+
-+  # Fetch transform matrices
-+  mov r3, 16*2 # Stride of transMatrix2 in bytes (and of coefficients)
-+  vldh HX(32++,0),(r0 += r3) REP 16 # This is the even 16x16 matrix
-+  add r0, 16*16*2
-+  vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
-+
-+  mov r3, 32*2*2 # Stride used to fetch alternate rows of our input coefficient buffer
-+  mov r7, 16*16*2 # Total block size
-+  mov r4, 64 # Constant used for rounding first pass
-+  mov r5, 1<<11 # Constant used for rounding second pass
-+  sub sp,sp,32*32*2+32 # Allocate some space on the stack for us to store 32*32 shorts as temporary results (needs to be aligned)
-+  # set r8 to 32byte aligned stack pointer
-+  add r8,sp,31
-+  lsr r8,5
-+  lsl r8,5
-+  mov r9,r8  # Backup of the temporary storage
-+  mov r10,r1 # Backup of the coefficient buffer
-+block_loop32:
-+
-+  # COLUMN TRANSFORM
-+  # Transform the first 16 columns
-+  mov r1,r10  # Input Coefficient buffer
-+  mov r8,r9   # Output temporary storage
-+  bl trans32
-+  # Transform the second 16 columns
-+  add r8,32
-+  add r1,32
-+  bl trans32
-+
-+  # ROW TRANSFORM
-+  mov r1,r9  # Input temporary storage
-+  mov r8,r10   # Output Coefficient buffer
-+  bl trans32
-+  # Transform the second 16 columns
-+  add r8,32
-+  add r1,32
-+  bl trans32
-+
-+  add r10, 32*32*2 # move onto next block of coefficients
-+  addcmpbgt r2,-1,0,block_loop32
-+
-+  add sp,sp,32*32*2+32 # Restore stack
-+
-+  pop r6-r15, pc
-+
-+trans32:
-+  # We can no longer afford the VRF space to do prefetching when doing 32x32
-+  # Fetch the even rows
-+  vldh HX(0++,0)+r0,(r1 += r3) REP 16
-+  # Fetch the odd rows
-+  vldh HX(16++,0)+r0,64(r1 += r3) REP 16 # First odd row is 32 shorts ahead of r1
-+
-+  # Transform the even rows using even matrix
-+  mov r0, 0 # Even rows
-+  bl col_trans_16
-+
-+  # Now transform the odd rows using odd matrix
-+  mov r0, 64*16 # Odd rows
-+  bl col_trans_odd_16
-+
-+  # Now apply butterfly to compute the first 16 results
-+  vadd HY(48++,0),HY(0++,0),HY(16++,0) REP 16
-+  vadd HY(48++,0),HY(48++,0),r4 REP 32   # add on rounding,
-+  vasl HY(48++,0),HY(48++,0),9 REP 32    # shift down by 7, and saturate
-+  # 16bit results now in HX(48,32)
-+  mov r0,r8
-+  mov r6,32*2
-+  vsth VX(48,32++),(r0+=r6) REP 16
-+  vmov VX(0,0++)+r0, HX(0++,32)+r0 REP 16    # Store transposed
-+
-+  # Now apply butterfly to compute the second 16 results (in reverse order)
-+  vsub HY(63,0),HY(0,0),HY(16,0)
-+  vsub HY(62,0),HY(0,0),HY(17,0)
-+  vsub HY(61,0),HY(0,0),HY(18,0)
-+  vsub HY(60,0),HY(0,0),HY(19,0)
-+  vsub HY(59,0),HY(0,0),HY(20,0)
-+  vsub HY(58,0),HY(0,0),HY(21,0)
-+  vsub HY(57,0),HY(0,0),HY(22,0)
-+  vsub HY(56,0),HY(0,0),HY(23,0)
-+  vsub HY(55,0),HY(0,0),HY(24,0)
-+  vsub HY(54,0),HY(0,0),HY(25,0)
-+  vsub HY(53,0),HY(0,0),HY(26,0)
-+  vsub HY(52,0),HY(0,0),HY(27,0)
-+  vsub HY(51,0),HY(0,0),HY(28,0)
-+  vsub HY(50,0),HY(0,0),HY(29,0)
-+  vsub HY(49,0),HY(0,0),HY(30,0)
-+  vsub HY(48,0),HY(0,0),HY(31,0)
-+  vadd HY(48++,0),HY(48++,0),r4 REP 32   # add on rounding,
-+  vasl HY(48++,0),HY(48++,0),9 REP 32    # shift down by 7, and saturate
-+  add r0,r8,16*32*2 # Move to 16th row
-+  vsth VX(48,32++),(r0+=r6) REP 16
-+  b lr
-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-index b1f50ee..d720546 100644
---- a/libavcodec/rpi_qpu.c
-+++ b/libavcodec/rpi_qpu.c
-@@ -3,6 +3,7 @@
- // This works better than the mmap in that the memory can be cached, but requires a kernel modification to enable the device.
- #define RPI_USE_VCSM
- #define RPI_TIME_TOTAL_QPU
-+#define RPI_TIME_TOTAL_VPU
- 
- #include <stdio.h>
- #include <stdlib.h>
-@@ -48,10 +49,47 @@ typedef int int32_t;
- #define QPU_CODE_SIZE 2048
- #define VPU_CODE_SIZE 2048
- 
-+const short rpi_transMatrix2even[32][16] = { // Even rows first
-+{64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64},
-+{90,  87,  80,  70,  57,  43,  25,   9,  -9, -25, -43, -57, -70, -80, -87, -90},
-+{89,  75,  50,  18, -18, -50, -75, -89, -89, -75, -50, -18,  18,  50,  75,  89},
-+{87,  57,   9, -43, -80, -90, -70, -25,  25,  70,  90,  80,  43,  -9, -57, -87},
-+{83,  36, -36, -83, -83, -36,  36,  83,  83,  36, -36, -83, -83, -36,  36,  83},
-+{80,   9, -70, -87, -25,  57,  90,  43, -43, -90, -57,  25,  87,  70,  -9, -80},
-+{75, -18, -89, -50,  50,  89,  18, -75, -75,  18,  89,  50, -50, -89, -18,  75},
-+{70, -43, -87,   9,  90,  25, -80, -57,  57,  80, -25, -90,  -9,  87,  43, -70},
-+{64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64},
-+{57, -80, -25,  90,  -9, -87,  43,  70, -70, -43,  87,   9, -90,  25,  80, -57},
-+{50, -89,  18,  75, -75, -18,  89, -50, -50,  89, -18, -75,  75,  18, -89,  50},
-+{43, -90,  57,  25, -87,  70,   9, -80,  80,  -9, -70,  87, -25, -57,  90, -43},
-+{36, -83,  83, -36, -36,  83, -83,  36,  36, -83,  83, -36, -36,  83, -83,  36},
-+{25, -70,  90, -80,  43,   9, -57,  87, -87,  57,  -9, -43,  80, -90,  70, -25},
-+{18, -50,  75, -89,  89, -75,  50, -18, -18,  50, -75,  89, -89,  75, -50,  18},
-+{ 9, -25,  43, -57,  70, -80,  87, -90,  90, -87,  80, -70,  57, -43,  25,  -9},
-+// Odd rows
-+{90,  90,  88,  85,  82,  78,  73,  67,  61,  54,  46,  38,  31,  22,  13,   4},
-+{90,  82,  67,  46,  22,  -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13},
-+{88,  67,  31, -13, -54, -82, -90, -78, -46,  -4,  38,  73,  90,  85,  61,  22},
-+{85,  46, -13, -67, -90, -73, -22,  38,  82,  88,  54,  -4, -61, -90, -78, -31},
-+{82,  22, -54, -90, -61,  13,  78,  85,  31, -46, -90, -67,   4,  73,  88,  38},
-+{78,  -4, -82, -73,  13,  85,  67, -22, -88, -61,  31,  90,  54, -38, -90, -46},
-+{73, -31, -90, -22,  78,  67, -38, -90, -13,  82,  61, -46, -88,  -4,  85,  54},
-+{67, -54, -78,  38,  85, -22, -90,   4,  90,  13, -88, -31,  82,  46, -73, -61},
-+{61, -73, -46,  82,  31, -88, -13,  90,  -4, -90,  22,  85, -38, -78,  54,  67},
-+{54, -85,  -4,  88, -46, -61,  82,  13, -90,  38,  67, -78, -22,  90, -31, -73},
-+{46, -90,  38,  54, -90,  31,  61, -88,  22,  67, -85,  13,  73, -82,   4,  78},
-+{38, -88,  73,  -4, -67,  90, -46, -31,  85, -78,  13,  61, -90,  54,  22, -82},
-+{31, -78,  90, -61,   4,  54, -88,  82, -38, -22,  73, -90,  67, -13, -46,  85},
-+{22, -61,  85, -90,  73, -38,  -4,  46, -78,  90, -82,  54, -13, -31,  67, -88},
-+{13, -38,  61, -78,  88, -90,  85, -73,  54, -31,   4,  22, -46,  67, -82,  90},
-+{ 4, -13,  22, -31,  38, -46,  54, -61,  67, -73,  78, -82,  85, -88,  90, -90}
-+};
-+
- struct GPU
- {
-   unsigned int qpu_code[QPU_CODE_SIZE];
-   unsigned int vpu_code[VPU_CODE_SIZE];
-+  short transMatrix2even[16*16];
-   int open_count; // Number of allocated video buffers
-   unsigned int vc_handle; // Handle of this memory
-   int      mb; // Mailbox handle
-@@ -123,6 +161,8 @@ static int gpu_init(volatile struct GPU **gpu) {
-     assert(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
-     memcpy((void*)ptr->vpu_code, rpi_hevc_transform, num_bytes);
-   }
-+  // And the transform coefficients
-+  memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, 16*16*sizeof(short));
- 
-   return 0;
- }
-@@ -274,11 +314,43 @@ unsigned int vpu_get_fn(void) {
-   return gpu->vc + offsetof(struct GPU,vpu_code);
- }
- 
-+unsigned int vpu_get_constants(void) {
-+  if (gpu==NULL) {
-+    gpu_lock();
-+    gpu_unlock();
-+  }
-+  return gpu->vc + offsetof(struct GPU,transMatrix2even);
-+}
-+
- unsigned vpu_execute_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5)
- {
-   unsigned r;
-+#ifdef RPI_TIME_TOTAL_VPU
-+  static int last_time=0;
-+  static long long on_time=0;
-+  static long long off_time=0;
-+  int start_time;
-+  int end_time;
-+  static int count=0;
-+  static long long countr2=0;
-+#endif
-   gpu_lock();
-+#ifdef RPI_TIME_TOTAL_VPU
-+  start_time = Microseconds();
-+  if (last_time==0)
-+    last_time = start_time;
-+  off_time += start_time-last_time;
-+#endif
-   r = execute_code(gpu->mb, code, r0, r1, r2, r3, r4, r5);
-+#ifdef RPI_TIME_TOTAL_VPU
-+  end_time = Microseconds();
-+  last_time = end_time;
-+  on_time += end_time - start_time;
-+  count++;
-+  countr2 += r2;
-+  if ((count&0x7f)==0)
-+    printf("VPU %d %lld On=%dms, Off=%dms\n",count,countr2,(int)(on_time/1000),(int)(off_time/1000));
-+#endif
-   gpu_unlock();
-   return r;
- }
-diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
-index 4e3c35c..814fc3c 100644
---- a/libavcodec/rpi_qpu.h
-+++ b/libavcodec/rpi_qpu.h
-@@ -34,6 +34,7 @@ extern unsigned int qpu_get_fn(int num);
- 
- // VPU specific functions
- extern unsigned int vpu_get_fn(void);
-+extern unsigned int vpu_get_constants(void);
- extern unsigned vpu_execute_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5);
- 
- // Simple test of shader code
--- 
-2.7.4
-
-
-From 4bb0a7ba6723650e74d63cec2123f76da4c3eb0e Mon Sep 17 00:00:00 2001
-From: Peter de Rivaz <peter.derivaz@gmail.com>
-Date: Tue, 5 May 2015 09:41:23 +0100
-Subject: [PATCH 05/68] Fixed deblocking
-
----
- libavcodec/hevc.c | 20 +++++++++++++++++---
- 1 file changed, 17 insertions(+), 3 deletions(-)
-
-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-index 94ff709..391c57a 100644
---- a/libavcodec/hevc.c
-+++ b/libavcodec/hevc.c
-@@ -2400,8 +2400,9 @@ static void rpi_execute_transform(HEVCContext *s)
-     //    s->hevcdsp.idct[4-2](coeffs, 16);
-     //}
- 
--    //gpu_cache_flush(&s->coeffs_buf[i]);
-+    gpu_cache_flush(&s->coeffs_buf[i]);
-     vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf[i].vc, s->num_coeffs[i] >> 8, 0, 0, 0);
-+    gpu_cache_flush(&s->coeffs_buf[i]);
- 
-     for(i=0;i<4;i++)
-         s->num_coeffs[i] = 0;
-@@ -2440,6 +2441,7 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-     int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs];
- 
- #ifdef RPI
-+    int start_ctb_x = (s->sh.slice_ctb_addr_rs % ((s->ps.sps->width + ctb_size - 1) >> s->ps.sps->log2_ctb_size)) << s->ps.sps->log2_ctb_size;
-     s->enable_rpi = 1; // TODO this should depend on cross component and frame width etc.
- #endif
- 
-@@ -2473,9 +2475,17 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
- 
-         more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
- #ifdef RPI
--        if (1 || x_ctb + ctb_size >= s->ps.sps->width) { // TODO watch out for deblocking!
-+        if (s->enable_rpi && x_ctb + ctb_size >= s->ps.sps->width) {
-+            int x;
-+            // Transform all blocks
-             rpi_execute_transform(s);
-+            // Perform intra prediction and residual reconstruction
-             rpi_execute_pred_cmds(s);
-+            // Perform deblocking for CTBs in this row
-+            for(x = start_ctb_x; x <= x_ctb; x += ctb_size) {  // TODO this will fail for tiles
-+                ff_hevc_hls_filters(s, x, y_ctb, ctb_size);
-+            }
-+            start_ctb_x = 0;
-         }
- #endif
-         if (more_data < 0) {
-@@ -2486,6 +2496,10 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
- 
-         ctb_addr_ts++;
-         ff_hevc_save_states(s, ctb_addr_ts);
-+#ifdef RPI
-+        if (s->enable_rpi)
-+            continue;
-+#endif
-         ff_hevc_hls_filters(s, x_ctb, y_ctb, ctb_size);
-     }
- 
-@@ -3289,7 +3303,7 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
-     if (!s->univ_pred_cmds)
-         goto fail;
-     for(i = 0; i < 4; i++) {
--        gpu_malloc_uncached(sizeof(int16_t)*RPI_MAX_XFM_CMDS*16, &s->coeffs_buf[i]); // TODO slim this down and share across sizes
-+        gpu_malloc_cached(sizeof(int16_t)*RPI_MAX_XFM_CMDS*16, &s->coeffs_buf[i]); // TODO slim this down and share across sizes
-         s->coeffs_buf_arm[i] = (int16_t*) s->coeffs_buf[i].arm;
-         if (!s->coeffs_buf_arm[i])
-             goto fail;
--- 
-2.7.4
-
-
-From 9079ef888e3d81a69f3c802ddc3c5134679e74a6 Mon Sep 17 00:00:00 2001
-From: Peter de Rivaz <peter.derivaz@gmail.com>
-Date: Tue, 5 May 2015 11:32:30 +0100
-Subject: [PATCH 06/68] Added 32x32 transform
-
----
- libavcodec/hevc.c               |   8 +-
- libavcodec/hevc_cabac.c         |   4 +-
- libavcodec/rpi_hevc_transform.h | 200 +++++++++++++++++-----------------------
- libavcodec/rpi_hevc_transform.s | 102 ++++++++++----------
- libavcodec/rpi_qpu.c            |   4 +-
- 5 files changed, 148 insertions(+), 170 deletions(-)
-
-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-index 391c57a..0dde6f2 100644
---- a/libavcodec/hevc.c
-+++ b/libavcodec/hevc.c
-@@ -2400,9 +2400,11 @@ static void rpi_execute_transform(HEVCContext *s)
-     //    s->hevcdsp.idct[4-2](coeffs, 16);
-     //}
- 
--    gpu_cache_flush(&s->coeffs_buf[i]);
--    vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf[i].vc, s->num_coeffs[i] >> 8, 0, 0, 0);
--    gpu_cache_flush(&s->coeffs_buf[i]);
-+    gpu_cache_flush(&s->coeffs_buf[2]);
-+    gpu_cache_flush(&s->coeffs_buf[3]);
-+    vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf[2].vc, s->num_coeffs[2] >> 8, s->coeffs_buf[3].vc, s->num_coeffs[3] >> 10, 0);
-+    gpu_cache_flush(&s->coeffs_buf[2]);
-+    gpu_cache_flush(&s->coeffs_buf[3]);
- 
-     for(i=0;i<4;i++)
-         s->num_coeffs[i] = 0;
-diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
-index d1cba86..88aa959 100644
---- a/libavcodec/hevc_cabac.c
-+++ b/libavcodec/hevc_cabac.c
-@@ -1031,7 +1031,9 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-     int vshift = s->ps.sps->vshift[c_idx];
-     uint8_t *dst = &s->frame->data[c_idx][(y0 >> vshift) * stride +
-                                           ((x0 >> hshift) << s->ps.sps->pixel_shift)];
--    int use_vpu = s->enable_rpi && !lc->cu.cu_transquant_bypass_flag && !transform_skip_flag && !lc->tu.cross_pf && log2_trafo_size==4;
-+#ifdef RPI
-+    int use_vpu = s->enable_rpi && !lc->cu.cu_transquant_bypass_flag && !transform_skip_flag && !lc->tu.cross_pf && log2_trafo_size>=4;
-+#endif
-     int16_t *coeffs = (int16_t*)(c_idx ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
-     uint8_t significant_coeff_group_flag[8][8] = {{0}};
-     int explicit_rdpcm_flag = 0;
-diff --git a/libavcodec/rpi_hevc_transform.h b/libavcodec/rpi_hevc_transform.h
-index c0c279f..6d772d7 100644
---- a/libavcodec/rpi_hevc_transform.h
-+++ b/libavcodec/rpi_hevc_transform.h
-@@ -1,6 +1,10 @@
- unsigned char rpi_hevc_transform [] = {
- 169,
- 3,
-+62,
-+64,
-+79,
-+64,
- 3,
- 232,
- 32,
-@@ -17,6 +21,22 @@ unsigned char rpi_hevc_transform [] = {
- 248,
- 0,
- 0,
-+64,
-+232,
-+0,
-+2,
-+0,
-+0,
-+12,
-+248,
-+0,
-+168,
-+0,
-+0,
-+192,
-+248,
-+0,
-+0,
- 0,
- 96,
- 3,
-@@ -79,7 +99,7 @@ unsigned char rpi_hevc_transform [] = {
- 70,
- 128,
- 144,
--39,
-+40,
- 0,
- 4,
- 255,
-@@ -113,7 +133,7 @@ unsigned char rpi_hevc_transform [] = {
- 0,
- 128,
- 144,
--22,
-+23,
- 0,
- 4,
- 255,
-@@ -153,6 +173,8 @@ unsigned char rpi_hevc_transform [] = {
- 140,
- 211,
- 192,
-+34,
-+31,
- 41,
- 3,
- 70,
-@@ -195,7 +217,7 @@ unsigned char rpi_hevc_transform [] = {
- 255,
- 36,
- 204,
--96,
-+224,
- 2,
- 0,
- 248,
-@@ -219,62 +241,10 @@ unsigned char rpi_hevc_transform [] = {
- 103,
- 90,
- 0,
--8,
--240,
--0,
--128,
--128,
--3,
--0,
--247,
--32,
--128,
--10,
--4,
--136,
--240,
--32,
--0,
--128,
--3,
--112,
--96,
--90,
--0,
--169,
--3,
--3,
--232,
--32,
--0,
--0,
--0,
--12,
--248,
--0,
--136,
--0,
--0,
--192,
--248,
--0,
--0,
-+225,
-+64,
-+242,
- 64,
--232,
--0,
--2,
--0,
--0,
--12,
--248,
--0,
--168,
--0,
--0,
--192,
--248,
--0,
--0,
- 3,
- 232,
- 128,
-@@ -287,18 +257,6 @@ unsigned char rpi_hevc_transform [] = {
- 2,
- 0,
- 0,
--4,
--232,
--64,
--0,
--0,
--0,
--5,
--232,
--0,
--8,
--0,
--0,
- 57,
- 239,
- 224,
-@@ -317,18 +275,26 @@ unsigned char rpi_hevc_transform [] = {
- 64,
- 26,
- 64,
-+4,
-+232,
-+64,
-+0,
-+0,
-+0,
-+149,
-+96,
- 161,
- 64,
- 152,
- 64,
- 128,
- 144,
--31,
-+35,
- 0,
- 72,
- 232,
--32,
- 0,
-+4,
- 0,
- 0,
- 65,
-@@ -339,8 +305,16 @@ unsigned char rpi_hevc_transform [] = {
- 0,
- 128,
- 144,
--23,
-+27,
-+0,
-+4,
-+232,
-+0,
-+8,
- 0,
-+0,
-+69,
-+96,
- 145,
- 64,
- 168,
-@@ -351,8 +325,8 @@ unsigned char rpi_hevc_transform [] = {
- 0,
- 72,
- 232,
--32,
- 0,
-+4,
- 0,
- 0,
- 65,
-@@ -373,7 +347,7 @@ unsigned char rpi_hevc_transform [] = {
- 0,
- 242,
- 140,
--229,
-+221,
- 192,
- 57,
- 239,
-@@ -383,6 +357,8 @@ unsigned char rpi_hevc_transform [] = {
- 0,
- 41,
- 3,
-+239,
-+3,
- 12,
- 248,
- 0,
-@@ -390,7 +366,7 @@ unsigned char rpi_hevc_transform [] = {
- 0,
- 0,
- 192,
--8,
-+248,
- 4,
- 0,
- 12,
-@@ -400,14 +376,14 @@ unsigned char rpi_hevc_transform [] = {
- 64,
- 0,
- 192,
--8,
-+248,
- 4,
- 0,
- 0,
- 96,
- 255,
- 159,
--131,
-+154,
- 255,
- 0,
- 232,
-@@ -417,7 +393,7 @@ unsigned char rpi_hevc_transform [] = {
- 0,
- 255,
- 159,
--142,
-+165,
- 255,
- 4,
- 255,
-@@ -429,7 +405,7 @@ unsigned char rpi_hevc_transform [] = {
- 251,
- 62,
- 0,
--5,
-+4,
- 255,
- 51,
- 204,
-@@ -439,15 +415,15 @@ unsigned char rpi_hevc_transform [] = {
- 251,
- 16,
- 0,
--77,
-+76,
- 254,
- 51,
- 204,
--9,
--4,
-+128,
-+3,
- 224,
- 251,
--0,
-+20,
- 0,
- 128,
- 64,
-@@ -467,16 +443,6 @@ unsigned char rpi_hevc_transform [] = {
- 99,
- 0,
- 0,
--4,
--254,
--0,
--144,
--128,
--2,
--0,
--8,
--2,
--0,
- 32,
- 247,
- 240,
-@@ -488,92 +454,92 @@ unsigned char rpi_hevc_transform [] = {
- 176,
- 207,
- 17,
--3,
-+19,
- 32,
- 247,
- 112,
- 207,
- 18,
--3,
-+35,
- 32,
- 247,
- 48,
- 207,
- 19,
--3,
-+51,
- 32,
- 247,
- 240,
- 206,
- 20,
--3,
-+67,
- 32,
- 247,
- 176,
- 206,
- 21,
--3,
-+83,
- 32,
- 247,
- 112,
- 206,
- 22,
--3,
-+99,
- 32,
- 247,
- 48,
- 206,
- 23,
--3,
-+115,
- 32,
- 247,
- 240,
- 205,
- 24,
--3,
-+131,
- 32,
- 247,
- 176,
- 205,
- 25,
--3,
-+147,
- 32,
- 247,
- 112,
- 205,
- 26,
--3,
-+163,
- 32,
- 247,
- 48,
- 205,
- 27,
--3,
-+179,
- 32,
- 247,
- 240,
- 204,
- 28,
--3,
-+195,
- 32,
- 247,
- 176,
- 204,
- 29,
--3,
-+211,
- 32,
- 247,
- 112,
- 204,
- 30,
--3,
-+227,
- 32,
- 247,
- 48,
- 204,
- 31,
--3,
--5,
-+243,
-+4,
- 255,
- 51,
- 204,
-@@ -583,20 +549,20 @@ unsigned char rpi_hevc_transform [] = {
- 251,
- 16,
- 0,
--77,
-+76,
- 254,
- 51,
- 204,
--9,
--4,
-+128,
-+3,
- 224,
- 251,
--0,
-+20,
- 0,
- 0,
- 237,
-+32,
- 0,
--4,
- 0,
- 0,
- 140,
-@@ -609,6 +575,6 @@ unsigned char rpi_hevc_transform [] = {
- 99,
- 0,
- 0,
--90,
--0,
-+111,
-+3,
- };
-diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s
-index 1e389c7..afdb32a 100644
---- a/libavcodec/rpi_hevc_transform.s
-+++ b/libavcodec/rpi_hevc_transform.s
-@@ -76,12 +76,19 @@
- # transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory)
- # coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
- # num: number of 16x16 transforms to be done
-+# coeffs32
-+# num32: number of 32x32 transforms
- #
- hevc_trans_16x16:
-   push r6-r15, lr # TODO cut down number of used registers
--
-+  mov r14,r3 # coeffs32
-+  mov r15,r4 # num32
-   mov r3, 16*2 # Stride of transMatrix2 in bytes
-   vldh HX(32++,0),(r0 += r3) REP 16 # This is the 16x16 matrix, a transform is equivalent to multiplying input row vector * matrix
-+
-+  add r0, 16*16*2 # For 32x32 transforms we also need this matrix
-+  vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
-+
-   # Now use r0 to describe which matrix we are working on.
-   # Allows us to prefetch the next block of coefficients for efficiency.
-   mov r0,0 # This describes the location where we read our coefficients from
-@@ -121,6 +128,10 @@ block_loop:
-   add r1,r7
- 
-   addcmpbgt r2,-1,0,block_loop
-+
-+  # Now go and do any 32x32 transforms
-+  b hevc_trans_32x32
-+
-   pop r6-r15, pc
- 
- # r1,r2,r3 r7,r8 should be preserved
-@@ -136,26 +147,18 @@ col_trans_16_loop:
-   # Then sum up the results and place back
-   vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
-   addcmpblt r0,1,r6,col_trans_16_loop
--  sub r0,16  # but r0 back to its original value
-+  sub r0,16  # put r0 back to its original value
-   b lr
- 
- col_trans_odd_16:
-   add r6,r0,16 # Final value for this loop
- col_trans_odd_16_loop:
-   # First compute partial products for a single column
--  vmul32s HY(48++,0), VX(0,0)+r0, VX(32,0++) REP 16
-+  vmul32s HY(48++,0), VX(0,0)+r0, VX(32,32++) REP 16
-   # Then sum up the results and place back
-   vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
-   addcmpblt r0,1,r6,col_trans_odd_16_loop
--  sub r0,16  # but r0 back to its original value
--  b lr
--
--
--test_add:
--  vldh HX(0,0),(r0)
--  vadd HX(0,0),HX(0,0),10
--  vsth HX(0,0),(r0)
--  mov r0,7 # return value
-+  sub r0,16  # put r0 back to its original value
-   b lr
- 
- # hevc_trans_32x32(short *transMatrix2, short *coeffs, int num)
-@@ -164,18 +167,17 @@ test_add:
- # num: number of 16x16 transforms to be done
- #
- hevc_trans_32x32:
--  push r6-r15, lr # TODO cut down number of used registers
-+  mov r1,r14 # coeffs
-+  mov r2,r15 # num
- 
--  # Fetch transform matrices
--  mov r3, 16*2 # Stride of transMatrix2 in bytes (and of coefficients)
--  vldh HX(32++,0),(r0 += r3) REP 16 # This is the even 16x16 matrix
--  add r0, 16*16*2
--  vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
-+  # Fetch odd transform matrix
-+  #mov r3, 16*2 # Stride of transMatrix2 in bytes (and of coefficients)
-+  #vldh HX(32++,0),(r0 += r3) REP 16 # This is the even 16x16 matrix
-+  #add r0, 16*16*2
-+  #vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
- 
-   mov r3, 32*2*2 # Stride used to fetch alternate rows of our input coefficient buffer
-   mov r7, 16*16*2 # Total block size
--  mov r4, 64 # Constant used for rounding first pass
--  mov r5, 1<<11 # Constant used for rounding second pass
-   sub sp,sp,32*32*2+32 # Allocate some space on the stack for us to store 32*32 shorts as temporary results (needs to be aligned)
-   # set r8 to 32byte aligned stack pointer
-   add r8,sp,31
-@@ -186,21 +188,27 @@ hevc_trans_32x32:
- block_loop32:
- 
-   # COLUMN TRANSFORM
-+  mov r4, 64 # Constant used for rounding first pass
-+  mov r5, 9 # left shift used for rounding first pass
-+
-   # Transform the first 16 columns
-   mov r1,r10  # Input Coefficient buffer
-   mov r8,r9   # Output temporary storage
-   bl trans32
-   # Transform the second 16 columns
--  add r8,32
-+  add r8,32*16*2
-   add r1,32
-   bl trans32
- 
-   # ROW TRANSFORM
-+  mov r4, 1<<11 # Constant used for rounding second pass
-+  mov r5, 4 # left shift used for rounding second pass
-+
-   mov r1,r9  # Input temporary storage
-   mov r8,r10   # Output Coefficient buffer
-   bl trans32
-   # Transform the second 16 columns
--  add r8,32
-+  add r8,32*16*2
-   add r1,32
-   bl trans32
- 
-@@ -212,11 +220,12 @@ block_loop32:
-   pop r6-r15, pc
- 
- trans32:
-+  push lr
-   # We can no longer afford the VRF space to do prefetching when doing 32x32
-   # Fetch the even rows
--  vldh HX(0++,0)+r0,(r1 += r3) REP 16
-+  vldh HX(0++,0),(r1 += r3) REP 16
-   # Fetch the odd rows
--  vldh HX(16++,0)+r0,64(r1 += r3) REP 16 # First odd row is 32 shorts ahead of r1
-+  vldh HX(16++,0),64(r1 += r3) REP 16 # First odd row is 32 shorts ahead of r1
- 
-   # Transform the even rows using even matrix
-   mov r0, 0 # Even rows
-@@ -228,33 +237,32 @@ trans32:
- 
-   # Now apply butterfly to compute the first 16 results
-   vadd HY(48++,0),HY(0++,0),HY(16++,0) REP 16
--  vadd HY(48++,0),HY(48++,0),r4 REP 32   # add on rounding,
--  vasl HY(48++,0),HY(48++,0),9 REP 32    # shift down by 7, and saturate
-+  vadd HY(48++,0),HY(48++,0),r4 REP 16   # add on rounding,
-+  vasl HY(48++,0),HY(48++,0),r5 REP 16    # shift down by 7, and saturate
-   # 16bit results now in HX(48,32)
-   mov r0,r8
-   mov r6,32*2
-   vsth VX(48,32++),(r0+=r6) REP 16
--  vmov VX(0,0++)+r0, HX(0++,32)+r0 REP 16    # Store transposed
- 
-   # Now apply butterfly to compute the second 16 results (in reverse order)
--  vsub HY(63,0),HY(0,0),HY(16,0)
--  vsub HY(62,0),HY(0,0),HY(17,0)
--  vsub HY(61,0),HY(0,0),HY(18,0)
--  vsub HY(60,0),HY(0,0),HY(19,0)
--  vsub HY(59,0),HY(0,0),HY(20,0)
--  vsub HY(58,0),HY(0,0),HY(21,0)
--  vsub HY(57,0),HY(0,0),HY(22,0)
--  vsub HY(56,0),HY(0,0),HY(23,0)
--  vsub HY(55,0),HY(0,0),HY(24,0)
--  vsub HY(54,0),HY(0,0),HY(25,0)
--  vsub HY(53,0),HY(0,0),HY(26,0)
--  vsub HY(52,0),HY(0,0),HY(27,0)
--  vsub HY(51,0),HY(0,0),HY(28,0)
--  vsub HY(50,0),HY(0,0),HY(29,0)
--  vsub HY(49,0),HY(0,0),HY(30,0)
--  vsub HY(48,0),HY(0,0),HY(31,0)
--  vadd HY(48++,0),HY(48++,0),r4 REP 32   # add on rounding,
--  vasl HY(48++,0),HY(48++,0),9 REP 32    # shift down by 7, and saturate
--  add r0,r8,16*32*2 # Move to 16th row
-+  vsub HY(63,0),HY(0 ,0),HY(16,0)
-+  vsub HY(62,0),HY(1 ,0),HY(17,0)
-+  vsub HY(61,0),HY(2 ,0),HY(18,0)
-+  vsub HY(60,0),HY(3 ,0),HY(19,0)
-+  vsub HY(59,0),HY(4 ,0),HY(20,0)
-+  vsub HY(58,0),HY(5 ,0),HY(21,0)
-+  vsub HY(57,0),HY(6 ,0),HY(22,0)
-+  vsub HY(56,0),HY(7 ,0),HY(23,0)
-+  vsub HY(55,0),HY(8 ,0),HY(24,0)
-+  vsub HY(54,0),HY(9 ,0),HY(25,0)
-+  vsub HY(53,0),HY(10,0),HY(26,0)
-+  vsub HY(52,0),HY(11,0),HY(27,0)
-+  vsub HY(51,0),HY(12,0),HY(28,0)
-+  vsub HY(50,0),HY(13,0),HY(29,0)
-+  vsub HY(49,0),HY(14,0),HY(30,0)
-+  vsub HY(48,0),HY(15,0),HY(31,0)
-+  vadd HY(48++,0),HY(48++,0),r4 REP 16   # add on rounding,
-+  vasl HY(48++,0),HY(48++,0),r5 REP 16    # shift down by 7, and saturate
-+  add r0,r8,32
-   vsth VX(48,32++),(r0+=r6) REP 16
--  b lr
-+  pop pc
-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-index d720546..12ad5fb 100644
---- a/libavcodec/rpi_qpu.c
-+++ b/libavcodec/rpi_qpu.c
-@@ -89,7 +89,7 @@ struct GPU
- {
-   unsigned int qpu_code[QPU_CODE_SIZE];
-   unsigned int vpu_code[VPU_CODE_SIZE];
--  short transMatrix2even[16*16];
-+  short transMatrix2even[16*16*2];
-   int open_count; // Number of allocated video buffers
-   unsigned int vc_handle; // Handle of this memory
-   int      mb; // Mailbox handle
-@@ -162,7 +162,7 @@ static int gpu_init(volatile struct GPU **gpu) {
-     memcpy((void*)ptr->vpu_code, rpi_hevc_transform, num_bytes);
-   }
-   // And the transform coefficients
--  memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, 16*16*sizeof(short));
-+  memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, sizeof(rpi_transMatrix2even));
- 
-   return 0;
- }
--- 
-2.7.4
-
-
-From 6c2ed6109c4dd5c8ab16bf16e0ae3be6ae166e50 Mon Sep 17 00:00:00 2001
-From: Peter de Rivaz <peter.derivaz@gmail.com>
-Date: Tue, 5 May 2015 16:57:03 +0100
-Subject: [PATCH 07/68] Clear coefficients in advance
-
----
- libavcodec/hevc.c               | 129 ++++++++++++++++++++++++++++------------
- libavcodec/hevc.h               |   6 +-
- libavcodec/hevc_cabac.c         |   7 ++-
- libavcodec/rpi_hevc_transform.h |  50 ++++++++++++++++
- libavcodec/rpi_hevc_transform.s |  16 +++++
- 5 files changed, 168 insertions(+), 40 deletions(-)
-
-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-index 0dde6f2..1424007 100644
---- a/libavcodec/hevc.c
-+++ b/libavcodec/hevc.c
-@@ -43,6 +43,8 @@
- 
- #ifdef RPI
- #include "rpi_qpu.h"
-+// For some unknown reason, the code seems to crash if I do a late malloc
-+#define EARLY_MALLOC
- #endif
- 
- // #define DISABLE_MC
-@@ -61,6 +63,20 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
- /* free everything allocated  by pic_arrays_init() */
- static void pic_arrays_free(HEVCContext *s)
- {
-+#ifdef RPI
-+#ifdef EARLY_MALLOC
-+#else
-+    printf("pic_arrays_free\n");
-+    if (s->coeffs_buf_arm[0]) {
-+      gpu_free(&s->coeffs_buf_default);
-+      s->coeffs_buf_arm[0] = 0;
-+    }
-+    if (s->coeffs_buf_arm[2]) {
-+      gpu_free(&s->coeffs_buf_accelerated);
-+      s->coeffs_buf_arm[2] = 0;
-+    }
-+#endif
-+#endif
-     av_freep(&s->sao);
-     av_freep(&s->deblock);
- 
-@@ -97,6 +113,28 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
-     int ctb_count        = sps->ctb_width * sps->ctb_height;
-     int min_pu_size      = sps->min_pu_width * sps->min_pu_height;
- 
-+#ifdef RPI
-+#ifdef EARLY_MALLOC
-+#else
-+    int coeffs_in_ctb = (1 << s->ps.sps->log2_ctb_size) * (1 << s->ps.sps->log2_ctb_size);
-+    int coefs_per_row = sps->ctb_width * coeffs_in_ctb * 3;  // Allow space for chroma
-+    printf("pic_arrays_init\n");
-+    printf("Allocated %d\n",coefs_per_row);
-+    gpu_malloc_cached(sizeof(int16_t) * coefs_per_row, &s->coeffs_buf_default);
-+    s->coeffs_buf_arm[0] = (int16_t*) s->coeffs_buf_default.arm;
-+    if (!s->coeffs_buf_arm[0])
-+        goto fail;
-+    gpu_malloc_cached(sizeof(int16_t) * coefs_per_row * 2, &s->coeffs_buf_accelerated);
-+    s->coeffs_buf_arm[2] = (int16_t*) s->coeffs_buf_accelerated.arm;
-+    s->coeffs_buf_vc[2] = s->coeffs_buf_accelerated.vc;
-+    if (!s->coeffs_buf_arm[2])
-+        goto fail;
-+    s->coeffs_buf_arm[3] = coefs_per_row + s->coeffs_buf_arm[2];
-+    s->coeffs_buf_vc[3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[2];
-+    printf("Done\n");
-+#endif
-+#endif
-+
-     s->bs_width  = (width  >> 2) + 1;
-     s->bs_height = (height >> 2) + 1;
- 
-@@ -2400,11 +2438,10 @@ static void rpi_execute_transform(HEVCContext *s)
-     //    s->hevcdsp.idct[4-2](coeffs, 16);
-     //}
- 
--    gpu_cache_flush(&s->coeffs_buf[2]);
--    gpu_cache_flush(&s->coeffs_buf[3]);
--    vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf[2].vc, s->num_coeffs[2] >> 8, s->coeffs_buf[3].vc, s->num_coeffs[3] >> 10, 0);
--    gpu_cache_flush(&s->coeffs_buf[2]);
--    gpu_cache_flush(&s->coeffs_buf[3]);
-+
-+    gpu_cache_flush(&s->coeffs_buf_accelerated);
-+    vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0);
-+    //gpu_cache_flush(&s->coeffs_buf_accelerated);
- 
-     for(i=0;i<4;i++)
-         s->num_coeffs[i] = 0;
-@@ -2426,7 +2463,9 @@ static void rpi_execute_pred_cmds(HEVCContext *s)
-           lc->na.cand_up_right     = (cmd->na >> 0) & 1;
-           s->hpc.intra_pred[cmd->size - 2](s, cmd->x, cmd->y, cmd->c_idx);
-       } else {
-+          int trafo_size = 1 << cmd->size;
-           s->hevcdsp.transform_add[cmd->size-2](cmd->dst, cmd->buf, cmd->stride);
-+          memset(cmd->buf, 0, trafo_size * trafo_size * sizeof(int16_t)); // Clear coefficients here while they are in the cache
-       }
-   }
-   s->num_pred_cmds = 0;
-@@ -3235,10 +3274,18 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
-     av_freep(&s->unif_mv_cmds);
-     av_freep(&s->unif_xfm_cmds);
-     av_freep(&s->univ_pred_cmds);
--    for(i = 0; i < 4; i++) {
--        gpu_free(&s->coeffs_buf[i]);
-+
-+#ifdef EARLY_MALLOC
-+    if (s->coeffs_buf_arm[0]) {
-+      gpu_free(&s->coeffs_buf_default);
-+      s->coeffs_buf_arm[0] = 0;
-+    }
-+    if (s->coeffs_buf_arm[2]) {
-+      gpu_free(&s->coeffs_buf_accelerated);
-+      s->coeffs_buf_arm[2] = 0;
-     }
- #endif
-+#endif
- 
-     for (i = 0; i < 3; i++) {
-         av_freep(&s->sao_pixel_buffer_h[i]);
-@@ -3281,6 +3328,16 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
-     return 0;
- }
- 
-+#ifdef RPI
-+static av_cold void memclear16(int16_t *p, int n)
-+{
-+  vpu_execute_code( vpu_get_fn(), p, n, 0, 0, 0, 1);
-+  //int i;
-+  //for(i=0;i<n;i++)
-+  //  p[i] = 0;
-+}
-+#endif
-+
- static av_cold int hevc_init_context(AVCodecContext *avctx)
- {
-     HEVCContext *s = avctx->priv_data;
-@@ -3304,37 +3361,35 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
-     s->univ_pred_cmds = av_mallocz(sizeof(HEVCPredCmd)*RPI_MAX_PRED_CMDS);
-     if (!s->univ_pred_cmds)
-         goto fail;
--    for(i = 0; i < 4; i++) {
--        gpu_malloc_cached(sizeof(int16_t)*RPI_MAX_XFM_CMDS*16, &s->coeffs_buf[i]); // TODO slim this down and share across sizes
--        s->coeffs_buf_arm[i] = (int16_t*) s->coeffs_buf[i].arm;
--        if (!s->coeffs_buf_arm[i])
--            goto fail;
--    }
--    s->enable_rpi = 0;
- 
--    // A little test program
--    /*{
--      GPU_MEM_PTR_T p;
--      int err = gpu_malloc_cached(16, &p);
--      short *q = (short *)p.arm;
--      int i;
--      int r;
--      printf("Allocated memory %d ARM 0x%x, VC 0x%x, Code 0x%x\n",err,(int)p.arm,p.vc,(int)vpu_get_fn());
--      printf("Allocated memory %d ARM 0x%x, VC 0x%x\n",err,(int)p.arm,p.vc);
--      printf("Preparing data %p\n",q);
--      for(i=0;i<16;i++)
--        q[i] = i;
--      printf("Flush cache\n");
--      gpu_cache_flush(&p);
--      printf("Executing code\n");
--      r = vpu_execute_code( vpu_get_fn(), p.vc, 0, 0, 0, 0, 0);
--      printf("Return value %d (",r);
--      for(i=0;i<16;i++)
--        printf("%d ",q[i]);
--      printf(")\n");
--      gpu_free(&p);
--      goto fail; // Early out
--    }*/
-+    s->coeffs_buf_arm[0] = 0;
-+    s->coeffs_buf_arm[2] = 0;
-+
-+#ifdef EARLY_MALLOC
-+    int coeffs_in_ctb = 64*64;
-+    int coefs_per_row = (2048/64) * coeffs_in_ctb * 3;  // Allow space for chroma
-+    printf("Allocated %d\n",coefs_per_row);
-+    gpu_malloc_cached(sizeof(int16_t) * coefs_per_row, &s->coeffs_buf_default);
-+    s->coeffs_buf_arm[0] = (int16_t*) s->coeffs_buf_default.arm;
-+    if (!s->coeffs_buf_arm[0])
-+        goto fail;
-+    gpu_malloc_cached(sizeof(int16_t) * coefs_per_row * 2, &s->coeffs_buf_accelerated);
-+    s->coeffs_buf_arm[2] = (int16_t*) s->coeffs_buf_accelerated.arm;
-+    s->coeffs_buf_vc[2] = s->coeffs_buf_accelerated.vc;
-+    if (!s->coeffs_buf_arm[2])
-+        goto fail;
-+    s->coeffs_buf_arm[3] = coefs_per_row + s->coeffs_buf_arm[2];
-+    s->coeffs_buf_vc[3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[2];
-+    printf("Done\n");
-+    //memset(s->coeffs_buf_arm[0],0, sizeof(int16_t) * coefs_per_row);
-+    memclear16(s->coeffs_buf_arm[0], coefs_per_row);
-+    //memset(s->coeffs_buf_arm[2],0, sizeof(int16_t) * coefs_per_row);
-+    memclear16(s->coeffs_buf_arm[2], coefs_per_row);
-+    //memset(s->coeffs_buf_arm[3],0, sizeof(int16_t) * coefs_per_row);
-+    memclear16(s->coeffs_buf_arm[3], coefs_per_row);
-+#endif
-+
-+    s->enable_rpi = 0;
- 
- #endif
- 
-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-index 4167985..9a228f6 100644
---- a/libavcodec/hevc.h
-+++ b/libavcodec/hevc.h
-@@ -861,8 +861,12 @@ typedef struct HEVCContext {
-     HEVCMvCmd *unif_mv_cmds;
-     HEVCXfmCmd *unif_xfm_cmds;
-     HEVCPredCmd *univ_pred_cmds;
--    GPU_MEM_PTR_T coeffs_buf[4];
-+    int buf_width;
-+    GPU_MEM_PTR_T coeffs_buf_default;
-+    GPU_MEM_PTR_T coeffs_buf_accelerated;
-     int16_t *coeffs_buf_arm[4];
-+    unsigned int coeffs_buf_vc[4];
-+
-     int num_coeffs[4];
-     int num_xfm_cmds;
-     int num_mv_cmds;
-diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
-index 88aa959..dbfee85 100644
---- a/libavcodec/hevc_cabac.c
-+++ b/libavcodec/hevc_cabac.c
-@@ -1058,9 +1058,13 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-             s->num_coeffs[0] += n;
-         }
-     }
-+    // We now do the memset after transform_add while we know the data is cached.
-+    //memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
-+#else
-+    memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
- #endif
- 
--    memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
-+
- 
-     // Derive QP for dequant
-     if (!lc->cu.cu_transquant_bypass_flag) {
-@@ -1547,7 +1551,6 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
- #ifdef RPI
-     if (s->enable_rpi) {
-         HEVCPredCmd *cmd = s->univ_pred_cmds + s->num_pred_cmds++;
--        //memcpy(coeffs2, coeffs, sizeof(int16_t) * trafo_size * trafo_size); // TODO
-         cmd->type = RPI_PRED_TRANSFORM_ADD;
-         cmd->size = log2_trafo_size;
-         cmd->buf = coeffs;
-diff --git a/libavcodec/rpi_hevc_transform.h b/libavcodec/rpi_hevc_transform.h
-index 6d772d7..4f13622 100644
---- a/libavcodec/rpi_hevc_transform.h
-+++ b/libavcodec/rpi_hevc_transform.h
-@@ -1,4 +1,10 @@
- unsigned char rpi_hevc_transform [] = {
-+21,
-+106,
-+0,
-+144,
-+35,
-+1,
- 169,
- 3,
- 62,
-@@ -577,4 +583,48 @@ unsigned char rpi_hevc_transform [] = {
- 0,
- 111,
- 3,
-+4,
-+254,
-+0,
-+128,
-+0,
-+4,
-+0,
-+248,
-+0,
-+0,
-+2,
-+232,
-+32,
-+0,
-+0,
-+0,
-+140,
-+248,
-+32,
-+0,
-+0,
-+0,
-+224,
-+35,
-+0,
-+0,
-+64,
-+232,
-+0,
-+2,
-+0,
-+0,
-+193,
-+232,
-+0,
-+1,
-+0,
-+0,
-+1,
-+106,
-+116,
-+30,
-+90,
-+0,
- };
-diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s
-index afdb32a..fd159bc 100644
---- a/libavcodec/rpi_hevc_transform.s
-+++ b/libavcodec/rpi_hevc_transform.s
-@@ -78,8 +78,11 @@
- # num: number of 16x16 transforms to be done
- # coeffs32
- # num32: number of 32x32 transforms
-+# command 0 for transform, 1 for memclear16(int16_t *dst,num16)
- #
- hevc_trans_16x16:
-+  cmp r5,1
-+  beq memclear16
-   push r6-r15, lr # TODO cut down number of used registers
-   mov r14,r3 # coeffs32
-   mov r15,r4 # num32
-@@ -266,3 +269,16 @@ trans32:
-   add r0,r8,32
-   vsth VX(48,32++),(r0+=r6) REP 16
-   pop pc
-+
-+memclear16:
-+  # r0 is address
-+  # r1 is number of 16bits values to set to 0 (may overrun past end and clear more than specified)
-+  vmov HX(0++,0),0 REP 16
-+  mov r2,32
-+loop:
-+  vsth HX(0++,0),(r0+=r2) REP 16
-+  add r0,16*16*2
-+  sub r1,16*16
-+  cmp r1,0
-+  bgt loop
-+  b lr
--- 
-2.7.4
-
-
-From 48282c2fb55c0d9a72222f384c03c432f78a3016 Mon Sep 17 00:00:00 2001
-From: Peter de Rivaz <peter.derivaz@gmail.com>
-Date: Wed, 6 May 2015 09:56:43 +0100
-Subject: [PATCH 08/68] Prepared inter offload
-
----
- libavcodec/hevc.c       | 116 +++++++++++++++++++++++++++++++++++++++++++-----
- libavcodec/hevc.h       |  29 +++++++++++-
- libavcodec/hevc_cabac.c |   5 ++-
- 3 files changed, 137 insertions(+), 13 deletions(-)
-
-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-index 1424007..8215201 100644
---- a/libavcodec/hevc.c
-+++ b/libavcodec/hevc.c
-@@ -45,6 +45,8 @@
- #include "rpi_qpu.h"
- // For some unknown reason, the code seems to crash if I do a late malloc
- #define EARLY_MALLOC
-+// Move Inter prediction into separate pass
-+//#define RPI_INTER
- #endif
- 
- // #define DISABLE_MC
-@@ -1440,6 +1442,95 @@ static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size)
+@@ -1332,6 +1703,93 @@ static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size)
   * @param luma_offset additive offset applied to the luma prediction value
   */
  
 +#ifdef RPI_INTER
-+#define RPI_REDIRECT(fn) rpi_ ## fn
++#define RPI_REDIRECT(fn) (s->enable_rpi ? rpi_ ## fn : fn)
 +static void rpi_luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
 +                        AVFrame *ref, const Mv *mv, int x_off, int y_off,
 +                        int block_w, int block_h, int luma_weight, int luma_offset)
 +{
-+    HEVCMvCmd *cmd = unif_mv_cmds + s->num_mv_cmds++;
++    HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++;
 +    cmd->cmd = RPI_CMD_LUMA_UNI;
 +    cmd->dst = dst;
 +    cmd->dststride = dststride;
@@ -7190,31 +2955,29 @@ index 1424007..8215201 100644
 +                       AVFrame *ref0, const Mv *mv0, int x_off, int y_off,
 +                       int block_w, int block_h, AVFrame *ref1, const Mv *mv1, struct MvField *current_mv)
 +{
-+    HEVCMvCmd *cmd = unif_mv_cmds + s->num_mv_cmds++;
++    HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++;
 +    cmd->cmd = RPI_CMD_LUMA_BI;
 +    cmd->dst = dst;
 +    cmd->dststride = dststride;
-+    cmd->src = ref->data[0];
-+    cmd->srcstride = ref->linesize[0];
-+    cmd->mv = *mv;
++    cmd->src = ref0->data[0];
++    cmd->srcstride = ref0->linesize[0];
++    cmd->mv = *mv0;
 +    cmd->x_off = x_off;
 +    cmd->y_off = y_off;
 +    cmd->block_w = block_w;
 +    cmd->block_h = block_h;
-+    cmd->weight = luma_weight;
-+    cmd->offset = luma_offset;
-+    cmd->src1 = ref1->data[];
++    cmd->src1 = ref1->data[0];
 +    cmd->srcstride1 = ref1->linesize[0];
 +    cmd->mv1 = *mv1;
 +    cmd->ref_idx[0] = current_mv->ref_idx[0];
 +    cmd->ref_idx[1] = current_mv->ref_idx[1];
 +}
 +
-+static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
++static void rpi_chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
 +                          ptrdiff_t dststride, uint8_t *src0, ptrdiff_t srcstride, int reflist,
 +                          int x_off, int y_off, int block_w, int block_h, struct MvField *current_mv, int chroma_weight, int chroma_offset)
 +{
-+    HEVCMvCmd *cmd = unif_mv_cmds + s->num_mv_cmds++;
++    HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++;
 +    cmd->cmd = RPI_CMD_CHROMA_UNI;
 +    cmd->dst = dst0;
 +    cmd->dststride = dststride;
@@ -7229,27 +2992,27 @@ index 1424007..8215201 100644
 +    cmd->offset = chroma_offset;
 +}
 +
-+static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVFrame *ref0, AVFrame *ref1,
++static void rpi_chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVFrame *ref0, AVFrame *ref1,
 +                         int x_off, int y_off, int block_w, int block_h, struct MvField *current_mv, int cidx)
 +{
-+    HEVCMvCmd *cmd = unif_mv_cmds + s->num_mv_cmds++;
++    HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++;
 +    cmd->cmd = RPI_CMD_CHROMA_BI+cidx;
 +    cmd->dst = dst0;
 +    cmd->dststride = dststride;
 +    cmd->src = ref0->data[cidx+1];
 +    cmd->srcstride = ref0->linesize[cidx+1];
-+    cmd->mv = current_mv->mv[reflist];
++    cmd->mv = current_mv->mv[0];
++    cmd->mv1 = current_mv->mv[1];
 +    cmd->x_off = x_off;
 +    cmd->y_off = y_off;
 +    cmd->block_w = block_w;
 +    cmd->block_h = block_h;
-+    cmd->weight = chroma_weight;
-+    cmd->offset = chroma_offset;
-+    cmd->src = ref1->data[cidx+1];
++    cmd->src1 = ref1->data[cidx+1];
 +    cmd->srcstride1 = ref1->linesize[cidx+1];
 +    cmd->ref_idx[0] = current_mv->ref_idx[0];
 +    cmd->ref_idx[1] = current_mv->ref_idx[1];
 +}
++
 +#else
 +#define RPI_REDIRECT(fn) fn
 +#endif
@@ -7257,7 +3020,18 @@ index 1424007..8215201 100644
  static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
                          AVFrame *ref, const Mv *mv, int x_off, int y_off,
                          int block_w, int block_h, int luma_weight, int luma_offset)
-@@ -1505,7 +1596,7 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+@@ -1347,6 +1805,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+                            (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
+     int idx              = ff_hevc_pel_weight[block_w];
+ 
++#ifdef DISABLE_MC
++    return;
++#endif
++
+     x_off += mv->x >> 2;
+     y_off += mv->y >> 2;
+     src   += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift));
+@@ -1393,7 +1855,7 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
   * @param mv1 motion vector1 (relative to block position) to get pixel data from
   * @param current_mv current motion vector structure
   */
@@ -7266,18 +3040,153 @@ index 1424007..8215201 100644
                         AVFrame *ref0, const Mv *mv0, int x_off, int y_off,
                         int block_w, int block_h, AVFrame *ref1, const Mv *mv1, struct MvField *current_mv)
  {
-@@ -1887,16 +1978,16 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+@@ -1417,6 +1879,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+     uint8_t *src0  = ref0->data[0] + y_off0 * src0stride + (int)((unsigned)x_off0 << s->ps.sps->pixel_shift);
+     uint8_t *src1  = ref1->data[0] + y_off1 * src1stride + (int)((unsigned)x_off1 << s->ps.sps->pixel_shift);
+ 
++#ifdef DISABLE_MC
++    return;
++#endif
++
+     if (x_off0 < QPEL_EXTRA_BEFORE || y_off0 < QPEL_EXTRA_AFTER ||
+         x_off0 >= pic_width - block_w - QPEL_EXTRA_AFTER ||
+         y_off0 >= pic_height - block_h - QPEL_EXTRA_AFTER) {
+@@ -1502,6 +1968,10 @@ static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
+     intptr_t _mx         = mx << (1 - hshift);
+     intptr_t _my         = my << (1 - vshift);
+ 
++#ifdef DISABLE_MC
++    return;
++#endif
++
+     x_off += mv->x >> (2 + hshift);
+     y_off += mv->y >> (2 + vshift);
+     src0  += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift));
+@@ -1566,6 +2036,10 @@ static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVF
+     int hshift = s->ps.sps->hshift[1];
+     int vshift = s->ps.sps->vshift[1];
+ 
++#ifdef DISABLE_MC
++    return;
++#endif
++
+     intptr_t mx0 = av_mod_uintp2(mv0->x, 2 + hshift);
+     intptr_t my0 = av_mod_uintp2(mv0->y, 2 + vshift);
+     intptr_t mx1 = av_mod_uintp2(mv1->x, 2 + hshift);
+@@ -1693,14 +2167,14 @@ static void hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW,
+     }
+ }
+ 
+-static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                                int nPbW, int nPbH,
+-                                int log2_cb_size, int partIdx, int idx)
++static void hls_prediction_unit(HEVCContext * const s, const int x0, const int y0,
++                                const int nPbW, const int nPbH,
++                                const unsigned int log2_cb_size, const unsigned int partIdx, const unsigned int idx)
+ {
+ #define POS(c_idx, x, y)                                                              \
+     &s->frame->data[c_idx][((y) >> s->ps.sps->vshift[c_idx]) * s->frame->linesize[c_idx] + \
+                            (((x) >> s->ps.sps->hshift[c_idx]) << s->ps.sps->pixel_shift)]
+-    HEVCLocalContext *lc = s->HEVClc;
++    HEVCLocalContext * const lc = s->HEVClc;
+     int merge_idx = 0;
+     struct MvField current_mv = {{{ 0 }}};
+ 
+@@ -1718,8 +2192,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+     int y_cb             = y0 >> log2_min_cb_size;
+     int x_pu, y_pu;
+     int i, j;
+-
+-    int skip_flag = SAMPLE_CTB(s->skip_flag, x_cb, y_cb);
++    const int skip_flag = SAMPLE_CTB(s->skip_flag, x_cb, y_cb);
+ 
+     if (!skip_flag)
+         lc->pu.merge_flag = ff_hevc_merge_flag_decode(s);
+@@ -1763,16 +2236,89 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
          int nPbW_c = nPbW >> s->ps.sps->hshift[1];
          int nPbH_c = nPbH >> s->ps.sps->vshift[1];
  
 -        luma_mc_uni(s, dst0, s->frame->linesize[0], ref0->frame,
-+        RPI_REDIRECT(luma_mc_uni)(s, dst0, s->frame->linesize[0], ref0->frame,
++#ifdef RPI_LUMA_QPU
++        if (s->enable_rpi) {
++            const Mv * const mv    = &current_mv.mv[0];
++            const unsigned int mx          = mv->x & 3;
++            const unsigned int my          = mv->y & 3;
++            const unsigned int my_mx       = (my<<8) | mx;
++            const uint32_t     my2_mx2_my_mx = (my_mx << 16) | my_mx;
++            const int x1_m3 = x0 + (mv->x >> 2) - 3;
++            const int y1_m3 = y0 + (mv->y >> 2) - 3;
++            const uint32_t src_vc_address_y = get_vc_address_y(ref0->frame);
++            uint32_t * y = s->curr_y_mvs;
++
++            for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
++              const uint32_t src_yx_hi = ((y1_m3 + start_y) << 16);
++
++              for(int start_x=0;start_x < nPbW;start_x+=16) {
++                  const int bw = nPbW-start_x;
++                  const int bh = nPbH-start_y;
++                  y++[-RPI_LUMA_COMMAND_WORDS] = src_yx_hi | ((x1_m3 + start_x) & 0xffff);
++                  y++[-RPI_LUMA_COMMAND_WORDS] = src_vc_address_y;
++                  y++[-RPI_LUMA_COMMAND_WORDS] = src_yx_hi | ((x1_m3 + 8 + start_x) & 0xffff);
++                  y++[-RPI_LUMA_COMMAND_WORDS] = src_vc_address_y;
++                  *y++ = ( (bw<16 ? bw : 16) << 16 ) + (bh<16 ? bh : 16);
++                  *y++ = my2_mx2_my_mx;
++                  *y++ = s->sh.luma_weight_l0[current_mv.ref_idx[0]];
++                  *y++ = s->sh.luma_offset_l0[current_mv.ref_idx[0]] * 2 + 1;
++                  *y++ = (get_vc_address_y(s->frame) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
++                  y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter;
++                }
++            }
++            s->curr_y_mvs = y;
++        } else
++#endif
++        {
++            RPI_REDIRECT(luma_mc_uni)(s, dst0, s->frame->linesize[0], ref0->frame,
                      &current_mv.mv[0], x0, y0, nPbW, nPbH,
                      s->sh.luma_weight_l0[current_mv.ref_idx[0]],
                      s->sh.luma_offset_l0[current_mv.ref_idx[0]]);
++        }
  
          if (s->ps.sps->chroma_format_idc) {
 -            chroma_mc_uni(s, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1],
++#ifdef RPI_INTER_QPU
++          if (s->enable_rpi) {
++                int hshift           = s->ps.sps->hshift[1];
++                int vshift           = s->ps.sps->vshift[1];
++                const Mv *mv         = &current_mv.mv[0];
++                intptr_t mx          = av_mod_uintp2(mv->x, 2 + hshift);
++                intptr_t my          = av_mod_uintp2(mv->y, 2 + vshift);
++                intptr_t _mx         = mx << (1 - hshift);
++                intptr_t _my         = my << (1 - vshift); // Fractional part of motion vector
++
++                int x1_c = x0_c + (mv->x >> (2 + hshift));
++                int y1_c = y0_c + (mv->y >> (2 + hshift));
++
++                uint32_t *u = s->curr_u_mvs;
++                for(int start_y=0;start_y < nPbH_c;start_y+=16) {
++                  for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
++                      int bw = nPbW_c-start_x;
++                      int bh = nPbH_c-start_y;
++                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
++                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
++                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
++                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_u(ref0->frame);
++                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_v(ref0->frame);
++                      *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
++                      *u++ = rpi_filter_coefs[_mx][0];
++                      *u++ = rpi_filter_coefs[_my][0];
++                      *u++ = PACK2(s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0] * 2 + 1,
++                                   s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0]);
++                      *u++ = PACK2(s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1] * 2 + 1,
++                                   s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1]);
++                      *u++ = (get_vc_address_u(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
++                      *u++ = (get_vc_address_v(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
++                    }
++                }
++                s->curr_u_mvs = u;
++                return;
++            }
++#endif
 +            RPI_REDIRECT(chroma_mc_uni)(s, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1],
                            0, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
                            s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]);
@@ -7286,18 +3195,90 @@ index 1424007..8215201 100644
                            0, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
                            s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1]);
          }
-@@ -1906,17 +1997,17 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+@@ -1782,17 +2328,89 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
          int nPbW_c = nPbW >> s->ps.sps->hshift[1];
          int nPbH_c = nPbH >> s->ps.sps->vshift[1];
  
 -        luma_mc_uni(s, dst0, s->frame->linesize[0], ref1->frame,
-+        RPI_REDIRECT(luma_mc_uni)(s, dst0, s->frame->linesize[0], ref1->frame,
++#ifdef RPI_LUMA_QPU
++        if (s->enable_rpi) {
++            const int reflist = 1;
++            const Mv *mv    = &current_mv.mv[reflist];
++            int mx          = mv->x & 3;
++            int my          = mv->y & 3;
++            int my_mx = (my<<8) + mx;
++            int my2_mx2_my_mx = (my_mx << 16) + my_mx;
++            int x1 = x0 + (mv->x >> 2);
++            int y1 = y0 + (mv->y >> 2);
++            uint32_t *y = s->curr_y_mvs;
++            for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
++              for(int start_x=0;start_x < nPbW;start_x+=16) {
++                  int bw = nPbW-start_x;
++                  int bh = nPbH-start_y;
++                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff);
++                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref1->frame);
++                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + 8 + start_x) & 0xffff);
++                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref1->frame);
++                  *y++ = ( (bw<16 ? bw : 16) << 16 ) + (bh<16 ? bh : 16);
++                  *y++ = my2_mx2_my_mx;
++                  *y++ = s->sh.luma_weight_l1[current_mv.ref_idx[reflist]];
++                  *y++ = s->sh.luma_offset_l1[current_mv.ref_idx[reflist]] * 2 + 1;
++                  *y++ = (get_vc_address_y(s->frame) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
++                  y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter;
++                }
++            }
++            s->curr_y_mvs = y;
++        } else
++#endif
++
++        {
++            RPI_REDIRECT(luma_mc_uni)(s, dst0, s->frame->linesize[0], ref1->frame,
                      &current_mv.mv[1], x0, y0, nPbW, nPbH,
                      s->sh.luma_weight_l1[current_mv.ref_idx[1]],
                      s->sh.luma_offset_l1[current_mv.ref_idx[1]]);
++        }
  
          if (s->ps.sps->chroma_format_idc) {
 -            chroma_mc_uni(s, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1],
++#ifdef RPI_INTER_QPU
++            if (s->enable_rpi) {
++                const int reflist = 1;
++                const int hshift           = s->ps.sps->hshift[1];
++                const int vshift           = s->ps.sps->vshift[1];
++                const Mv * const mv        = &current_mv.mv[reflist];
++                const intptr_t mx          = av_mod_uintp2(mv->x, 2 + hshift);
++                const intptr_t my          = av_mod_uintp2(mv->y, 2 + vshift);
++                const intptr_t _mx         = mx << (1 - hshift);
++                const intptr_t _my         = my << (1 - vshift); // Fractional part of motion vector
++
++                const int x1_c = x0_c + (mv->x >> (2 + hshift));
++                const int y1_c = y0_c + (mv->y >> (2 + hshift));
++
++                uint32_t * u = s->curr_u_mvs;
++                for(int start_y=0;start_y < nPbH_c;start_y+=16) {
++                  for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
++                      const int bw = nPbW_c-start_x;
++                      const int bh = nPbH_c-start_y;
++                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
++                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
++                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
++                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_u(ref1->frame);
++                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_v(ref1->frame);
++                      *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
++                      *u++ = rpi_filter_coefs[_mx][0];
++                      *u++ = rpi_filter_coefs[_my][0];
++                      *u++ = PACK2(s->sh.chroma_offset_l1[current_mv.ref_idx[reflist]][0] * 2 + 1,
++                                   s->sh.chroma_weight_l1[current_mv.ref_idx[reflist]][0]);
++                      *u++ = PACK2(s->sh.chroma_offset_l1[current_mv.ref_idx[reflist]][1] * 2 + 1,
++                                   s->sh.chroma_weight_l1[current_mv.ref_idx[reflist]][1]);
++                      *u++ = (get_vc_address_u(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
++                      *u++ = (get_vc_address_v(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
++                    }
++                }
++                s->curr_u_mvs = u;
++                return;
++            }
++#endif
 +            RPI_REDIRECT(chroma_mc_uni)(s, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1],
                            1, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
                            s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0], s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0]);
@@ -7307,17 +3288,120 @@ index 1424007..8215201 100644
                            1, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
                            s->sh.chroma_weight_l1[current_mv.ref_idx[1]][1], s->sh.chroma_offset_l1[current_mv.ref_idx[1]][1]);
          }
-@@ -1926,15 +2017,15 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+@@ -1802,15 +2420,118 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
          int nPbW_c = nPbW >> s->ps.sps->hshift[1];
          int nPbH_c = nPbH >> s->ps.sps->vshift[1];
  
 -        luma_mc_bi(s, dst0, s->frame->linesize[0], ref0->frame,
-+        RPI_REDIRECT(luma_mc_bi)(s, dst0, s->frame->linesize[0], ref0->frame,
++#ifdef RPI_LUMA_QPU
++        if (s->enable_rpi && 0) {
++            const Mv *mv    = &current_mv.mv[0];
++            int mx          = mv->x & 3;
++            int my          = mv->y & 3;
++            int my_mx = (my<<8) + mx;
++            const Mv *mv2    = &current_mv.mv[1];
++            int mx2          = mv2->x & 3;
++            int my2          = mv2->y & 3;
++            int my2_mx2 = (my2<<8) + mx2;
++            int my2_mx2_my_mx = (my2_mx2 << 16) + my_mx;
++            int x1 = x0 + (mv->x >> 2);
++            int y1 = y0 + (mv->y >> 2);
++            int x2 = x0 + (mv2->x >> 2);
++            int y2 = y0 + (mv2->y >> 2);
++            uint32_t *y = s->curr_y_mvs;
++            for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
++              for(int start_x=0;start_x < nPbW;start_x+=8) { // B blocks work 8 at a time
++                  int bw = nPbW-start_x;
++                  int bh = nPbH-start_y;
++                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff);
++                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref0->frame);
++                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y2 - 3 + start_y) << 16) + ( (x2 - 3 + start_x) & 0xffff); // Second fetch is for ref1
++                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref1->frame);
++                  *y++ = PACK2(bw<8 ? bw : 8, bh<16 ? bh : 16);
++                  *y++ = my2_mx2_my_mx;
++
++                  *y++ = PACK2(s->sh.luma_weight_l1[current_mv.ref_idx[1]],
++                               s->sh.luma_weight_l0[current_mv.ref_idx[0]]);
++                  *y++ = s->sh.luma_offset_l0[current_mv.ref_idx[0]] +
++                         s->sh.luma_offset_l1[current_mv.ref_idx[1]] + 1;
++
++                  *y++ = (get_vc_address_y(s->frame) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
++                  y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter_b;
++                }
++            }
++            s->curr_y_mvs = y;
++        } else
++#endif
++        {
++            RPI_REDIRECT(luma_mc_bi)(s, dst0, s->frame->linesize[0], ref0->frame,
                     &current_mv.mv[0], x0, y0, nPbW, nPbH,
                     ref1->frame, &current_mv.mv[1], &current_mv);
++        }
  
          if (s->ps.sps->chroma_format_idc) {
 -            chroma_mc_bi(s, dst1, s->frame->linesize[1], ref0->frame, ref1->frame,
++#ifdef RPI_INTER_QPU
++          if (s->enable_rpi) {
++                int hshift           = s->ps.sps->hshift[1];
++                int vshift           = s->ps.sps->vshift[1];
++                const Mv *mv         = &current_mv.mv[0];
++                intptr_t mx          = av_mod_uintp2(mv->x, 2 + hshift);
++                intptr_t my          = av_mod_uintp2(mv->y, 2 + vshift);
++                intptr_t _mx         = mx << (1 - hshift);
++                intptr_t _my         = my << (1 - vshift); // Fractional part of motion vector
++                int x1_c = x0_c + (mv->x >> (2 + hshift));
++                int y1_c = y0_c + (mv->y >> (2 + hshift));
++
++                const Mv *mv2         = &current_mv.mv[1];
++                intptr_t mx2          = av_mod_uintp2(mv2->x, 2 + hshift);
++                intptr_t my2          = av_mod_uintp2(mv2->y, 2 + vshift);
++                intptr_t _mx2         = mx2 << (1 - hshift);
++                intptr_t _my2         = my2 << (1 - vshift); // Fractional part of motion vector
++
++                int x2_c = x0_c + (mv2->x >> (2 + hshift));
++                int y2_c = y0_c + (mv2->y >> (2 + hshift));
++
++
++                uint32_t *u = s->curr_u_mvs;
++                for(int start_y=0;start_y < nPbH_c;start_y+=16) {
++                  for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
++                      int bw = nPbW_c-start_x;
++                      int bh = nPbH_c-start_y;
++                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b0;
++                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
++                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
++                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_u(ref0->frame);
++                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_v(ref0->frame);
++                      *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
++                      *u++ = rpi_filter_coefs[_mx][0];
++                      *u++ = rpi_filter_coefs[_my][0];
++                      *u++ = s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0]; // Weight L0 U
++                      *u++ = s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1]; // Weight L0 V
++                      *u++ = 0;  // Intermediate results are not written back in first pass of B filtering
++                      *u++ = 0;
++
++                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b;
++                      u++[-RPI_CHROMA_COMMAND_WORDS] = x2_c - 1 + start_x;
++                      u++[-RPI_CHROMA_COMMAND_WORDS] = y2_c - 1 + start_y;
++                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_u(ref1->frame);
++                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_v(ref1->frame);
++                      *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
++                      *u++ = rpi_filter_coefs[_mx2][0];
++                      *u++ = rpi_filter_coefs[_my2][0];
++                      *u++ = PACK2(s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0] +
++                                     s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0] + 1,
++                                   s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0]);
++                      *u++ = PACK2(s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1] +
++                                     s->sh.chroma_offset_l1[current_mv.ref_idx[1]][1] + 1,
++                                   s->sh.chroma_weight_l1[current_mv.ref_idx[1]][1]);
++                      *u++ = (get_vc_address_u(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
++                      *u++ = (get_vc_address_v(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
++                    }
++                }
++                s->curr_u_mvs = u;
++                return;
++            }
++#endif
 +            RPI_REDIRECT(chroma_mc_bi)(s, dst1, s->frame->linesize[1], ref0->frame, ref1->frame,
                           x0_c, y0_c, nPbW_c, nPbH_c, &current_mv, 0);
  
@@ -7326,231 +3410,97 @@ index 1424007..8215201 100644
                           x0_c, y0_c, nPbW_c, nPbH_c, &current_mv, 1);
          }
      }
-@@ -2465,7 +2556,9 @@ static void rpi_execute_pred_cmds(HEVCContext *s)
-       } else {
-           int trafo_size = 1 << cmd->size;
-           s->hevcdsp.transform_add[cmd->size-2](cmd->dst, cmd->buf, cmd->stride);
-+#ifdef RPI_PRECLEAR
-           memset(cmd->buf, 0, trafo_size * trafo_size * sizeof(int16_t)); // Clear coefficients here while they are in the cache
-+#endif
-       }
-   }
-   s->num_pred_cmds = 0;
-@@ -3381,6 +3474,7 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
-     s->coeffs_buf_arm[3] = coefs_per_row + s->coeffs_buf_arm[2];
-     s->coeffs_buf_vc[3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[2];
-     printf("Done\n");
-+#ifdef RPI_PRECLEAR
-     //memset(s->coeffs_buf_arm[0],0, sizeof(int16_t) * coefs_per_row);
-     memclear16(s->coeffs_buf_arm[0], coefs_per_row);
-     //memset(s->coeffs_buf_arm[2],0, sizeof(int16_t) * coefs_per_row);
-@@ -3389,6 +3483,8 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
-     memclear16(s->coeffs_buf_arm[3], coefs_per_row);
- #endif
+@@ -2304,6 +3025,734 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
+     lc->ctb_up_left_flag = ((x_ctb > 0) && (y_ctb > 0)  && (ctb_addr_in_slice-1 >= s->ps.sps->ctb_width) && (s->ps.pps->tile_id[ctb_addr_ts] == s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1 - s->ps.sps->ctb_width]]));
+ }
  
++#ifdef RPI
++static void rpi_execute_dblk_cmds(HEVCContext *s)
++{
++    int n;
++    int job = s->pass1_job;
++    int ctb_size    = 1 << s->ps.sps->log2_ctb_size;
++    int (*p)[2] = s->dblk_cmds[job];
++    for(n = s->num_dblk_cmds[job]; n>0 ;n--,p++) {
++        ff_hevc_hls_filters(s, (*p)[0], (*p)[1], ctb_size);
++    }
++    s->num_dblk_cmds[job] = 0;
++}
++
++static void rpi_execute_transform(HEVCContext *s)
++{
++    int i=2;
++    int job = s->pass1_job;
++    /*int j;
++    int16_t *coeffs = s->coeffs_buf_arm[job][i];
++    for(j=s->num_coeffs[job][i]; j > 0; j-= 16*16, coeffs+=16*16) {
++        s->hevcdsp.idct[4-2](coeffs, 16);
++    }
++    i=3;
++    coeffs = s->coeffs_buf_arm[job][i] - s->num_coeffs[job][i];
++    for(j=s->num_coeffs[job][i]; j > 0; j-= 32*32, coeffs+=32*32) {
++        s->hevcdsp.idct[5-2](coeffs, 32);
++    }*/
++
++    gpu_cache_flush(&s->coeffs_buf_accelerated[job]);
++    s->vpu_id = vpu_post_code2( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2],
++                               s->num_coeffs[job][2] >> 8, s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3],
++                               s->num_coeffs[job][3] >> 10, 0, &s->coeffs_buf_accelerated[job]);
++    //vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0);
++    //gpu_cache_flush(&s->coeffs_buf_accelerated);
++    //vpu_wait(s->vpu_id);
++
++    for(i=0;i<4;i++)
++        s->num_coeffs[job][i] = 0;
++}
++
++static void rpi_execute_pred_cmds(HEVCContext *s)
++{
++  int i;
++  int job = s->pass1_job;
++  HEVCPredCmd *cmd = s->univ_pred_cmds[job];
++#ifdef RPI_WORKER
++  HEVCLocalContextIntra *lc = &s->HEVClcIntra;
++#else
++  HEVCLocalContext *lc = s->HEVClc;
 +#endif
 +
-     s->enable_rpi = 0;
- 
- #endif
-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-index 9a228f6..1ac119a 100644
---- a/libavcodec/hevc.h
-+++ b/libavcodec/hevc.h
-@@ -803,14 +803,39 @@ typedef struct HEVCLocalContext {
- // RPI_MAX_WIDTH is maximum width in pixels supported by the accelerated code
- #define RPI_MAX_WIDTH 2048
- 
--// Worst case is for 4:4:4 4x4 blocks with 64 high coding tree blocks, so 16 MV cmds per 4 pixels across for each colour plane
--#define RPI_MAX_MV_CMDS   (16*3*(RPI_MAX_WIDTH/4))
-+// Worst case is for 4:4:4 4x4 blocks with 64 high coding tree blocks, so 16 MV cmds per 4 pixels across for each colour plane, * 2 for bi
-+#define RPI_MAX_MV_CMDS   (2*16*3*(RPI_MAX_WIDTH/4))
- #define RPI_MAX_XFM_CMDS  (16*3*(RPI_MAX_WIDTH/4))
- // Each block can have an intra prediction and a transform_add command
- #define RPI_MAX_PRED_CMDS (2*16*3*(RPI_MAX_WIDTH/4))
- 
-+#define RPI_CMD_LUMA_UNI 0
-+#define RPI_CMD_CHROMA_UNI 1
-+#define RPI_CMD_LUMA_BI 2
-+#define RPI_CMD_U_BI 3
-+#define RPI_CMD_V_BI 4
-+
-+// RPI_PRECLEAR is not working yet - perhaps clearing on VPUs is flawed?
-+// #define RPI_PRECLEAR
-+
- // Command for inter prediction
- typedef struct HEVCMvCmd {
-+    int cmd;
-+    uint8_t *dst;
-+    ptrdiff_t dststride;
-+    uint8_t *src;
-+    ptrdiff_t srcstride;
-+    Mv mv;
-+    int x_off;
-+    int y_off;
-+    int block_w;
-+    int block_h;
-+    int weight;
-+    int offset;
-+    uint8_t *src1;
-+    ptrdiff_t srcstride1;
-+    Mv mv1;
-+    int8_t ref_idx[2];
- } HEVCMvCmd;
- 
- // Command for transform to process a block of coefficients
-diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
-index dbfee85..4f072be 100644
---- a/libavcodec/hevc_cabac.c
-+++ b/libavcodec/hevc_cabac.c
-@@ -1059,7 +1059,10 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-         }
-     }
-     // We now do the memset after transform_add while we know the data is cached.
--    //memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
-+    #ifdef RPI_PRECLEAR
-+    #else
-+    memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
-+    #endif
- #else
-     memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
- #endif
--- 
-2.7.4
-
-
-From 25d3b4e876febe08302a01abd85d5009160ead3e Mon Sep 17 00:00:00 2001
-From: Peter de Rivaz <peter.derivaz@gmail.com>
-Date: Wed, 6 May 2015 11:08:50 +0100
-Subject: [PATCH 09/68] Inter prediction in separate pass
-
----
- libavcodec/hevc.c | 93 +++++++++++++++++++++++++++++++++++++++++++++----------
- libavcodec/hevc.h |  2 +-
- 2 files changed, 77 insertions(+), 18 deletions(-)
-
-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-index 8215201..b7bc6ad 100644
---- a/libavcodec/hevc.c
-+++ b/libavcodec/hevc.c
-@@ -46,7 +46,7 @@
- // For some unknown reason, the code seems to crash if I do a late malloc
- #define EARLY_MALLOC
- // Move Inter prediction into separate pass
--//#define RPI_INTER
-+#define RPI_INTER
- #endif
- 
- // #define DISABLE_MC
-@@ -1448,7 +1448,7 @@ static void rpi_luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
-                         AVFrame *ref, const Mv *mv, int x_off, int y_off,
-                         int block_w, int block_h, int luma_weight, int luma_offset)
- {
--    HEVCMvCmd *cmd = unif_mv_cmds + s->num_mv_cmds++;
-+    HEVCMvCmd *cmd = s->unif_mv_cmds + s->num_mv_cmds++;
-     cmd->cmd = RPI_CMD_LUMA_UNI;
-     cmd->dst = dst;
-     cmd->dststride = dststride;
-@@ -1467,31 +1467,29 @@ static void rpi_luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
-                        AVFrame *ref0, const Mv *mv0, int x_off, int y_off,
-                        int block_w, int block_h, AVFrame *ref1, const Mv *mv1, struct MvField *current_mv)
- {
--    HEVCMvCmd *cmd = unif_mv_cmds + s->num_mv_cmds++;
-+    HEVCMvCmd *cmd = s->unif_mv_cmds + s->num_mv_cmds++;
-     cmd->cmd = RPI_CMD_LUMA_BI;
-     cmd->dst = dst;
-     cmd->dststride = dststride;
--    cmd->src = ref->data[0];
--    cmd->srcstride = ref->linesize[0];
--    cmd->mv = *mv;
-+    cmd->src = ref0->data[0];
-+    cmd->srcstride = ref0->linesize[0];
-+    cmd->mv = *mv0;
-     cmd->x_off = x_off;
-     cmd->y_off = y_off;
-     cmd->block_w = block_w;
-     cmd->block_h = block_h;
--    cmd->weight = luma_weight;
--    cmd->offset = luma_offset;
--    cmd->src1 = ref1->data[];
-+    cmd->src1 = ref1->data[0];
-     cmd->srcstride1 = ref1->linesize[0];
-     cmd->mv1 = *mv1;
-     cmd->ref_idx[0] = current_mv->ref_idx[0];
-     cmd->ref_idx[1] = current_mv->ref_idx[1];
- }
- 
--static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
-+static void rpi_chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
-                           ptrdiff_t dststride, uint8_t *src0, ptrdiff_t srcstride, int reflist,
-                           int x_off, int y_off, int block_w, int block_h, struct MvField *current_mv, int chroma_weight, int chroma_offset)
- {
--    HEVCMvCmd *cmd = unif_mv_cmds + s->num_mv_cmds++;
-+    HEVCMvCmd *cmd = s->unif_mv_cmds + s->num_mv_cmds++;
-     cmd->cmd = RPI_CMD_CHROMA_UNI;
-     cmd->dst = dst0;
-     cmd->dststride = dststride;
-@@ -1506,27 +1504,27 @@ static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
-     cmd->offset = chroma_offset;
- }
- 
--static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVFrame *ref0, AVFrame *ref1,
-+static void rpi_chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVFrame *ref0, AVFrame *ref1,
-                          int x_off, int y_off, int block_w, int block_h, struct MvField *current_mv, int cidx)
- {
--    HEVCMvCmd *cmd = unif_mv_cmds + s->num_mv_cmds++;
-+    HEVCMvCmd *cmd = s->unif_mv_cmds + s->num_mv_cmds++;
-     cmd->cmd = RPI_CMD_CHROMA_BI+cidx;
-     cmd->dst = dst0;
-     cmd->dststride = dststride;
-     cmd->src = ref0->data[cidx+1];
-     cmd->srcstride = ref0->linesize[cidx+1];
--    cmd->mv = current_mv->mv[reflist];
-+    cmd->mv = current_mv->mv[0];
-+    cmd->mv1 = current_mv->mv[1];
-     cmd->x_off = x_off;
-     cmd->y_off = y_off;
-     cmd->block_w = block_w;
-     cmd->block_h = block_h;
--    cmd->weight = chroma_weight;
--    cmd->offset = chroma_offset;
--    cmd->src = ref1->data[cidx+1];
-+    cmd->src1 = ref1->data[cidx+1];
-     cmd->srcstride1 = ref1->linesize[cidx+1];
-     cmd->ref_idx[0] = current_mv->ref_idx[0];
-     cmd->ref_idx[1] = current_mv->ref_idx[1];
- }
-+
- #else
- #define RPI_REDIRECT(fn) fn
- #endif
-@@ -2554,7 +2552,9 @@ static void rpi_execute_pred_cmds(HEVCContext *s)
-           lc->na.cand_up_right     = (cmd->na >> 0) & 1;
-           s->hpc.intra_pred[cmd->size - 2](s, cmd->x, cmd->y, cmd->c_idx);
-       } else {
++  for(i = s->num_pred_cmds[job]; i > 0; i--, cmd++) {
++      //printf("i=%d cmd=%p job1=%d job0=%d\n",i,cmd,s->pass1_job,s->pass0_job);
++      if (cmd->type == RPI_PRED_INTRA) {
++          lc->tu.intra_pred_mode_c = lc->tu.intra_pred_mode = cmd->mode;
++          lc->na.cand_bottom_left  = (cmd->na >> 4) & 1;
++          lc->na.cand_left         = (cmd->na >> 3) & 1;
++          lc->na.cand_up_left      = (cmd->na >> 2) & 1;
++          lc->na.cand_up           = (cmd->na >> 1) & 1;
++          lc->na.cand_up_right     = (cmd->na >> 0) & 1;
++          s->hpc.intra_pred[cmd->size - 2](s, cmd->x, cmd->y, cmd->c_idx);
++      } else {
 +#ifdef RPI_PRECLEAR
-           int trafo_size = 1 << cmd->size;
++          int trafo_size = 1 << cmd->size;
 +#endif
-           s->hevcdsp.transform_add[cmd->size-2](cmd->dst, cmd->buf, cmd->stride);
- #ifdef RPI_PRECLEAR
-           memset(cmd->buf, 0, trafo_size * trafo_size * sizeof(int16_t)); // Clear coefficients here while they are in the cache
-@@ -2563,6 +2563,61 @@ static void rpi_execute_pred_cmds(HEVCContext *s)
-   }
-   s->num_pred_cmds = 0;
- }
++          s->hevcdsp.transform_add[cmd->size-2](cmd->dst, cmd->buf, cmd->stride);
++#ifdef RPI_PRECLEAR
++          memset(cmd->buf, 0, trafo_size * trafo_size * sizeof(int16_t)); // Clear coefficients here while they are in the cache
++#endif
++      }
++  }
++  s->num_pred_cmds[job] = 0;
++}
 +
 +static void rpi_execute_inter_cmds(HEVCContext *s)
 +{
-+    HEVCMvCmd *cmd = s->unif_mv_cmds;
++    int job = s->pass1_job;
++    HEVCMvCmd *cmd = s->unif_mv_cmds[job];
 +    int n,cidx;
 +    AVFrame myref;
 +    AVFrame myref1;
 +    struct MvField mymv;
-+    if (s->num_mv_cmds > RPI_MAX_MV_CMDS) {
++    if (s->num_mv_cmds[job] > RPI_MAX_MV_CMDS) {
 +        printf("Overflow inter_cmds\n");
 +        exit(-1);
 +    }
-+    for(n = s->num_mv_cmds; n>0 ; n--, cmd++) {
++    for(n = s->num_mv_cmds[job]; n>0 ; n--, cmd++) {
 +        switch(cmd->cmd) {
 +        case RPI_CMD_LUMA_UNI:
 +            myref.data[0] = cmd->src;
@@ -7590,20766 +3540,76 @@ index 8215201..b7bc6ad 100644
 +            break;
 +        }
 +    }
-+    s->num_mv_cmds = 0;
++    s->num_mv_cmds[job] = 0;
 +}
 +
- #endif
- 
- static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-@@ -2611,6 +2666,8 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
- #ifdef RPI
-         if (s->enable_rpi && x_ctb + ctb_size >= s->ps.sps->width) {
-             int x;
-+            // Perform inter prediction
-+            rpi_execute_inter_cmds(s);
-             // Transform all blocks
-             rpi_execute_transform(s);
-             // Perform intra prediction and residual reconstruction
-@@ -3422,6 +3479,7 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
- }
- 
- #ifdef RPI
-+#ifdef RPI_PRECLEAR
- static av_cold void memclear16(int16_t *p, int n)
- {
-   vpu_execute_code( vpu_get_fn(), p, n, 0, 0, 0, 1);
-@@ -3430,6 +3488,7 @@ static av_cold void memclear16(int16_t *p, int n)
-   //  p[i] = 0;
- }
- #endif
-+#endif
- 
- static av_cold int hevc_init_context(AVCodecContext *avctx)
- {
-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-index 1ac119a..a0eb71b 100644
---- a/libavcodec/hevc.h
-+++ b/libavcodec/hevc.h
-@@ -812,7 +812,7 @@ typedef struct HEVCLocalContext {
- #define RPI_CMD_LUMA_UNI 0
- #define RPI_CMD_CHROMA_UNI 1
- #define RPI_CMD_LUMA_BI 2
--#define RPI_CMD_U_BI 3
-+#define RPI_CMD_CHROMA_BI 3
- #define RPI_CMD_V_BI 4
- 
- // RPI_PRECLEAR is not working yet - perhaps clearing on VPUs is flawed?
--- 
-2.7.4
-
-
-From 8af0a0a036e4bb3883f144d0567bc527772dd65b Mon Sep 17 00:00:00 2001
-From: Peter de Rivaz <peter.derivaz@gmail.com>
-Date: Wed, 6 May 2015 13:03:50 +0100
-Subject: [PATCH 10/68] Added VPU thread
-
----
- libavcodec/hevc.c    |  11 +++--
- libavcodec/hevc.h    |   1 +
- libavcodec/rpi_qpu.c | 125 +++++++++++++++++++++++++++++++++++++++++++++++++--
- libavcodec/rpi_qpu.h |   2 +
- 4 files changed, 133 insertions(+), 6 deletions(-)
-
-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-index b7bc6ad..98dbd69 100644
---- a/libavcodec/hevc.c
-+++ b/libavcodec/hevc.c
-@@ -2529,8 +2529,10 @@ static void rpi_execute_transform(HEVCContext *s)
- 
- 
-     gpu_cache_flush(&s->coeffs_buf_accelerated);
--    vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0);
-+    s->vpu_id = vpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0, &s->coeffs_buf_accelerated);
-+    //vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0);
-     //gpu_cache_flush(&s->coeffs_buf_accelerated);
-+    //vpu_wait(s->vpu_id);
- 
-     for(i=0;i<4;i++)
-         s->num_coeffs[i] = 0;
-@@ -2666,10 +2668,12 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
- #ifdef RPI
-         if (s->enable_rpi && x_ctb + ctb_size >= s->ps.sps->width) {
-             int x;
--            // Perform inter prediction
--            rpi_execute_inter_cmds(s);
-             // Transform all blocks
-             rpi_execute_transform(s);
-+            // Perform inter prediction
-+            rpi_execute_inter_cmds(s);
-+            // Wait for transform completion
-+            vpu_wait(s->vpu_id);
-             // Perform intra prediction and residual reconstruction
-             rpi_execute_pred_cmds(s);
-             // Perform deblocking for CTBs in this row
-@@ -3426,6 +3430,7 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
-     av_freep(&s->univ_pred_cmds);
- 
- #ifdef EARLY_MALLOC
-+    printf("hevc_decode_free\n");
-     if (s->coeffs_buf_arm[0]) {
-       gpu_free(&s->coeffs_buf_default);
-       s->coeffs_buf_arm[0] = 0;
-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-index a0eb71b..0d8dfe9 100644
---- a/libavcodec/hevc.h
-+++ b/libavcodec/hevc.h
-@@ -896,6 +896,7 @@ typedef struct HEVCContext {
-     int num_xfm_cmds;
-     int num_mv_cmds;
-     int num_pred_cmds;
-+    int vpu_id;
- #endif
- 
-     uint8_t *cabac_state;
-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-index 12ad5fb..378dd74 100644
---- a/libavcodec/rpi_qpu.c
-+++ b/libavcodec/rpi_qpu.c
-@@ -1,9 +1,13 @@
- #ifdef RPI
--// Use the vcsm device for shared memory
-+// define RPI_USE_VCSM to use the vcsm device for shared memory
- // This works better than the mmap in that the memory can be cached, but requires a kernel modification to enable the device.
- #define RPI_USE_VCSM
--#define RPI_TIME_TOTAL_QPU
--#define RPI_TIME_TOTAL_VPU
-+// define RPI_TIME_TOTAL_QPU to print out how much time is spent in the QPU code
-+//#define RPI_TIME_TOTAL_QPU
-+// define RPI_TIME_TOTAL_VPU to print out how much time is spent in the VPI code
-+//#define RPI_TIME_TOTAL_VPU
-+// define RPI_ASYNC to run the VPU in a separate thread, need to make a separate call to check for completion
-+#define RPI_ASYNC
- 
- #include <stdio.h>
- #include <stdlib.h>
-@@ -113,6 +117,19 @@ static unsigned int Microseconds(void) {
- }
- #endif
- 
-+#ifdef RPI_ASYNC
-+pthread_t vpu_thread;
-+static void *vpu_start(void *arg);
-+
-+#define MAXCMDS 128
-+static pthread_cond_t post_cond = PTHREAD_COND_INITIALIZER;
-+static pthread_mutex_t post_mutex = PTHREAD_MUTEX_INITIALIZER;
-+
-+static int vpu_cmds[MAXCMDS][8];
-+static volatile int vpu_async_tail=0; // Contains the number of posted jobs
-+static volatile int vpu_async_head=0;
-+#endif
-+
- // Connect to QPU, returns 0 on success.
- static int gpu_init(volatile struct GPU **gpu) {
-   int mb = mbox_open();
-@@ -164,12 +181,27 @@ static int gpu_init(volatile struct GPU **gpu) {
-   // And the transform coefficients
-   memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, sizeof(rpi_transMatrix2even));
- 
-+#ifdef RPI_ASYNC
-+  {
-+    int err;
-+    vpu_async_tail = 0;
-+    vpu_async_head = 0;
-+    err = pthread_create(&vpu_thread, NULL, vpu_start, NULL);
-+    //printf("Created thread\n");
-+    if (err) {
-+        printf("Failed to create vpu thread\n");
-+        return -4;
-+    }
-+  }
-+#endif
-+
-   return 0;
- }
- 
- // Make sure we have exclusive access to the mailbox, and enable qpu if necessary.
- static void gpu_lock(void) {
-   pthread_mutex_lock(&gpu_mutex);
-+
-   if (gpu==NULL) {
-     gpu_init(&gpu);
-   }
-@@ -264,6 +296,16 @@ static void gpu_term(void)
- 	unsigned handle = gpu->vc_handle;
-   if (gpu==NULL)
-     return;
-+
-+#ifdef RPI_ASYNC
-+  {
-+    void *res;
-+    vpu_post_code(0, 0, 0, 0, 0, 0, -1, NULL);
-+    pthread_join(vpu_thread, &res);
-+  }
-+#endif
-+
-+
- 	unmapmem((void*)gpu, sizeof(struct GPU));
- 	mem_unlock(mb, handle);
- 	mem_free(mb, handle);
-@@ -322,6 +364,79 @@ unsigned int vpu_get_constants(void) {
-   return gpu->vc + offsetof(struct GPU,transMatrix2even);
- }
- 
-+#ifdef RPI_ASYNC
-+
-+static void *vpu_start(void *arg) {
-+  while(1) {
-+    pthread_mutex_lock(&post_mutex);
-+    while( vpu_async_tail - vpu_async_head <= 0)
-+    {
-+      //printf("Checking number %d %d\n",vpu_async_head,vpu_async_tail);
-+      pthread_cond_wait(&post_cond, &post_mutex);
-+    }
-+    int *p = vpu_cmds[vpu_async_head%MAXCMDS];
-+    pthread_mutex_unlock(&post_mutex);
-+
-+    if (p[6] == -1) {
-+      break; // Last job
-+    }
-+    if (p[7]) {
-+        GPU_MEM_PTR_T *buf = (GPU_MEM_PTR_T *)p[7];
-+        //gpu_cache_flush(buf);
-+    }
-+    vpu_execute_code(p[0], p[1], p[2], p[3], p[4], p[5], p[6]);
-+
-+    pthread_mutex_lock(&post_mutex);
-+    vpu_async_head++;
-+    pthread_cond_broadcast(&post_cond);
-+    pthread_mutex_unlock(&post_mutex);
-+  }
-+
-+  return NULL;
-+}
-+
-+// Post a command to the queue
-+// Returns an id which we can use to wait for completion
-+int vpu_post_code(unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5, GPU_MEM_PTR_T *buf)
++static void rpi_do_all_passes(HEVCContext *s)
 +{
-+  pthread_mutex_lock(&post_mutex);
-+  {
-+    int id = vpu_async_tail++;
-+    int *p = vpu_cmds[id%MAXCMDS];
-+    int num = vpu_async_tail - vpu_async_head;
-+    if (num>MAXCMDS) {
-+      printf("Too many commands submitted\n");
-+      exit(-1);
-+    }
-+    p[0] = code;
-+    p[1] = r0;
-+    p[2] = r1;
-+    p[3] = r2;
-+    p[4] = r3;
-+    p[5] = r4;
-+    p[6] = r5;
-+    p[7] = (int) buf;
-+    if (num<=1)
-+      pthread_cond_broadcast(&post_cond); // Otherwise the vpu thread must already be awake
-+    pthread_mutex_unlock(&post_mutex);
-+    return id;
-+  }
-+}
-+
-+// Wait for completion of the given command
-+void vpu_wait(int id)
-+{
-+  pthread_mutex_lock(&post_mutex);
-+  while( id + 1 - vpu_async_head > 0)
-+  {
-+    pthread_cond_wait(&post_cond, &post_mutex);
-+  }
-+  pthread_mutex_unlock(&post_mutex);
++    // Kick off QPUs and VPUs
++    rpi_launch_vpu_qpu(s);
++    // Perform luma inter prediction
++    rpi_execute_inter_cmds(s);
++    // Wait for transform completion
++    vpu_wait(s->vpu_id);
++    // Perform intra prediction and residual reconstruction
++    rpi_execute_pred_cmds(s);
++    // Perform deblocking for CTBs in this row
++    rpi_execute_dblk_cmds(s);
++    // Prepare next batch
++    rpi_begin(s);
 +}
 +
 +#endif
 +
-+
- unsigned vpu_execute_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5)
- {
-   unsigned r;
-@@ -334,7 +449,9 @@ unsigned vpu_execute_code( unsigned code, unsigned r0, unsigned r1, unsigned r2,
-   static int count=0;
-   static long long countr2=0;
- #endif
-+#ifndef RPI_ASYNC
-   gpu_lock();
-+#endif
- #ifdef RPI_TIME_TOTAL_VPU
-   start_time = Microseconds();
-   if (last_time==0)
-@@ -351,7 +468,9 @@ unsigned vpu_execute_code( unsigned code, unsigned r0, unsigned r1, unsigned r2,
-   if ((count&0x7f)==0)
-     printf("VPU %d %lld On=%dms, Off=%dms\n",count,countr2,(int)(on_time/1000),(int)(off_time/1000));
- #endif
-+#ifndef RPI_ASYNC
-   gpu_unlock();
-+#endif
-   return r;
- }
- 
-diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
-index 814fc3c..3526fce 100644
---- a/libavcodec/rpi_qpu.h
-+++ b/libavcodec/rpi_qpu.h
-@@ -36,6 +36,8 @@ extern unsigned int qpu_get_fn(int num);
- extern unsigned int vpu_get_fn(void);
- extern unsigned int vpu_get_constants(void);
- extern unsigned vpu_execute_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5);
-+extern int vpu_post_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5, GPU_MEM_PTR_T *buf);
-+extern void vpu_wait( int id);
- 
- // Simple test of shader code
- extern int rpi_test_shader(void);
--- 
-2.7.4
-
-
-From 016d3db644e60fbe272bfcf1d7c3670c82422317 Mon Sep 17 00:00:00 2001
-From: Peter de Rivaz <peter.derivaz@gmail.com>
-Date: Wed, 6 May 2015 15:03:37 +0100
-Subject: [PATCH 11/68] Added different signal when tail moves
-
----
- libavcodec/rpi_qpu.c | 11 ++++++-----
- 1 file changed, 6 insertions(+), 5 deletions(-)
-
-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-index 378dd74..d1c3e20 100644
---- a/libavcodec/rpi_qpu.c
-+++ b/libavcodec/rpi_qpu.c
-@@ -122,7 +122,8 @@ pthread_t vpu_thread;
- static void *vpu_start(void *arg);
- 
- #define MAXCMDS 128
--static pthread_cond_t post_cond = PTHREAD_COND_INITIALIZER;
-+static pthread_cond_t post_cond_head = PTHREAD_COND_INITIALIZER;
-+static pthread_cond_t post_cond_tail = PTHREAD_COND_INITIALIZER;
- static pthread_mutex_t post_mutex = PTHREAD_MUTEX_INITIALIZER;
- 
- static int vpu_cmds[MAXCMDS][8];
-@@ -372,7 +373,7 @@ static void *vpu_start(void *arg) {
-     while( vpu_async_tail - vpu_async_head <= 0)
-     {
-       //printf("Checking number %d %d\n",vpu_async_head,vpu_async_tail);
--      pthread_cond_wait(&post_cond, &post_mutex);
-+      pthread_cond_wait(&post_cond_tail, &post_mutex);
-     }
-     int *p = vpu_cmds[vpu_async_head%MAXCMDS];
-     pthread_mutex_unlock(&post_mutex);
-@@ -388,7 +389,7 @@ static void *vpu_start(void *arg) {
- 
-     pthread_mutex_lock(&post_mutex);
-     vpu_async_head++;
--    pthread_cond_broadcast(&post_cond);
-+    pthread_cond_broadcast(&post_cond_head);
-     pthread_mutex_unlock(&post_mutex);
-   }
- 
-@@ -417,7 +418,7 @@ int vpu_post_code(unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned
-     p[6] = r5;
-     p[7] = (int) buf;
-     if (num<=1)
--      pthread_cond_broadcast(&post_cond); // Otherwise the vpu thread must already be awake
-+      pthread_cond_broadcast(&post_cond_tail); // Otherwise the vpu thread must already be awake
-     pthread_mutex_unlock(&post_mutex);
-     return id;
-   }
-@@ -429,7 +430,7 @@ void vpu_wait(int id)
-   pthread_mutex_lock(&post_mutex);
-   while( id + 1 - vpu_async_head > 0)
-   {
--    pthread_cond_wait(&post_cond, &post_mutex);
-+    pthread_cond_wait(&post_cond_head, &post_mutex);
-   }
-   pthread_mutex_unlock(&post_mutex);
- }
--- 
-2.7.4
-
-
-From b04a72641253dc89fd1ec688035c3e2a946aa370 Mon Sep 17 00:00:00 2001
-From: Peter de Rivaz <peter.derivaz@gmail.com>
-Date: Thu, 7 May 2015 08:57:11 +0100
-Subject: [PATCH 12/68] Add option to test for gpu_idle
-
----
- libavcodec/hevc.c    |  3 ++-
- libavcodec/rpi_qpu.c | 18 ++++++++++++++++++
- 2 files changed, 20 insertions(+), 1 deletion(-)
-
-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-index 98dbd69..2e269b6 100644
---- a/libavcodec/hevc.c
-+++ b/libavcodec/hevc.c
-@@ -2527,7 +2527,6 @@ static void rpi_execute_transform(HEVCContext *s)
-     //    s->hevcdsp.idct[4-2](coeffs, 16);
-     //}
- 
--
-     gpu_cache_flush(&s->coeffs_buf_accelerated);
-     s->vpu_id = vpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0, &s->coeffs_buf_accelerated);
-     //vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0);
-@@ -2669,6 +2668,8 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-         if (s->enable_rpi && x_ctb + ctb_size >= s->ps.sps->width) {
-             int x;
-             // Transform all blocks
-+            //printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
-+
-             rpi_execute_transform(s);
-             // Perform inter prediction
-             rpi_execute_inter_cmds(s);
-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-index d1c3e20..85f49db 100644
---- a/libavcodec/rpi_qpu.c
-+++ b/libavcodec/rpi_qpu.c
-@@ -199,6 +199,17 @@ static int gpu_init(volatile struct GPU **gpu) {
-   return 0;
- }
- 
-+// Returns 1 if the gpu is currently idle
-+static int gpu_idle(void)
-+{
-+  int ret = pthread_mutex_trylock(&gpu_mutex);
-+  if (ret==0) {
-+    pthread_mutex_unlock(&gpu_mutex);
-+    return 1;
-+  }
-+  return 0;
-+}
-+
- // Make sure we have exclusive access to the mailbox, and enable qpu if necessary.
- static void gpu_lock(void) {
-   pthread_mutex_lock(&gpu_mutex);
-@@ -400,6 +411,13 @@ static void *vpu_start(void *arg) {
- // Returns an id which we can use to wait for completion
- int vpu_post_code(unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5, GPU_MEM_PTR_T *buf)
- {
-+  // If the gpu is idle then just run the command immediately
-+  // This works, but doesn't seem to give any benefit
-+  // if (gpu_idle()) {
-+  //   vpu_execute_code( code,  r0,  r1,  r2,  r3,  r4,  r5);
-+  //   return -1; // TODO perhaps a wraparound bug here?
-+  // }
-+
-   pthread_mutex_lock(&post_mutex);
-   {
-     int id = vpu_async_tail++;
--- 
-2.7.4
-
-
-From e7b457e683d4ca92bf2677b69708fbfc3849847b Mon Sep 17 00:00:00 2001
-From: Peter de Rivaz <peter.derivaz@gmail.com>
-Date: Thu, 7 May 2015 11:01:35 +0100
-Subject: [PATCH 13/68] Added deblocking pass
-
----
- libavcodec/hevc.c        | 33 +++++++++++++++++++++++++++------
- libavcodec/hevc.h        |  7 ++++++-
- libavcodec/hevc_filter.c |  6 +++++-
- libavcodec/rpi_qpu.c     |  2 +-
- 4 files changed, 39 insertions(+), 9 deletions(-)
-
-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-index 2e269b6..29f8415 100644
---- a/libavcodec/hevc.c
-+++ b/libavcodec/hevc.c
-@@ -2518,6 +2518,17 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
- }
- 
- #ifdef RPI
-+static void rpi_execute_dblk_cmds(HEVCContext *s)
-+{
-+    int n;
-+    int ctb_size    = 1 << s->ps.sps->log2_ctb_size;
-+    int (*p)[2] = s->dblk_cmds;
-+    for(n = s->num_dblk_cmds; n>0 ;n--,p++) {
-+        ff_hevc_hls_filters(s, (*p)[0], (*p)[1], ctb_size);
-+    }
-+    s->num_dblk_cmds = 0;
-+}
-+
- static void rpi_execute_transform(HEVCContext *s)
- {
-     int i=2;
-@@ -2631,7 +2642,6 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-     int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs];
- 
- #ifdef RPI
--    int start_ctb_x = (s->sh.slice_ctb_addr_rs % ((s->ps.sps->width + ctb_size - 1) >> s->ps.sps->log2_ctb_size)) << s->ps.sps->log2_ctb_size;
-     s->enable_rpi = 1; // TODO this should depend on cross component and frame width etc.
- #endif
- 
-@@ -2665,7 +2675,10 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
- 
-         more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
- #ifdef RPI
--        if (s->enable_rpi && x_ctb + ctb_size >= s->ps.sps->width) {
-+        if (s->enable_rpi) {
-+          s->dblk_cmds[s->num_dblk_cmds][0] = x_ctb;
-+          s->dblk_cmds[s->num_dblk_cmds++][1] = y_ctb;
-+          if ( (((y_ctb + ctb_size)&63) == 0) && x_ctb + ctb_size >= s->ps.sps->width) {
-             int x;
-             // Transform all blocks
-             //printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
-@@ -2678,10 +2691,8 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-             // Perform intra prediction and residual reconstruction
-             rpi_execute_pred_cmds(s);
-             // Perform deblocking for CTBs in this row
--            for(x = start_ctb_x; x <= x_ctb; x += ctb_size) {  // TODO this will fail for tiles
--                ff_hevc_hls_filters(s, x, y_ctb, ctb_size);
--            }
--            start_ctb_x = 0;
-+            rpi_execute_dblk_cmds(s);
-+          }
-         }
- #endif
-         if (more_data < 0) {
-@@ -2699,6 +2710,16 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-         ff_hevc_hls_filters(s, x_ctb, y_ctb, ctb_size);
-     }
- 
-+#ifdef RPI
-+    if (s->enable_rpi && s->num_dblk_cmds) {
-+        rpi_execute_transform(s);
-+        rpi_execute_inter_cmds(s);
-+        vpu_wait(s->vpu_id);
-+        rpi_execute_pred_cmds(s);
-+        rpi_execute_dblk_cmds(s);
-+    }
-+#endif
-+
-     if (x_ctb + ctb_size >= s->ps.sps->width &&
-         y_ctb + ctb_size >= s->ps.sps->height)
-         ff_hevc_hls_filter(s, x_ctb, y_ctb, ctb_size);
-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-index 0d8dfe9..990bd8c 100644
---- a/libavcodec/hevc.h
-+++ b/libavcodec/hevc.h
-@@ -808,6 +808,8 @@ typedef struct HEVCLocalContext {
- #define RPI_MAX_XFM_CMDS  (16*3*(RPI_MAX_WIDTH/4))
- // Each block can have an intra prediction and a transform_add command
- #define RPI_MAX_PRED_CMDS (2*16*3*(RPI_MAX_WIDTH/4))
-+// Worst case is 16x16 CTUs
-+#define RPI_MAX_DEBLOCK_CMDS (RPI_MAX_WIDTH*4/16)
- 
- #define RPI_CMD_LUMA_UNI 0
- #define RPI_CMD_CHROMA_UNI 1
-@@ -867,6 +869,9 @@ typedef struct HEVCPredCmd {
- #endif
- 
- typedef struct HEVCContext {
-+#ifdef RPI
-+    int dblk_cmds[RPI_MAX_DEBLOCK_CMDS][2];
-+#endif
-     const AVClass *c;  // needed by private avoptions
-     AVCodecContext *avctx;
- 
-@@ -891,11 +896,11 @@ typedef struct HEVCContext {
-     GPU_MEM_PTR_T coeffs_buf_accelerated;
-     int16_t *coeffs_buf_arm[4];
-     unsigned int coeffs_buf_vc[4];
--
-     int num_coeffs[4];
-     int num_xfm_cmds;
-     int num_mv_cmds;
-     int num_pred_cmds;
-+    int num_dblk_cmds;
-     int vpu_id;
- #endif
- 
-diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
-index e4c3da7..ea0af91 100644
---- a/libavcodec/hevc_filter.c
-+++ b/libavcodec/hevc_filter.c
-@@ -877,8 +877,12 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
-             if (s->threads_type & FF_THREAD_FRAME )
-                 ff_thread_report_progress(&s->ref->tf, y + ctb_size, 0);
-         }
--    } else if (s->threads_type & FF_THREAD_FRAME && x_end)
-+    } else if (s->threads_type & FF_THREAD_FRAME && x_end) {
-+        int newh = y + ctb_size - 4;
-+        //int currh = s->ref->tf.progress->data[0];
-+        //if (((y + ctb_size)&63)==0)
-         ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
-+    }
- }
- 
- void ff_hevc_hls_filters(HEVCContext *s, int x_ctb, int y_ctb, int ctb_size)
-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-index 85f49db..3b6dae7 100644
---- a/libavcodec/rpi_qpu.c
-+++ b/libavcodec/rpi_qpu.c
-@@ -105,7 +105,7 @@ struct GPU
- static pthread_mutex_t gpu_mutex = PTHREAD_MUTEX_INITIALIZER;
- static volatile struct GPU* gpu = NULL;
- 
--#ifdef RPI_TIME_TOTAL_QPU
-+#if defined(RPI_TIME_TOTAL_QPU) || defined(RPI_TIME_TOTAL_VPU)
- static unsigned int Microseconds(void) {
-     struct timespec ts;
-     unsigned int x;
--- 
-2.7.4
-
-
-From 7a443df9115f21b4428de378bd146dcdba3dd42a Mon Sep 17 00:00:00 2001
-From: Peter de Rivaz <peter.derivaz@gmail.com>
-Date: Thu, 7 May 2015 16:47:47 +0100
-Subject: [PATCH 14/68] Added option to disable deblocking for non-ref frames
-
----
- libavcodec/hevc_filter.c | 10 ++++++++++
- 1 file changed, 10 insertions(+)
-
-diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
-index ea0af91..2cdd621 100644
---- a/libavcodec/hevc_filter.c
-+++ b/libavcodec/hevc_filter.c
-@@ -25,6 +25,8 @@
- //#define DISABLE_SAO
- //#define DISABLE_DEBLOCK
- //#define DISABLE_STRENGTHS
-+// define DISABLE_DEBLOCK_NONREF for a 6% speed boost (by skipping deblocking on unimportant frames)
-+//#define DISABLE_DEBLOCK_NONREF
- 
- #include "libavutil/common.h"
- #include "libavutil/internal.h"
-@@ -504,6 +506,14 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
-                 s->ps.sps->pcm.loop_filter_disable_flag) ||
-                s->ps.pps->transquant_bypass_enable_flag;
- 
-+#ifdef DISABLE_DEBLOCK_NONREF
-+    if (    s->nal_unit_type == NAL_TRAIL_N ||
-+            s->nal_unit_type == NAL_TSA_N   ||
-+            s->nal_unit_type == NAL_STSA_N  ||
-+            s->nal_unit_type == NAL_RADL_N  ||
-+            s->nal_unit_type == NAL_RASL_N )
-+      return; // Don't deblock non-reference frames
-+#endif
- #ifdef DISABLE_DEBLOCK
-     return;
- #endif
--- 
-2.7.4
-
-
-From 9606e160a582db64ccf981d971cdc258d8cc02f7 Mon Sep 17 00:00:00 2001
-From: Peter de Rivaz <peter.derivaz@gmail.com>
-Date: Mon, 11 May 2015 10:00:27 +0100
-Subject: [PATCH 15/68] Moved buffers to VPU memory
-
----
- libavcodec/hevc_filter.c | 17 +++++++++++++-
- libavcodec/utils.c       | 59 ++++++++++++++++++++++++++++++++++++++++++++++++
- libavutil/buffer.c       |  6 +++++
- libavutil/buffer.h       |  3 +++
- 4 files changed, 84 insertions(+), 1 deletion(-)
-
-diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
-index 2cdd621..e1b32d4 100644
---- a/libavcodec/hevc_filter.c
-+++ b/libavcodec/hevc_filter.c
-@@ -866,6 +866,13 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
- #undef CB
- #undef CR
- 
-+#ifdef RPI_INTER_QPU
-+static void flush_buffer(AVBufferRef *bref) {
-+    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
-+    gpu_cache_flush(p);
-+}
-+#endif
-+
- void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
- {
-     int x_end = x >= s->ps.sps->width  - ctb_size;
-@@ -888,9 +895,17 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
-                 ff_thread_report_progress(&s->ref->tf, y + ctb_size, 0);
-         }
-     } else if (s->threads_type & FF_THREAD_FRAME && x_end) {
--        int newh = y + ctb_size - 4;
-+        //int newh = y + ctb_size - 4;
-         //int currh = s->ref->tf.progress->data[0];
-         //if (((y + ctb_size)&63)==0)
-+        if (!(  s->nal_unit_type == NAL_TRAIL_N ||
-+            s->nal_unit_type == NAL_TSA_N   ||
-+            s->nal_unit_type == NAL_STSA_N  ||
-+            s->nal_unit_type == NAL_RADL_N  ||
-+            s->nal_unit_type == NAL_RASL_N )) {
-+            flush_buffer(s->frame->buf[1]);
-+            flush_buffer(s->frame->buf[2]);
-+        }
-         ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
-     }
- }
-diff --git a/libavcodec/utils.c b/libavcodec/utils.c
-index f7adb52..708526e 100644
---- a/libavcodec/utils.c
-+++ b/libavcodec/utils.c
-@@ -26,6 +26,12 @@
-  */
- 
- #include "config.h"
-+
 +#ifdef RPI
-+// Move video buffers to GPU memory
-+#define RPI_GPU_BUFFERS
-+#endif
-+
- #include "libavutil/atomic.h"
- #include "libavutil/attributes.h"
- #include "libavutil/avassert.h"
-@@ -64,6 +70,10 @@
- #include "libavutil/ffversion.h"
- const char av_codec_ffversion[] = "FFmpeg version " FFMPEG_VERSION;
- 
-+#ifdef RPI_GPU_BUFFERS
-+#include "rpi_qpu.h"
-+#endif
-+
- #if HAVE_PTHREADS || HAVE_W32THREADS || HAVE_OS2THREADS
- static int default_lockmgr_cb(void **arg, enum AVLockOp op)
- {
-@@ -503,6 +513,47 @@ int avcodec_fill_audio_frame(AVFrame *frame, int nb_channels,
-     return ret;
- }
- 
-+#ifdef RPI_GPU_BUFFERS
-+static void rpi_buffer_default_free(void *opaque, uint8_t *data)
-+{
-+    GPU_MEM_PTR_T *p = opaque;
-+    gpu_free(p);
-+    av_free(p);
-+}
-+
-+static AVBufferRef *rpi_buffer_alloc(int size)
-+{
-+    AVBufferRef *ret = NULL;
-+    uint8_t    *data = NULL;
-+    GPU_MEM_PTR_T *p;
-+
-+    static int total=0;
-+    total+=size;
-+
-+    p = av_malloc(sizeof *p);
-+    if (!p)
-+        return NULL;
-+
-+    if (gpu_malloc_cached(size,p)<0)  // Change this line to choose cached or uncached memory.  The caching here refers to the ARM data cache.
-+        return NULL;
-+
-+    data = p->arm;
-+    printf("Rpi alloc %d/%d ARM=%p VC=%x->%x\n",size,total,p->arm,p->vc,p->vc+size);
-+    //memset(data, 64, size);
-+
-+    if (!data)
-+        return NULL;
-+
-+    ret = av_buffer_create(data, size, rpi_buffer_default_free, p, 0);
-+    if (!ret) {
-+        gpu_free(p);
-+        av_freep(&p);
-+    }
-+
-+    return ret;
-+}
-+#endif
-+
- static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame)
- {
-     FramePool *pool = avctx->internal->pool;
-@@ -550,6 +601,14 @@ static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame)
-             av_buffer_pool_uninit(&pool->pools[i]);
-             pool->linesize[i] = linesize[i];
-             if (size[i]) {
-+#ifdef RPI_GPU_BUFFERS
-+                if (avctx->codec_id == AV_CODEC_ID_HEVC)
-+                    pool->pools[i] = av_buffer_pool_init(size[i] + 16 + STRIDE_ALIGN - 1,
-+                                                     CONFIG_MEMORY_POISONING ?
-+                                                        NULL :
-+                                                        rpi_buffer_alloc);
-+                else
-+#endif
-                 pool->pools[i] = av_buffer_pool_init(size[i] + 16 + STRIDE_ALIGN - 1,
-                                                      CONFIG_MEMORY_POISONING ?
-                                                         NULL :
-diff --git a/libavutil/buffer.c b/libavutil/buffer.c
-index 694e116..203ca7b 100644
---- a/libavutil/buffer.c
-+++ b/libavutil/buffer.c
-@@ -425,3 +425,9 @@ AVBufferRef *av_buffer_pool_get(AVBufferPool *pool)
- 
-     return ret;
- }
-+
-+// Return the opaque for the underlying frame (gives us a GPU_MEM_PTR_T)
-+void *av_buffer_pool_opaque(AVBufferRef *ref) {
-+  BufferPoolEntry *buf = av_buffer_get_opaque(ref);
-+  return buf->opaque;
-+}
-diff --git a/libavutil/buffer.h b/libavutil/buffer.h
-index 0c0ce12..82e0bc3 100644
---- a/libavutil/buffer.h
-+++ b/libavutil/buffer.h
-@@ -283,6 +283,9 @@ void av_buffer_pool_uninit(AVBufferPool **pool);
-  */
- AVBufferRef *av_buffer_pool_get(AVBufferPool *pool);
- 
-+// Return the opaque for the underlying frame
-+void *av_buffer_pool_opaque(AVBufferRef *ref);
-+
- /**
-  * @}
-  */
--- 
-2.7.4
-
-
-From f56515b9a720c829ba3ddf6da4232a91b13e0f03 Mon Sep 17 00:00:00 2001
-From: Peter de Rivaz <peter.derivaz@gmail.com>
-Date: Mon, 11 May 2015 14:04:37 +0100
-Subject: [PATCH 16/68] Prepared QPU execute code
-
----
- libavcodec/hevc.c        | 227 ++++++++++++++++++++++++++++++++++++++++-------
- libavcodec/hevc.h        |  22 ++++-
- libavcodec/hevc_filter.c |   7 +-
- libavcodec/rpi_qpu.c     |  55 +++++++++++-
- libavcodec/rpi_qpu.h     |   2 +
- 5 files changed, 276 insertions(+), 37 deletions(-)
-
-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-index 29f8415..66ed37a 100644
---- a/libavcodec/hevc.c
-+++ b/libavcodec/hevc.c
-@@ -42,17 +42,45 @@
- #include "profiles.h"
- 
- #ifdef RPI
--#include "rpi_qpu.h"
--// For some unknown reason, the code seems to crash if I do a late malloc
--#define EARLY_MALLOC
--// Move Inter prediction into separate pass
--#define RPI_INTER
-+  #include "rpi_qpu.h"
-+  // For some unknown reason, the code seems to crash if I do a late malloc
-+  #define EARLY_MALLOC
-+  // Move Inter prediction into separate pass
-+  #define RPI_INTER
- #endif
- 
- // #define DISABLE_MC
- 
- const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
- 
-+
-+#ifdef RPI_INTER_QPU
-+
-+#define RPI_CHROMA_COMMAND_WORDS 12
-+// The QPU code for UV blocks only works up to a block width of 8
-+#define RPI_CHROMA_BLOCK_WIDTH 8
-+
-+#define ENCODE_COEFFS(c0, c1, c2, c3) (((-c0) & 0xff) | ((-c1) & 0xff) << 8 | ((-c2) & 0xff) << 16 | ((-c3) & 0xff) << 24)
-+
-+// TODO Chroma only needs 4 taps
-+static uint32_t rpi_filter_coefs[8][2] = {
-+        { ENCODE_COEFFS(  0,  0,  0, 128), ENCODE_COEFFS(   0,   0,  0,  0 ) },
-+        { ENCODE_COEFFS(  0,  0, -2,  58), ENCODE_COEFFS(  10,  -2,  0,  0 ) },
-+        { ENCODE_COEFFS(  0,  0, -4,  54), ENCODE_COEFFS(  16,  -2,  0,  0 ) },
-+        { ENCODE_COEFFS(  0,  0, -6,  46), ENCODE_COEFFS(  28,  -4,  0,  0 ) },
-+        { ENCODE_COEFFS(  0,  0, -4,  36), ENCODE_COEFFS(  36,  -4,  0,  0 ) },
-+        { ENCODE_COEFFS(  0,  0, -4,  28), ENCODE_COEFFS(  46,  -6,  0,  0 ) },
-+        { ENCODE_COEFFS(  0,  0, -2,  16), ENCODE_COEFFS(  54,  -4,  0,  0 ) },
-+        { ENCODE_COEFFS(  0,  0, -2,  10), ENCODE_COEFFS(  58,  -2,  0,  0 ) }
-+};
-+
-+static uint32_t get_vc_address(AVBufferRef *bref) {
-+  GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
-+  return p->vc;
-+}
-+
-+#endif
-+
- /**
-  * NOTE: Each function hls_foo correspond to the function foo in the
-  * specification (HLS stands for High Level Syntax).
-@@ -66,6 +94,7 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
- static void pic_arrays_free(HEVCContext *s)
- {
- #ifdef RPI
-+
- #ifdef EARLY_MALLOC
- #else
-     printf("pic_arrays_free\n");
-@@ -1982,6 +2011,43 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-                     s->sh.luma_offset_l0[current_mv.ref_idx[0]]);
- 
-         if (s->ps.sps->chroma_format_idc) {
-+#ifdef RPI_INTER_QPU
-+            if (s->enable_rpi) {
-+                int reflist = 0;
-+                int hshift           = s->ps.sps->hshift[1];
-+                int vshift           = s->ps.sps->vshift[1];
-+                const Mv *mv         = &current_mv.mv[reflist];
-+                intptr_t mx          = av_mod_uintp2(mv->x, 2 + hshift);
-+                intptr_t my          = av_mod_uintp2(mv->y, 2 + vshift);
-+                intptr_t _mx         = mx << (1 - hshift);
-+                intptr_t _my         = my << (1 - vshift); // Fractional part of motion vector
-+
-+                int x1_c = x0_c + (mv->x >> (2 + hshift));
-+                int y1_c = y0_c + (mv->y >> (2 + hshift));
-+                int chan = x0>>8; // Allocate commands for the first 256 luma pixels across to the first QPU.  This is optimised for images around 1920 width
-+
-+                uint32_t *u = s->u_mvs[chan & 7];
-+                for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-+                  for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 3 + start_x;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 3 + start_y;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[1]);
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[2]);
-+                      *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
-+                      // TODO chroma weight and offset... s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]
-+                      *u++ = rpi_filter_coefs[_mx][0];
-+                      *u++ = rpi_filter_coefs[_mx][1];
-+                      *u++ = rpi_filter_coefs[_my][0];
-+                      *u++ = rpi_filter_coefs[_my][1];
-+                      *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
-+                      *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-+                    }
-+                }
-+                s->u_mvs[chan & 7] = u;
-+                return;
-+            }
-+#endif
-             RPI_REDIRECT(chroma_mc_uni)(s, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1],
-                           0, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
-                           s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]);
-@@ -2632,6 +2698,54 @@ static void rpi_execute_inter_cmds(HEVCContext *s)
- 
- #endif
- 
-+#ifdef RPI_INTER_QPU
-+static void rpi_inter_clear(HEVCContext *s)
++static void rpi_begin(HEVCContext *s)
 +{
++    int job = s->pass0_job;
 +    int i;
++#ifdef RPI_INTER_QPU
 +    int pic_width        = s->ps.sps->width >> s->ps.sps->hshift[1];
 +    int pic_height       = s->ps.sps->height >> s->ps.sps->vshift[1];
++
 +    for(i=0;i<8;i++) {
-+        s->u_mvs[i] = s->mvs_base[i];
-+        *s->u_mvs[i]++ = 0;
-+        *s->u_mvs[i]++ = 0;
-+        *s->u_mvs[i]++ = 0;
-+        *s->u_mvs[i]++ = 0;
-+        *s->u_mvs[i]++ = 0;
-+        *s->u_mvs[i]++ = pic_width;
-+        *s->u_mvs[i]++ = pic_height;
-+        *s->u_mvs[i]++ = s->frame->linesize[1];
-+        *s->u_mvs[i]++ = s->frame->linesize[2];
-+        s->u_mvs[i] += 3;  // Padding words
++        s->u_mvs[job][i] = s->mvs_base[job][i];
++        *s->u_mvs[job][i]++ = 0;
++        *s->u_mvs[job][i]++ = 0;
++        *s->u_mvs[job][i]++ = 0;
++        *s->u_mvs[job][i]++ = 0;
++        *s->u_mvs[job][i]++ = 0;
++        *s->u_mvs[job][i]++ = pic_width;
++        *s->u_mvs[job][i]++ = pic_height;
++        *s->u_mvs[job][i]++ = s->frame->linesize[1];
++        *s->u_mvs[job][i]++ = s->frame->linesize[2];
++        *s->u_mvs[job][i]++ = s->sh.chroma_log2_weight_denom + 6;
++        *s->u_mvs[job][i]++ = 0;
++        *s->u_mvs[job][i]++ = i;  // Select section of VPM (avoid collisions with 3d unit)
 +    }
-+}
-+
-+static void rpi_execute_inter_qpu(HEVCContext *s)
-+{
-+    int k;
-+    uint32_t *unif_vc = (uint32_t *)s->unif_mvs_ptr.vc;
-+
-+    if (s->sh.slice_type == I_SLICE)
-+        return;
-+    for(k=0;k<8;k++) {
-+        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
-+        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
-+    }
-+
-+    s->u_mvs[8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
-+
-+    qpu_run_shader8(qpu_get_fn(QPU_MC_SETUP_UV),
-+      (uint32_t)(unif_vc+(s->mvs_base[0 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+      (uint32_t)(unif_vc+(s->mvs_base[1 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+      (uint32_t)(unif_vc+(s->mvs_base[2 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+      (uint32_t)(unif_vc+(s->mvs_base[3 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+      (uint32_t)(unif_vc+(s->mvs_base[4 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+      (uint32_t)(unif_vc+(s->mvs_base[5 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+      (uint32_t)(unif_vc+(s->mvs_base[6 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+      (uint32_t)(unif_vc+(s->mvs_base[7 ] - (uint32_t*)s->unif_mvs_ptr.arm))
-+      );
-+}
++    s->curr_u_mvs = s->u_mvs[job][0];
 +#endif
-+
- static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
- {
-     HEVCContext *s  = avctxt->priv_data;
-@@ -2658,6 +2772,10 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-         }
-     }
- 
-+#ifdef RPI_INTER_QPU
-+    rpi_inter_clear(s);
-+#endif
-+
-     while (more_data && ctb_addr_ts < s->ps.sps->ctb_size) {
-         int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
- 
-@@ -2679,19 +2797,30 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-           s->dblk_cmds[s->num_dblk_cmds][0] = x_ctb;
-           s->dblk_cmds[s->num_dblk_cmds++][1] = y_ctb;
-           if ( (((y_ctb + ctb_size)&63) == 0) && x_ctb + ctb_size >= s->ps.sps->width) {
--            int x;
-+#ifdef RPI_INTER_QPU
-+            // Kick off inter prediction on QPUs
-+            rpi_execute_inter_qpu(s);
-+#endif
-             // Transform all blocks
-             //printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
--
-             rpi_execute_transform(s);
-             // Perform inter prediction
-             rpi_execute_inter_cmds(s);
-             // Wait for transform completion
-             vpu_wait(s->vpu_id);
-+
-+            // Copy back reconstructed data
-+            //memcpy(s->frame->data[0],s->dummy.arm,2048*64);
-+            //memcpy(s->frame->data[1],s->dummy.arm,1024*32);
-+            //memcpy(s->frame->data[2],s->dummy.arm,1024*32);
-+
-             // Perform intra prediction and residual reconstruction
-             rpi_execute_pred_cmds(s);
-             // Perform deblocking for CTBs in this row
-             rpi_execute_dblk_cmds(s);
-+#ifdef RPI_INTER_QPU
-+            rpi_inter_clear(s);
-+#endif
-           }
-         }
- #endif
-@@ -2712,6 +2841,9 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
- 
- #ifdef RPI
-     if (s->enable_rpi && s->num_dblk_cmds) {
-+#ifdef RPI_INTER_QPU
-+        rpi_execute_inter_qpu(s);
-+#endif
-         rpi_execute_transform(s);
-         rpi_execute_inter_cmds(s);
-         vpu_wait(s->vpu_id);
-@@ -3451,6 +3583,14 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
-     av_freep(&s->unif_xfm_cmds);
-     av_freep(&s->univ_pred_cmds);
- 
-+#ifdef RPI_INTER_QPU
-+    if (s->unif_mvs) {
-+        gpu_free( &s->unif_mvs_ptr );
-+        s->unif_mvs = 0;
-+    }
-+#endif
-+    //gpu_free(&s->dummy);
-+
- #ifdef EARLY_MALLOC
-     printf("hevc_decode_free\n");
-     if (s->coeffs_buf_arm[0]) {
-@@ -3541,34 +3681,59 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
-     if (!s->univ_pred_cmds)
-         goto fail;
- 
--    s->coeffs_buf_arm[0] = 0;
--    s->coeffs_buf_arm[2] = 0;
-+#ifdef RPI_INTER_QPU
-+    // We divide the image into blocks 256 wide and 64 high
-+    // We support up to 2048 widths
-+    // We compute the number of chroma motion vector commands for 4:4:4 format and 4x4 chroma blocks - assuming all blocks are B predicted
-+    // Also add space for the startup command for each stream.
-+
-+    {
-+        int uv_commands_per_qpu = (1 + (256*64*2)/(4*4)) * RPI_CHROMA_COMMAND_WORDS;
-+        uint32_t *p;
-+        gpu_malloc_uncached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr );
-+        s->unif_mvs = (uint32_t *) s->unif_mvs_ptr.arm; // TODO support this allocation in non EARLY_MALLOC
-+
-+        // Set up initial locations for uniform streams
-+        p = s->unif_mvs;
-+        for(i = 0; i < 8; i++) {
-+            s->mvs_base[i] = p;
-+            p += uv_commands_per_qpu;
-+        }
-+        s->mc_filter_uv = qpu_get_fn(QPU_MC_FILTER_UV);
-+        s->mc_filter_uv_b = qpu_get_fn(QPU_MC_FILTER_UV_B);
-+
-+    }
-+#endif
-+    //gpu_malloc_uncached(2048*64,&s->dummy);
- 
- #ifdef EARLY_MALLOC
--    int coeffs_in_ctb = 64*64;
--    int coefs_per_row = (2048/64) * coeffs_in_ctb * 3;  // Allow space for chroma
--    printf("Allocated %d\n",coefs_per_row);
--    gpu_malloc_cached(sizeof(int16_t) * coefs_per_row, &s->coeffs_buf_default);
--    s->coeffs_buf_arm[0] = (int16_t*) s->coeffs_buf_default.arm;
--    if (!s->coeffs_buf_arm[0])
--        goto fail;
--    gpu_malloc_cached(sizeof(int16_t) * coefs_per_row * 2, &s->coeffs_buf_accelerated);
--    s->coeffs_buf_arm[2] = (int16_t*) s->coeffs_buf_accelerated.arm;
--    s->coeffs_buf_vc[2] = s->coeffs_buf_accelerated.vc;
--    if (!s->coeffs_buf_arm[2])
--        goto fail;
--    s->coeffs_buf_arm[3] = coefs_per_row + s->coeffs_buf_arm[2];
--    s->coeffs_buf_vc[3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[2];
--    printf("Done\n");
-+    {
-+        int coeffs_in_ctb = 64*64;
-+        int coefs_per_row = (2048/64) * coeffs_in_ctb * 3;  // Allow space for chroma
-+        s->coeffs_buf_arm[0] = 0;
-+        s->coeffs_buf_arm[2] = 0;
-+        printf("Allocated %d\n",coefs_per_row);
-+        gpu_malloc_cached(sizeof(int16_t) * coefs_per_row, &s->coeffs_buf_default);
-+        s->coeffs_buf_arm[0] = (int16_t*) s->coeffs_buf_default.arm;
-+        if (!s->coeffs_buf_arm[0])
-+            goto fail;
-+        gpu_malloc_cached(sizeof(int16_t) * coefs_per_row * 2, &s->coeffs_buf_accelerated);
-+        s->coeffs_buf_arm[2] = (int16_t*) s->coeffs_buf_accelerated.arm;
-+        s->coeffs_buf_vc[2] = s->coeffs_buf_accelerated.vc;
-+        if (!s->coeffs_buf_arm[2])
-+            goto fail;
-+        s->coeffs_buf_arm[3] = coefs_per_row + s->coeffs_buf_arm[2];
-+        s->coeffs_buf_vc[3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[2];
-+        printf("Done\n");
- #ifdef RPI_PRECLEAR
--    //memset(s->coeffs_buf_arm[0],0, sizeof(int16_t) * coefs_per_row);
--    memclear16(s->coeffs_buf_arm[0], coefs_per_row);
--    //memset(s->coeffs_buf_arm[2],0, sizeof(int16_t) * coefs_per_row);
--    memclear16(s->coeffs_buf_arm[2], coefs_per_row);
--    //memset(s->coeffs_buf_arm[3],0, sizeof(int16_t) * coefs_per_row);
--    memclear16(s->coeffs_buf_arm[3], coefs_per_row);
-+        //memset(s->coeffs_buf_arm[0],0, sizeof(int16_t) * coefs_per_row);
-+        memclear16(s->coeffs_buf_arm[0], coefs_per_row);
-+        //memset(s->coeffs_buf_arm[2],0, sizeof(int16_t) * coefs_per_row);
-+        memclear16(s->coeffs_buf_arm[2], coefs_per_row);
-+        //memset(s->coeffs_buf_arm[3],0, sizeof(int16_t) * coefs_per_row);
-+        memclear16(s->coeffs_buf_arm[3], coefs_per_row);
- #endif
--
-+    }
- #endif
- 
-     s->enable_rpi = 0;
-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-index 990bd8c..da345f6 100644
---- a/libavcodec/hevc.h
-+++ b/libavcodec/hevc.h
-@@ -42,7 +42,11 @@
- 
- // define RPI to split the CABAC/prediction/transform into separate stages
- #ifdef RPI
--#include "rpi_qpu.h"
-+
-+  #include "rpi_qpu.h"
-+  // Use QPU for inter prediction
-+  //#define RPI_INTER_QPU
-+
- #endif
- 
- #define MAX_DPB_SIZE 16 // A.4.1
-@@ -888,7 +892,7 @@ typedef struct HEVCContext {
- 
- #ifdef RPI
-     int enable_rpi;
--    HEVCMvCmd *unif_mv_cmds;
-+    HEVCMvCmd *unif_mv_cmds;  // TODO rename
-     HEVCXfmCmd *unif_xfm_cmds;
-     HEVCPredCmd *univ_pred_cmds;
-     int buf_width;
-@@ -902,6 +906,20 @@ typedef struct HEVCContext {
-     int num_pred_cmds;
-     int num_dblk_cmds;
-     int vpu_id;
-+    //GPU_MEM_PTR_T dummy;
-+#ifdef RPI_INTER_QPU
-+    GPU_MEM_PTR_T unif_mvs_ptr;
-+    uint32_t *unif_mvs; // Base of memory for motion vector commands
-+
-+    // _base pointers are to the start of the row
-+    uint32_t *mvs_base[8];
-+    // these pointers are to the next free space
-+    uint32_t *u_mvs[8];
-+    // Function pointers
-+    uint32_t mc_filter_uv;
-+    uint32_t mc_filter_uv_b;
-+#endif
-+
- #endif
- 
-     uint8_t *cabac_state;
-diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
-index e1b32d4..5b3d759 100644
---- a/libavcodec/hevc_filter.c
-+++ b/libavcodec/hevc_filter.c
-@@ -903,8 +903,11 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
-             s->nal_unit_type == NAL_STSA_N  ||
-             s->nal_unit_type == NAL_RADL_N  ||
-             s->nal_unit_type == NAL_RASL_N )) {
--            flush_buffer(s->frame->buf[1]);
--            flush_buffer(s->frame->buf[2]);
-+            //flush_buffer(s->frame->buf[1]);
-+            //flush_buffer(s->frame->buf[2]);
-+            //memcpy(s->dummy.arm,s->frame->data[0],2048*64);
-+            //memcpy(s->dummy.arm,s->frame->data[1],1024*32);
-+            //memcpy(s->dummy.arm,s->frame->data[2],1024*32);
-         }
-         ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
-     }
-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-index 3b6dae7..e4dd58a 100644
---- a/libavcodec/rpi_qpu.c
-+++ b/libavcodec/rpi_qpu.c
-@@ -3,7 +3,7 @@
- // This works better than the mmap in that the memory can be cached, but requires a kernel modification to enable the device.
- #define RPI_USE_VCSM
- // define RPI_TIME_TOTAL_QPU to print out how much time is spent in the QPU code
--//#define RPI_TIME_TOTAL_QPU
-+#define RPI_TIME_TOTAL_QPU
- // define RPI_TIME_TOTAL_VPU to print out how much time is spent in the VPI code
- //#define RPI_TIME_TOTAL_VPU
- // define RPI_ASYNC to run the VPU in a separate thread, need to make a separate call to check for completion
-@@ -30,7 +30,7 @@
- #endif
- 
- // On Pi2 there is no way to access the VPU L2 cache
--// GPU_MEM_FLG should be 4 for uncached memory.
-+// GPU_MEM_FLG should be 4 for uncached memory.  (Or C for alias to allocate in the VPU L2 cache)
- // However, if using VCSM allocated buffers, need to use C at the moment because VCSM does not allocate uncached memory correctly
- // The QPU crashes if we mix L2 cached and L2 uncached accesses due to a HW bug.
- #define GPU_MEM_FLG 0xC
-@@ -549,6 +549,54 @@ void qpu_run_shader12(int code, int num, int code2, int num2, int unifs1, int un
-   gpu_unlock();
- }
- 
-+// Run a program on 8 QPUs with the given code and uniform stream (given in GPU addresses)
-+void qpu_run_shader8(int code, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8)
-+{
-+  int i;
-+#ifdef RPI_TIME_TOTAL_QPU
-+  static int last_time=0;
-+  static long long on_time=0;
-+  static long long off_time=0;
-+  int start_time;
-+  int end_time;
-+  static int count=0;
-+#endif
-+
-+  gpu_lock();
-+#ifdef RPI_TIME_TOTAL_QPU
-+  start_time = Microseconds();
-+  if (last_time==0)
-+    last_time = start_time;
-+  off_time += start_time-last_time;
-+#endif
-+  for(i=0;i<8;i++) {
-+    gpu->mail[i*2 + 1] = code;
-+  }
-+  gpu->mail[0 ] = unifs1;
-+  gpu->mail[2 ] = unifs2;
-+  gpu->mail[4 ] = unifs3;
-+  gpu->mail[6 ] = unifs4;
-+  gpu->mail[8 ] = unifs5;
-+  gpu->mail[10] = unifs6;
-+	gpu->mail[12] = unifs7;
-+	gpu->mail[14] = unifs8;
-+	execute_qpu(
-+		gpu->mb,
-+		8 /* Number of QPUs */,
-+		gpu->vc + offsetof(struct GPU, mail),
-+		1 /* no flush */,  // Don't flush VPU L1 cache
-+		5000 /* timeout ms */);
-+#ifdef RPI_TIME_TOTAL_QPU
-+  end_time = Microseconds();
-+  last_time = end_time;
-+  on_time += end_time - start_time;
-+  count++;
-+  if ((count&0x7f)==0)
-+    printf("On=%dms, Off=%dms\n",(int)(on_time/1000),(int)(off_time/1000));
-+#endif
-+  gpu_unlock();
-+}
-+
- unsigned int qpu_get_fn(int num) {
-     // Make sure that the gpu is initialized
-     unsigned int *fn;
-@@ -585,6 +633,9 @@ unsigned int qpu_get_fn(int num) {
-     case QPU_MC_FILTER_UV_B:
-       fn = mc_filter_uv_b;
-       break;
-+    case QPU_MC_INTERRUPT_EXIT8:
-+      fn = mc_interrupt_exit8;
-+      break;
-     case QPU_MC_END:
-       fn = mc_end;
-       break;
-diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
-index 3526fce..2b22d98 100644
---- a/libavcodec/rpi_qpu.h
-+++ b/libavcodec/rpi_qpu.h
-@@ -16,6 +16,7 @@ extern void gpu_free(GPU_MEM_PTR_T *p);
- extern void gpu_cache_flush(GPU_MEM_PTR_T *p);
- 
- // QPU specific functions
-+extern void qpu_run_shader8(int code, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8);
- extern void qpu_run_shader12(int code, int num, int code2, int num2, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8, int unifs9, int unifs10, int unifs11, int unifs12);
- 
- enum {
-@@ -28,6 +29,7 @@ enum {
-   QPU_MC_SETUP_UV,
-   QPU_MC_FILTER_UV,
-   QPU_MC_FILTER_UV_B,
-+  QPU_MC_INTERRUPT_EXIT8,
-   QPU_MC_END
-   };
- extern unsigned int qpu_get_fn(int num);
--- 
-2.7.4
-
-
-From bd651e1569ebe0cdc41a6be169e139758cce069d Mon Sep 17 00:00:00 2001
-From: Peter de Rivaz <peter.derivaz@gmail.com>
-Date: Wed, 13 May 2015 11:47:23 +0100
-Subject: [PATCH 17/68] Drafted chroma interpolation on QPUs
-
----
- libavcodec/hevc.c          |   5 ++-
- libavcodec/hevc.h          |   2 +-
- libavcodec/hevc_filter.c   |   6 ++-
- libavcodec/rpi_qpu.c       | 101 +++++++++++++++++++++++++++++++++++++++++++--
- libavcodec/rpi_qpu.h       |   1 +
- libavcodec/rpi_shader.c    |  42 +++++++++----------
- libavcodec/rpi_shader.qasm |  42 +++++++++----------
- 7 files changed, 149 insertions(+), 50 deletions(-)
-
-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-index 66ed37a..d5ea45e 100644
---- a/libavcodec/hevc.c
-+++ b/libavcodec/hevc.c
-@@ -60,11 +60,11 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
- // The QPU code for UV blocks only works up to a block width of 8
- #define RPI_CHROMA_BLOCK_WIDTH 8
- 
--#define ENCODE_COEFFS(c0, c1, c2, c3) (((-c0) & 0xff) | ((-c1) & 0xff) << 8 | ((-c2) & 0xff) << 16 | ((-c3) & 0xff) << 24)
-+#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24)
- 
- // TODO Chroma only needs 4 taps
- static uint32_t rpi_filter_coefs[8][2] = {
--        { ENCODE_COEFFS(  0,  0,  0, 128), ENCODE_COEFFS(   0,   0,  0,  0 ) },
-+        { ENCODE_COEFFS(  0,  0,  0,  64), ENCODE_COEFFS(   0,   0,  0,  0 ) },
-         { ENCODE_COEFFS(  0,  0, -2,  58), ENCODE_COEFFS(  10,  -2,  0,  0 ) },
-         { ENCODE_COEFFS(  0,  0, -4,  54), ENCODE_COEFFS(  16,  -2,  0,  0 ) },
-         { ENCODE_COEFFS(  0,  0, -6,  46), ENCODE_COEFFS(  28,  -4,  0,  0 ) },
-@@ -2729,6 +2729,7 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
-     for(k=0;k<8;k++) {
-         s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
-         s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
-+        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP); // Also need a dummy for V
-     }
- 
-     s->u_mvs[8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-index da345f6..2497c47 100644
---- a/libavcodec/hevc.h
-+++ b/libavcodec/hevc.h
-@@ -45,7 +45,7 @@
- 
-   #include "rpi_qpu.h"
-   // Use QPU for inter prediction
--  //#define RPI_INTER_QPU
-+  // #define RPI_INTER_QPU
- 
- #endif
- 
-diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
-index 5b3d759..9b6e26d 100644
---- a/libavcodec/hevc_filter.c
-+++ b/libavcodec/hevc_filter.c
-@@ -903,8 +903,10 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
-             s->nal_unit_type == NAL_STSA_N  ||
-             s->nal_unit_type == NAL_RADL_N  ||
-             s->nal_unit_type == NAL_RASL_N )) {
--            //flush_buffer(s->frame->buf[1]);
--            //flush_buffer(s->frame->buf[2]);
-+#ifdef RPI_INTER_QPU
-+            flush_buffer(s->frame->buf[1]);
-+            flush_buffer(s->frame->buf[2]);
-+#endif
-             //memcpy(s->dummy.arm,s->frame->data[0],2048*64);
-             //memcpy(s->dummy.arm,s->frame->data[1],1024*32);
-             //memcpy(s->dummy.arm,s->frame->data[2],1024*32);
-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-index e4dd58a..4d9eda8 100644
---- a/libavcodec/rpi_qpu.c
-+++ b/libavcodec/rpi_qpu.c
-@@ -33,7 +33,8 @@
- // GPU_MEM_FLG should be 4 for uncached memory.  (Or C for alias to allocate in the VPU L2 cache)
- // However, if using VCSM allocated buffers, need to use C at the moment because VCSM does not allocate uncached memory correctly
- // The QPU crashes if we mix L2 cached and L2 uncached accesses due to a HW bug.
--#define GPU_MEM_FLG 0xC
-+#define GPU_MEM_FLG 0x4
-+// GPU_MEM_MAP is meaningless on the Pi2 and should be left at 0  (On Pi1 it allows ARM to access VPU L2 cache)
- #define GPU_MEM_MAP 0x0
- 
- #define vcos_verify(x) ((x)>=0)
-@@ -165,6 +166,8 @@ static int gpu_init(volatile struct GPU **gpu) {
- 	ptr->vc_handle = handle;
- 	ptr->vc = vc;
- 
-+  printf("GPU allocated at 0x%x\n",vc);
-+
-   *gpu = ptr;
- 
-   // Now copy over the QPU code into GPU memory
-@@ -304,10 +307,13 @@ int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p) {
- 
- static void gpu_term(void)
- {
--	int mb = gpu->mb;
--	unsigned handle = gpu->vc_handle;
-+	int mb;
-+	unsigned handle;
-+
-   if (gpu==NULL)
-     return;
-+  mb = gpu->mb;
-+  handle = gpu->vc_handle;
- 
- #ifdef RPI_ASYNC
-   {
-@@ -648,6 +654,95 @@ unsigned int qpu_get_fn(int num) {
- }
- 
- #if 0
-+typedef unsigned int uint32_t;
-+
-+typedef struct mvs_s {
-+    GPU_MEM_PTR_T unif_mvs_ptr;
-+    uint32_t *unif_mvs; // Base of memory for motion vector commands
-+
-+    // _base pointers are to the start of the row
-+    uint32_t *mvs_base[8];
-+    // these pointers are to the next free space
-+    uint32_t *u_mvs[8];
-+
-+} HEVCContext;
-+
-+#define RPI_CHROMA_COMMAND_WORDS 12
-+
-+static void rpi_inter_clear(HEVCContext *s)
-+{
-+    int i;
-+    for(i=0;i<8;i++) {
-+        s->u_mvs[i] = s->mvs_base[i];
-+        *s->u_mvs[i]++ = 0;
-+        *s->u_mvs[i]++ = 0;
-+        *s->u_mvs[i]++ = 0;
-+        *s->u_mvs[i]++ = 0;
-+        *s->u_mvs[i]++ = 0;
-+        *s->u_mvs[i]++ = 128;  // w
-+        *s->u_mvs[i]++ = 128;  // h
-+        *s->u_mvs[i]++ = 128;  // stride u
-+        *s->u_mvs[i]++ = 128;  // stride v
-+        s->u_mvs[i] += 3;  // Padding words
-+    }
-+}
-+
-+static void rpi_execute_inter_qpu(HEVCContext *s)
-+{
-+    int k;
-+    uint32_t *unif_vc = (uint32_t *)s->unif_mvs_ptr.vc;
-+
-+    for(k=0;k<8;k++) {
-+        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
-+        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
-+        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP); //  dummy location for V
-+    }
-+
-+    s->u_mvs[8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
-+
-+    qpu_run_shader8(qpu_get_fn(QPU_MC_SETUP_UV),
-+      (uint32_t)(unif_vc+(s->mvs_base[0 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+      (uint32_t)(unif_vc+(s->mvs_base[1 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+      (uint32_t)(unif_vc+(s->mvs_base[2 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+      (uint32_t)(unif_vc+(s->mvs_base[3 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+      (uint32_t)(unif_vc+(s->mvs_base[4 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+      (uint32_t)(unif_vc+(s->mvs_base[5 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+      (uint32_t)(unif_vc+(s->mvs_base[6 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+      (uint32_t)(unif_vc+(s->mvs_base[7 ] - (uint32_t*)s->unif_mvs_ptr.arm))
-+      );
-+}
-+
-+void rpi_test_qpu(void)
-+{
-+    HEVCContext mvs;
-+    HEVCContext *s = &mvs;
-+    int i;
-+    int uv_commands_per_qpu = (1 + (256*64*2)/(4*4)) * RPI_CHROMA_COMMAND_WORDS;
-+    uint32_t *p;
-+    printf("Allocate memory\n");
-+    gpu_malloc_uncached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr );
-+    s->unif_mvs = (uint32_t *) s->unif_mvs_ptr.arm;
-+
-+    // Set up initial locations for uniform streams
-+    p = s->unif_mvs;
-+    for(i = 0; i < 8; i++) {
-+        s->mvs_base[i] = p;
-+        p += uv_commands_per_qpu;
-+    }
-+    // Now run a simple program that should just quit immediately after a single texture fetch
-+    rpi_inter_clear(s);
-+    for(i=0;i<4;i++) {
-+      printf("Launch QPUs\n");
-+      rpi_execute_inter_qpu(s);
-+      printf("Done\n");
-+    }
-+    printf("Free memory\n");
-+    gpu_free(&s->unif_mvs_ptr);
-+    return;
-+}
-+#endif
-+
-+#if 0
- 
- int32_t hcoeffs[] = {-4, 10, -21, 70, 90, -24, 11, -4};
- //int32_t hcoeffs[] = {1, 1, 1, 1, 1, 1, 1, 1};
-diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
-index 2b22d98..f9ad333 100644
---- a/libavcodec/rpi_qpu.h
-+++ b/libavcodec/rpi_qpu.h
-@@ -18,6 +18,7 @@ extern void gpu_cache_flush(GPU_MEM_PTR_T *p);
- // QPU specific functions
- extern void qpu_run_shader8(int code, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8);
- extern void qpu_run_shader12(int code, int num, int code2, int num2, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8, int unifs9, int unifs10, int unifs11, int unifs12);
-+extern void rpi_test_qpu(void);
- 
- enum {
-   QPU_MC_SETUP,
-diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
-index 41cc2e1..d7ed297 100644
---- a/libavcodec/rpi_shader.c
-+++ b/libavcodec/rpi_shader.c
-@@ -152,23 +152,23 @@ unsigned int rpi_shader[] = {
- /* [0x000003e8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
- /* [0x000003f0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
- /* [0x000003f8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00000400] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
--/* [0x00000408] */ 0x40038031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
--/* [0x00000410] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-+/* [0x00000400] */ 0x55015fc6, 0x100248a2, // mov r2, rb21         ; mul24 r2, r0, ra0
-+/* [0x00000408] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+/* [0x00000410] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
- /* [0x00000418] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
--/* [0x00000420] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+/* [0x00000420] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
- /* [0x00000428] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
--/* [0x00000430] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+/* [0x00000430] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
- /* [0x00000438] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
--/* [0x00000440] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+/* [0x00000440] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
- /* [0x00000448] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
--/* [0x00000450] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+/* [0x00000450] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
- /* [0x00000458] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
--/* [0x00000460] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+/* [0x00000460] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
- /* [0x00000468] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
--/* [0x00000470] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+/* [0x00000470] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
- /* [0x00000478] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
--/* [0x00000480] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
-+/* [0x00000480] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
- /* [0x00000488] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
- /* [0x00000490] */ 0x15267d80, 0x10020227, // mov ra8, ra9
- /* [0x00000498] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-@@ -179,20 +179,20 @@ unsigned int rpi_shader[] = {
- /* [0x000004c0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
- /* [0x000004c8] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop
- /* [0x000004d0] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
--/* [0x000004d8] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
-+/* [0x000004d8] */ 0x8f54e1f6, 0xd0024821, // asr r0, r0, 14          ; mov r1, ra21
- /* [0x000004e0] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
--/* [0x000004e8] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
--/* [0x000004f0] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
--/* [0x000004f8] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
--/* [0x00000500] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
--/* [0x00000508] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
--/* [0x00000510] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
--/* [0x00000518] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
--/* [0x00000520] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
--/* [0x00000528] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
-+/* [0x000004e8] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
-+/* [0x000004f0] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
-+/* [0x000004f8] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
-+/* [0x00000500] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
-+/* [0x00000508] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
-+/* [0x00000510] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
-+/* [0x00000518] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
-+/* [0x00000520] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
-+/* [0x00000528] */ 0x8c9f223f, 0x100a0867, // add.ifnn r1, r1, r0     ; mov -, vw_wait
- /* [0x00000530] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
- /* [0x00000538] */ 0xfffffe50, 0xf06809e7, // brr.anyn -, r:uvloop
--/* [0x00000540] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
-+/* [0x00000540] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
- /* [0x00000548] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
- /* [0x00000550] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
- /* [0x00000558] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-index 6851e83..02fdcb2 100644
---- a/libavcodec/rpi_shader.qasm
-+++ b/libavcodec/rpi_shader.qasm
-@@ -270,23 +270,23 @@ add t0s, ra_x2_base, r2
- 
- mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
- 
--mov r2, rb21         ; mul24 r3, r0, ra0
--nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
--sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-+mov r2, rb21         ; mul24 r2, r0, ra0
-+nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+nop                  ; mul24      r3, ra1 << 1, r0 << 1
- nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
--sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
- nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
--sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
- nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
--sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
- nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
--sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
- nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
--sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
- nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
--sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
- nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
--sub r0, r2, r3
-+add r0, r2, r3
- 
- mov r3, rb31
- 
-@@ -302,23 +302,23 @@ sub.setf -, r3, 8 ; mov r1, ra22
- # apply horizontal filter
- brr.anyn -, r:uvloop
- max ra14, ra15, 0       ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
--asr r0, r0, 15          ; mov r1, ra21
-+asr r0, r0, 14          ; mov r1, ra21
- min.setf ra15, r0, rb22
- 
- # apply vertical filter and write to VPM
- 
--nop                     ; mul24 r0, ra14, rb14
--sub r1, r1, r0          ; mul24 r0, ra13, rb13
--sub r1, r1, r0          ; mul24 r0, ra12, rb12
--sub r1, r1, r0          ; mul24 r0, ra11, rb11
--sub r1, r1, r0          ; mul24 r0, ra10, rb10
--sub r1, r1, r0          ; mul24 r0, ra9, rb9
--sub r1, r1, r0          ; mul24 r0, ra8, rb8
--sub r1, r1, r0          ; mul24 r0, ra15, rb15
--sub.ifnn r1, r1, r0     ; mov -, vw_wait
-+nop                     ; mul24 r1, ra14, rb14
-+nop                     ; mul24 r0, ra13, rb13
-+add r1, r1, r0          ; mul24 r0, ra12, rb12
-+add r1, r1, r0          ; mul24 r0, ra11, rb11
-+add r1, r1, r0          ; mul24 r0, ra10, rb10
-+add r1, r1, r0          ; mul24 r0, ra9, rb9
-+add r1, r1, r0          ; mul24 r0, ra8, rb8
-+add r1, r1, r0          ; mul24 r0, ra15, rb15
-+add.ifnn r1, r1, r0     ; mov -, vw_wait
- sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
- brr.anyn -, r:uvloop
--asr r1, r1, 15
-+asr r1, r1, 14
- min r1, r1, rb22
- max vpm, r1, 0
- 
--- 
-2.7.4
-
-
-From 61628063461ee5d891af6dbedfd495efcf464012 Mon Sep 17 00:00:00 2001
-From: Peter de Rivaz <peter.derivaz@gmail.com>
-Date: Wed, 13 May 2015 13:54:11 +0100
-Subject: [PATCH 18/68] Fixed chroma inter prediction
-
----
- libavcodec/hevc.c          |    8 +-
- libavcodec/hevc.h          |    2 +-
- libavcodec/rpi_shader.c    | 1170 ++++++++++++++++++++++----------------------
- libavcodec/rpi_shader.h    |   22 +-
- libavcodec/rpi_shader.qasm |   24 +-
- 5 files changed, 617 insertions(+), 609 deletions(-)
-
-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-index d5ea45e..d6d78ee 100644
---- a/libavcodec/hevc.c
-+++ b/libavcodec/hevc.c
-@@ -57,9 +57,11 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
- #ifdef RPI_INTER_QPU
- 
- #define RPI_CHROMA_COMMAND_WORDS 12
-+#define UV_COMMANDS_PER_QPU ((1 + (256*64*2)/(4*4)) * RPI_CHROMA_COMMAND_WORDS)
- // The QPU code for UV blocks only works up to a block width of 8
- #define RPI_CHROMA_BLOCK_WIDTH 8
- 
-+
- #define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24)
- 
- // TODO Chroma only needs 4 taps
-@@ -2024,7 +2026,8 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
- 
-                 int x1_c = x0_c + (mv->x >> (2 + hshift));
-                 int y1_c = y0_c + (mv->y >> (2 + hshift));
--                int chan = x0>>8; // Allocate commands for the first 256 luma pixels across to the first QPU.  This is optimised for images around 1920 width
-+                //int chan = x0>>8; // Allocate commands for the first 256 luma pixels across to the first QPU.  This is optimised for images around 1920 width
-+                int chan = x0>>8;
- 
-                 uint32_t *u = s->u_mvs[chan & 7];
-                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-@@ -2730,6 +2733,7 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
-         s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
-         s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
-         s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP); // Also need a dummy for V
-+        assert(s->u_mvs[k] - s->mvs_base[k] < UV_COMMANDS_PER_QPU);
-     }
- 
-     s->u_mvs[8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
-@@ -3689,7 +3693,7 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
-     // Also add space for the startup command for each stream.
- 
-     {
--        int uv_commands_per_qpu = (1 + (256*64*2)/(4*4)) * RPI_CHROMA_COMMAND_WORDS;
-+        int uv_commands_per_qpu = UV_COMMANDS_PER_QPU;
-         uint32_t *p;
-         gpu_malloc_uncached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr );
-         s->unif_mvs = (uint32_t *) s->unif_mvs_ptr.arm; // TODO support this allocation in non EARLY_MALLOC
-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-index 2497c47..d513579 100644
---- a/libavcodec/hevc.h
-+++ b/libavcodec/hevc.h
-@@ -45,7 +45,7 @@
- 
-   #include "rpi_qpu.h"
-   // Use QPU for inter prediction
--  // #define RPI_INTER_QPU
-+  #define RPI_INTER_QPU
- 
- #endif
- 
-diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
-index d7ed297..831633b 100644
---- a/libavcodec/rpi_shader.c
-+++ b/libavcodec/rpi_shader.c
-@@ -33,7 +33,7 @@ unsigned int rpi_shader[] = {
- /* [0x00000040] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
- /* [0x00000048] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
- /* [0x00000050] */ 0x00000001, 0xe0020527, // mov ra20, 1
--/* [0x00000058] */ 0x00000040, 0xe0020567, // mov ra21, 64
-+/* [0x00000058] */ 0x00000020, 0xe0020567, // mov ra21, 32
- /* [0x00000060] */ 0x00000100, 0xe00205a7, // mov ra22, 256
- /* [0x00000068] */ 0x00000008, 0xe00205e7, // mov ra23, 8
- /* [0x00000070] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
-@@ -152,7 +152,7 @@ unsigned int rpi_shader[] = {
- /* [0x000003e8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
- /* [0x000003f0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
- /* [0x000003f8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00000400] */ 0x55015fc6, 0x100248a2, // mov r2, rb21         ; mul24 r2, r0, ra0
-+/* [0x00000400] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
- /* [0x00000408] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
- /* [0x00000410] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
- /* [0x00000418] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-@@ -178,9 +178,9 @@ unsigned int rpi_shader[] = {
- /* [0x000004b8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
- /* [0x000004c0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
- /* [0x000004c8] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop
--/* [0x000004d0] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
--/* [0x000004d8] */ 0x8f54e1f6, 0xd0024821, // asr r0, r0, 14          ; mov r1, ra21
--/* [0x000004e0] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
-+/* [0x000004d0] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-+/* [0x000004d8] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-+/* [0x000004e0] */ 0x009e7000, 0x100009e7, // nop                     ; nop
- /* [0x000004e8] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
- /* [0x000004f0] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
- /* [0x000004f8] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
-@@ -189,400 +189,400 @@ unsigned int rpi_shader[] = {
- /* [0x00000510] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
- /* [0x00000518] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
- /* [0x00000520] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
--/* [0x00000528] */ 0x8c9f223f, 0x100a0867, // add.ifnn r1, r1, r0     ; mov -, vw_wait
-+/* [0x00000528] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
- /* [0x00000530] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
--/* [0x00000538] */ 0xfffffe50, 0xf06809e7, // brr.anyn -, r:uvloop
--/* [0x00000540] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
--/* [0x00000548] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
--/* [0x00000550] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
--/* [0x00000558] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
--/* [0x00000560] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x00000568] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
--/* [0x00000570] */ 0x00000010, 0xe0020827, // mov r0, 16
--/* [0x00000578] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
--/* [0x00000580] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
--/* [0x00000588] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
--/* [0x00000590] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x00000598] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x00000538] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+/* [0x00000540] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
-+/* [0x00000548] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:uvloop
-+/* [0x00000550] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-+/* [0x00000558] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+/* [0x00000560] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+/* [0x00000568] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x00000570] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000578] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x00000580] */ 0x00000010, 0xe0020827, // mov r0, 16
-+/* [0x00000588] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000590] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00000598] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+/* [0x000005a0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x000005a8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
- // ::mc_filter
--/* [0x000005a0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
--/* [0x000005a8] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
--/* [0x000005b0] */ 0x156e7d80, 0x10020667, // mov ra_x2shift, ra_x2shift_next
--/* [0x000005b8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
--/* [0x000005c0] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
--/* [0x000005c8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
--/* [0x000005d0] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3,unif
--/* [0x000005d8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
--/* [0x000005e0] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
--/* [0x000005e8] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
--/* [0x000005f0] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
--/* [0x000005f8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
--/* [0x00000600] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
--/* [0x00000608] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
--/* [0x00000610] */ 0x149dc5c0, 0xd00206a7, // and ra_x2_base_next, r2, ~3
--/* [0x00000618] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
--/* [0x00000620] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
--/* [0x00000628] */ 0x00000010, 0xe00208a7, // mov r2, 16
--/* [0x00000630] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000638] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
--/* [0x00000640] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
--/* [0x00000648] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
--/* [0x00000650] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
--/* [0x00000658] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
--/* [0x00000660] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
--/* [0x00000668] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
--/* [0x00000670] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
--/* [0x00000678] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
--/* [0x00000680] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
--/* [0x00000688] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000690] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000698] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000006a0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000006a8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
--/* [0x000006b0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000006b8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000006c0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000006c8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
--/* [0x000006d0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
--/* [0x000006d8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
--/* [0x000006e0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000006e8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
--/* [0x000006f0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
--/* [0x000006f8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000700] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000708] */ 0x000001d0, 0xf07809e7, // brr.anynn -, r:fast_path
--/* [0x00000710] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
--/* [0x00000718] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
--/* [0x00000720] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00000728] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+/* [0x000005b0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x000005b8] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+/* [0x000005c0] */ 0x156e7d80, 0x10020667, // mov ra_x2shift, ra_x2shift_next
-+/* [0x000005c8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+/* [0x000005d0] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
-+/* [0x000005d8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+/* [0x000005e0] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3,unif
-+/* [0x000005e8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x000005f0] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
-+/* [0x000005f8] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
-+/* [0x00000600] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
-+/* [0x00000608] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+/* [0x00000610] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
-+/* [0x00000618] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+/* [0x00000620] */ 0x149dc5c0, 0xd00206a7, // and ra_x2_base_next, r2, ~3
-+/* [0x00000628] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+/* [0x00000630] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+/* [0x00000638] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+/* [0x00000640] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000648] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+/* [0x00000650] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+/* [0x00000658] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+/* [0x00000660] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+/* [0x00000668] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+/* [0x00000670] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+/* [0x00000678] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000680] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+/* [0x00000688] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+/* [0x00000690] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-+/* [0x00000698] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x000006a0] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000006a8] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000006b0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000006b8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+/* [0x000006c0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000006c8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000006d0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000006d8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+/* [0x000006e0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x000006e8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x000006f0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000006f8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+/* [0x00000700] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000708] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000710] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000718] */ 0x000001d0, 0xf07809e7, // brr.anynn -, r:fast_path
-+/* [0x00000720] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-+/* [0x00000728] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+/* [0x00000730] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000738] */ 0x00000000, 0xe00208e7, // mov r3, 0
- // :loop
--/* [0x00000730] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
--/* [0x00000738] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
--/* [0x00000740] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
--/* [0x00000748] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
--/* [0x00000750] */ 0xee654987, 0x10024860, // shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20
--/* [0x00000758] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
--/* [0x00000760] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
--/* [0x00000768] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
--/* [0x00000770] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
--/* [0x00000778] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
--/* [0x00000780] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00000788] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
--/* [0x00000790] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
--/* [0x00000798] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
--/* [0x000007a0] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
--/* [0x000007a8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
--/* [0x000007b0] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
--/* [0x000007b8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
--/* [0x000007c0] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
--/* [0x000007c8] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
--/* [0x000007d0] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
--/* [0x000007d8] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
--/* [0x000007e0] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
--/* [0x000007e8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
--/* [0x000007f0] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
--/* [0x000007f8] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
--/* [0x00000800] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
--/* [0x00000808] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
--/* [0x00000810] */ 0x15267d80, 0x10020227, // mov ra8, ra9
--/* [0x00000818] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
--/* [0x00000820] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
--/* [0x00000828] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
--/* [0x00000830] */ 0x15367d80, 0x10020327, // mov ra12, ra13
--/* [0x00000838] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
--/* [0x00000840] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
--/* [0x00000848] */ 0xfffffec8, 0xf06809e7, // brr.anyn -, r:loop
--/* [0x00000850] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
--/* [0x00000858] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
--/* [0x00000860] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
--/* [0x00000868] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
--/* [0x00000870] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
--/* [0x00000878] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
--/* [0x00000880] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
--/* [0x00000888] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
--/* [0x00000890] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
--/* [0x00000898] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
--/* [0x000008a0] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
--/* [0x000008a8] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
--/* [0x000008b0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
--/* [0x000008b8] */ 0xfffffe58, 0xf06809e7, // brr.anyn -, r:loop
--/* [0x000008c0] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
--/* [0x000008c8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
--/* [0x000008d0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
--/* [0x000008d8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
--/* [0x000008e0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
--/* [0x000008e8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x000008f0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x00000740] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+/* [0x00000748] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+/* [0x00000750] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+/* [0x00000758] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x00000760] */ 0xee654987, 0x10024860, // shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20
-+/* [0x00000768] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x00000770] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00000778] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+/* [0x00000780] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+/* [0x00000788] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+/* [0x00000790] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000798] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
-+/* [0x000007a0] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-+/* [0x000007a8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+/* [0x000007b0] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+/* [0x000007b8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+/* [0x000007c0] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+/* [0x000007c8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+/* [0x000007d0] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+/* [0x000007d8] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+/* [0x000007e0] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+/* [0x000007e8] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+/* [0x000007f0] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+/* [0x000007f8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+/* [0x00000800] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+/* [0x00000808] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+/* [0x00000810] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
-+/* [0x00000818] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+/* [0x00000820] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-+/* [0x00000828] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+/* [0x00000830] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+/* [0x00000838] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+/* [0x00000840] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+/* [0x00000848] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+/* [0x00000850] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-+/* [0x00000858] */ 0xfffffec8, 0xf06809e7, // brr.anyn -, r:loop
-+/* [0x00000860] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
-+/* [0x00000868] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
-+/* [0x00000870] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
-+/* [0x00000878] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
-+/* [0x00000880] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
-+/* [0x00000888] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
-+/* [0x00000890] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
-+/* [0x00000898] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
-+/* [0x000008a0] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
-+/* [0x000008a8] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
-+/* [0x000008b0] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
-+/* [0x000008b8] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
-+/* [0x000008c0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+/* [0x000008c8] */ 0xfffffe58, 0xf06809e7, // brr.anyn -, r:loop
-+/* [0x000008d0] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
-+/* [0x000008d8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+/* [0x000008e0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+/* [0x000008e8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x000008f0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x000008f8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000900] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
- // :fast_path
--/* [0x000008f8] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+/* [0x00000908] */ 0x00000000, 0xe00208e7, // mov r3, 0
- // :fast_loop
--/* [0x00000900] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
--/* [0x00000908] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
--/* [0x00000910] */ 0x95727d9b, 0x1004475f, // mov.ifz ra_y, ra_y_next   ; mov rb31, r3
--/* [0x00000918] */ 0x95690dbf, 0x10044623, // mov.ifz ra_x2_base, ra_x2_base_next   ; mov r3, rb_pitch
--/* [0x00000920] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
--/* [0x00000928] */ 0x929de5e4, 0x100248a1, // min r2, r2, rb_frame_height_minus_1 ; mov r1, r4
--/* [0x00000930] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
--/* [0x00000938] */ 0xec414c87, 0x10024e20, // add t0s, ra_x_base, r2    ; v8subs r0, r0, rb20
--/* [0x00000940] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
--/* [0x00000948] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
--/* [0x00000950] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra1 << 1, r0 << 1
--/* [0x00000958] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra2 << 2, r0 << 2
--/* [0x00000960] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra3 << 3, r0 << 3
--/* [0x00000968] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra4 << 4, r0 << 4
--/* [0x00000970] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra5 << 5, r0 << 5
--/* [0x00000978] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra6 << 6, r0 << 6
--/* [0x00000980] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra7 << 7, r0 << 7
--/* [0x00000988] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
--/* [0x00000990] */ 0x15267d80, 0x10020227, // mov ra8, ra9
--/* [0x00000998] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
--/* [0x000009a0] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
--/* [0x000009a8] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
--/* [0x000009b0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
--/* [0x000009b8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
--/* [0x000009c0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8       ; mov r1, ra22
--/* [0x000009c8] */ 0xffffff18, 0xf06809e7, // brr.anyn -, r:fast_loop
--/* [0x000009d0] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
--/* [0x000009d8] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
--/* [0x000009e0] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
--/* [0x000009e8] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
--/* [0x000009f0] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
--/* [0x000009f8] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
--/* [0x00000a00] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
--/* [0x00000a08] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
--/* [0x00000a10] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
--/* [0x00000a18] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
--/* [0x00000a20] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
--/* [0x00000a28] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
--/* [0x00000a30] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
--/* [0x00000a38] */ 0xfffffea8, 0xf06809e7, // brr.anyn -, r:fast_loop
--/* [0x00000a40] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
--/* [0x00000a48] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
--/* [0x00000a50] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
--/* [0x00000a58] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
--/* [0x00000a60] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
--/* [0x00000a68] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x00000a70] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x00000910] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+/* [0x00000918] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+/* [0x00000920] */ 0x95727d9b, 0x1004475f, // mov.ifz ra_y, ra_y_next   ; mov rb31, r3
-+/* [0x00000928] */ 0x95690dbf, 0x10044623, // mov.ifz ra_x2_base, ra_x2_base_next   ; mov r3, rb_pitch
-+/* [0x00000930] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x00000938] */ 0x929de5e4, 0x100248a1, // min r2, r2, rb_frame_height_minus_1 ; mov r1, r4
-+/* [0x00000940] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+/* [0x00000948] */ 0xec414c87, 0x10024e20, // add t0s, ra_x_base, r2    ; v8subs r0, r0, rb20
-+/* [0x00000950] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+/* [0x00000958] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
-+/* [0x00000960] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra1 << 1, r0 << 1
-+/* [0x00000968] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra2 << 2, r0 << 2
-+/* [0x00000970] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra3 << 3, r0 << 3
-+/* [0x00000978] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra4 << 4, r0 << 4
-+/* [0x00000980] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra5 << 5, r0 << 5
-+/* [0x00000988] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra6 << 6, r0 << 6
-+/* [0x00000990] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra7 << 7, r0 << 7
-+/* [0x00000998] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
-+/* [0x000009a0] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-+/* [0x000009a8] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+/* [0x000009b0] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+/* [0x000009b8] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+/* [0x000009c0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+/* [0x000009c8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+/* [0x000009d0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8       ; mov r1, ra22
-+/* [0x000009d8] */ 0xffffff18, 0xf06809e7, // brr.anyn -, r:fast_loop
-+/* [0x000009e0] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
-+/* [0x000009e8] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
-+/* [0x000009f0] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
-+/* [0x000009f8] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
-+/* [0x00000a00] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
-+/* [0x00000a08] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
-+/* [0x00000a10] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
-+/* [0x00000a18] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
-+/* [0x00000a20] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
-+/* [0x00000a28] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
-+/* [0x00000a30] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
-+/* [0x00000a38] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
-+/* [0x00000a40] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+/* [0x00000a48] */ 0xfffffea8, 0xf06809e7, // brr.anyn -, r:fast_loop
-+/* [0x00000a50] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
-+/* [0x00000a58] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+/* [0x00000a60] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+/* [0x00000a68] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00000a70] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x00000a78] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000a80] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
- // ::mc_filter_b
--/* [0x00000a78] */ 0x15827d80, 0x100207e7, // mov ra31, unif
--/* [0x00000a80] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
--/* [0x00000a88] */ 0x156e7d80, 0x10020667, // mov ra_x2shift, ra_x2shift_next
--/* [0x00000a90] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
--/* [0x00000a98] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
--/* [0x00000aa0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
--/* [0x00000aa8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3,unif
--/* [0x00000ab0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
--/* [0x00000ab8] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
--/* [0x00000ac0] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
--/* [0x00000ac8] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
--/* [0x00000ad0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
--/* [0x00000ad8] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
--/* [0x00000ae0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
--/* [0x00000ae8] */ 0x149dc5c0, 0xd00206a7, // and ra_x2_base_next, r2, ~3
--/* [0x00000af0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
--/* [0x00000af8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
--/* [0x00000b00] */ 0x00000010, 0xe00208a7, // mov r2, 16
--/* [0x00000b08] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000b10] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
--/* [0x00000b18] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
--/* [0x00000b20] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
--/* [0x00000b28] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
--/* [0x00000b30] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
--/* [0x00000b38] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
--/* [0x00000b40] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
--/* [0x00000b48] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
--/* [0x00000b50] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
--/* [0x00000b58] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
--/* [0x00000b60] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
--/* [0x00000b68] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
--/* [0x00000b70] */ 0x0c9dc7c0, 0x10020c67, // add vr_setup, r3, rb28
--/* [0x00000b78] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000b80] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000b88] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000b90] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000b98] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
--/* [0x00000ba0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000ba8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000bb0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000bb8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
--/* [0x00000bc0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000bc8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000bd0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000bd8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
--/* [0x00000be0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000be8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000bf0] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000bf8] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
--/* [0x00000c00] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
--/* [0x00000c08] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00000c10] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+/* [0x00000a88] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x00000a90] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+/* [0x00000a98] */ 0x156e7d80, 0x10020667, // mov ra_x2shift, ra_x2shift_next
-+/* [0x00000aa0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+/* [0x00000aa8] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
-+/* [0x00000ab0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+/* [0x00000ab8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3,unif
-+/* [0x00000ac0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x00000ac8] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
-+/* [0x00000ad0] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
-+/* [0x00000ad8] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
-+/* [0x00000ae0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+/* [0x00000ae8] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
-+/* [0x00000af0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+/* [0x00000af8] */ 0x149dc5c0, 0xd00206a7, // and ra_x2_base_next, r2, ~3
-+/* [0x00000b00] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+/* [0x00000b08] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+/* [0x00000b10] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+/* [0x00000b18] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000b20] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+/* [0x00000b28] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+/* [0x00000b30] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+/* [0x00000b38] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+/* [0x00000b40] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+/* [0x00000b48] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+/* [0x00000b50] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-+/* [0x00000b58] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-+/* [0x00000b60] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-+/* [0x00000b68] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000b70] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+/* [0x00000b78] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+/* [0x00000b80] */ 0x0c9dc7c0, 0x10020c67, // add vr_setup, r3, rb28
-+/* [0x00000b88] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000b90] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000b98] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000ba0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000ba8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+/* [0x00000bb0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000bb8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000bc0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000bc8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+/* [0x00000bd0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000bd8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000be0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000be8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+/* [0x00000bf0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000bf8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000c00] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000c08] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-+/* [0x00000c10] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+/* [0x00000c18] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000c20] */ 0x00000000, 0xe00208e7, // mov r3, 0
- // :bloop
--/* [0x00000c18] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
--/* [0x00000c20] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
--/* [0x00000c28] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
--/* [0x00000c30] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
--/* [0x00000c38] */ 0xee654987, 0x10024860, // shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20
--/* [0x00000c40] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
--/* [0x00000c48] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
--/* [0x00000c50] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
--/* [0x00000c58] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
--/* [0x00000c60] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
--/* [0x00000c68] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00000c70] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
--/* [0x00000c78] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
--/* [0x00000c80] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
--/* [0x00000c88] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
--/* [0x00000c90] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
--/* [0x00000c98] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
--/* [0x00000ca0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
--/* [0x00000ca8] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
--/* [0x00000cb0] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
--/* [0x00000cb8] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
--/* [0x00000cc0] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
--/* [0x00000cc8] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
--/* [0x00000cd0] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
--/* [0x00000cd8] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
--/* [0x00000ce0] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
--/* [0x00000ce8] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
--/* [0x00000cf0] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
--/* [0x00000cf8] */ 0x15267d80, 0x10020227, // mov ra8, ra9
--/* [0x00000d00] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
--/* [0x00000d08] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
--/* [0x00000d10] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
--/* [0x00000d18] */ 0x15367d80, 0x10020327, // mov ra12, ra13
--/* [0x00000d20] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
--/* [0x00000d28] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
--/* [0x00000d30] */ 0xfffffec8, 0xf06809e7, // brr.anyn -, r:bloop
--/* [0x00000d38] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
--/* [0x00000d40] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
--/* [0x00000d48] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
--/* [0x00000d50] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
--/* [0x00000d58] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
--/* [0x00000d60] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
--/* [0x00000d68] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
--/* [0x00000d70] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
--/* [0x00000d78] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
--/* [0x00000d80] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
--/* [0x00000d88] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
--/* [0x00000d90] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
--/* [0x00000d98] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
--/* [0x00000da0] */ 0x8fc8f3f6, 0xd0020867, // asr r1, r1, 15          ; mov -, vr_wait
--/* [0x00000da8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
--/* [0x00000db0] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
--/* [0x00000db8] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:bloop
--/* [0x00000dc0] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
--/* [0x00000dc8] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
--/* [0x00000dd0] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
--/* [0x00000dd8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
--/* [0x00000de0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
--/* [0x00000de8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x00000df0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x00000c28] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+/* [0x00000c30] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+/* [0x00000c38] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+/* [0x00000c40] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x00000c48] */ 0xee654987, 0x10024860, // shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20
-+/* [0x00000c50] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x00000c58] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00000c60] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+/* [0x00000c68] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+/* [0x00000c70] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+/* [0x00000c78] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000c80] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
-+/* [0x00000c88] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-+/* [0x00000c90] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+/* [0x00000c98] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+/* [0x00000ca0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+/* [0x00000ca8] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+/* [0x00000cb0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+/* [0x00000cb8] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+/* [0x00000cc0] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+/* [0x00000cc8] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+/* [0x00000cd0] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+/* [0x00000cd8] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+/* [0x00000ce0] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+/* [0x00000ce8] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+/* [0x00000cf0] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+/* [0x00000cf8] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
-+/* [0x00000d00] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+/* [0x00000d08] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-+/* [0x00000d10] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+/* [0x00000d18] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+/* [0x00000d20] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+/* [0x00000d28] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+/* [0x00000d30] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+/* [0x00000d38] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-+/* [0x00000d40] */ 0xfffffec8, 0xf06809e7, // brr.anyn -, r:bloop
-+/* [0x00000d48] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
-+/* [0x00000d50] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
-+/* [0x00000d58] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
-+/* [0x00000d60] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
-+/* [0x00000d68] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
-+/* [0x00000d70] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
-+/* [0x00000d78] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
-+/* [0x00000d80] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
-+/* [0x00000d88] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
-+/* [0x00000d90] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
-+/* [0x00000d98] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
-+/* [0x00000da0] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
-+/* [0x00000da8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+/* [0x00000db0] */ 0x8fc8f3f6, 0xd0020867, // asr r1, r1, 15          ; mov -, vr_wait
-+/* [0x00000db8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+/* [0x00000dc0] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
-+/* [0x00000dc8] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:bloop
-+/* [0x00000dd0] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-+/* [0x00000dd8] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
-+/* [0x00000de0] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
-+/* [0x00000de8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00000df0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x00000df8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000e00] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
- // ::mc_filter_honly
--/* [0x00000df8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
--/* [0x00000e00] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
--/* [0x00000e08] */ 0x156e7d80, 0x10020667, // mov ra_x2shift, ra_x2shift_next
--/* [0x00000e10] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
--/* [0x00000e18] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
--/* [0x00000e20] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
--/* [0x00000e28] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3,unif
--/* [0x00000e30] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
--/* [0x00000e38] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
--/* [0x00000e40] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
--/* [0x00000e48] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
--/* [0x00000e50] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
--/* [0x00000e58] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
--/* [0x00000e60] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
--/* [0x00000e68] */ 0x149dc5c0, 0xd00206a7, // and ra_x2_base_next, r2, ~3
--/* [0x00000e70] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
--/* [0x00000e78] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
--/* [0x00000e80] */ 0x00000010, 0xe00208a7, // mov r2, 16
--/* [0x00000e88] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000e90] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
--/* [0x00000e98] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
--/* [0x00000ea0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
--/* [0x00000ea8] */ 0x0c9de1c0, 0xd0021467, // add rb17, r0, -2
--/* [0x00000eb0] */ 0x919c71c0, 0xd0024812, // shl r0, r0, 7 ; mov rb18,r0
--/* [0x00000eb8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
--/* [0x00000ec0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
--/* [0x00000ec8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
--/* [0x00000ed0] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000ed8] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000ee0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000ee8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000ef0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
--/* [0x00000ef8] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000f00] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000f08] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000f10] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
--/* [0x00000f18] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000f20] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
--/* [0x00000f28] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00000f30] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+/* [0x00000e08] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x00000e10] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+/* [0x00000e18] */ 0x156e7d80, 0x10020667, // mov ra_x2shift, ra_x2shift_next
-+/* [0x00000e20] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+/* [0x00000e28] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
-+/* [0x00000e30] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+/* [0x00000e38] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3,unif
-+/* [0x00000e40] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x00000e48] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
-+/* [0x00000e50] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
-+/* [0x00000e58] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
-+/* [0x00000e60] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+/* [0x00000e68] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
-+/* [0x00000e70] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+/* [0x00000e78] */ 0x149dc5c0, 0xd00206a7, // and ra_x2_base_next, r2, ~3
-+/* [0x00000e80] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+/* [0x00000e88] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+/* [0x00000e90] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+/* [0x00000e98] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000ea0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+/* [0x00000ea8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+/* [0x00000eb0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+/* [0x00000eb8] */ 0x0c9de1c0, 0xd0021467, // add rb17, r0, -2
-+/* [0x00000ec0] */ 0x919c71c0, 0xd0024812, // shl r0, r0, 7 ; mov rb18,r0
-+/* [0x00000ec8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000ed0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+/* [0x00000ed8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+/* [0x00000ee0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000ee8] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000ef0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000ef8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000f00] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+/* [0x00000f08] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000f10] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000f18] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000f20] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+/* [0x00000f28] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000f30] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+/* [0x00000f38] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000f40] */ 0x00000000, 0xe00208e7, // mov r3, 0
- // :loop_honly
--/* [0x00000f38] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
--/* [0x00000f40] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
--/* [0x00000f48] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
--/* [0x00000f50] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
--/* [0x00000f58] */ 0xee654987, 0x10024860, // shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20
--/* [0x00000f60] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
--/* [0x00000f68] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
--/* [0x00000f70] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
--/* [0x00000f78] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
--/* [0x00000f80] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
--/* [0x00000f88] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00000f90] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
--/* [0x00000f98] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
--/* [0x00000fa0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
--/* [0x00000fa8] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
--/* [0x00000fb0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
--/* [0x00000fb8] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
--/* [0x00000fc0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
--/* [0x00000fc8] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
--/* [0x00000fd0] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
--/* [0x00000fd8] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
--/* [0x00000fe0] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
--/* [0x00000fe8] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
--/* [0x00000ff0] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
--/* [0x00000ff8] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
--/* [0x00001000] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
--/* [0x00001008] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
--/* [0x00001010] */ 0x8d5927f6, 0x100269e1, // sub.setf -, r3, rb18 ; mov r1, ra22
--/* [0x00001018] */ 0x559f2fc1, 0x100049e0, // mov -, vw_wait   ; mul24 r0, r0, r1
--/* [0x00001020] */ 0xfffffef8, 0xf06809e7, // brr.anyn -, r:loop_honly
--/* [0x00001028] */ 0x0f9cf1c0, 0xd0020827, // asr r0, r0, 15
--/* [0x00001030] */ 0x129d61c0, 0x10020827, // min r0, r0, rb22
--/* [0x00001038] */ 0x139c01c0, 0xd0020c27, // max vpm, r0, 0
--/* [0x00001040] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
--/* [0x00001048] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
--/* [0x00001050] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x00001058] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x00000f48] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+/* [0x00000f50] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+/* [0x00000f58] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+/* [0x00000f60] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x00000f68] */ 0xee654987, 0x10024860, // shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20
-+/* [0x00000f70] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x00000f78] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00000f80] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+/* [0x00000f88] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+/* [0x00000f90] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+/* [0x00000f98] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000fa0] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
-+/* [0x00000fa8] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-+/* [0x00000fb0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+/* [0x00000fb8] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+/* [0x00000fc0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+/* [0x00000fc8] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+/* [0x00000fd0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+/* [0x00000fd8] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+/* [0x00000fe0] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+/* [0x00000fe8] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+/* [0x00000ff0] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+/* [0x00000ff8] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+/* [0x00001000] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+/* [0x00001008] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+/* [0x00001010] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+/* [0x00001018] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
-+/* [0x00001020] */ 0x8d5927f6, 0x100269e1, // sub.setf -, r3, rb18 ; mov r1, ra22
-+/* [0x00001028] */ 0x559f2fc1, 0x100049e0, // mov -, vw_wait   ; mul24 r0, r0, r1
-+/* [0x00001030] */ 0xfffffef8, 0xf06809e7, // brr.anyn -, r:loop_honly
-+/* [0x00001038] */ 0x0f9cf1c0, 0xd0020827, // asr r0, r0, 15
-+/* [0x00001040] */ 0x129d61c0, 0x10020827, // min r0, r0, rb22
-+/* [0x00001048] */ 0x139c01c0, 0xd0020c27, // max vpm, r0, 0
-+/* [0x00001050] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00001058] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x00001060] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00001068] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
- // ::mc_exit
--/* [0x00001060] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
--/* [0x00001068] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
--/* [0x00001070] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00001078] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00001070] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x00001078] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
- /* [0x00001080] */ 0x009e7000, 0xa00009e7, // ldtmu0
- /* [0x00001088] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00001090] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
--/* [0x00001098] */ 0x009e7000, 0x100009e7, // nop        ; nop
--/* [0x000010a0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+/* [0x00001090] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00001098] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x000010a0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x000010a8] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+/* [0x000010b0] */ 0x009e7000, 0x100009e7, // nop        ; nop
- // ::mc_exit1
--/* [0x000010a8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
--/* [0x000010b0] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x000010b8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x000010b8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
- /* [0x000010c0] */ 0x009e7000, 0xa00009e7, // ldtmu0
- /* [0x000010c8] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x000010d0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
--/* [0x000010d8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
--/* [0x000010e0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+/* [0x000010d0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x000010d8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x000010e0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x000010e8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+/* [0x000010f0] */ 0x009e7000, 0x100009e7, // nop        ; nop
- // ::mc_interrupt_exit
--/* [0x000010e8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
--/* [0x000010f0] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x000010f8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x000010f8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
- /* [0x00001100] */ 0x009e7000, 0xa00009e7, // ldtmu0
- /* [0x00001108] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00001110] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00001118] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00001110] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00001118] */ 0x009e7000, 0xa00009e7, // ldtmu0
- /* [0x00001120] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
- /* [0x00001128] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
- /* [0x00001130] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-@@ -592,225 +592,227 @@ unsigned int rpi_shader[] = {
- /* [0x00001150] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
- /* [0x00001158] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
- /* [0x00001160] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00001168] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
--/* [0x00001170] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
--/* [0x00001178] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+/* [0x00001168] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00001170] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00001178] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x00001180] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+/* [0x00001188] */ 0x009e7000, 0x100009e7, // nop        ; nop
- // ::mc_interrupt_exit4
--/* [0x00001180] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
--/* [0x00001188] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00001190] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00001190] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
- /* [0x00001198] */ 0x009e7000, 0xa00009e7, // ldtmu0
- /* [0x000011a0] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x000011a8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x000011b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000011a8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x000011b0] */ 0x009e7000, 0xa00009e7, // ldtmu0
- /* [0x000011b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x000011c0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
--/* [0x000011c8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
--/* [0x000011d0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+/* [0x000011c0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000011c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000011d0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x000011d8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+/* [0x000011e0] */ 0x009e7000, 0x100009e7, // nop        ; nop
- // ::mc_interrupt_exit8
--/* [0x000011d8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
--/* [0x000011e0] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x000011e8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x000011e8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
- /* [0x000011f0] */ 0x009e7000, 0xa00009e7, // ldtmu0
- /* [0x000011f8] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00001200] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00001208] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00001200] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00001208] */ 0x009e7000, 0xa00009e7, // ldtmu0
- /* [0x00001210] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
- /* [0x00001218] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
- /* [0x00001220] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
- /* [0x00001228] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
- /* [0x00001230] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00001238] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
--/* [0x00001240] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
--/* [0x00001248] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+/* [0x00001238] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00001240] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00001248] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x00001250] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+/* [0x00001258] */ 0x009e7000, 0x100009e7, // nop        ; nop
- // ::mc_setup_uv
--/* [0x00001250] */ 0x15827d80, 0x100207e7, // mov ra31, unif
--/* [0x00001258] */ 0x0c9a0f80, 0x10020427, // add ra_x_base, unif, elem_num
--/* [0x00001260] */ 0x15827d80, 0x10020767, // mov ra_y, unif
--/* [0x00001268] */ 0x15827d80, 0x10020627, // mov ra_x2_base, unif
--/* [0x00001270] */ 0x009e7000, 0x100009e7, // nop
--/* [0x00001278] */ 0x0d620f80, 0x10020667, // sub ra_u2v_ref_offset, unif, ra_x2_base
--/* [0x00001280] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
--/* [0x00001288] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
--/* [0x00001290] */ 0x15827d80, 0x10021427, // mov rb16, unif
--/* [0x00001298] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x000012a0] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
--/* [0x000012a8] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
--/* [0x000012b0] */ 0x00000001, 0xe0020527, // mov ra20, 1
--/* [0x000012b8] */ 0x00000040, 0xe0020567, // mov ra21, 64
--/* [0x000012c0] */ 0x00000100, 0xe00205a7, // mov ra22, 256
--/* [0x000012c8] */ 0x00000008, 0xe00205e7, // mov ra23, 8
--/* [0x000012d0] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
--/* [0x000012d8] */ 0x00000040, 0xe0021567, // mov rb21, 64
--/* [0x000012e0] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
--/* [0x000012e8] */ 0x00000018, 0xe00215e7, // mov rb23, 24
--/* [0x000012f0] */ 0x00000000, 0xe0020227, // mov ra8, 0
--/* [0x000012f8] */ 0x00000000, 0xe0020267, // mov ra9, 0
--/* [0x00001300] */ 0x00000000, 0xe00202a7, // mov ra10, 0
--/* [0x00001308] */ 0x00000000, 0xe00202e7, // mov ra11, 0
--/* [0x00001310] */ 0x00000000, 0xe0020327, // mov ra12, 0
--/* [0x00001318] */ 0x00000000, 0xe0020367, // mov ra13, 0
--/* [0x00001320] */ 0x00000000, 0xe00203a7, // mov ra14, 0
--/* [0x00001328] */ 0x00000000, 0xe00203e7, // mov ra15, 0
--/* [0x00001330] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
--/* [0x00001338] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
--/* [0x00001340] */ 0x159e7480, 0x10020867, // mov r1, r2
--/* [0x00001348] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
--/* [0x00001350] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
--/* [0x00001358] */ 0x159e7480, 0x10020827, // mov r0, r2
--/* [0x00001360] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
--/* [0x00001368] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
--/* [0x00001370] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
--/* [0x00001378] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
--/* [0x00001380] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
--/* [0x00001388] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
--/* [0x00001390] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
--/* [0x00001398] */ 0x159e7480, 0x10020867, // mov r1, r2
--/* [0x000013a0] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
--/* [0x000013a8] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
--/* [0x000013b0] */ 0x159e7480, 0x10020827, // mov r0, r2
--/* [0x000013b8] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
--/* [0x000013c0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
--/* [0x000013c8] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
--/* [0x000013d0] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
--/* [0x000013d8] */ 0x15427d80, 0x10020827, // mov r0, ra_x_base
--/* [0x000013e0] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
--/* [0x000013e8] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base
--/* [0x000013f0] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
--/* [0x000013f8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
--/* [0x00001400] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
--/* [0x00001408] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
--/* [0x00001410] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x_base, r0
--/* [0x00001418] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
--/* [0x00001420] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
--/* [0x00001428] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
--/* [0x00001430] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
--/* [0x00001438] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00001440] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00001260] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x00001268] */ 0x0c9a0f80, 0x10020427, // add ra_x_base, unif, elem_num
-+/* [0x00001270] */ 0x15827d80, 0x10020767, // mov ra_y, unif
-+/* [0x00001278] */ 0x15827d80, 0x10020627, // mov ra_x2_base, unif
-+/* [0x00001280] */ 0x009e7000, 0x100009e7, // nop
-+/* [0x00001288] */ 0x0d620f80, 0x10020667, // sub ra_u2v_ref_offset, unif, ra_x2_base
-+/* [0x00001290] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
-+/* [0x00001298] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
-+/* [0x000012a0] */ 0x15827d80, 0x10021427, // mov rb16, unif
-+/* [0x000012a8] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x000012b0] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-+/* [0x000012b8] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
-+/* [0x000012c0] */ 0x00000001, 0xe0020527, // mov ra20, 1
-+/* [0x000012c8] */ 0x00000020, 0xe0020567, // mov ra21, 32
-+/* [0x000012d0] */ 0x00000100, 0xe00205a7, // mov ra22, 256
-+/* [0x000012d8] */ 0x00000008, 0xe00205e7, // mov ra23, 8
-+/* [0x000012e0] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
-+/* [0x000012e8] */ 0x00000040, 0xe0021567, // mov rb21, 64
-+/* [0x000012f0] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
-+/* [0x000012f8] */ 0x00000018, 0xe00215e7, // mov rb23, 24
-+/* [0x00001300] */ 0x00000000, 0xe0020227, // mov ra8, 0
-+/* [0x00001308] */ 0x00000000, 0xe0020267, // mov ra9, 0
-+/* [0x00001310] */ 0x00000000, 0xe00202a7, // mov ra10, 0
-+/* [0x00001318] */ 0x00000000, 0xe00202e7, // mov ra11, 0
-+/* [0x00001320] */ 0x00000000, 0xe0020327, // mov ra12, 0
-+/* [0x00001328] */ 0x00000000, 0xe0020367, // mov ra13, 0
-+/* [0x00001330] */ 0x00000000, 0xe00203a7, // mov ra14, 0
-+/* [0x00001338] */ 0x00000000, 0xe00203e7, // mov ra15, 0
-+/* [0x00001340] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+/* [0x00001348] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-+/* [0x00001350] */ 0x159e7480, 0x10020867, // mov r1, r2
-+/* [0x00001358] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+/* [0x00001360] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+/* [0x00001368] */ 0x159e7480, 0x10020827, // mov r0, r2
-+/* [0x00001370] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-+/* [0x00001378] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00001380] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-+/* [0x00001388] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-+/* [0x00001390] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
-+/* [0x00001398] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+/* [0x000013a0] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-+/* [0x000013a8] */ 0x159e7480, 0x10020867, // mov r1, r2
-+/* [0x000013b0] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+/* [0x000013b8] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+/* [0x000013c0] */ 0x159e7480, 0x10020827, // mov r0, r2
-+/* [0x000013c8] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-+/* [0x000013d0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x000013d8] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-+/* [0x000013e0] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
-+/* [0x000013e8] */ 0x15427d80, 0x10020827, // mov r0, ra_x_base
-+/* [0x000013f0] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
-+/* [0x000013f8] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base
-+/* [0x00001400] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
-+/* [0x00001408] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
-+/* [0x00001410] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+/* [0x00001418] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+/* [0x00001420] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x_base, r0
-+/* [0x00001428] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+/* [0x00001430] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
-+/* [0x00001438] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
-+/* [0x00001440] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
- /* [0x00001448] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00001450] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
--/* [0x00001458] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
--/* [0x00001460] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
--/* [0x00001468] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
--/* [0x00001470] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
--/* [0x00001478] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
--/* [0x00001480] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
-+/* [0x00001450] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00001458] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00001460] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-+/* [0x00001468] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+/* [0x00001470] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-+/* [0x00001478] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00001480] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-+/* [0x00001488] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
-+/* [0x00001490] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
- // ::mc_filter_uv_b
--/* [0x00001488] */ 0x15827d80, 0x100207e7, // mov ra31, unif
--/* [0x00001490] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
--/* [0x00001498] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
--/* [0x000014a0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
--/* [0x000014a8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
--/* [0x000014b0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
--/* [0x000014b8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
--/* [0x000014c0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
--/* [0x000014c8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
--/* [0x000014d0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
--/* [0x000014d8] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
--/* [0x000014e0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
--/* [0x000014e8] */ 0x00000010, 0xe00208a7, // mov r2, 16
--/* [0x000014f0] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x000014f8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
--/* [0x00001500] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
--/* [0x00001508] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
--/* [0x00001510] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
--/* [0x00001518] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
--/* [0x00001520] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
--/* [0x00001528] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
--/* [0x00001530] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
--/* [0x00001538] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
--/* [0x00001540] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
--/* [0x00001548] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
--/* [0x00001550] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
--/* [0x00001558] */ 0x0c9dc7c0, 0x10020c67, // add vr_setup, r3, rb28
--/* [0x00001560] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
--/* [0x00001568] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00001570] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00001578] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00001580] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00001588] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
--/* [0x00001590] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00001598] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000015a0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000015a8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
--/* [0x000015b0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
--/* [0x000015b8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
--/* [0x000015c0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000015c8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
--/* [0x000015d0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
--/* [0x000015d8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
--/* [0x000015e0] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
--/* [0x000015e8] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
--/* [0x000015f0] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
--/* [0x000015f8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00001600] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+/* [0x00001498] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x000014a0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+/* [0x000014a8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+/* [0x000014b0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+/* [0x000014b8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+/* [0x000014c0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x000014c8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+/* [0x000014d0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+/* [0x000014d8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+/* [0x000014e0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+/* [0x000014e8] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+/* [0x000014f0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+/* [0x000014f8] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+/* [0x00001500] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00001508] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+/* [0x00001510] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+/* [0x00001518] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+/* [0x00001520] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+/* [0x00001528] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+/* [0x00001530] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+/* [0x00001538] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-+/* [0x00001540] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-+/* [0x00001548] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-+/* [0x00001550] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00001558] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+/* [0x00001560] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+/* [0x00001568] */ 0x0c9dc7c0, 0x10020c67, // add vr_setup, r3, rb28
-+/* [0x00001570] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-+/* [0x00001578] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00001580] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00001588] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00001590] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00001598] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+/* [0x000015a0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000015a8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000015b0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000015b8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+/* [0x000015c0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x000015c8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x000015d0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000015d8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+/* [0x000015e0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x000015e8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x000015f0] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x000015f8] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-+/* [0x00001600] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+/* [0x00001608] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00001610] */ 0x00000000, 0xe00208e7, // mov r3, 0
- // :uvloop_b
--/* [0x00001608] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
--/* [0x00001610] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
--/* [0x00001618] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
--/* [0x00001620] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
--/* [0x00001628] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
--/* [0x00001630] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
--/* [0x00001638] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
--/* [0x00001640] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
--/* [0x00001648] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
--/* [0x00001650] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
--/* [0x00001658] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00001660] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
--/* [0x00001668] */ 0x40038031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
--/* [0x00001670] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
--/* [0x00001678] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
--/* [0x00001680] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
--/* [0x00001688] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
--/* [0x00001690] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
--/* [0x00001698] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
--/* [0x000016a0] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
--/* [0x000016a8] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
--/* [0x000016b0] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
--/* [0x000016b8] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
--/* [0x000016c0] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
--/* [0x000016c8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
--/* [0x000016d0] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
--/* [0x000016d8] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
--/* [0x000016e0] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
--/* [0x000016e8] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
--/* [0x000016f0] */ 0x15267d80, 0x10020227, // mov ra8, ra9
--/* [0x000016f8] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
--/* [0x00001700] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
--/* [0x00001708] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
--/* [0x00001710] */ 0x15367d80, 0x10020327, // mov ra12, ra13
--/* [0x00001718] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
--/* [0x00001720] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
--/* [0x00001728] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
--/* [0x00001730] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
--/* [0x00001738] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
--/* [0x00001740] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
--/* [0x00001748] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
--/* [0x00001750] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
--/* [0x00001758] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
--/* [0x00001760] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
--/* [0x00001768] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
--/* [0x00001770] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
--/* [0x00001778] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
--/* [0x00001780] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
--/* [0x00001788] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
--/* [0x00001790] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
--/* [0x00001798] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
--/* [0x000017a0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
--/* [0x000017a8] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
--/* [0x000017b0] */ 0xfffffe38, 0xf06809e7, // brr.anyn -, r:uvloop_b
--/* [0x000017b8] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
--/* [0x000017c0] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
--/* [0x000017c8] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
--/* [0x000017d0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
--/* [0x000017d8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x000017e0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
--/* [0x000017e8] */ 0x00000010, 0xe0020827, // mov r0, 16
--/* [0x000017f0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
--/* [0x000017f8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
--/* [0x00001800] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
--/* [0x00001808] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x00001810] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x00001618] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+/* [0x00001620] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+/* [0x00001628] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+/* [0x00001630] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x00001638] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+/* [0x00001640] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x00001648] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00001650] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+/* [0x00001658] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+/* [0x00001660] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+/* [0x00001668] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00001670] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
-+/* [0x00001678] */ 0x40038031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
-+/* [0x00001680] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-+/* [0x00001688] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+/* [0x00001690] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+/* [0x00001698] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+/* [0x000016a0] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+/* [0x000016a8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+/* [0x000016b0] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+/* [0x000016b8] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+/* [0x000016c0] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+/* [0x000016c8] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+/* [0x000016d0] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+/* [0x000016d8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+/* [0x000016e0] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+/* [0x000016e8] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+/* [0x000016f0] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
-+/* [0x000016f8] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+/* [0x00001700] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-+/* [0x00001708] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+/* [0x00001710] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+/* [0x00001718] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+/* [0x00001720] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+/* [0x00001728] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+/* [0x00001730] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-+/* [0x00001738] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+/* [0x00001740] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
-+/* [0x00001748] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
-+/* [0x00001750] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
-+/* [0x00001758] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
-+/* [0x00001760] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
-+/* [0x00001768] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
-+/* [0x00001770] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
-+/* [0x00001778] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
-+/* [0x00001780] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
-+/* [0x00001788] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
-+/* [0x00001790] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
-+/* [0x00001798] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
-+/* [0x000017a0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+/* [0x000017a8] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
-+/* [0x000017b0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+/* [0x000017b8] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
-+/* [0x000017c0] */ 0xfffffe38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+/* [0x000017c8] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-+/* [0x000017d0] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
-+/* [0x000017d8] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
-+/* [0x000017e0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x000017e8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x000017f0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x000017f8] */ 0x00000010, 0xe0020827, // mov r0, 16
-+/* [0x00001800] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00001808] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00001810] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+/* [0x00001818] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00001820] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
- // ::mc_end
- };
- #ifdef __HIGHC__
-diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
-index db971f4..3464cdb 100644
---- a/libavcodec/rpi_shader.h
-+++ b/libavcodec/rpi_shader.h
-@@ -5,16 +5,16 @@ extern unsigned int rpi_shader[];
- 
- #define mc_setup (rpi_shader + 0)
- #define mc_filter_uv (rpi_shader + 146)
--#define mc_filter (rpi_shader + 360)
--#define mc_filter_b (rpi_shader + 670)
--#define mc_filter_honly (rpi_shader + 894)
--#define mc_exit (rpi_shader + 1048)
--#define mc_exit1 (rpi_shader + 1066)
--#define mc_interrupt_exit (rpi_shader + 1082)
--#define mc_interrupt_exit4 (rpi_shader + 1120)
--#define mc_interrupt_exit8 (rpi_shader + 1142)
--#define mc_setup_uv (rpi_shader + 1172)
--#define mc_filter_uv_b (rpi_shader + 1314)
--#define mc_end (rpi_shader + 1542)
-+#define mc_filter (rpi_shader + 364)
-+#define mc_filter_b (rpi_shader + 674)
-+#define mc_filter_honly (rpi_shader + 898)
-+#define mc_exit (rpi_shader + 1052)
-+#define mc_exit1 (rpi_shader + 1070)
-+#define mc_interrupt_exit (rpi_shader + 1086)
-+#define mc_interrupt_exit4 (rpi_shader + 1124)
-+#define mc_interrupt_exit8 (rpi_shader + 1146)
-+#define mc_setup_uv (rpi_shader + 1176)
-+#define mc_filter_uv_b (rpi_shader + 1318)
-+#define mc_end (rpi_shader + 1546)
- 
- #endif
-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-index 02fdcb2..4809e1d 100644
---- a/libavcodec/rpi_shader.qasm
-+++ b/libavcodec/rpi_shader.qasm
-@@ -21,7 +21,7 @@
- # rb19                                          next ra16
- #
- # ra20                                          1
--# ra21                                          64
-+# ra21                                          32
- # ra22                                          256
- # ra23                                          8
- #
-@@ -97,7 +97,7 @@ add rb24, r1, r0
- # load constants
- 
- mov ra20, 1
--mov ra21, 64
-+mov ra21, 32
- mov ra22, 256
- mov ra23, 8
- 
-@@ -270,7 +270,7 @@ add t0s, ra_x2_base, r2
- 
- mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
- 
--mov r2, rb21         ; mul24 r2, r0, ra0
-+nop                  ; mul24 r2, r0, ra0
- nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
- nop                  ; mul24      r3, ra1 << 1, r0 << 1
- nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-@@ -301,9 +301,9 @@ sub.setf -, r3, 8 ; mov r1, ra22
- 
- # apply horizontal filter
- brr.anyn -, r:uvloop
--max ra14, ra15, 0       ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
--asr r0, r0, 14          ; mov r1, ra21
--min.setf ra15, r0, rb22
-+mov ra14, ra15          ; mul24 r0, r0, r1         # last bit of context scroll
-+asr ra15, r0, 8         ; nop
-+nop                     ; nop  # Delay slot 3 (TODO move more of the context scroll into here)
- 
- # apply vertical filter and write to VPM
- 
-@@ -315,12 +315,14 @@ add r1, r1, r0          ; mul24 r0, ra10, rb10
- add r1, r1, r0          ; mul24 r0, ra9, rb9
- add r1, r1, r0          ; mul24 r0, ra8, rb8
- add r1, r1, r0          ; mul24 r0, ra15, rb15
--add.ifnn r1, r1, r0     ; mov -, vw_wait
-+add r1, r1, r0          ; mov -, vw_wait
- sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
--brr.anyn -, r:uvloop
- asr r1, r1, 14
--min r1, r1, rb22
--max vpm, r1, 0
-+add r1, r1, ra21
-+brr.anyn -, r:uvloop
-+asr r1, r1, 6          # Delay 1
-+min r1, r1, rb22       # Delay 2
-+max vpm, r1, 0         # Delay 3
- 
- # DMA out for U
- 
-@@ -1161,7 +1163,7 @@ add rb24, r1, r0
- # load constants
- 
- mov ra20, 1
--mov ra21, 64
-+mov ra21, 32
- mov ra22, 256
- mov ra23, 8
- 
--- 
-2.7.4
-
-
-From b7321192751956ed7deceeb3dabe22ccedb8e08d Mon Sep 17 00:00:00 2001
-From: Peter de Rivaz <peter.derivaz@gmail.com>
-Date: Wed, 13 May 2015 14:37:32 +0100
-Subject: [PATCH 19/68] Removed unused luma functions
-
----
- libavcodec/hevc.c          |    4 +-
- libavcodec/rpi_qpu.c       |   32 +-
- libavcodec/rpi_shader.c    | 1097 +++++++++++++-------------------------------
- libavcodec/rpi_shader.h    |   19 +-
- libavcodec/rpi_shader.qasm |  970 +++------------------------------------
- 5 files changed, 396 insertions(+), 1726 deletions(-)
-
-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-index d6d78ee..31b8b2f 100644
---- a/libavcodec/hevc.c
-+++ b/libavcodec/hevc.c
-@@ -2731,8 +2731,8 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
-         return;
-     for(k=0;k<8;k++) {
-         s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
--        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
--        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP); // Also need a dummy for V
-+        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
-+        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP_UV); // Also need a dummy for V
-         assert(s->u_mvs[k] - s->mvs_base[k] < UV_COMMANDS_PER_QPU);
-     }
- 
-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-index 4d9eda8..4e90cc1 100644
---- a/libavcodec/rpi_qpu.c
-+++ b/libavcodec/rpi_qpu.c
-@@ -172,7 +172,7 @@ static int gpu_init(volatile struct GPU **gpu) {
- 
-   // Now copy over the QPU code into GPU memory
-   {
--    int num_bytes = qpu_get_fn(QPU_MC_END) - qpu_get_fn(QPU_MC_SETUP);
-+    int num_bytes = qpu_get_fn(QPU_MC_END) - qpu_get_fn(QPU_MC_SETUP_UV);
-     assert(num_bytes<=QPU_CODE_SIZE*sizeof(unsigned int));
-     memcpy((void*)ptr->qpu_code, rpi_shader, num_bytes);
-   }
-@@ -612,24 +612,24 @@ unsigned int qpu_get_fn(int num) {
-       gpu_unlock();
-     }
-     switch(num) {
--    case QPU_MC_SETUP:
--      fn = mc_setup;
--      break;
--    case QPU_MC_FILTER:
--      fn = mc_filter;
--      break;
-+    //case QPU_MC_SETUP:
-+    //  fn = mc_setup;
-+    //  break;
-+    //case QPU_MC_FILTER:
-+    //  fn = mc_filter;
-+    //  break;
-     case QPU_MC_EXIT:
-       fn = mc_exit;
-       break;
--    case QPU_MC_INTERRUPT_EXIT:
--      fn = mc_interrupt_exit;
--      break;
--    case QPU_MC_FILTER_B:
--      fn = mc_filter_b;
--      break;
--    case QPU_MC_FILTER_HONLY:
--      fn = mc_filter_honly;
--      break;
-+    //case QPU_MC_INTERRUPT_EXIT:
-+    //  fn = mc_interrupt_exit;
-+    //  break;
-+    //case QPU_MC_FILTER_B:
-+    //  fn = mc_filter_b;
-+    //  break;
-+    //case QPU_MC_FILTER_HONLY:
-+    //  fn = mc_filter_honly;
-+    //  break;
-     case QPU_MC_SETUP_UV:
-       fn = mc_setup_uv;
-       break;
-diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
-index 831633b..170e8ac 100644
---- a/libavcodec/rpi_shader.c
-+++ b/libavcodec/rpi_shader.c
-@@ -21,798 +21,331 @@ __declspec(align(8))
- __attribute__((aligned(8)))
- #endif
- unsigned int rpi_shader[] = {
--// ::mc_setup
-+// ::mc_setup_uv
- /* [0x00000000] */ 0x15827d80, 0x100207e7, // mov ra31, unif
- /* [0x00000008] */ 0x0c9a0f80, 0x10020427, // add ra_x_base, unif, elem_num
- /* [0x00000010] */ 0x15827d80, 0x10020767, // mov ra_y, unif
- /* [0x00000018] */ 0x15827d80, 0x10020627, // mov ra_x2_base, unif
--/* [0x00000020] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
--/* [0x00000028] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
--/* [0x00000030] */ 0x15827d80, 0x10021427, // mov rb16, unif
--/* [0x00000038] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000040] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
--/* [0x00000048] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
--/* [0x00000050] */ 0x00000001, 0xe0020527, // mov ra20, 1
--/* [0x00000058] */ 0x00000020, 0xe0020567, // mov ra21, 32
--/* [0x00000060] */ 0x00000100, 0xe00205a7, // mov ra22, 256
--/* [0x00000068] */ 0x00000008, 0xe00205e7, // mov ra23, 8
--/* [0x00000070] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
--/* [0x00000078] */ 0x00000040, 0xe0021567, // mov rb21, 64
--/* [0x00000080] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
--/* [0x00000088] */ 0x00000018, 0xe00215e7, // mov rb23, 24
--/* [0x00000090] */ 0x00000000, 0xe0020227, // mov ra8, 0
--/* [0x00000098] */ 0x00000000, 0xe0020267, // mov ra9, 0
--/* [0x000000a0] */ 0x00000000, 0xe00202a7, // mov ra10, 0
--/* [0x000000a8] */ 0x00000000, 0xe00202e7, // mov ra11, 0
--/* [0x000000b0] */ 0x00000000, 0xe0020327, // mov ra12, 0
--/* [0x000000b8] */ 0x00000000, 0xe0020367, // mov ra13, 0
--/* [0x000000c0] */ 0x00000000, 0xe00203a7, // mov ra14, 0
--/* [0x000000c8] */ 0x00000000, 0xe00203e7, // mov ra15, 0
--/* [0x000000d0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
--/* [0x000000d8] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
--/* [0x000000e0] */ 0x159e7480, 0x10020867, // mov r1, r2
--/* [0x000000e8] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
--/* [0x000000f0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
--/* [0x000000f8] */ 0x159e7480, 0x10020827, // mov r0, r2
--/* [0x00000100] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
--/* [0x00000108] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
--/* [0x00000110] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
--/* [0x00000118] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
--/* [0x00000120] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
--/* [0x00000128] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
--/* [0x00000130] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
--/* [0x00000138] */ 0x159e7480, 0x10020867, // mov r1, r2
--/* [0x00000140] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
--/* [0x00000148] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
--/* [0x00000150] */ 0x159e7480, 0x10020827, // mov r0, r2
--/* [0x00000158] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
--/* [0x00000160] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
--/* [0x00000168] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
--/* [0x00000170] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
--/* [0x00000178] */ 0x15427d80, 0x10020827, // mov r0, ra_x_base
--/* [0x00000180] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
--/* [0x00000188] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
--/* [0x00000190] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base
--/* [0x00000198] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
--/* [0x000001a0] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
-+/* [0x00000020] */ 0x009e7000, 0x100009e7, // nop
-+/* [0x00000028] */ 0x0d620f80, 0x10020667, // sub ra_u2v_ref_offset, unif, ra_x2_base
-+/* [0x00000030] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
-+/* [0x00000038] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
-+/* [0x00000040] */ 0x15827d80, 0x10021427, // mov rb16, unif
-+/* [0x00000048] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000050] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-+/* [0x00000058] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
-+/* [0x00000060] */ 0x00000001, 0xe0020527, // mov ra20, 1
-+/* [0x00000068] */ 0x00000020, 0xe0020567, // mov ra21, 32
-+/* [0x00000070] */ 0x00000100, 0xe00205a7, // mov ra22, 256
-+/* [0x00000078] */ 0x00000008, 0xe00205e7, // mov ra23, 8
-+/* [0x00000080] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
-+/* [0x00000088] */ 0x00000040, 0xe0021567, // mov rb21, 64
-+/* [0x00000090] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
-+/* [0x00000098] */ 0x00000018, 0xe00215e7, // mov rb23, 24
-+/* [0x000000a0] */ 0x00000000, 0xe0020227, // mov ra8, 0
-+/* [0x000000a8] */ 0x00000000, 0xe0020267, // mov ra9, 0
-+/* [0x000000b0] */ 0x00000000, 0xe00202a7, // mov ra10, 0
-+/* [0x000000b8] */ 0x00000000, 0xe00202e7, // mov ra11, 0
-+/* [0x000000c0] */ 0x00000000, 0xe0020327, // mov ra12, 0
-+/* [0x000000c8] */ 0x00000000, 0xe0020367, // mov ra13, 0
-+/* [0x000000d0] */ 0x00000000, 0xe00203a7, // mov ra14, 0
-+/* [0x000000d8] */ 0x00000000, 0xe00203e7, // mov ra15, 0
-+/* [0x000000e0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+/* [0x000000e8] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-+/* [0x000000f0] */ 0x159e7480, 0x10020867, // mov r1, r2
-+/* [0x000000f8] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+/* [0x00000100] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+/* [0x00000108] */ 0x159e7480, 0x10020827, // mov r0, r2
-+/* [0x00000110] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-+/* [0x00000118] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000120] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-+/* [0x00000128] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-+/* [0x00000130] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
-+/* [0x00000138] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+/* [0x00000140] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-+/* [0x00000148] */ 0x159e7480, 0x10020867, // mov r1, r2
-+/* [0x00000150] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+/* [0x00000158] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+/* [0x00000160] */ 0x159e7480, 0x10020827, // mov r0, r2
-+/* [0x00000168] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-+/* [0x00000170] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000178] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-+/* [0x00000180] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
-+/* [0x00000188] */ 0x15427d80, 0x10020827, // mov r0, ra_x_base
-+/* [0x00000190] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
-+/* [0x00000198] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base
-+/* [0x000001a0] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
- /* [0x000001a8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
--/* [0x000001b0] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
--/* [0x000001b8] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
--/* [0x000001c0] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-+/* [0x000001b0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+/* [0x000001b8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+/* [0x000001c0] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x_base, r0
- /* [0x000001c8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
--/* [0x000001d0] */ 0x4c9d00cf, 0x10024821, // add r0, r0, r3; mul24 r1, r1, rb_pitch
--/* [0x000001d8] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
--/* [0x000001e0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
--/* [0x000001e8] */ 0x949dc5c0, 0xd0025890, // and r2, r2, ~3; mov ra_x_base, r0
--/* [0x000001f0] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
--/* [0x000001f8] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
--/* [0x00000200] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000208] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000210] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
--/* [0x00000218] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
--/* [0x00000220] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
--/* [0x00000228] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
--/* [0x00000230] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
--/* [0x00000238] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
--/* [0x00000240] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
-+/* [0x000001d0] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
-+/* [0x000001d8] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
-+/* [0x000001e0] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
-+/* [0x000001e8] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x000001f0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x000001f8] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000200] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-+/* [0x00000208] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+/* [0x00000210] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-+/* [0x00000218] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00000220] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-+/* [0x00000228] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
-+/* [0x00000230] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
- // ::mc_filter_uv
--/* [0x00000248] */ 0x15827d80, 0x100207e7, // mov ra31, unif
--/* [0x00000250] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
--/* [0x00000258] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
--/* [0x00000260] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
--/* [0x00000268] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
--/* [0x00000270] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
--/* [0x00000278] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
--/* [0x00000280] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
--/* [0x00000288] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
--/* [0x00000290] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
--/* [0x00000298] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
--/* [0x000002a0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
--/* [0x000002a8] */ 0x00000010, 0xe00208a7, // mov r2, 16
--/* [0x000002b0] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x000002b8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
--/* [0x000002c0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
--/* [0x000002c8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
--/* [0x000002d0] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
--/* [0x000002d8] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
--/* [0x000002e0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
--/* [0x000002e8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
--/* [0x000002f0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
--/* [0x000002f8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
--/* [0x00000300] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
--/* [0x00000308] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000310] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000318] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000320] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000328] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
--/* [0x00000330] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000338] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000340] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000348] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
--/* [0x00000350] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000358] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000360] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000368] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
--/* [0x00000370] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000378] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000380] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000388] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
--/* [0x00000390] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
--/* [0x00000398] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x000003a0] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+/* [0x00000238] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x00000240] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+/* [0x00000248] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+/* [0x00000250] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+/* [0x00000258] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+/* [0x00000260] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x00000268] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+/* [0x00000270] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+/* [0x00000278] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+/* [0x00000280] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+/* [0x00000288] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+/* [0x00000290] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+/* [0x00000298] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+/* [0x000002a0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x000002a8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+/* [0x000002b0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+/* [0x000002b8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+/* [0x000002c0] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+/* [0x000002c8] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+/* [0x000002d0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+/* [0x000002d8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x000002e0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+/* [0x000002e8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+/* [0x000002f0] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-+/* [0x000002f8] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000300] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000308] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000310] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000318] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+/* [0x00000320] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000328] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000330] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000338] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+/* [0x00000340] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000348] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000350] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000358] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+/* [0x00000360] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000368] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000370] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000378] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-+/* [0x00000380] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+/* [0x00000388] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000390] */ 0x00000000, 0xe00208e7, // mov r3, 0
- // :uvloop
--/* [0x000003a8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
--/* [0x000003b0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
--/* [0x000003b8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
--/* [0x000003c0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
--/* [0x000003c8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
--/* [0x000003d0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
--/* [0x000003d8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
--/* [0x000003e0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
--/* [0x000003e8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
--/* [0x000003f0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
--/* [0x000003f8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00000400] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
--/* [0x00000408] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
--/* [0x00000410] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
--/* [0x00000418] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
--/* [0x00000420] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
--/* [0x00000428] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
--/* [0x00000430] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
--/* [0x00000438] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
--/* [0x00000440] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
--/* [0x00000448] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
--/* [0x00000450] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
--/* [0x00000458] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
--/* [0x00000460] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
--/* [0x00000468] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
--/* [0x00000470] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
--/* [0x00000478] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
--/* [0x00000480] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
--/* [0x00000488] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
--/* [0x00000490] */ 0x15267d80, 0x10020227, // mov ra8, ra9
--/* [0x00000498] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
--/* [0x000004a0] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
--/* [0x000004a8] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
--/* [0x000004b0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
--/* [0x000004b8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
--/* [0x000004c0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
--/* [0x000004c8] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop
--/* [0x000004d0] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
--/* [0x000004d8] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
--/* [0x000004e0] */ 0x009e7000, 0x100009e7, // nop                     ; nop
--/* [0x000004e8] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
--/* [0x000004f0] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
--/* [0x000004f8] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
--/* [0x00000500] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
--/* [0x00000508] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
--/* [0x00000510] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
--/* [0x00000518] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
--/* [0x00000520] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
--/* [0x00000528] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
--/* [0x00000530] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
--/* [0x00000538] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
--/* [0x00000540] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
--/* [0x00000548] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:uvloop
--/* [0x00000550] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
--/* [0x00000558] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
--/* [0x00000560] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
--/* [0x00000568] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
--/* [0x00000570] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x00000578] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
--/* [0x00000580] */ 0x00000010, 0xe0020827, // mov r0, 16
--/* [0x00000588] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
--/* [0x00000590] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
--/* [0x00000598] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
--/* [0x000005a0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x000005a8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
--// ::mc_filter
--/* [0x000005b0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
--/* [0x000005b8] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
--/* [0x000005c0] */ 0x156e7d80, 0x10020667, // mov ra_x2shift, ra_x2shift_next
--/* [0x000005c8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
--/* [0x000005d0] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
--/* [0x000005d8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
--/* [0x000005e0] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3,unif
--/* [0x000005e8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
--/* [0x000005f0] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
--/* [0x000005f8] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
--/* [0x00000600] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
--/* [0x00000608] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
--/* [0x00000610] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
--/* [0x00000618] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
--/* [0x00000620] */ 0x149dc5c0, 0xd00206a7, // and ra_x2_base_next, r2, ~3
--/* [0x00000628] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
--/* [0x00000630] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
--/* [0x00000638] */ 0x00000010, 0xe00208a7, // mov r2, 16
--/* [0x00000640] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000648] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
--/* [0x00000650] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
--/* [0x00000658] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
--/* [0x00000660] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
--/* [0x00000668] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
--/* [0x00000670] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
--/* [0x00000678] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
--/* [0x00000680] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
--/* [0x00000688] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
--/* [0x00000690] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
--/* [0x00000698] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x000006a0] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000006a8] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000006b0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000006b8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
--/* [0x000006c0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000006c8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000006d0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000006d8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
--/* [0x000006e0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
--/* [0x000006e8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
--/* [0x000006f0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000006f8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
--/* [0x00000700] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000708] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000710] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000718] */ 0x000001d0, 0xf07809e7, // brr.anynn -, r:fast_path
--/* [0x00000720] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
--/* [0x00000728] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
--/* [0x00000730] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00000738] */ 0x00000000, 0xe00208e7, // mov r3, 0
--// :loop
--/* [0x00000740] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
--/* [0x00000748] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
--/* [0x00000750] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
--/* [0x00000758] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
--/* [0x00000760] */ 0xee654987, 0x10024860, // shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20
--/* [0x00000768] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
--/* [0x00000770] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
--/* [0x00000778] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
--/* [0x00000780] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
--/* [0x00000788] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
--/* [0x00000790] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00000798] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
--/* [0x000007a0] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
--/* [0x000007a8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
--/* [0x000007b0] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
--/* [0x000007b8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
--/* [0x000007c0] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
--/* [0x000007c8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
--/* [0x000007d0] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
--/* [0x000007d8] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
--/* [0x000007e0] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
--/* [0x000007e8] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
--/* [0x000007f0] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
--/* [0x000007f8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
--/* [0x00000800] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
--/* [0x00000808] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
--/* [0x00000810] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
--/* [0x00000818] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
--/* [0x00000820] */ 0x15267d80, 0x10020227, // mov ra8, ra9
--/* [0x00000828] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
--/* [0x00000830] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
--/* [0x00000838] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
--/* [0x00000840] */ 0x15367d80, 0x10020327, // mov ra12, ra13
--/* [0x00000848] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
--/* [0x00000850] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
--/* [0x00000858] */ 0xfffffec8, 0xf06809e7, // brr.anyn -, r:loop
--/* [0x00000860] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
--/* [0x00000868] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
--/* [0x00000870] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
--/* [0x00000878] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
--/* [0x00000880] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
--/* [0x00000888] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
--/* [0x00000890] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
--/* [0x00000898] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
--/* [0x000008a0] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
--/* [0x000008a8] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
--/* [0x000008b0] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
--/* [0x000008b8] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
--/* [0x000008c0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
--/* [0x000008c8] */ 0xfffffe58, 0xf06809e7, // brr.anyn -, r:loop
--/* [0x000008d0] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
--/* [0x000008d8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
--/* [0x000008e0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
--/* [0x000008e8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
--/* [0x000008f0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
--/* [0x000008f8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x00000900] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
--// :fast_path
--/* [0x00000908] */ 0x00000000, 0xe00208e7, // mov r3, 0
--// :fast_loop
--/* [0x00000910] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
--/* [0x00000918] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
--/* [0x00000920] */ 0x95727d9b, 0x1004475f, // mov.ifz ra_y, ra_y_next   ; mov rb31, r3
--/* [0x00000928] */ 0x95690dbf, 0x10044623, // mov.ifz ra_x2_base, ra_x2_base_next   ; mov r3, rb_pitch
--/* [0x00000930] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
--/* [0x00000938] */ 0x929de5e4, 0x100248a1, // min r2, r2, rb_frame_height_minus_1 ; mov r1, r4
--/* [0x00000940] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
--/* [0x00000948] */ 0xec414c87, 0x10024e20, // add t0s, ra_x_base, r2    ; v8subs r0, r0, rb20
--/* [0x00000950] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
--/* [0x00000958] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
--/* [0x00000960] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra1 << 1, r0 << 1
--/* [0x00000968] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra2 << 2, r0 << 2
--/* [0x00000970] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra3 << 3, r0 << 3
--/* [0x00000978] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra4 << 4, r0 << 4
--/* [0x00000980] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra5 << 5, r0 << 5
--/* [0x00000988] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra6 << 6, r0 << 6
--/* [0x00000990] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra7 << 7, r0 << 7
--/* [0x00000998] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
--/* [0x000009a0] */ 0x15267d80, 0x10020227, // mov ra8, ra9
--/* [0x000009a8] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
--/* [0x000009b0] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
--/* [0x000009b8] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
--/* [0x000009c0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
--/* [0x000009c8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
--/* [0x000009d0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8       ; mov r1, ra22
--/* [0x000009d8] */ 0xffffff18, 0xf06809e7, // brr.anyn -, r:fast_loop
--/* [0x000009e0] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
--/* [0x000009e8] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
--/* [0x000009f0] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
--/* [0x000009f8] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
--/* [0x00000a00] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
--/* [0x00000a08] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
--/* [0x00000a10] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
--/* [0x00000a18] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
--/* [0x00000a20] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
--/* [0x00000a28] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
--/* [0x00000a30] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
--/* [0x00000a38] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
--/* [0x00000a40] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
--/* [0x00000a48] */ 0xfffffea8, 0xf06809e7, // brr.anyn -, r:fast_loop
--/* [0x00000a50] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
--/* [0x00000a58] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
--/* [0x00000a60] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
--/* [0x00000a68] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
--/* [0x00000a70] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
--/* [0x00000a78] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x00000a80] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
--// ::mc_filter_b
--/* [0x00000a88] */ 0x15827d80, 0x100207e7, // mov ra31, unif
--/* [0x00000a90] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
--/* [0x00000a98] */ 0x156e7d80, 0x10020667, // mov ra_x2shift, ra_x2shift_next
--/* [0x00000aa0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
--/* [0x00000aa8] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
--/* [0x00000ab0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
--/* [0x00000ab8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3,unif
--/* [0x00000ac0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
--/* [0x00000ac8] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
--/* [0x00000ad0] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
--/* [0x00000ad8] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
--/* [0x00000ae0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
--/* [0x00000ae8] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
--/* [0x00000af0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
--/* [0x00000af8] */ 0x149dc5c0, 0xd00206a7, // and ra_x2_base_next, r2, ~3
--/* [0x00000b00] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
--/* [0x00000b08] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
--/* [0x00000b10] */ 0x00000010, 0xe00208a7, // mov r2, 16
--/* [0x00000b18] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000b20] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
--/* [0x00000b28] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
--/* [0x00000b30] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
--/* [0x00000b38] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
--/* [0x00000b40] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
--/* [0x00000b48] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
--/* [0x00000b50] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
--/* [0x00000b58] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
--/* [0x00000b60] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
--/* [0x00000b68] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
--/* [0x00000b70] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
--/* [0x00000b78] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
--/* [0x00000b80] */ 0x0c9dc7c0, 0x10020c67, // add vr_setup, r3, rb28
--/* [0x00000b88] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000b90] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000b98] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000ba0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000ba8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
--/* [0x00000bb0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000bb8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000bc0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000bc8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
--/* [0x00000bd0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000bd8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000be0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000be8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
--/* [0x00000bf0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000bf8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000c00] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000c08] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
--/* [0x00000c10] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
--/* [0x00000c18] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00000c20] */ 0x00000000, 0xe00208e7, // mov r3, 0
--// :bloop
--/* [0x00000c28] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
--/* [0x00000c30] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
--/* [0x00000c38] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
--/* [0x00000c40] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
--/* [0x00000c48] */ 0xee654987, 0x10024860, // shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20
--/* [0x00000c50] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
--/* [0x00000c58] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
--/* [0x00000c60] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
--/* [0x00000c68] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
--/* [0x00000c70] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
--/* [0x00000c78] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00000c80] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
--/* [0x00000c88] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
--/* [0x00000c90] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
--/* [0x00000c98] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
--/* [0x00000ca0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
--/* [0x00000ca8] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
--/* [0x00000cb0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
--/* [0x00000cb8] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
--/* [0x00000cc0] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
--/* [0x00000cc8] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
--/* [0x00000cd0] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
--/* [0x00000cd8] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
--/* [0x00000ce0] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
--/* [0x00000ce8] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
--/* [0x00000cf0] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
--/* [0x00000cf8] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
--/* [0x00000d00] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
--/* [0x00000d08] */ 0x15267d80, 0x10020227, // mov ra8, ra9
--/* [0x00000d10] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
--/* [0x00000d18] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
--/* [0x00000d20] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
--/* [0x00000d28] */ 0x15367d80, 0x10020327, // mov ra12, ra13
--/* [0x00000d30] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
--/* [0x00000d38] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
--/* [0x00000d40] */ 0xfffffec8, 0xf06809e7, // brr.anyn -, r:bloop
--/* [0x00000d48] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
--/* [0x00000d50] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
--/* [0x00000d58] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
--/* [0x00000d60] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
--/* [0x00000d68] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
--/* [0x00000d70] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
--/* [0x00000d78] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
--/* [0x00000d80] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
--/* [0x00000d88] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
--/* [0x00000d90] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
--/* [0x00000d98] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
--/* [0x00000da0] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
--/* [0x00000da8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
--/* [0x00000db0] */ 0x8fc8f3f6, 0xd0020867, // asr r1, r1, 15          ; mov -, vr_wait
--/* [0x00000db8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
--/* [0x00000dc0] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
--/* [0x00000dc8] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:bloop
--/* [0x00000dd0] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
--/* [0x00000dd8] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
--/* [0x00000de0] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
--/* [0x00000de8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
--/* [0x00000df0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
--/* [0x00000df8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x00000e00] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
--// ::mc_filter_honly
--/* [0x00000e08] */ 0x15827d80, 0x100207e7, // mov ra31, unif
--/* [0x00000e10] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
--/* [0x00000e18] */ 0x156e7d80, 0x10020667, // mov ra_x2shift, ra_x2shift_next
--/* [0x00000e20] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
--/* [0x00000e28] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
--/* [0x00000e30] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
--/* [0x00000e38] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3,unif
--/* [0x00000e40] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
--/* [0x00000e48] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
--/* [0x00000e50] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
--/* [0x00000e58] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
--/* [0x00000e60] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
--/* [0x00000e68] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
--/* [0x00000e70] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
--/* [0x00000e78] */ 0x149dc5c0, 0xd00206a7, // and ra_x2_base_next, r2, ~3
--/* [0x00000e80] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
--/* [0x00000e88] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
--/* [0x00000e90] */ 0x00000010, 0xe00208a7, // mov r2, 16
--/* [0x00000e98] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000ea0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
--/* [0x00000ea8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
--/* [0x00000eb0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
--/* [0x00000eb8] */ 0x0c9de1c0, 0xd0021467, // add rb17, r0, -2
--/* [0x00000ec0] */ 0x919c71c0, 0xd0024812, // shl r0, r0, 7 ; mov rb18,r0
--/* [0x00000ec8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
--/* [0x00000ed0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
--/* [0x00000ed8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
--/* [0x00000ee0] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000ee8] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000ef0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000ef8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000f00] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
--/* [0x00000f08] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000f10] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000f18] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000f20] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
--/* [0x00000f28] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000f30] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
--/* [0x00000f38] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00000f40] */ 0x00000000, 0xe00208e7, // mov r3, 0
--// :loop_honly
--/* [0x00000f48] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
--/* [0x00000f50] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
--/* [0x00000f58] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
--/* [0x00000f60] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
--/* [0x00000f68] */ 0xee654987, 0x10024860, // shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20
--/* [0x00000f70] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
--/* [0x00000f78] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
--/* [0x00000f80] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
--/* [0x00000f88] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
--/* [0x00000f90] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
--/* [0x00000f98] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00000fa0] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
--/* [0x00000fa8] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
--/* [0x00000fb0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
--/* [0x00000fb8] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
--/* [0x00000fc0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
--/* [0x00000fc8] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
--/* [0x00000fd0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
--/* [0x00000fd8] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
--/* [0x00000fe0] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
--/* [0x00000fe8] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
--/* [0x00000ff0] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
--/* [0x00000ff8] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
--/* [0x00001000] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
--/* [0x00001008] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
--/* [0x00001010] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
--/* [0x00001018] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
--/* [0x00001020] */ 0x8d5927f6, 0x100269e1, // sub.setf -, r3, rb18 ; mov r1, ra22
--/* [0x00001028] */ 0x559f2fc1, 0x100049e0, // mov -, vw_wait   ; mul24 r0, r0, r1
--/* [0x00001030] */ 0xfffffef8, 0xf06809e7, // brr.anyn -, r:loop_honly
--/* [0x00001038] */ 0x0f9cf1c0, 0xd0020827, // asr r0, r0, 15
--/* [0x00001040] */ 0x129d61c0, 0x10020827, // min r0, r0, rb22
--/* [0x00001048] */ 0x139c01c0, 0xd0020c27, // max vpm, r0, 0
--/* [0x00001050] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
--/* [0x00001058] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
--/* [0x00001060] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x00001068] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
--// ::mc_exit
--/* [0x00001070] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
--/* [0x00001078] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
--/* [0x00001080] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00001088] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00001090] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00001098] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x000010a0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
--/* [0x000010a8] */ 0x009e7000, 0x100009e7, // nop        ; nop
--/* [0x000010b0] */ 0x009e7000, 0x100009e7, // nop        ; nop
--// ::mc_exit1
--/* [0x000010b8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
--/* [0x000010c0] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x000010c8] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x000010d0] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x000010d8] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x000010e0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
--/* [0x000010e8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
--/* [0x000010f0] */ 0x009e7000, 0x100009e7, // nop        ; nop
--// ::mc_interrupt_exit
--/* [0x000010f8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
--/* [0x00001100] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00001108] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00001110] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00001118] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00001120] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00001128] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00001130] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00001138] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00001140] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00001148] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00001150] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00001158] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00001160] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00001168] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00001170] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00001178] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
--/* [0x00001180] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
--/* [0x00001188] */ 0x009e7000, 0x100009e7, // nop        ; nop
--// ::mc_interrupt_exit4
--/* [0x00001190] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
--/* [0x00001198] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x000011a0] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x000011a8] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x000011b0] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x000011b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x000011c0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x000011c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x000011d0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
--/* [0x000011d8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
--/* [0x000011e0] */ 0x009e7000, 0x100009e7, // nop        ; nop
--// ::mc_interrupt_exit8
--/* [0x000011e8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
--/* [0x000011f0] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x000011f8] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00001200] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00001208] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00001210] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00001218] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00001220] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00001228] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00001230] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00001238] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00001240] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00001248] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
--/* [0x00001250] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
--/* [0x00001258] */ 0x009e7000, 0x100009e7, // nop        ; nop
--// ::mc_setup_uv
--/* [0x00001260] */ 0x15827d80, 0x100207e7, // mov ra31, unif
--/* [0x00001268] */ 0x0c9a0f80, 0x10020427, // add ra_x_base, unif, elem_num
--/* [0x00001270] */ 0x15827d80, 0x10020767, // mov ra_y, unif
--/* [0x00001278] */ 0x15827d80, 0x10020627, // mov ra_x2_base, unif
--/* [0x00001280] */ 0x009e7000, 0x100009e7, // nop
--/* [0x00001288] */ 0x0d620f80, 0x10020667, // sub ra_u2v_ref_offset, unif, ra_x2_base
--/* [0x00001290] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
--/* [0x00001298] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
--/* [0x000012a0] */ 0x15827d80, 0x10021427, // mov rb16, unif
--/* [0x000012a8] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x000012b0] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
--/* [0x000012b8] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
--/* [0x000012c0] */ 0x00000001, 0xe0020527, // mov ra20, 1
--/* [0x000012c8] */ 0x00000020, 0xe0020567, // mov ra21, 32
--/* [0x000012d0] */ 0x00000100, 0xe00205a7, // mov ra22, 256
--/* [0x000012d8] */ 0x00000008, 0xe00205e7, // mov ra23, 8
--/* [0x000012e0] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
--/* [0x000012e8] */ 0x00000040, 0xe0021567, // mov rb21, 64
--/* [0x000012f0] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
--/* [0x000012f8] */ 0x00000018, 0xe00215e7, // mov rb23, 24
--/* [0x00001300] */ 0x00000000, 0xe0020227, // mov ra8, 0
--/* [0x00001308] */ 0x00000000, 0xe0020267, // mov ra9, 0
--/* [0x00001310] */ 0x00000000, 0xe00202a7, // mov ra10, 0
--/* [0x00001318] */ 0x00000000, 0xe00202e7, // mov ra11, 0
--/* [0x00001320] */ 0x00000000, 0xe0020327, // mov ra12, 0
--/* [0x00001328] */ 0x00000000, 0xe0020367, // mov ra13, 0
--/* [0x00001330] */ 0x00000000, 0xe00203a7, // mov ra14, 0
--/* [0x00001338] */ 0x00000000, 0xe00203e7, // mov ra15, 0
--/* [0x00001340] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
--/* [0x00001348] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
--/* [0x00001350] */ 0x159e7480, 0x10020867, // mov r1, r2
--/* [0x00001358] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
--/* [0x00001360] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
--/* [0x00001368] */ 0x159e7480, 0x10020827, // mov r0, r2
--/* [0x00001370] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
--/* [0x00001378] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
--/* [0x00001380] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
--/* [0x00001388] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
--/* [0x00001390] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
--/* [0x00001398] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
--/* [0x000013a0] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
--/* [0x000013a8] */ 0x159e7480, 0x10020867, // mov r1, r2
--/* [0x000013b0] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
--/* [0x000013b8] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
--/* [0x000013c0] */ 0x159e7480, 0x10020827, // mov r0, r2
--/* [0x000013c8] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
--/* [0x000013d0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
--/* [0x000013d8] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
--/* [0x000013e0] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
--/* [0x000013e8] */ 0x15427d80, 0x10020827, // mov r0, ra_x_base
--/* [0x000013f0] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
--/* [0x000013f8] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base
--/* [0x00001400] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
--/* [0x00001408] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
--/* [0x00001410] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
--/* [0x00001418] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
--/* [0x00001420] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x_base, r0
--/* [0x00001428] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
--/* [0x00001430] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
--/* [0x00001438] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
--/* [0x00001440] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
--/* [0x00001448] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00001450] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00001458] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00001460] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
--/* [0x00001468] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
--/* [0x00001470] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
--/* [0x00001478] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
--/* [0x00001480] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
--/* [0x00001488] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
--/* [0x00001490] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
-+/* [0x00000398] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+/* [0x000003a0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+/* [0x000003a8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+/* [0x000003b0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x000003b8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+/* [0x000003c0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x000003c8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x000003d0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+/* [0x000003d8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+/* [0x000003e0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+/* [0x000003e8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x000003f0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+/* [0x000003f8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+/* [0x00000400] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+/* [0x00000408] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+/* [0x00000410] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+/* [0x00000418] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+/* [0x00000420] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+/* [0x00000428] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+/* [0x00000430] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+/* [0x00000438] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+/* [0x00000440] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+/* [0x00000448] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+/* [0x00000450] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+/* [0x00000458] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+/* [0x00000460] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+/* [0x00000468] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+/* [0x00000470] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-+/* [0x00000478] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+/* [0x00000480] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-+/* [0x00000488] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+/* [0x00000490] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+/* [0x00000498] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+/* [0x000004a0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+/* [0x000004a8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+/* [0x000004b0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-+/* [0x000004b8] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop
-+/* [0x000004c0] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-+/* [0x000004c8] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-+/* [0x000004d0] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-+/* [0x000004d8] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
-+/* [0x000004e0] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
-+/* [0x000004e8] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
-+/* [0x000004f0] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
-+/* [0x000004f8] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
-+/* [0x00000500] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
-+/* [0x00000508] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
-+/* [0x00000510] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
-+/* [0x00000518] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+/* [0x00000520] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+/* [0x00000528] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+/* [0x00000530] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
-+/* [0x00000538] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:uvloop
-+/* [0x00000540] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-+/* [0x00000548] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+/* [0x00000550] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+/* [0x00000558] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x00000560] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000568] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x00000570] */ 0x00000010, 0xe0020827, // mov r0, 16
-+/* [0x00000578] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000580] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00000588] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+/* [0x00000590] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000598] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
- // ::mc_filter_uv_b
--/* [0x00001498] */ 0x15827d80, 0x100207e7, // mov ra31, unif
--/* [0x000014a0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
--/* [0x000014a8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
--/* [0x000014b0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
--/* [0x000014b8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
--/* [0x000014c0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
--/* [0x000014c8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
--/* [0x000014d0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
--/* [0x000014d8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
--/* [0x000014e0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
--/* [0x000014e8] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
--/* [0x000014f0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
--/* [0x000014f8] */ 0x00000010, 0xe00208a7, // mov r2, 16
--/* [0x00001500] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00001508] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
--/* [0x00001510] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
--/* [0x00001518] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
--/* [0x00001520] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
--/* [0x00001528] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
--/* [0x00001530] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
--/* [0x00001538] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
--/* [0x00001540] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
--/* [0x00001548] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
--/* [0x00001550] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
--/* [0x00001558] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
--/* [0x00001560] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
--/* [0x00001568] */ 0x0c9dc7c0, 0x10020c67, // add vr_setup, r3, rb28
--/* [0x00001570] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
--/* [0x00001578] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00001580] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00001588] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00001590] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00001598] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
--/* [0x000015a0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000015a8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000015b0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000015b8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
--/* [0x000015c0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
--/* [0x000015c8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
--/* [0x000015d0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000015d8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
--/* [0x000015e0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
--/* [0x000015e8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
--/* [0x000015f0] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
--/* [0x000015f8] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
--/* [0x00001600] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
--/* [0x00001608] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00001610] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+/* [0x000005a0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x000005a8] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+/* [0x000005b0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+/* [0x000005b8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+/* [0x000005c0] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+/* [0x000005c8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x000005d0] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+/* [0x000005d8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+/* [0x000005e0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+/* [0x000005e8] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+/* [0x000005f0] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+/* [0x000005f8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+/* [0x00000600] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+/* [0x00000608] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000610] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+/* [0x00000618] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+/* [0x00000620] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+/* [0x00000628] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+/* [0x00000630] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+/* [0x00000638] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+/* [0x00000640] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-+/* [0x00000648] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-+/* [0x00000650] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-+/* [0x00000658] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000660] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+/* [0x00000668] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+/* [0x00000670] */ 0x0c9dc7c0, 0x10020c67, // add vr_setup, r3, rb28
-+/* [0x00000678] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-+/* [0x00000680] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000688] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000690] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000698] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000006a0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+/* [0x000006a8] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000006b0] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000006b8] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000006c0] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+/* [0x000006c8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x000006d0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x000006d8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000006e0] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+/* [0x000006e8] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x000006f0] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x000006f8] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000700] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-+/* [0x00000708] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+/* [0x00000710] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000718] */ 0x00000000, 0xe00208e7, // mov r3, 0
- // :uvloop_b
--/* [0x00001618] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
--/* [0x00001620] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
--/* [0x00001628] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
--/* [0x00001630] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
--/* [0x00001638] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
--/* [0x00001640] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
--/* [0x00001648] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
--/* [0x00001650] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
--/* [0x00001658] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
--/* [0x00001660] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
--/* [0x00001668] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00001670] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
--/* [0x00001678] */ 0x40038031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
--/* [0x00001680] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
--/* [0x00001688] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
--/* [0x00001690] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
--/* [0x00001698] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
--/* [0x000016a0] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
--/* [0x000016a8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
--/* [0x000016b0] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
--/* [0x000016b8] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
--/* [0x000016c0] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
--/* [0x000016c8] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
--/* [0x000016d0] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
--/* [0x000016d8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
--/* [0x000016e0] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
--/* [0x000016e8] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
--/* [0x000016f0] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
--/* [0x000016f8] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
--/* [0x00001700] */ 0x15267d80, 0x10020227, // mov ra8, ra9
--/* [0x00001708] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
--/* [0x00001710] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
--/* [0x00001718] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
--/* [0x00001720] */ 0x15367d80, 0x10020327, // mov ra12, ra13
--/* [0x00001728] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
--/* [0x00001730] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
--/* [0x00001738] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
--/* [0x00001740] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
--/* [0x00001748] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
--/* [0x00001750] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
--/* [0x00001758] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
--/* [0x00001760] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
--/* [0x00001768] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
--/* [0x00001770] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
--/* [0x00001778] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
--/* [0x00001780] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
--/* [0x00001788] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
--/* [0x00001790] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
--/* [0x00001798] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
--/* [0x000017a0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
--/* [0x000017a8] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
--/* [0x000017b0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
--/* [0x000017b8] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
--/* [0x000017c0] */ 0xfffffe38, 0xf06809e7, // brr.anyn -, r:uvloop_b
--/* [0x000017c8] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
--/* [0x000017d0] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
--/* [0x000017d8] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
--/* [0x000017e0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
--/* [0x000017e8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x000017f0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
--/* [0x000017f8] */ 0x00000010, 0xe0020827, // mov r0, 16
--/* [0x00001800] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
--/* [0x00001808] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
--/* [0x00001810] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
--/* [0x00001818] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x00001820] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x00000720] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+/* [0x00000728] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+/* [0x00000730] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+/* [0x00000738] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x00000740] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+/* [0x00000748] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x00000750] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00000758] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+/* [0x00000760] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+/* [0x00000768] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+/* [0x00000770] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000778] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
-+/* [0x00000780] */ 0x40038031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
-+/* [0x00000788] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-+/* [0x00000790] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+/* [0x00000798] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+/* [0x000007a0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+/* [0x000007a8] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+/* [0x000007b0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+/* [0x000007b8] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+/* [0x000007c0] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+/* [0x000007c8] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+/* [0x000007d0] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+/* [0x000007d8] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+/* [0x000007e0] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+/* [0x000007e8] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+/* [0x000007f0] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+/* [0x000007f8] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
-+/* [0x00000800] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+/* [0x00000808] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-+/* [0x00000810] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+/* [0x00000818] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+/* [0x00000820] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+/* [0x00000828] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+/* [0x00000830] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+/* [0x00000838] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-+/* [0x00000840] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+/* [0x00000848] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
-+/* [0x00000850] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
-+/* [0x00000858] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
-+/* [0x00000860] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
-+/* [0x00000868] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
-+/* [0x00000870] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
-+/* [0x00000878] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
-+/* [0x00000880] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
-+/* [0x00000888] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
-+/* [0x00000890] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
-+/* [0x00000898] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
-+/* [0x000008a0] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
-+/* [0x000008a8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+/* [0x000008b0] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
-+/* [0x000008b8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+/* [0x000008c0] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
-+/* [0x000008c8] */ 0xfffffe38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+/* [0x000008d0] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-+/* [0x000008d8] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
-+/* [0x000008e0] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
-+/* [0x000008e8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x000008f0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x000008f8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x00000900] */ 0x00000010, 0xe0020827, // mov r0, 16
-+/* [0x00000908] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000910] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00000918] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+/* [0x00000920] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000928] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+// ::mc_exit
-+/* [0x00000930] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x00000938] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-+/* [0x00000940] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000948] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000950] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000958] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000960] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x00000968] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+/* [0x00000970] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+// ::mc_interrupt_exit8
-+/* [0x00000978] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x00000980] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000988] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000990] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000998] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x000009a0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000009a8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000009b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000009b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000009c0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000009c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000009d0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000009d8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x000009e0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+/* [0x000009e8] */ 0x009e7000, 0x100009e7, // nop        ; nop
- // ::mc_end
- };
- #ifdef __HIGHC__
-diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
-index 3464cdb..9de4535 100644
---- a/libavcodec/rpi_shader.h
-+++ b/libavcodec/rpi_shader.h
-@@ -3,18 +3,11 @@
- 
- extern unsigned int rpi_shader[];
- 
--#define mc_setup (rpi_shader + 0)
--#define mc_filter_uv (rpi_shader + 146)
--#define mc_filter (rpi_shader + 364)
--#define mc_filter_b (rpi_shader + 674)
--#define mc_filter_honly (rpi_shader + 898)
--#define mc_exit (rpi_shader + 1052)
--#define mc_exit1 (rpi_shader + 1070)
--#define mc_interrupt_exit (rpi_shader + 1086)
--#define mc_interrupt_exit4 (rpi_shader + 1124)
--#define mc_interrupt_exit8 (rpi_shader + 1146)
--#define mc_setup_uv (rpi_shader + 1176)
--#define mc_filter_uv_b (rpi_shader + 1318)
--#define mc_end (rpi_shader + 1546)
-+#define mc_setup_uv (rpi_shader + 0)
-+#define mc_filter_uv (rpi_shader + 142)
-+#define mc_filter_uv_b (rpi_shader + 360)
-+#define mc_exit (rpi_shader + 588)
-+#define mc_interrupt_exit8 (rpi_shader + 606)
-+#define mc_end (rpi_shader + 636)
- 
- #endif
-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-index 4809e1d..cd7346d 100644
---- a/libavcodec/rpi_shader.qasm
-+++ b/libavcodec/rpi_shader.qasm
-@@ -71,8 +71,10 @@
- 
- .set rb_const_64,                  rb21
- 
--# mc_setup(next_kernel, x, y, ref_base, frame_width, frame_height, pitch, dst_pitch, pad0, pad1)
--::mc_setup
-+
-+################################################################################
-+# mc_setup_uv(next_kernel, x, y, ref_u_base, ref_v_base, frame_width, frame_height, pitch, dst_pitch, pad0, pad1, pad2)
-+::mc_setup_uv
- 
- # Read starting kernel
- mov ra31, unif
-@@ -80,7 +82,9 @@ mov ra31, unif
- # Load first request location
- add ra_x_base, unif, elem_num # Store x
- mov ra_y, unif # Store y
--mov ra_x2_base, unif # Store frame base
-+mov ra_x2_base, unif # Store frame u base
-+nop
-+sub ra_u2v_ref_offset, unif, ra_x2_base # Store offset to add to move from u to v in reference frame
- 
- # Read image dimensions
- sub rb25,unif,1
-@@ -143,29 +147,24 @@ mov r1, vpm_setup(0, 4, h8p(0, 0))
- add rb28, r0, r1
- 
- # Compute base address for first and second access
--#add r0, unif, elem_num     # x
- mov r0, ra_x_base           # Load x
--add r2, r0, 8               # x+8
- max r0, r0, 0; mov r1, ra_y # Load y
- min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base  # Load the frame base
--shl ra_xshift_next, r0, 3
--max r2, r2, 0
-+shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
- add ra_y, r1, 1
--min r2, r2, rb_frame_width_minus_1
--shl ra_x2shift_next, r2, 3
--max r1, r1, 0  # y
--min r1, r1, rb_frame_height_minus_1
--add r0, r0, r3; mul24 r1, r1, rb_pitch
--add r2, r2, r3
-+add r0, r0, r3
- and r0, r0, ~3
--and r2, r2, ~3; mov ra_x_base, r0
-+max r1, r1, 0 ; mov ra_x_base, r0 # y
-+min r1, r1, rb_frame_height_minus_1
- # submit texture requests for first line
-+add r2, r2, r0 ; mul24 r1, r1, rb_pitch
- add t0s, r0, r1 ; mov ra_x2_base, r2
- add t0s, r2, r1
- 
- # Dump padding words
- mov r0, unif
- mov r0, unif
-+mov r0, unif
- 
- # submit texture requests for second line
- max r1, ra_y, 0
-@@ -176,6 +175,8 @@ nop ; mul24 r1, r1, rb_pitch
- add t0s, r1, ra_x_base
- add t0s, r1, ra_x2_base
- 
-+
-+
- ################################################################################
- 
- # mc_filter_uv(next_kernel, x, y, frame_u_base, frame_v_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_u_dst, this_v_dst)
-@@ -341,453 +342,26 @@ add vw_setup, rb26, r0 # VDW setup 0
- mov vw_setup, rb29 # Stride
- mov vw_addr, unif # start the VDW
- 
--################################################################################
--
--
--# mc_filter(next_kernel, x, y, frame_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_dst)
--
--# At this point we have already issued two pairs of texture requests for the current block
--# ra_x_base, ra_x16_base point to the current coordinates for this block
--::mc_filter
--mov ra31, unif
--
--# per-channel shifts were calculated on the *previous* invocation
--
--mov ra_xshift, ra_xshift_next
--mov ra_x2shift, ra_x2shift_next
--
--# get base addresses and per-channel shifts for *next* invocation
--add r0, unif, elem_num    # x
--add r2, r0, 8 # x+8
--max r0, r0, 0; mov r1, unif # y
--min r0, r0, rb_frame_width_minus_1 ; mov r3,unif # frame_base
--shl ra_xshift_next, r0, 3
--max r2, r2, 0
--min r2, r2, rb_frame_width_minus_1
--shl ra_x2shift_next, r2, 3
--add r0, r0, r3
--add r2, r2, r3
--and rb_x_base_next, r0, ~3
--and ra_x2_base_next, r2, ~3
--mov ra_y_next, r1
--
--# set up VPM write
--mov vw_setup, rb28
--
--# get width,height of block
--mov r2, 16
--mov r0, unif
--shr r1, r0, r2 # Extract width
--sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
--and r0, r0, rb22 # Extract height
--add rb17, r0, 5
--add rb18, r0, 7
--shl r0, r0, 7
--add r0, r0, r1 # Combine width and height of destination area
--shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
--add rb26, r0, rb27
--
--sub.setf -,8,r1 # 8-r1, so if <0 (negative) we need to use the full code
--
--# get filter coefficients
--
--mov r0, unif
--asr ra3, r0, rb23;      mul24 r0, r0, ra22
--asr ra2, r0, rb23;      mul24 r0, r0, ra22
--asr ra1, r0, rb23;      mul24 r0, r0, ra22
--asr ra0, r0, rb23;      mov r0, unif
--asr ra7, r0, rb23;      mul24 r0, r0, ra22
--asr ra6, r0, rb23;      mul24 r0, r0, ra22
--asr ra5, r0, rb23;      mul24 r0, r0, ra22
--asr ra4, r0, rb23;      mov r0, unif
--asr rb11, r0, rb23;     mul24 r0, r0, ra22
--asr rb10, r0, rb23;     mul24 r0, r0, ra22
--asr rb9, r0, rb23;      mul24 r0, r0, ra22
--asr rb8, r0, rb23;      mov r0, unif
--asr rb15, r0, rb23;     mul24 r0, r0, ra22
--asr rb14, r0, rb23;     mul24 r0, r0, ra22
--asr rb13, r0, rb23;     mul24 r0, r0, ra22
--brr.anynn -, r:fast_path
--asr rb12, r0, rb23  # delay slot 1
--
--# r2 is elem_num
--# r3 is loop counter
--
--mov r5rep, -8 # delay slot 2
--
--mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]  # delay slot 3
--
--# retrieve texture results and pick out bytes
--# then submit two more texture requests
--
--## nop                                                                 ; ldtmu0     # loop counter increment
--## shr r0, r4, ra17                                                    ; ldtmu0
--## shr r1, r4, ra17     ; v8subs r0, r0, rb20
--## add t0s, ra16, r5    ; v8subs r1, r1, rb20
--## add ra16, ra16, rb16 ; mov t0s, ra16
--##
--## # generate seven shifted versions
--## # interleave with scroll of vertical context
--##
--## mov r2, rb21         ; mul24 r3, r0, ra0
--## sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
--## nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
--## sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
--## nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
--## sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
--## nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
--## sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
--## nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
--## sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
--## nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
--## sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
--## nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
--## sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
--## nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
--## sub r2, r2, r3                                                    ; ldtmu0
--##
--## mov r0, ra22
--## shr r0, r4, ra17     ; mul24 r2, r2, r0                           ; ldtmu0
--## shr r1, r4, ra17     ; v8subs r0, r0, rb20
--## add t0s, ra16, r5    ; v8subs r1, r1, rb20
--## add ra16, ra16, rb16 ; mov t0s, ra16
--##
--## # apply horizontal filter
--##
--## asr r2, r2, 15    ; mul24 r3, r0, ra0
--## min r2, r2, rb22
--## max ra13, r2, 0
--##
--## # generate seven shifted versions
--## # interleave with scroll of vertical context
--##
--## mov r2, rb21
--## sub r2, r2, r3 ; mul24      r3, ra1 << 1, r0 << 1
--## nop            ; mul24.ifnz r3, ra1 << 9, r1 << 9
--## sub r2, r2, r3 ; mul24    r3, ra2 << 2, r0 << 2
--## nop            ; mul24.ifnz r3, ra2 << 10, r1 << 10
--## sub r2, r2, r3 ; mul24    r3, ra3 << 3, r0 << 3
--## nop            ; mul24.ifnz r3, ra3 << 11, r1 << 11
--## sub r2, r2, r3 ; mul24    r3, ra4 << 4, r0 << 4
--## nop            ; mul24.ifnz r3, ra4 << 12, r1 << 12
--## sub r2, r2, r3 ; mul24    r3, ra5 << 5, r0 << 5
--## nop            ; mul24.ifnz r3, ra5 << 13, r1 << 13
--## sub r2, r2, r3 ; mul24    r3, ra6 << 6, r0 << 6
--## nop            ; mul24.ifnz r3, ra6 << 14, r1 << 14
--## sub r2, r2, r3 ; mul24    r3, ra7 << 7, r0 << 7
--## nop            ; mul24.ifnz r3, ra7 << 15, r1 << 15
--## sub r0, r2, r3
--##
--## # apply horizontal filter
--##
--## nop          ; mul24 r0, r0, ra22         # last bit of context scroll, including clamp to zero
--## asr r0, r0, 15
--## min r0, r0, rb22
--## max ra14, r0, 0
--##
--##
--##
--##
--## nop                                                                 ; ldtmu0     # loop counter increment
--## shr r0, r4, ra17                                                    ; ldtmu0
--## shr r1, r4, ra17     ; v8subs r0, r0, rb20
--## add t0s, ra16, r5    ; v8subs r1, r1, rb20
--## add ra16, ra16, rb16 ; mov t0s, ra16
--##
--## # generate seven shifted versions
--## # interleave with scroll of vertical context
--##
--## mov r2, rb21         ; mul24 r3, r0, ra0
--## sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
--## nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
--## sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
--## nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
--## sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
--## nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
--## sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
--## nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
--## sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
--## nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
--## sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
--## nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
--## sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
--## nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
--## sub r0, r2, r3
--##
--## # apply horizontal filter
--##
--## nop          ; mul24 r0, r0, ra22         # last bit of context scroll, including clamp to zero
--## asr r0, r0, 15
--## min r0, r0, rb22
--## max ra15, r0, 0
--
--
--
--
--mov r3, 0
--
--:loop
--# retrieve texture results and pick out bytes
--# then submit two more texture requests
--
--sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
--shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
--mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
--mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
--shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
--
--max r2, ra_y, 0  # y
--min r2, r2, rb_frame_height_minus_1
--add ra_y, ra_y, 1         ; mul24 r2, r2, r3
--add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
--add t0s, ra_x2_base, r2
--
--# generate seven shifted versions
--# interleave with scroll of vertical context
--
--mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--
--mov r2, rb21         ; mul24 r3, r0, ra0
--sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
--nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
--sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
--nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
--sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
--nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
--sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
--nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
--sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
--nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
--sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
--nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
--sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
--nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
--sub r0, r2, r3
--
--mov r3, rb31
--
--mov ra8, ra9
--mov ra9, ra10
--mov ra10, ra11
--mov ra11, ra12
--mov ra12, ra13
--mov ra13, ra14
--
--sub.setf -, r3, 8 ; mov r1, ra22
--
--# apply horizontal filter
--brr.anyn -, r:loop
--max ra14, ra15, 0       ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
--asr r0, r0, 15          ; mov r1, ra21
--min.setf ra15, r0, rb22
--
--# apply vertical filter and write to VPM
--
--nop                     ; mul24 r0, ra14, rb14
--sub r1, r1, r0          ; mul24 r0, ra13, rb13
--sub r1, r1, r0          ; mul24 r0, ra12, rb12
--sub r1, r1, r0          ; mul24 r0, ra11, rb11
--sub r1, r1, r0          ; mul24 r0, ra10, rb10
--sub r1, r1, r0          ; mul24 r0, ra9, rb9
--sub r1, r1, r0          ; mul24 r0, ra8, rb8
--sub r1, r1, r0          ; mul24 r0, ra15, rb15
--sub.ifnn r1, r1, r0     ; mov -, vw_wait
--sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
--brr.anyn -, r:loop
--asr r1, r1, 15
--min r1, r1, rb22
--max vpm, r1, 0
--
--# DMA out
--
--bra -, ra31
--mov vw_setup, rb26 # VDW: height rows, 16 8-bit units long
--mov vw_setup, rb29
--mov vw_addr, unif # start the VDW
--
--####################################################
--
--:fast_path
--## nop                                                                 ; ldtmu0     # loop counter increment
--## shr r0, r4, ra17                                                    ; ldtmu0
--## shr r1, r4, ra17     ; v8subs r0, r0, rb20
--## add t0s, ra16, r5    ; v8subs r1, r1, rb20
--## add ra16, ra16, rb16 ; mov t0s, ra16
--##
--## # generate seven shifted versions
--## # interleave with scroll of vertical context
--##
--## mov r2, rb21         ; mul24 r3, r0, ra0
--## sub r2, r2, r3       ; mul24 r3, ra1 << 1, r0 << 1
--## sub r2, r2, r3       ; mul24 r3, ra2 << 2, r0 << 2
--## sub r2, r2, r3       ; mul24 r3, ra3 << 3, r0 << 3
--## sub r2, r2, r3       ; mul24 r3, ra4 << 4, r0 << 4
--## sub r2, r2, r3       ; mul24 r3, ra5 << 5, r0 << 5
--## sub r2, r2, r3       ; mul24 r3, ra6 << 6, r0 << 6
--## sub r2, r2, r3       ; mul24 r3, ra7 << 7, r0 << 7
--## sub r2, r2, r3                                                    ; ldtmu0
--##
--## mov r0, ra22
--## shr r0, r4, ra17     ; mul24 r2, r2, r0                           ; ldtmu0
--## shr r1, r4, ra17     ; v8subs r0, r0, rb20
--## add t0s, ra16, r5    ; v8subs r1, r1, rb20
--## add ra16, ra16, rb16 ; mov t0s, ra16
--##
--## # apply horizontal filter
--##
--## asr r2, r2, 15    ; mul24 r3, r0, ra0
--## min r2, r2, rb22
--## max ra13, r2, 0
--##
--## # generate seven shifted versions
--## # interleave with scroll of vertical context
--##
--## mov r2, rb21
--## sub r2, r2, r3 ; mul24    r3, ra1 << 1, r0 << 1
--## sub r2, r2, r3 ; mul24    r3, ra2 << 2, r0 << 2
--## sub r2, r2, r3 ; mul24    r3, ra3 << 3, r0 << 3
--## sub r2, r2, r3 ; mul24    r3, ra4 << 4, r0 << 4
--## sub r2, r2, r3 ; mul24    r3, ra5 << 5, r0 << 5
--## sub r2, r2, r3 ; mul24    r3, ra6 << 6, r0 << 6
--## sub r2, r2, r3 ; mul24    r3, ra7 << 7, r0 << 7
--## sub r0, r2, r3
--##
--## # apply horizontal filter
--##
--## nop          ; mul24 r0, r0, ra22         # last bit of context scroll, including clamp to zero
--## asr r0, r0, 15
--## min r0, r0, rb22
--## max ra14, r0, 0
--##
--##
--##
--##
--## nop                                                                 ; ldtmu0     # loop counter increment
--## shr r0, r4, ra17                                                    ; ldtmu0
--## shr r1, r4, ra17     ; v8subs r0, r0, rb20
--## add t0s, ra16, r5    ; v8subs r1, r1, rb20
--## add ra16, ra16, rb16 ; mov t0s, ra16
--##
--## # generate seven shifted versions
--## # interleave with scroll of vertical context
--##
--## mov r2, rb21   ; mul24    r3, r0, ra0
--## sub r2, r2, r3 ; mul24    r3, ra1 << 1, r0 << 1
--## sub r2, r2, r3 ; mul24    r3, ra2 << 2, r0 << 2
--## sub r2, r2, r3 ; mul24    r3, ra3 << 3, r0 << 3
--## sub r2, r2, r3 ; mul24    r3, ra4 << 4, r0 << 4
--## sub r2, r2, r3 ; mul24    r3, ra5 << 5, r0 << 5
--## sub r2, r2, r3 ; mul24    r3, ra6 << 6, r0 << 6
--## sub r2, r2, r3 ; mul24    r3, ra7 << 7, r0 << 7
--## sub r0, r2, r3
--##
--## # apply horizontal filter
--##
--## nop          ; mul24 r0, r0, ra22         # last bit of context scroll, including clamp to zero
--## asr r0, r0, 15
--## min r0, r0, rb22
--## max ra15, r0, 0
--
--
--mov r3, 0  # This signifies the amount of unrolling
--
--:fast_loop
--# retrieve texture results and pick out bytes
--# then submit two more texture requests
--
--# Due to pipelining we can only skip second pipeline instructions related to the fetched pixels
--sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
--shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
--mov.ifz ra_y, ra_y_next   ; mov rb31, r3
--mov.ifz ra_x2_base, ra_x2_base_next   ; mov r3, rb_pitch
--
--max r2, ra_y, 0
--min r2, r2, rb_frame_height_minus_1 ; mov r1, r4  # discard texture read
--add ra_y, ra_y, 1         ; mul24 r2, r2, r3
--add t0s, ra_x_base, r2    ; v8subs r0, r0, rb20
--add t0s, ra_x2_base, r2
--
--# generate seven shifted versions
--# interleave with scroll of vertical context
--
--mov r2, rb21         ; mul24 r3, r0, ra0
--sub r2, r2, r3       ; mul24 r3, ra1 << 1, r0 << 1
--sub r2, r2, r3       ; mul24 r3, ra2 << 2, r0 << 2
--sub r2, r2, r3       ; mul24 r3, ra3 << 3, r0 << 3
--sub r2, r2, r3       ; mul24 r3, ra4 << 4, r0 << 4
--sub r2, r2, r3       ; mul24 r3, ra5 << 5, r0 << 5
--sub r2, r2, r3       ; mul24 r3, ra6 << 6, r0 << 6
--sub r2, r2, r3       ; mul24 r3, ra7 << 7, r0 << 7
--sub r0, r2, r3       ; mov r3, rb31
--
--mov ra8, ra9
--mov ra9, ra10
--mov ra10, ra11
--mov ra11, ra12
--mov ra12, ra13
--mov ra13, ra14
--
--sub.setf -, r3, 8       ; mov r1, ra22
--
--# apply horizontal filter
--
--brr.anyn -, r:fast_loop
--max ra14, ra15, 0       ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
--asr r0, r0, 15          ; mov r1, ra21
--min.setf ra15, r0, rb22
--
--# apply vertical filter and write to VPM
--
--nop                     ; mul24 r0, ra14, rb14
--sub r1, r1, r0          ; mul24 r0, ra13, rb13
--sub r1, r1, r0          ; mul24 r0, ra12, rb12
--sub r1, r1, r0          ; mul24 r0, ra11, rb11
--sub r1, r1, r0          ; mul24 r0, ra10, rb10
--sub r1, r1, r0          ; mul24 r0, ra9, rb9
--sub r1, r1, r0          ; mul24 r0, ra8, rb8
--sub r1, r1, r0          ; mul24 r0, ra15, rb15
--sub.ifnn r1, r1, r0     ; mov -, vw_wait
--sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
--brr.anyn -, r:fast_loop
--asr r1, r1, 15
--min r1, r1, rb22
--max vpm, r1, 0
--
--# DMA out
--
--bra -, ra31
--mov vw_setup, rb26 # VDW: height rows, 16 8-bit units long
--mov vw_setup, rb29
--mov vw_addr, unif # start the VDW
- 
- ################################################################################
- 
--# mc_filter_b(next_kernel, x, y, frame_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_dst)
--
--# At this point we have already issued two pairs of texture requests for the current block
--# ra_x_base, ra_x16_base point to the current coordinates for this block
--::mc_filter_b
-+::mc_filter_uv_b
- mov ra31, unif
- 
- # per-channel shifts were calculated on the *previous* invocation
- 
- mov ra_xshift, ra_xshift_next
--mov ra_x2shift, ra_x2shift_next
- 
- # get base addresses and per-channel shifts for *next* invocation
- add r0, unif, elem_num    # x
--add r2, r0, 8 # x+8
- max r0, r0, 0; mov r1, unif # y
--min r0, r0, rb_frame_width_minus_1 ; mov r3,unif # frame_base
-+min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base
- shl ra_xshift_next, r0, 3
--max r2, r2, 0
--min r2, r2, rb_frame_width_minus_1
--shl ra_x2shift_next, r2, 3
-+sub r2, unif, r3 # compute offset from frame base u to frame base v
- add r0, r0, r3
--add r2, r2, r3
- and rb_x_base_next, r0, ~3
--and ra_x2_base_next, r2, ~3
- mov ra_y_next, r1
-+add ra_x2_base_next, rb_x_base_next, r2
- 
- # set up VPM write
- mov vw_setup, rb28
-@@ -801,17 +375,22 @@ and r0, r0, rb22 # Extract height
- add rb17, r0, 5
- add rb18, r0, 7
- shl r0, r0, 7
-+
- # r0 is currently height<<7
- # For vr_setup we want height<<20 (so 20-7=13 additional bits)
- shl r3, r0, 13
- shl r3, r3, 8 # Mask off top 8 bits
- shr r3, r3, 8
-+
- add r0, r0, r1 # Combine width and height of destination area
- shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
- add rb26, r0, rb27
-+
- # In a B frame, so also set up VPM read
- add vr_setup, r3, rb28
- 
-+sub.setf -,8,r1 # 8-r1, so if <0 (negative) we need to use the full code
-+
- # get filter coefficients
- 
- mov r0, unif
-@@ -837,9 +416,13 @@ asr rb12, r0, rb23
- 
- mov r5rep, -8
- mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+
-+# retrieve texture results and pick out bytes
-+# then submit two more texture requests
-+
- mov r3, 0
- 
--:bloop
-+:uvloop_b
- # retrieve texture results and pick out bytes
- # then submit two more texture requests
- 
-@@ -847,7 +430,7 @@ sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
- shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
- mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
- mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
--shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
-+shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
- 
- max r2, ra_y, 0  # y
- min r2, r2, rb_frame_height_minus_1
-@@ -861,6 +444,7 @@ add t0s, ra_x2_base, r2
- mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
- 
- mov r2, rb21         ; mul24 r3, r0, ra0
-+nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
- sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
- nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
- sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-@@ -889,7 +473,7 @@ mov ra13, ra14
- sub.setf -, r3, 8 ; mov r1, ra22
- 
- # apply horizontal filter
--brr.anyn -, r:bloop
-+brr.anyn -, r:uvloop_b
- max ra14, ra15, 0       ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
- asr r0, r0, 15          ; mov r1, ra21
- min.setf ra15, r0, rb22
-@@ -906,213 +490,50 @@ sub r1, r1, r0          ; mul24 r0, ra8, rb8
- sub r1, r1, r0          ; mul24 r0, ra15, rb15
- sub.ifnn r1, r1, r0     ; mov -, vw_wait
- sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
--asr r1, r1, 15          ; mov -, vr_wait
-+asr r1, r1, 15
- min r1, r1, rb22
- add r0, vpm, 1          # Blend in previous VPM contents at this location
--brr.anyn -, r:bloop
-+brr.anyn -, r:uvloop_b
- max r1, r1, 0
- add r1, r1, r0
- shr vpm, r1, 1
- 
--# DMA out
-+
-+# DMA out for U
-+
-+mov vw_setup, rb26 # VDW setup 0
-+mov vw_setup, rb29 # Stride
-+mov vw_addr, unif # start the VDW
-+
-+# DMA out for V
-+# We need to wait for the U to complete first, but have nothing useful to compute while we wait.
-+# Could potentially push this write into the start of the next pipeline stage.
-+mov r0, 16
-+mov -, vw_wait
- 
- bra -, ra31
--mov vw_setup, rb26 # VDW: height rows, 16 8-bit units long
--mov vw_setup, rb29
-+add vw_setup, rb26, r0 # VDW setup 0
-+mov vw_setup, rb29 # Stride
- mov vw_addr, unif # start the VDW
- 
- ################################################################################
- 
--# mc_filter_honly(next_kernel, x, y, frame_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_dst)
--# This filter only does horizontal filtering.
--# It is assumed that the region to fetch does not include extra rows above.
-+# mc_exit()
- 
--# At this point we have already issued two pairs of texture requests for the current block
--# ra_x_base, ra_x16_base point to the current coordinates for this block
--::mc_filter_honly
--mov ra31, unif
-+::mc_exit
-+mov  -, vw_wait # wait on the VDW
- 
--# per-channel shifts were calculated on the *previous* invocation
-+mov -,srel(0)
- 
--mov ra_xshift, ra_xshift_next
--mov ra_x2shift, ra_x2shift_next
--
--# get base addresses and per-channel shifts for *next* invocation
--add r0, unif, elem_num    # x
--add r2, r0, 8 # x+8
--max r0, r0, 0; mov r1, unif # y
--min r0, r0, rb_frame_width_minus_1 ; mov r3,unif # frame_base
--shl ra_xshift_next, r0, 3
--max r2, r2, 0
--min r2, r2, rb_frame_width_minus_1
--shl ra_x2shift_next, r2, 3
--add r0, r0, r3
--add r2, r2, r3
--and rb_x_base_next, r0, ~3
--and ra_x2_base_next, r2, ~3
--mov ra_y_next, r1
--
--# set up VPM write
--mov vw_setup, rb28
--
--# get width,height of block
--mov r2, 16
--mov r0, unif
--shr r1, r0, r2 # Extract width
--sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
--and r0, r0, rb22 # Extract height
--add rb17, r0, -2 # Pipelining means we move data across 2 iterations early
--shl r0, r0, 7 ; mov rb18,r0
--add r0, r0, r1 # Combine width and height of destination area
--shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
--add rb26, r0, rb27
--
--# get filter coefficients
--
--mov r0, unif
--asr ra3, r0, rb23;      mul24 r0, r0, ra22
--asr ra2, r0, rb23;      mul24 r0, r0, ra22
--asr ra1, r0, rb23;      mul24 r0, r0, ra22
--asr ra0, r0, rb23;      mov r0, unif
--asr ra7, r0, rb23;      mul24 r0, r0, ra22
--asr ra6, r0, rb23;      mul24 r0, r0, ra22
--asr ra5, r0, rb23;      mul24 r0, r0, ra22
--asr ra4, r0, rb23;      mov r0, unif
--mov r0, unif
--
--# r2 is elem_num
--# r3 is loop counter
--mov r5rep, -8
--mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]  # delay slot 3
--mov r3, 0
--
--:loop_honly
--# retrieve texture results and pick out bytes
--# then submit two more texture requests
--
--sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
--shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
--mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
--mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
--shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
--
--max r2, ra_y, 0  # y
--min r2, r2, rb_frame_height_minus_1
--add ra_y, ra_y, 1         ; mul24 r2, r2, r3
--add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
--add t0s, ra_x2_base, r2
--
--# generate seven shifted versions
--# interleave with scroll of vertical context
--
--mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--
--mov r2, rb21         ; mul24 r3, r0, ra0
--sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
--nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
--sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
--nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
--sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
--nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
--sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
--nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
--sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
--nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
--sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
--nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
--sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
--nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
--sub r0, r2, r3       ; mov r3, rb31
--
--sub.setf -, r3, rb18 ; mov r1, ra22
--
--mov -, vw_wait   ; mul24 r0, r0, r1
--brr.anyn -, r:loop_honly
--asr r0, r0, 15          # delay 1
--min r0, r0, rb22        # delay 2
--max vpm, r0, 0          # delay 3
--
--# DMA out
--bra -, ra31
--mov vw_setup, rb26 # VDW: height rows, 16 8-bit units long
--mov vw_setup, rb29
--mov vw_addr, unif # start the VDW
--
--
--################################################################################
--
--# mc_exit()
--
--::mc_exit
--mov  -, vw_wait # wait on the VDW
--
--mov -,srel(0)
--
--ldtmu0
--ldtmu0
--ldtmu0
--ldtmu0
-+ldtmu0
-+ldtmu0
-+ldtmu0
-+ldtmu0
- 
- nop        ; nop ; thrend
- nop        ; nop # delay slot 1
- nop        ; nop # delay slot 2
- 
--::mc_exit1
--mov  -, vw_wait # wait on the VDW
--
--#mov -,srel(1)
--
--ldtmu0
--ldtmu0
--ldtmu0
--ldtmu0
--
--nop        ; nop ; thrend
--mov interrupt, 1; nop # delay slot 1
--nop        ; nop # delay slot 2
--
--# mc_interrupt_exit()
--::mc_interrupt_exit
--mov  -, vw_wait # wait on the VDW
--
--ldtmu0
--ldtmu0
--ldtmu0
--ldtmu0
--
--mov -,sacq(0) # 1
--mov -,sacq(0) # 2
--mov -,sacq(0) # 3
--mov -,sacq(0) # 4
--mov -,sacq(0) # 5
--mov -,sacq(0) # 6
--mov -,sacq(0) # 7
--mov -,sacq(0) # 8
--mov -,sacq(0) # 9
--mov -,sacq(0) # 10
--mov -,sacq(0) # 11
--
--nop        ; nop ; thrend
--mov interrupt, 1; nop # delay slot 1
--nop        ; nop # delay slot 2
--
--# mc_interrupt_exit4()
--::mc_interrupt_exit4
--mov  -, vw_wait # wait on the VDW
--
--ldtmu0
--ldtmu0
--ldtmu0
--ldtmu0
--
--mov -,sacq(0) # 1
--mov -,sacq(0) # 2
--mov -,sacq(0) # 3
--
--nop        ; nop ; thrend
--mov interrupt, 1; nop # delay slot 1
--nop        ; nop # delay slot 2
--
- # mc_interrupt_exit8()
- ::mc_interrupt_exit8
- mov  -, vw_wait # wait on the VDW
-@@ -1134,282 +555,5 @@ nop        ; nop ; thrend
- mov interrupt, 1; nop # delay slot 1
- nop        ; nop # delay slot 2
- 
--################################################################################
--# mc_setup_uv(next_kernel, x, y, ref_u_base, ref_v_base, frame_width, frame_height, pitch, dst_pitch, pad0, pad1, pad2)
--::mc_setup_uv
--
--# Read starting kernel
--mov ra31, unif
--
--# Load first request location
--add ra_x_base, unif, elem_num # Store x
--mov ra_y, unif # Store y
--mov ra_x2_base, unif # Store frame u base
--nop
--sub ra_u2v_ref_offset, unif, ra_x2_base # Store offset to add to move from u to v in reference frame
--
--# Read image dimensions
--sub rb25,unif,1
--sub rb30,unif,1
--
--# get source pitch
--mov rb16, unif
--
--# get destination pitch
--mov r0, unif
--mov r1, vdw_setup_1(0)
--add rb24, r1, r0
--
--# load constants
--
--mov ra20, 1
--mov ra21, 32
--mov ra22, 256
--mov ra23, 8
--
--mov rb20, 0xffffff00
--mov rb21, 64
--mov rb22, 255
--mov rb23, 24
--
--# touch vertical context to keep simulator happy
--
--mov ra8, 0
--mov ra9, 0
--mov ra10, 0
--mov ra11, 0
--mov ra12, 0
--mov ra13, 0
--mov ra14, 0
--mov ra15, 0
--
--# Compute part of VPM to use for DMA output
--mov r2, qpu_num
--and r2, r2, 15
--mov r1, r2
--asr r1, r1, 2
--shl r1, r1, 6
--mov r0, r2
--and r0, r0, 3
--add r0, r0, r1
--mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
--shl r0, r0, 5
--add rb27, r0, r1
--
--# Compute part of VPM to save data into
--mov r2, qpu_num
--and r2, r2, 15
--mov r1, r2
--asr r1, r1, 2
--shl r1, r1, 6
--mov r0, r2
--and r0, r0, 3
--add r0, r0, r1
--mov r1, vpm_setup(0, 4, h8p(0, 0))
--add rb28, r0, r1
--
--# Compute base address for first and second access
--mov r0, ra_x_base           # Load x
--max r0, r0, 0; mov r1, ra_y # Load y
--min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base  # Load the frame base
--shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
--add ra_y, r1, 1
--add r0, r0, r3
--and r0, r0, ~3
--max r1, r1, 0 ; mov ra_x_base, r0 # y
--min r1, r1, rb_frame_height_minus_1
--# submit texture requests for first line
--add r2, r2, r0 ; mul24 r1, r1, rb_pitch
--add t0s, r0, r1 ; mov ra_x2_base, r2
--add t0s, r2, r1
--
--# Dump padding words
--mov r0, unif
--mov r0, unif
--mov r0, unif
--
--# submit texture requests for second line
--max r1, ra_y, 0
--min r1, r1, rb_frame_height_minus_1
--add ra_y, ra_y, 1
--bra -, ra31
--nop ; mul24 r1, r1, rb_pitch
--add t0s, r1, ra_x_base
--add t0s, r1, ra_x2_base
--
--
--
--################################################################################
--
--::mc_filter_uv_b
--mov ra31, unif
--
--# per-channel shifts were calculated on the *previous* invocation
--
--mov ra_xshift, ra_xshift_next
--
--# get base addresses and per-channel shifts for *next* invocation
--add r0, unif, elem_num    # x
--max r0, r0, 0; mov r1, unif # y
--min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base
--shl ra_xshift_next, r0, 3
--sub r2, unif, r3 # compute offset from frame base u to frame base v
--add r0, r0, r3
--and rb_x_base_next, r0, ~3
--mov ra_y_next, r1
--add ra_x2_base_next, rb_x_base_next, r2
--
--# set up VPM write
--mov vw_setup, rb28
--
--# get width,height of block
--mov r2, 16
--mov r0, unif
--shr r1, r0, r2 # Extract width
--sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
--and r0, r0, rb22 # Extract height
--add rb17, r0, 5
--add rb18, r0, 7
--shl r0, r0, 7
--
--# r0 is currently height<<7
--# For vr_setup we want height<<20 (so 20-7=13 additional bits)
--shl r3, r0, 13
--shl r3, r3, 8 # Mask off top 8 bits
--shr r3, r3, 8
--
--add r0, r0, r1 # Combine width and height of destination area
--shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
--add rb26, r0, rb27
--
--# In a B frame, so also set up VPM read
--add vr_setup, r3, rb28
--
--sub.setf -,8,r1 # 8-r1, so if <0 (negative) we need to use the full code
--
--# get filter coefficients
--
--mov r0, unif
--asr ra3, r0, rb23;      mul24 r0, r0, ra22
--asr ra2, r0, rb23;      mul24 r0, r0, ra22
--asr ra1, r0, rb23;      mul24 r0, r0, ra22
--asr ra0, r0, rb23;      mov r0, unif
--asr ra7, r0, rb23;      mul24 r0, r0, ra22
--asr ra6, r0, rb23;      mul24 r0, r0, ra22
--asr ra5, r0, rb23;      mul24 r0, r0, ra22
--asr ra4, r0, rb23;      mov r0, unif
--asr rb11, r0, rb23;     mul24 r0, r0, ra22
--asr rb10, r0, rb23;     mul24 r0, r0, ra22
--asr rb9, r0, rb23;      mul24 r0, r0, ra22
--asr rb8, r0, rb23;      mov r0, unif
--asr rb15, r0, rb23;     mul24 r0, r0, ra22
--asr rb14, r0, rb23;     mul24 r0, r0, ra22
--asr rb13, r0, rb23;     mul24 r0, r0, ra22
--asr rb12, r0, rb23
--
--# r2 is elem_num
--# r3 is loop counter
--
--mov r5rep, -8
--mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--
--# retrieve texture results and pick out bytes
--# then submit two more texture requests
--
--mov r3, 0
--
--:uvloop_b
--# retrieve texture results and pick out bytes
--# then submit two more texture requests
--
--sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
--shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
--mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
--mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
--shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
--
--max r2, ra_y, 0  # y
--min r2, r2, rb_frame_height_minus_1
--add ra_y, ra_y, 1         ; mul24 r2, r2, r3
--add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
--add t0s, ra_x2_base, r2
--
--# generate seven shifted versions
--# interleave with scroll of vertical context
--
--mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--
--mov r2, rb21         ; mul24 r3, r0, ra0
--nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
--sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
--nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
--sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
--nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
--sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
--nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
--sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
--nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
--sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
--nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
--sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
--nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
--sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
--nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
--sub r0, r2, r3
--
--mov r3, rb31
--
--mov ra8, ra9
--mov ra9, ra10
--mov ra10, ra11
--mov ra11, ra12
--mov ra12, ra13
--mov ra13, ra14
--
--sub.setf -, r3, 8 ; mov r1, ra22
--
--# apply horizontal filter
--brr.anyn -, r:uvloop_b
--max ra14, ra15, 0       ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
--asr r0, r0, 15          ; mov r1, ra21
--min.setf ra15, r0, rb22
--
--# apply vertical filter and write to VPM
--
--nop                     ; mul24 r0, ra14, rb14
--sub r1, r1, r0          ; mul24 r0, ra13, rb13
--sub r1, r1, r0          ; mul24 r0, ra12, rb12
--sub r1, r1, r0          ; mul24 r0, ra11, rb11
--sub r1, r1, r0          ; mul24 r0, ra10, rb10
--sub r1, r1, r0          ; mul24 r0, ra9, rb9
--sub r1, r1, r0          ; mul24 r0, ra8, rb8
--sub r1, r1, r0          ; mul24 r0, ra15, rb15
--sub.ifnn r1, r1, r0     ; mov -, vw_wait
--sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
--asr r1, r1, 15
--min r1, r1, rb22
--add r0, vpm, 1          # Blend in previous VPM contents at this location
--brr.anyn -, r:uvloop_b
--max r1, r1, 0
--add r1, r1, r0
--shr vpm, r1, 1
--
--
--# DMA out for U
--
--mov vw_setup, rb26 # VDW setup 0
--mov vw_setup, rb29 # Stride
--mov vw_addr, unif # start the VDW
--
--# DMA out for V
--# We need to wait for the U to complete first, but have nothing useful to compute while we wait.
--# Could potentially push this write into the start of the next pipeline stage.
--mov r0, 16
--mov -, vw_wait
--
--bra -, ra31
--add vw_setup, rb26, r0 # VDW setup 0
--mov vw_setup, rb29 # Stride
--mov vw_addr, unif # start the VDW
--
- ::mc_end
-+# Do not add code here because mc_end must appear after all other code.
--- 
-2.7.4
-
-
-From d40d59de0f09fd1a6e7146532418b63d8e2711b7 Mon Sep 17 00:00:00 2001
-From: Peter de Rivaz <peter.derivaz@gmail.com>
-Date: Wed, 13 May 2015 14:54:25 +0100
-Subject: [PATCH 20/68] Moved chroma P1 to QPUs
-
----
- libavcodec/hevc.c | 38 ++++++++++++++++++++++++++++++++++++++
- 1 file changed, 38 insertions(+)
-
-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-index 31b8b2f..391d139 100644
---- a/libavcodec/hevc.c
-+++ b/libavcodec/hevc.c
-@@ -2070,6 +2070,44 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-                     s->sh.luma_offset_l1[current_mv.ref_idx[1]]);
- 
-         if (s->ps.sps->chroma_format_idc) {
-+#ifdef RPI_INTER_QPU
-+            if (s->enable_rpi) {
-+                int reflist = 1;
-+                int hshift           = s->ps.sps->hshift[1];
-+                int vshift           = s->ps.sps->vshift[1];
-+                const Mv *mv         = &current_mv.mv[reflist];
-+                intptr_t mx          = av_mod_uintp2(mv->x, 2 + hshift);
-+                intptr_t my          = av_mod_uintp2(mv->y, 2 + vshift);
-+                intptr_t _mx         = mx << (1 - hshift);
-+                intptr_t _my         = my << (1 - vshift); // Fractional part of motion vector
-+
-+                int x1_c = x0_c + (mv->x >> (2 + hshift));
-+                int y1_c = y0_c + (mv->y >> (2 + hshift));
-+                //int chan = x0>>8; // Allocate commands for the first 256 luma pixels across to the first QPU.  This is optimised for images around 1920 width
-+                int chan = x0>>8;
-+
-+                uint32_t *u = s->u_mvs[chan & 7];
-+                for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-+                  for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 3 + start_x;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 3 + start_y;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[1]);
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[2]);
-+                      *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
-+                      // TODO chroma weight and offset... s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]
-+                      *u++ = rpi_filter_coefs[_mx][0];
-+                      *u++ = rpi_filter_coefs[_mx][1];
-+                      *u++ = rpi_filter_coefs[_my][0];
-+                      *u++ = rpi_filter_coefs[_my][1];
-+                      *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
-+                      *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-+                    }
-+                }
-+                s->u_mvs[chan & 7] = u;
-+                return;
-+            }
-+#endif
-             RPI_REDIRECT(chroma_mc_uni)(s, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1],
-                           1, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
-                           s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0], s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0]);
--- 
-2.7.4
-
-
-From 75777ba7927086e862104b14f6446e81bc789611 Mon Sep 17 00:00:00 2001
-From: Peter de Rivaz <peter.derivaz@gmail.com>
-Date: Wed, 13 May 2015 15:13:47 +0100
-Subject: [PATCH 21/68] Added B prediction - not quite right
-
----
- libavcodec/hevc.c          |  58 ++++++++++++++++++++++++
- libavcodec/rpi_shader.c    | 108 +++++++++++++++++++++++----------------------
- libavcodec/rpi_shader.h    |   6 +--
- libavcodec/rpi_shader.qasm |  48 ++++++++++----------
- 4 files changed, 141 insertions(+), 79 deletions(-)
-
-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-index 391d139..47ddfff 100644
---- a/libavcodec/hevc.c
-+++ b/libavcodec/hevc.c
-@@ -2127,6 +2127,64 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-                    ref1->frame, &current_mv.mv[1], &current_mv);
- 
-         if (s->ps.sps->chroma_format_idc) {
-+#ifdef RPI_INTER_QPU
-+            if (s->enable_rpi) {
-+                int hshift           = s->ps.sps->hshift[1];
-+                int vshift           = s->ps.sps->vshift[1];
-+                const Mv *mv         = &current_mv.mv[0];
-+                intptr_t mx          = av_mod_uintp2(mv->x, 2 + hshift);
-+                intptr_t my          = av_mod_uintp2(mv->y, 2 + vshift);
-+                intptr_t _mx         = mx << (1 - hshift);
-+                intptr_t _my         = my << (1 - vshift); // Fractional part of motion vector
-+                int x1_c = x0_c + (mv->x >> (2 + hshift));
-+                int y1_c = y0_c + (mv->y >> (2 + hshift));
-+
-+                const Mv *mv2         = &current_mv.mv[1];
-+                intptr_t mx2          = av_mod_uintp2(mv2->x, 2 + hshift);
-+                intptr_t my2          = av_mod_uintp2(mv2->y, 2 + vshift);
-+                intptr_t _mx2         = mx2 << (1 - hshift);
-+                intptr_t _my2         = my2 << (1 - vshift); // Fractional part of motion vector
-+
-+                int x2_c = x0_c + (mv2->x >> (2 + hshift));
-+                int y2_c = y0_c + (mv2->y >> (2 + hshift));
-+
-+                int chan = x0>>8; // Allocate commands for the first 256 luma pixels across to the first QPU.  This is optimised for images around 1920 width
-+
-+                uint32_t *u = s->u_mvs[chan & 7];
-+                for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-+                  for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 3 + start_x;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 3 + start_y;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[1]);
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[2]);
-+                      *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
-+                      *u++ = rpi_filter_coefs[_mx][0];
-+                      *u++ = rpi_filter_coefs[_mx][1];
-+                      *u++ = rpi_filter_coefs[_my][0];
-+                      *u++ = rpi_filter_coefs[_my][1];
-+                      *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]); // TODO this will become unused once we have a dedicated pass0 filter
-+                      *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-+
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = x2_c - 3 + start_x;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = y2_c - 3 + start_y;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[1]);
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[2]);
-+                      *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
-+                      // TODO chroma weight and offset... s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]
-+                      *u++ = rpi_filter_coefs[_mx2][0];
-+                      *u++ = rpi_filter_coefs[_mx2][1];
-+                      *u++ = rpi_filter_coefs[_my2][0];
-+                      *u++ = rpi_filter_coefs[_my2][1];
-+                      *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
-+                      *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-+                    }
-+                }
-+                s->u_mvs[chan & 7] = u;
-+                return;
-+            }
-+#endif
-             RPI_REDIRECT(chroma_mc_bi)(s, dst1, s->frame->linesize[1], ref0->frame, ref1->frame,
-                          x0_c, y0_c, nPbW_c, nPbH_c, &current_mv, 0);
- 
-diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
-index 170e8ac..5d00cb2 100644
---- a/libavcodec/rpi_shader.c
-+++ b/libavcodec/rpi_shader.c
-@@ -265,23 +265,23 @@ unsigned int rpi_shader[] = {
- /* [0x00000760] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
- /* [0x00000768] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
- /* [0x00000770] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00000778] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
--/* [0x00000780] */ 0x40038031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
--/* [0x00000788] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-+/* [0x00000778] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+/* [0x00000780] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+/* [0x00000788] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
- /* [0x00000790] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
--/* [0x00000798] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+/* [0x00000798] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
- /* [0x000007a0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
--/* [0x000007a8] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+/* [0x000007a8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
- /* [0x000007b0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
--/* [0x000007b8] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+/* [0x000007b8] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
- /* [0x000007c0] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
--/* [0x000007c8] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+/* [0x000007c8] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
- /* [0x000007d0] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
--/* [0x000007d8] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+/* [0x000007d8] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
- /* [0x000007e0] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
--/* [0x000007e8] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+/* [0x000007e8] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
- /* [0x000007f0] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
--/* [0x000007f8] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
-+/* [0x000007f8] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
- /* [0x00000800] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
- /* [0x00000808] */ 0x15267d80, 0x10020227, // mov ra8, ra9
- /* [0x00000810] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-@@ -291,61 +291,63 @@ unsigned int rpi_shader[] = {
- /* [0x00000830] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
- /* [0x00000838] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
- /* [0x00000840] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
--/* [0x00000848] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
--/* [0x00000850] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
--/* [0x00000858] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
--/* [0x00000860] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
--/* [0x00000868] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
--/* [0x00000870] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
--/* [0x00000878] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
--/* [0x00000880] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
--/* [0x00000888] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
--/* [0x00000890] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
--/* [0x00000898] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
--/* [0x000008a0] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
-+/* [0x00000848] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-+/* [0x00000850] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-+/* [0x00000858] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-+/* [0x00000860] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
-+/* [0x00000868] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
-+/* [0x00000870] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
-+/* [0x00000878] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
-+/* [0x00000880] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
-+/* [0x00000888] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
-+/* [0x00000890] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
-+/* [0x00000898] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
-+/* [0x000008a0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
- /* [0x000008a8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
--/* [0x000008b0] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
--/* [0x000008b8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
--/* [0x000008c0] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
--/* [0x000008c8] */ 0xfffffe38, 0xf06809e7, // brr.anyn -, r:uvloop_b
--/* [0x000008d0] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
--/* [0x000008d8] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
--/* [0x000008e0] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
--/* [0x000008e8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
--/* [0x000008f0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x000008f8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
--/* [0x00000900] */ 0x00000010, 0xe0020827, // mov r0, 16
--/* [0x00000908] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
--/* [0x00000910] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
--/* [0x00000918] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
--/* [0x00000920] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x00000928] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x000008b0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+/* [0x000008b8] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
-+/* [0x000008c0] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-+/* [0x000008c8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+/* [0x000008d0] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
-+/* [0x000008d8] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+/* [0x000008e0] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-+/* [0x000008e8] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
-+/* [0x000008f0] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
-+/* [0x000008f8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x00000900] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000908] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x00000910] */ 0x00000010, 0xe0020827, // mov r0, 16
-+/* [0x00000918] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000920] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00000928] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+/* [0x00000930] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000938] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
- // ::mc_exit
--/* [0x00000930] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
--/* [0x00000938] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
--/* [0x00000940] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000948] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000940] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x00000948] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
- /* [0x00000950] */ 0x009e7000, 0xa00009e7, // ldtmu0
- /* [0x00000958] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000960] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
--/* [0x00000968] */ 0x009e7000, 0x100009e7, // nop        ; nop
--/* [0x00000970] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+/* [0x00000960] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000968] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000970] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x00000978] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+/* [0x00000980] */ 0x009e7000, 0x100009e7, // nop        ; nop
- // ::mc_interrupt_exit8
--/* [0x00000978] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
--/* [0x00000980] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000988] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000988] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
- /* [0x00000990] */ 0x009e7000, 0xa00009e7, // ldtmu0
- /* [0x00000998] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x000009a0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x000009a8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000009a0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x000009a8] */ 0x009e7000, 0xa00009e7, // ldtmu0
- /* [0x000009b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
- /* [0x000009b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
- /* [0x000009c0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
- /* [0x000009c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
- /* [0x000009d0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x000009d8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
--/* [0x000009e0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
--/* [0x000009e8] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+/* [0x000009d8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000009e0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000009e8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x000009f0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+/* [0x000009f8] */ 0x009e7000, 0x100009e7, // nop        ; nop
- // ::mc_end
- };
- #ifdef __HIGHC__
-diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
-index 9de4535..e36c4ae 100644
---- a/libavcodec/rpi_shader.h
-+++ b/libavcodec/rpi_shader.h
-@@ -6,8 +6,8 @@ extern unsigned int rpi_shader[];
- #define mc_setup_uv (rpi_shader + 0)
- #define mc_filter_uv (rpi_shader + 142)
- #define mc_filter_uv_b (rpi_shader + 360)
--#define mc_exit (rpi_shader + 588)
--#define mc_interrupt_exit8 (rpi_shader + 606)
--#define mc_end (rpi_shader + 636)
-+#define mc_exit (rpi_shader + 592)
-+#define mc_interrupt_exit8 (rpi_shader + 610)
-+#define mc_end (rpi_shader + 640)
- 
- #endif
-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-index cd7346d..870437d2 100644
---- a/libavcodec/rpi_shader.qasm
-+++ b/libavcodec/rpi_shader.qasm
-@@ -443,23 +443,23 @@ add t0s, ra_x2_base, r2
- 
- mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
- 
--mov r2, rb21         ; mul24 r3, r0, ra0
--nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
--sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-+nop                  ; mul24 r2, r0, ra0
-+nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+nop                  ; mul24      r3, ra1 << 1, r0 << 1
- nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
--sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
- nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
--sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
- nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
--sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
- nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
--sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
- nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
--sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
- nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
--sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
- nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
--sub r0, r2, r3
-+add r0, r2, r3
- 
- mov r3, rb31
- 
-@@ -474,23 +474,25 @@ sub.setf -, r3, 8 ; mov r1, ra22
- 
- # apply horizontal filter
- brr.anyn -, r:uvloop_b
--max ra14, ra15, 0       ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
--asr r0, r0, 15          ; mov r1, ra21
--min.setf ra15, r0, rb22
-+mov ra14, ra15          ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
-+asr ra15, r0, 8         ; nop
-+nop                     ; nop
- 
- # apply vertical filter and write to VPM
- 
--nop                     ; mul24 r0, ra14, rb14
--sub r1, r1, r0          ; mul24 r0, ra13, rb13
--sub r1, r1, r0          ; mul24 r0, ra12, rb12
--sub r1, r1, r0          ; mul24 r0, ra11, rb11
--sub r1, r1, r0          ; mul24 r0, ra10, rb10
--sub r1, r1, r0          ; mul24 r0, ra9, rb9
--sub r1, r1, r0          ; mul24 r0, ra8, rb8
--sub r1, r1, r0          ; mul24 r0, ra15, rb15
--sub.ifnn r1, r1, r0     ; mov -, vw_wait
-+nop                     ; mul24 r1, ra14, rb14
-+nop                     ; mul24 r0, ra13, rb13
-+add r1, r1, r0          ; mul24 r0, ra12, rb12
-+add r1, r1, r0          ; mul24 r0, ra11, rb11
-+add r1, r1, r0          ; mul24 r0, ra10, rb10
-+add r1, r1, r0          ; mul24 r0, ra9, rb9
-+add r1, r1, r0          ; mul24 r0, ra8, rb8
-+add r1, r1, r0          ; mul24 r0, ra15, rb15
-+add r1, r1, r0          ; mov -, vw_wait
- sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
--asr r1, r1, 15
-+asr r1, r1, 14
-+add r1, r1, ra21
-+asr r1, r1, 6
- min r1, r1, rb22
- add r0, vpm, 1          # Blend in previous VPM contents at this location
- brr.anyn -, r:uvloop_b
--- 
-2.7.4
-
-
-From 3d4e94b8f0b08fe4c0b582fc7f1dbe9d1d9d60ed Mon Sep 17 00:00:00 2001
-From: Peter de Rivaz <peter.derivaz@gmail.com>
-Date: Thu, 14 May 2015 08:15:55 +0100
-Subject: [PATCH 22/68] Added flush for SAO
-
----
- libavcodec/hevc.c        |  2 +-
- libavcodec/hevc_filter.c | 39 ++++++++++++++++++++++++++-------------
- 2 files changed, 27 insertions(+), 14 deletions(-)
-
-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-index 47ddfff..93e1eba 100644
---- a/libavcodec/hevc.c
-+++ b/libavcodec/hevc.c
-@@ -2903,7 +2903,7 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-             rpi_execute_inter_qpu(s);
- #endif
-             // Transform all blocks
--            //printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
-+            // printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
-             rpi_execute_transform(s);
-             // Perform inter prediction
-             rpi_execute_inter_cmds(s);
-diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
-index 9b6e26d..92a8271 100644
---- a/libavcodec/hevc_filter.c
-+++ b/libavcodec/hevc_filter.c
-@@ -871,6 +871,21 @@ static void flush_buffer(AVBufferRef *bref) {
-     GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
-     gpu_cache_flush(p);
- }
-+
-+static void ff_hevc_flush_chroma(HEVCContext *s)
-+{
-+    if (s->enable_rpi && !(  s->nal_unit_type == NAL_TRAIL_N ||
-+            s->nal_unit_type == NAL_TSA_N   ||
-+            s->nal_unit_type == NAL_STSA_N  ||
-+            s->nal_unit_type == NAL_RADL_N  ||
-+            s->nal_unit_type == NAL_RASL_N )) {
-+        flush_buffer(s->frame->buf[1]);
-+        flush_buffer(s->frame->buf[2]);
-+        //memcpy(s->dummy.arm,s->frame->data[0],2048*64);
-+        //memcpy(s->dummy.arm,s->frame->data[1],1024*32);
-+        //memcpy(s->dummy.arm,s->frame->data[2],1024*32);
-+    }
-+}
- #endif
- 
- void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
-@@ -886,31 +901,29 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
-             sao_filter_CTB(s, x - ctb_size, y);
-         if (y && x_end) {
-             sao_filter_CTB(s, x, y - ctb_size);
--            if (s->threads_type & FF_THREAD_FRAME )
-+            if (s->threads_type & FF_THREAD_FRAME ) {
-+#ifdef RPI_INTER_QPU
-+                ff_hevc_flush_chroma(s);
-+#endif
-                 ff_thread_report_progress(&s->ref->tf, y, 0);
-+            }
-         }
-         if (x_end && y_end) {
-             sao_filter_CTB(s, x , y);
--            if (s->threads_type & FF_THREAD_FRAME )
-+            if (s->threads_type & FF_THREAD_FRAME ) {
-+#ifdef RPI_INTER_QPU
-+                ff_hevc_flush_chroma(s);
-+#endif
-                 ff_thread_report_progress(&s->ref->tf, y + ctb_size, 0);
-+            }
-         }
-     } else if (s->threads_type & FF_THREAD_FRAME && x_end) {
-         //int newh = y + ctb_size - 4;
-         //int currh = s->ref->tf.progress->data[0];
-         //if (((y + ctb_size)&63)==0)
--        if (!(  s->nal_unit_type == NAL_TRAIL_N ||
--            s->nal_unit_type == NAL_TSA_N   ||
--            s->nal_unit_type == NAL_STSA_N  ||
--            s->nal_unit_type == NAL_RADL_N  ||
--            s->nal_unit_type == NAL_RASL_N )) {
- #ifdef RPI_INTER_QPU
--            flush_buffer(s->frame->buf[1]);
--            flush_buffer(s->frame->buf[2]);
-+        ff_hevc_flush_chroma(s);
- #endif
--            //memcpy(s->dummy.arm,s->frame->data[0],2048*64);
--            //memcpy(s->dummy.arm,s->frame->data[1],1024*32);
--            //memcpy(s->dummy.arm,s->frame->data[2],1024*32);
--        }
-         ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
-     }
- }
--- 
-2.7.4
-
-
-From 3e337b9c4ef0c356a0259be2254ad1bc4d5bbe29 Mon Sep 17 00:00:00 2001
-From: Peter de Rivaz <peter.derivaz@gmail.com>
-Date: Thu, 14 May 2015 09:17:28 +0100
-Subject: [PATCH 23/68] Stopped using acceleration in unsupported cases
-
----
- libavcodec/hevc.c       | 14 +++++++-------
- libavcodec/hevc_cabac.c |  4 ++--
- 2 files changed, 9 insertions(+), 9 deletions(-)
-
-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-index 93e1eba..bfd5a55 100644
---- a/libavcodec/hevc.c
-+++ b/libavcodec/hevc.c
-@@ -1152,15 +1152,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
-                         for (i = 0; i < (size * size); i++) {
-                             coeffs[i] = ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
-                         }
--                        printf("Cross component not supported\n"); // TODO
--                        exit(-1);
-                         s->hevcdsp.transform_add[log2_trafo_size_c-2](dst, coeffs, stride);
-                     }
-             }
- 
-             if (lc->tu.cross_pf) {
--                printf("Cross component not supported\n"); // TODO
--                exit(-1);
-                 hls_cross_component_pred(s, 1);
-             }
-             for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
-@@ -1189,8 +1185,6 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
-                         for (i = 0; i < (size * size); i++) {
-                             coeffs[i] = ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
-                         }
--                        printf("Cross component not supported\n"); // TODO
--                        exit(-1);
-                         s->hevcdsp.transform_add[log2_trafo_size_c-2](dst, coeffs, stride);
-                     }
-             }
-@@ -2857,7 +2851,13 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-     int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs];
- 
- #ifdef RPI
--    s->enable_rpi = 1; // TODO this should depend on cross component and frame width etc.
-+    s->enable_rpi = s->ps.sps->bit_depth == 8
-+                    && s->ps.sps->width <= RPI_MAX_WIDTH
-+                    && !s->ps.pps->cross_component_prediction_enabled_flag
-+                    && s->ps.pps->num_tile_rows <= 1 && s->ps.pps->num_tile_columns <= 1
-+                    && !(s->ps.pps->weighted_pred_flag && s->sh.slice_type == P_SLICE)
-+                    && !(s->ps.pps->weighted_bipred_flag && s->sh.slice_type == B_SLICE);
-+
- #endif
- 
-     if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) {
-diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
-index 4f072be..38f53de 100644
---- a/libavcodec/hevc_cabac.c
-+++ b/libavcodec/hevc_cabac.c
-@@ -1513,9 +1513,9 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
- #ifdef RPI
-             if (!use_vpu) {
-               int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
--              if (max_xy == 0)
-+              if (max_xy == 0) {
-                   s->hevcdsp.idct_dc[log2_trafo_size-2](coeffs);
--              else {
-+              } else {
-                   int col_limit = last_significant_coeff_x + last_significant_coeff_y + 4;
-                   if (max_xy < 4)
-                       col_limit = FFMIN(4, col_limit);
--- 
-2.7.4
-
-
-From 3941d3e4c2305fa037e8aba5a14cf698ac8673db Mon Sep 17 00:00:00 2001
-From: Peter de Rivaz <peter.derivaz@gmail.com>
-Date: Thu, 14 May 2015 09:42:16 +0100
-Subject: [PATCH 24/68] Split B prediction into two passes
-
----
- libavcodec/hevc.c          |   1 +
- libavcodec/hevc.h          |   1 +
- libavcodec/rpi_qpu.c       |   3 +
- libavcodec/rpi_qpu.h       |   1 +
- libavcodec/rpi_shader.c    | 559 +++++++++++++++++++++++++++------------------
- libavcodec/rpi_shader.h    |  11 +-
- libavcodec/rpi_shader.qasm | 196 ++++++++++++++--
- 7 files changed, 531 insertions(+), 241 deletions(-)
-
-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-index bfd5a55..4b133d2 100644
---- a/libavcodec/hevc.c
-+++ b/libavcodec/hevc.c
-@@ -3801,6 +3801,7 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
-             p += uv_commands_per_qpu;
-         }
-         s->mc_filter_uv = qpu_get_fn(QPU_MC_FILTER_UV);
-+        s->mc_filter_uv_b0 = qpu_get_fn(QPU_MC_FILTER_UV_B0);
-         s->mc_filter_uv_b = qpu_get_fn(QPU_MC_FILTER_UV_B);
- 
-     }
-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-index d513579..4a39e39 100644
---- a/libavcodec/hevc.h
-+++ b/libavcodec/hevc.h
-@@ -917,6 +917,7 @@ typedef struct HEVCContext {
-     uint32_t *u_mvs[8];
-     // Function pointers
-     uint32_t mc_filter_uv;
-+    uint32_t mc_filter_uv_b0;
-     uint32_t mc_filter_uv_b;
- #endif
- 
-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-index 4e90cc1..60bf079 100644
---- a/libavcodec/rpi_qpu.c
-+++ b/libavcodec/rpi_qpu.c
-@@ -636,6 +636,9 @@ unsigned int qpu_get_fn(int num) {
-     case QPU_MC_FILTER_UV:
-       fn = mc_filter_uv;
-       break;
-+    case QPU_MC_FILTER_UV_B0:
-+      fn = mc_filter_uv_b0;
-+      break;
-     case QPU_MC_FILTER_UV_B:
-       fn = mc_filter_uv_b;
-       break;
-diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
-index f9ad333..543c84b 100644
---- a/libavcodec/rpi_qpu.h
-+++ b/libavcodec/rpi_qpu.h
-@@ -29,6 +29,7 @@ enum {
-   QPU_MC_FILTER_HONLY,
-   QPU_MC_SETUP_UV,
-   QPU_MC_FILTER_UV,
-+  QPU_MC_FILTER_UV_B0,
-   QPU_MC_FILTER_UV_B,
-   QPU_MC_INTERRUPT_EXIT8,
-   QPU_MC_END
-diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
-index 5d00cb2..88ad20b 100644
---- a/libavcodec/rpi_shader.c
-+++ b/libavcodec/rpi_shader.c
-@@ -39,18 +39,18 @@ unsigned int rpi_shader[] = {
- /* [0x00000070] */ 0x00000100, 0xe00205a7, // mov ra22, 256
- /* [0x00000078] */ 0x00000008, 0xe00205e7, // mov ra23, 8
- /* [0x00000080] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
--/* [0x00000088] */ 0x00000040, 0xe0021567, // mov rb21, 64
--/* [0x00000090] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
--/* [0x00000098] */ 0x00000018, 0xe00215e7, // mov rb23, 24
--/* [0x000000a0] */ 0x00000000, 0xe0020227, // mov ra8, 0
--/* [0x000000a8] */ 0x00000000, 0xe0020267, // mov ra9, 0
--/* [0x000000b0] */ 0x00000000, 0xe00202a7, // mov ra10, 0
--/* [0x000000b8] */ 0x00000000, 0xe00202e7, // mov ra11, 0
--/* [0x000000c0] */ 0x00000000, 0xe0020327, // mov ra12, 0
--/* [0x000000c8] */ 0x00000000, 0xe0020367, // mov ra13, 0
--/* [0x000000d0] */ 0x00000000, 0xe00203a7, // mov ra14, 0
--/* [0x000000d8] */ 0x00000000, 0xe00203e7, // mov ra15, 0
--/* [0x000000e0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+/* [0x00000088] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
-+/* [0x00000090] */ 0x00000018, 0xe00215e7, // mov rb23, 24
-+/* [0x00000098] */ 0x00000000, 0xe0020227, // mov ra8, 0
-+/* [0x000000a0] */ 0x00000000, 0xe0020267, // mov ra9, 0
-+/* [0x000000a8] */ 0x00000000, 0xe00202a7, // mov ra10, 0
-+/* [0x000000b0] */ 0x00000000, 0xe00202e7, // mov ra11, 0
-+/* [0x000000b8] */ 0x00000000, 0xe0020327, // mov ra12, 0
-+/* [0x000000c0] */ 0x00000000, 0xe0020367, // mov ra13, 0
-+/* [0x000000c8] */ 0x00000000, 0xe00203a7, // mov ra14, 0
-+/* [0x000000d0] */ 0x00000000, 0xe00203e7, // mov ra15, 0
-+/* [0x000000d8] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+/* [0x000000e0] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
- /* [0x000000e8] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
- /* [0x000000f0] */ 0x159e7480, 0x10020867, // mov r1, r2
- /* [0x000000f8] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-@@ -62,176 +62,176 @@ unsigned int rpi_shader[] = {
- /* [0x00000128] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
- /* [0x00000130] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
- /* [0x00000138] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
--/* [0x00000140] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
--/* [0x00000148] */ 0x159e7480, 0x10020867, // mov r1, r2
--/* [0x00000150] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
--/* [0x00000158] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
--/* [0x00000160] */ 0x159e7480, 0x10020827, // mov r0, r2
--/* [0x00000168] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
--/* [0x00000170] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
--/* [0x00000178] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
--/* [0x00000180] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
--/* [0x00000188] */ 0x15427d80, 0x10020827, // mov r0, ra_x_base
--/* [0x00000190] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
--/* [0x00000198] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base
--/* [0x000001a0] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
--/* [0x000001a8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
--/* [0x000001b0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
--/* [0x000001b8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
--/* [0x000001c0] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x_base, r0
--/* [0x000001c8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
--/* [0x000001d0] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
--/* [0x000001d8] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
--/* [0x000001e0] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
--/* [0x000001e8] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x000001f0] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x000001f8] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000200] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
--/* [0x00000208] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
--/* [0x00000210] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
--/* [0x00000218] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
--/* [0x00000220] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
--/* [0x00000228] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
--/* [0x00000230] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
-+/* [0x00000140] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
-+/* [0x00000148] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-+/* [0x00000150] */ 0x159e7480, 0x10020867, // mov r1, r2
-+/* [0x00000158] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+/* [0x00000160] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+/* [0x00000168] */ 0x159e7480, 0x10020827, // mov r0, r2
-+/* [0x00000170] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-+/* [0x00000178] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000180] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-+/* [0x00000188] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
-+/* [0x00000190] */ 0x0f9c11c0, 0xd0020827, // asr r0, r0, 1
-+/* [0x00000198] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
-+/* [0x000001a0] */ 0x0c9e7040, 0x10021567, // add rb21, r0, r1
-+/* [0x000001a8] */ 0x15427d80, 0x10020827, // mov r0, ra_x_base
-+/* [0x000001b0] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
-+/* [0x000001b8] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base
-+/* [0x000001c0] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
-+/* [0x000001c8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
-+/* [0x000001d0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+/* [0x000001d8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+/* [0x000001e0] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x_base, r0
-+/* [0x000001e8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+/* [0x000001f0] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
-+/* [0x000001f8] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
-+/* [0x00000200] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
-+/* [0x00000208] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000210] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000218] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000220] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-+/* [0x00000228] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+/* [0x00000230] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-+/* [0x00000238] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00000240] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-+/* [0x00000248] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
-+/* [0x00000250] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
- // ::mc_filter_uv
--/* [0x00000238] */ 0x15827d80, 0x100207e7, // mov ra31, unif
--/* [0x00000240] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
--/* [0x00000248] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
--/* [0x00000250] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
--/* [0x00000258] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
--/* [0x00000260] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
--/* [0x00000268] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
--/* [0x00000270] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
--/* [0x00000278] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
--/* [0x00000280] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
--/* [0x00000288] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
--/* [0x00000290] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
--/* [0x00000298] */ 0x00000010, 0xe00208a7, // mov r2, 16
--/* [0x000002a0] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x000002a8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
--/* [0x000002b0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
--/* [0x000002b8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
--/* [0x000002c0] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
--/* [0x000002c8] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
--/* [0x000002d0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
--/* [0x000002d8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
--/* [0x000002e0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
--/* [0x000002e8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
--/* [0x000002f0] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
--/* [0x000002f8] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000300] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000308] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000310] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000318] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
--/* [0x00000320] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000328] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000330] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000338] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
--/* [0x00000340] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000348] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000350] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000358] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
--/* [0x00000360] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000368] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000370] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000378] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
--/* [0x00000380] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
--/* [0x00000388] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00000390] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+/* [0x00000258] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x00000260] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+/* [0x00000268] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+/* [0x00000270] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+/* [0x00000278] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+/* [0x00000280] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x00000288] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+/* [0x00000290] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+/* [0x00000298] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+/* [0x000002a0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+/* [0x000002a8] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+/* [0x000002b0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+/* [0x000002b8] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+/* [0x000002c0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x000002c8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+/* [0x000002d0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+/* [0x000002d8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+/* [0x000002e0] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+/* [0x000002e8] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+/* [0x000002f0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+/* [0x000002f8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000300] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+/* [0x00000308] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+/* [0x00000310] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-+/* [0x00000318] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000320] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000328] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000330] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000338] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+/* [0x00000340] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000348] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000350] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000358] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+/* [0x00000360] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000368] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000370] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000378] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+/* [0x00000380] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000388] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000390] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000398] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-+/* [0x000003a0] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+/* [0x000003a8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x000003b0] */ 0x00000000, 0xe00208e7, // mov r3, 0
- // :uvloop
--/* [0x00000398] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
--/* [0x000003a0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
--/* [0x000003a8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
--/* [0x000003b0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
--/* [0x000003b8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
--/* [0x000003c0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
--/* [0x000003c8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
--/* [0x000003d0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
--/* [0x000003d8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
--/* [0x000003e0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
--/* [0x000003e8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x000003f0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
--/* [0x000003f8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
--/* [0x00000400] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
--/* [0x00000408] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
--/* [0x00000410] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
--/* [0x00000418] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
--/* [0x00000420] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
--/* [0x00000428] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
--/* [0x00000430] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
--/* [0x00000438] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
--/* [0x00000440] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
--/* [0x00000448] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
--/* [0x00000450] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
--/* [0x00000458] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
--/* [0x00000460] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
--/* [0x00000468] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
--/* [0x00000470] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
--/* [0x00000478] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
--/* [0x00000480] */ 0x15267d80, 0x10020227, // mov ra8, ra9
--/* [0x00000488] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
--/* [0x00000490] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
--/* [0x00000498] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
--/* [0x000004a0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
--/* [0x000004a8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
--/* [0x000004b0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
--/* [0x000004b8] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop
--/* [0x000004c0] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
--/* [0x000004c8] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
--/* [0x000004d0] */ 0x009e7000, 0x100009e7, // nop                     ; nop
--/* [0x000004d8] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
--/* [0x000004e0] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
--/* [0x000004e8] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
--/* [0x000004f0] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
--/* [0x000004f8] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
--/* [0x00000500] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
--/* [0x00000508] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
--/* [0x00000510] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
--/* [0x00000518] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
--/* [0x00000520] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
--/* [0x00000528] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
--/* [0x00000530] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
--/* [0x00000538] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:uvloop
--/* [0x00000540] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
--/* [0x00000548] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
--/* [0x00000550] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
--/* [0x00000558] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
--/* [0x00000560] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x00000568] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
--/* [0x00000570] */ 0x00000010, 0xe0020827, // mov r0, 16
--/* [0x00000578] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
--/* [0x00000580] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
--/* [0x00000588] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
--/* [0x00000590] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x00000598] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
--// ::mc_filter_uv_b
--/* [0x000005a0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
--/* [0x000005a8] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
--/* [0x000005b0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
--/* [0x000005b8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
--/* [0x000005c0] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
--/* [0x000005c8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
--/* [0x000005d0] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
--/* [0x000005d8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
--/* [0x000005e0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
--/* [0x000005e8] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
--/* [0x000005f0] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
--/* [0x000005f8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
--/* [0x00000600] */ 0x00000010, 0xe00208a7, // mov r2, 16
--/* [0x00000608] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000610] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
--/* [0x00000618] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
--/* [0x00000620] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
--/* [0x00000628] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
--/* [0x00000630] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
--/* [0x00000638] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
--/* [0x00000640] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
--/* [0x00000648] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
--/* [0x00000650] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
--/* [0x00000658] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
--/* [0x00000660] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
--/* [0x00000668] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
--/* [0x00000670] */ 0x0c9dc7c0, 0x10020c67, // add vr_setup, r3, rb28
-+/* [0x000003b8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+/* [0x000003c0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+/* [0x000003c8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+/* [0x000003d0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x000003d8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+/* [0x000003e0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x000003e8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x000003f0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+/* [0x000003f8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+/* [0x00000400] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+/* [0x00000408] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000410] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+/* [0x00000418] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+/* [0x00000420] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+/* [0x00000428] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+/* [0x00000430] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+/* [0x00000438] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+/* [0x00000440] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+/* [0x00000448] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+/* [0x00000450] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+/* [0x00000458] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+/* [0x00000460] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+/* [0x00000468] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+/* [0x00000470] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+/* [0x00000478] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+/* [0x00000480] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+/* [0x00000488] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+/* [0x00000490] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-+/* [0x00000498] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+/* [0x000004a0] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-+/* [0x000004a8] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+/* [0x000004b0] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+/* [0x000004b8] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+/* [0x000004c0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+/* [0x000004c8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+/* [0x000004d0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-+/* [0x000004d8] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop
-+/* [0x000004e0] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-+/* [0x000004e8] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-+/* [0x000004f0] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-+/* [0x000004f8] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
-+/* [0x00000500] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
-+/* [0x00000508] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
-+/* [0x00000510] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
-+/* [0x00000518] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
-+/* [0x00000520] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
-+/* [0x00000528] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
-+/* [0x00000530] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
-+/* [0x00000538] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+/* [0x00000540] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+/* [0x00000548] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+/* [0x00000550] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
-+/* [0x00000558] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:uvloop
-+/* [0x00000560] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-+/* [0x00000568] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+/* [0x00000570] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+/* [0x00000578] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x00000580] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000588] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x00000590] */ 0x00000010, 0xe0020827, // mov r0, 16
-+/* [0x00000598] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x000005a0] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x000005a8] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+/* [0x000005b0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x000005b8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+// ::mc_filter_uv_b0
-+/* [0x000005c0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x000005c8] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+/* [0x000005d0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+/* [0x000005d8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+/* [0x000005e0] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+/* [0x000005e8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x000005f0] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+/* [0x000005f8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+/* [0x00000600] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+/* [0x00000608] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+/* [0x00000610] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+/* [0x00000618] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+/* [0x00000620] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+/* [0x00000628] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000630] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+/* [0x00000638] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+/* [0x00000640] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+/* [0x00000648] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+/* [0x00000650] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+/* [0x00000658] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+/* [0x00000660] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000668] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+/* [0x00000670] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
- /* [0x00000678] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
- /* [0x00000680] */ 0x15827d80, 0x10020827, // mov r0, unif
- /* [0x00000688] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-@@ -253,7 +253,7 @@ unsigned int rpi_shader[] = {
- /* [0x00000708] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
- /* [0x00000710] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
- /* [0x00000718] */ 0x00000000, 0xe00208e7, // mov r3, 0
--// :uvloop_b
-+// :uvloop_b0
- /* [0x00000720] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
- /* [0x00000728] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
- /* [0x00000730] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-@@ -290,7 +290,7 @@ unsigned int rpi_shader[] = {
- /* [0x00000828] */ 0x15367d80, 0x10020327, // mov ra12, ra13
- /* [0x00000830] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
- /* [0x00000838] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
--/* [0x00000840] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+/* [0x00000840] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b0
- /* [0x00000848] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
- /* [0x00000850] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
- /* [0x00000858] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-@@ -306,48 +306,163 @@ unsigned int rpi_shader[] = {
- /* [0x000008a8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
- /* [0x000008b0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
- /* [0x000008b8] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
--/* [0x000008c0] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
--/* [0x000008c8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
--/* [0x000008d0] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
--/* [0x000008d8] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:uvloop_b
--/* [0x000008e0] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
--/* [0x000008e8] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
--/* [0x000008f0] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
--/* [0x000008f8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
--/* [0x00000900] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x00000908] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
--/* [0x00000910] */ 0x00000010, 0xe0020827, // mov r0, 16
--/* [0x00000918] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
--/* [0x00000920] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
--/* [0x00000928] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
--/* [0x00000930] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x00000938] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x000008c0] */ 0xfffffad8, 0xf06809e7, // brr.anyn -, r:uvloop
-+/* [0x000008c8] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-+/* [0x000008d0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+/* [0x000008d8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+/* [0x000008e0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x000008e8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x000008f0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x000008f8] */ 0x00000010, 0xe0020827, // mov r0, 16
-+/* [0x00000900] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000908] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00000910] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+/* [0x00000918] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000920] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+// ::mc_filter_uv_b
-+/* [0x00000928] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x00000930] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+/* [0x00000938] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+/* [0x00000940] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+/* [0x00000948] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+/* [0x00000950] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x00000958] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+/* [0x00000960] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+/* [0x00000968] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+/* [0x00000970] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+/* [0x00000978] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+/* [0x00000980] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+/* [0x00000988] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+/* [0x00000990] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000998] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+/* [0x000009a0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+/* [0x000009a8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+/* [0x000009b0] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+/* [0x000009b8] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+/* [0x000009c0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+/* [0x000009c8] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-+/* [0x000009d0] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-+/* [0x000009d8] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-+/* [0x000009e0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x000009e8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+/* [0x000009f0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+/* [0x000009f8] */ 0x0c9dc7c0, 0x10020c67, // add vr_setup, r3, rb28
-+/* [0x00000a00] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-+/* [0x00000a08] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000a10] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000a18] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000a20] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000a28] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+/* [0x00000a30] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000a38] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000a40] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000a48] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+/* [0x00000a50] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000a58] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000a60] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000a68] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+/* [0x00000a70] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000a78] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000a80] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000a88] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-+/* [0x00000a90] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+/* [0x00000a98] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000aa0] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+// :uvloop_b
-+/* [0x00000aa8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+/* [0x00000ab0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+/* [0x00000ab8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+/* [0x00000ac0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x00000ac8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+/* [0x00000ad0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x00000ad8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00000ae0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+/* [0x00000ae8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+/* [0x00000af0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+/* [0x00000af8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000b00] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+/* [0x00000b08] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+/* [0x00000b10] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+/* [0x00000b18] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+/* [0x00000b20] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+/* [0x00000b28] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+/* [0x00000b30] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+/* [0x00000b38] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+/* [0x00000b40] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+/* [0x00000b48] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+/* [0x00000b50] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+/* [0x00000b58] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+/* [0x00000b60] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+/* [0x00000b68] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+/* [0x00000b70] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+/* [0x00000b78] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+/* [0x00000b80] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-+/* [0x00000b88] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+/* [0x00000b90] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-+/* [0x00000b98] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+/* [0x00000ba0] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+/* [0x00000ba8] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+/* [0x00000bb0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+/* [0x00000bb8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+/* [0x00000bc0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-+/* [0x00000bc8] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+/* [0x00000bd0] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-+/* [0x00000bd8] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-+/* [0x00000be0] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-+/* [0x00000be8] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
-+/* [0x00000bf0] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
-+/* [0x00000bf8] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
-+/* [0x00000c00] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
-+/* [0x00000c08] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
-+/* [0x00000c10] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
-+/* [0x00000c18] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
-+/* [0x00000c20] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
-+/* [0x00000c28] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+/* [0x00000c30] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+/* [0x00000c38] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+/* [0x00000c40] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
-+/* [0x00000c48] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-+/* [0x00000c50] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+/* [0x00000c58] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
-+/* [0x00000c60] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+/* [0x00000c68] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-+/* [0x00000c70] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
-+/* [0x00000c78] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
-+/* [0x00000c80] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x00000c88] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000c90] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x00000c98] */ 0x00000010, 0xe0020827, // mov r0, 16
-+/* [0x00000ca0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000ca8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00000cb0] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+/* [0x00000cb8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000cc0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
- // ::mc_exit
--/* [0x00000940] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
--/* [0x00000948] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
--/* [0x00000950] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000958] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000960] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000968] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000970] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
--/* [0x00000978] */ 0x009e7000, 0x100009e7, // nop        ; nop
--/* [0x00000980] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+/* [0x00000cc8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x00000cd0] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-+/* [0x00000cd8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000ce0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000ce8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000cf0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000cf8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x00000d00] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+/* [0x00000d08] */ 0x009e7000, 0x100009e7, // nop        ; nop
- // ::mc_interrupt_exit8
--/* [0x00000988] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
--/* [0x00000990] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000998] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x000009a0] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x000009a8] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x000009b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x000009b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x000009c0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x000009c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x000009d0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x000009d8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x000009e0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x000009e8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
--/* [0x000009f0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
--/* [0x000009f8] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+/* [0x00000d10] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x00000d18] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000d20] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000d28] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000d30] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000d38] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000d40] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000d48] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000d50] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000d58] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000d60] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000d68] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000d70] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x00000d78] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+/* [0x00000d80] */ 0x009e7000, 0x100009e7, // nop        ; nop
- // ::mc_end
- };
- #ifdef __HIGHC__
-diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
-index e36c4ae..809e582 100644
---- a/libavcodec/rpi_shader.h
-+++ b/libavcodec/rpi_shader.h
-@@ -4,10 +4,11 @@
- extern unsigned int rpi_shader[];
- 
- #define mc_setup_uv (rpi_shader + 0)
--#define mc_filter_uv (rpi_shader + 142)
--#define mc_filter_uv_b (rpi_shader + 360)
--#define mc_exit (rpi_shader + 592)
--#define mc_interrupt_exit8 (rpi_shader + 610)
--#define mc_end (rpi_shader + 640)
-+#define mc_filter_uv (rpi_shader + 150)
-+#define mc_filter_uv_b0 (rpi_shader + 368)
-+#define mc_filter_uv_b (rpi_shader + 586)
-+#define mc_exit (rpi_shader + 818)
-+#define mc_interrupt_exit8 (rpi_shader + 836)
-+#define mc_end (rpi_shader + 866)
- 
- #endif
-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-index 870437d2..635b894 100644
---- a/libavcodec/rpi_shader.qasm
-+++ b/libavcodec/rpi_shader.qasm
-@@ -26,7 +26,7 @@
- # ra23                                          8
- #
- # rb20                                          0xffffff00
--# rb21                                          64
-+# rb21                                          vpm_setup for writing 16bit results into VPM
- # rb22                                          255
- # rb23                                          24
- #
-@@ -34,7 +34,7 @@
- # rb25                                          frame width-1
- # rb26                                          height<<23 + width<<16 + vdw_setup_0
- # rb27                                          vdw_setup_0 (depends on QPU number)
--# rb28                                          vpm_setup (depends on QPU number)
-+# rb28                                          vpm_setup (depends on QPU number) for writing 8bit results into VPM
- # rb29                                          vdw_setup_1(dst_pitch-width)
- # rb30                                          frame height-1
- # rb31                                          used as temp to count loop iterations
-@@ -69,8 +69,6 @@
- .set ra_y_next,                    ra28
- .set ra_y,                         ra29
- 
--.set rb_const_64,                  rb21
--
- 
- ################################################################################
- # mc_setup_uv(next_kernel, x, y, ref_u_base, ref_v_base, frame_width, frame_height, pitch, dst_pitch, pad0, pad1, pad2)
-@@ -106,7 +104,6 @@ mov ra22, 256
- mov ra23, 8
- 
- mov rb20, 0xffffff00
--mov rb21, 64
- mov rb22, 255
- mov rb23, 24
- 
-@@ -123,6 +120,7 @@ mov ra15, 0
- 
- # Compute part of VPM to use for DMA output
- mov r2, qpu_num
-+shl r2, r2, 1   # Convert QPU numbers to be even (this means we can only use 8 QPUs, but is necessary as we need to save 16bit intermediate results)
- and r2, r2, 15
- mov r1, r2
- asr r1, r1, 2
-@@ -135,16 +133,21 @@ shl r0, r0, 5
- add rb27, r0, r1
- 
- # Compute part of VPM to save data into
--mov r2, qpu_num
--and r2, r2, 15
--mov r1, r2
--asr r1, r1, 2
--shl r1, r1, 6
--mov r0, r2
--and r0, r0, 3
--add r0, r0, r1
--mov r1, vpm_setup(0, 4, h8p(0, 0))
-+mov r2, qpu_num   # qpu_num = abcd
-+shl r2, r2, 1
-+and r2, r2, 15    # r2 = bcd0
-+mov r1, r2        # r1 = bcd0
-+asr r1, r1, 2     # r1 = bc
-+shl r1, r1, 6     # r1 = bc000000
-+mov r0, r2        # r0 = bcd0
-+and r0, r0, 3     # r0 = d0
-+add r0, r0, r1    # r0 = bc0000d0
-+mov r1, vpm_setup(0, 4, h8p(0, 0))   # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
- add rb28, r0, r1
-+asr r0, r0, 1     # r0 = bc0000d
-+# Prepare VPM command for 16bit intermediates
-+mov r1, vpm_setup(0, 2, h16p(0, 0))  # 2 is stride - stride acts on ADDR which is Y[5:0],H[0] for 16 bit
-+add rb21, r0, r1
- 
- # Compute base address for first and second access
- mov r0, ra_x_base           # Load x
-@@ -345,6 +348,171 @@ mov vw_addr, unif # start the VDW
- 
- ################################################################################
- 
-+# mc_filter_uv_b0(next_kernel, x, y, frame_u_base, frame_v_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_u_dst, this_v_dst)
-+
-+# At this point we have already issued two pairs of texture requests for the current block
-+# ra_x_base, ra_x16_base point to the current coordinates for this block
-+::mc_filter_uv_b0
-+mov ra31, unif
-+
-+# per-channel shifts were calculated on the *previous* invocation
-+
-+mov ra_xshift, ra_xshift_next
-+
-+# get base addresses and per-channel shifts for *next* invocation
-+add r0, unif, elem_num    # x
-+max r0, r0, 0; mov r1, unif # y
-+min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base
-+shl ra_xshift_next, r0, 3
-+sub r2, unif, r3 # compute offset from frame base u to frame base v
-+add r0, r0, r3
-+and rb_x_base_next, r0, ~3
-+mov ra_y_next, r1
-+add ra_x2_base_next, rb_x_base_next, r2
-+
-+# set up VPM write
-+mov vw_setup, rb28
-+
-+# get width,height of block
-+mov r2, 16
-+mov r0, unif
-+shr r1, r0, r2 # Extract width
-+sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
-+and r0, r0, rb22 # Extract height
-+add rb17, r0, 5
-+add rb18, r0, 7
-+shl r0, r0, 7
-+add r0, r0, r1 # Combine width and height of destination area
-+shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
-+add rb26, r0, rb27
-+
-+sub.setf -,8,r1 # 8-r1, so if <0 (negative) we need to use the full code
-+
-+# get filter coefficients
-+
-+mov r0, unif
-+asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+asr ra0, r0, rb23;      mov r0, unif
-+asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+asr ra4, r0, rb23;      mov r0, unif
-+asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+asr rb8, r0, rb23;      mov r0, unif
-+asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+asr rb12, r0, rb23
-+
-+# r2 is elem_num
-+# r3 is loop counter
-+
-+mov r5rep, -8
-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+
-+# retrieve texture results and pick out bytes
-+# then submit two more texture requests
-+
-+mov r3, 0
-+
-+:uvloop_b0
-+# retrieve texture results and pick out bytes
-+# then submit two more texture requests
-+
-+sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
-+shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
-+
-+max r2, ra_y, 0  # y
-+min r2, r2, rb_frame_height_minus_1
-+add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+add t0s, ra_x2_base, r2
-+
-+# generate seven shifted versions
-+# interleave with scroll of vertical context
-+
-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+
-+nop                  ; mul24 r2, r0, ra0
-+nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+add r0, r2, r3
-+
-+mov r3, rb31
-+
-+mov ra8, ra9
-+mov ra9, ra10
-+mov ra10, ra11
-+mov ra11, ra12
-+mov ra12, ra13
-+mov ra13, ra14
-+
-+sub.setf -, r3, 8 ; mov r1, ra22
-+
-+# apply horizontal filter
-+brr.anyn -, r:uvloop_b0
-+mov ra14, ra15          ; mul24 r0, r0, r1         # last bit of context scroll
-+asr ra15, r0, 8         ; nop
-+nop                     ; nop  # Delay slot 3 (TODO move more of the context scroll into here)
-+
-+# apply vertical filter and write to VPM
-+
-+nop                     ; mul24 r1, ra14, rb14
-+nop                     ; mul24 r0, ra13, rb13
-+add r1, r1, r0          ; mul24 r0, ra12, rb12
-+add r1, r1, r0          ; mul24 r0, ra11, rb11
-+add r1, r1, r0          ; mul24 r0, ra10, rb10
-+add r1, r1, r0          ; mul24 r0, ra9, rb9
-+add r1, r1, r0          ; mul24 r0, ra8, rb8
-+add r1, r1, r0          ; mul24 r0, ra15, rb15
-+add r1, r1, r0          ; mov -, vw_wait
-+sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+asr r1, r1, 14
-+add r1, r1, ra21
-+brr.anyn -, r:uvloop
-+asr r1, r1, 6          # Delay 1
-+min r1, r1, rb22       # Delay 2
-+max vpm, r1, 0         # Delay 3
-+
-+# DMA out for U
-+
-+mov vw_setup, rb26 # VDW setup 0
-+mov vw_setup, rb29 # Stride
-+mov vw_addr, unif # start the VDW
-+
-+# DMA out for V
-+# We need to wait for the U to complete first, but have nothing useful to compute while we wait.
-+# Could potentially push this write into the start of the next pipeline stage.
-+mov r0, 16
-+mov -, vw_wait
-+
-+bra -, ra31
-+add vw_setup, rb26, r0 # VDW setup 0
-+mov vw_setup, rb29 # Stride
-+mov vw_addr, unif # start the VDW
-+
-+################################################################################
-+
- ::mc_filter_uv_b
- mov ra31, unif
- 
--- 
-2.7.4
-
-
-From 85d0ffa2bcf6a2b94c1a0c8f84241cda9ac92ce2 Mon Sep 17 00:00:00 2001
-From: Peter de Rivaz <peter.derivaz@gmail.com>
-Date: Thu, 14 May 2015 10:04:55 +0100
-Subject: [PATCH 25/68] Switch to using 16bit temp buffers
-
----
- libavcodec/hevc.c          |  2 +-
- libavcodec/rpi_shader.c    |  4 ++--
- libavcodec/rpi_shader.qasm | 10 +++++-----
- 3 files changed, 8 insertions(+), 8 deletions(-)
-
-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-index 4b133d2..28a6660 100644
---- a/libavcodec/hevc.c
-+++ b/libavcodec/hevc.c
-@@ -2147,7 +2147,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-                 uint32_t *u = s->u_mvs[chan & 7];
-                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
--                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b0;
-                       u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 3 + start_x;
-                       u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 3 + start_y;
-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[1]);
-diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
-index 88ad20b..ffd3a07 100644
---- a/libavcodec/rpi_shader.c
-+++ b/libavcodec/rpi_shader.c
-@@ -220,7 +220,7 @@ unsigned int rpi_shader[] = {
- /* [0x00000600] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
- /* [0x00000608] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
- /* [0x00000610] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
--/* [0x00000618] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+/* [0x00000618] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
- /* [0x00000620] */ 0x00000010, 0xe00208a7, // mov r2, 16
- /* [0x00000628] */ 0x15827d80, 0x10020827, // mov r0, unif
- /* [0x00000630] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-@@ -346,7 +346,7 @@ unsigned int rpi_shader[] = {
- /* [0x000009e0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
- /* [0x000009e8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
- /* [0x000009f0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
--/* [0x000009f8] */ 0x0c9dc7c0, 0x10020c67, // add vr_setup, r3, rb28
-+/* [0x000009f8] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
- /* [0x00000a00] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
- /* [0x00000a08] */ 0x15827d80, 0x10020827, // mov r0, unif
- /* [0x00000a10] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-index 635b894..9577121 100644
---- a/libavcodec/rpi_shader.qasm
-+++ b/libavcodec/rpi_shader.qasm
-@@ -26,7 +26,7 @@
- # ra23                                          8
- #
- # rb20                                          0xffffff00
--# rb21                                          vpm_setup for writing 16bit results into VPM
-+# rb21                                          vpm_setup for reading/writing 16bit results into VPM
- # rb22                                          255
- # rb23                                          24
- #
-@@ -370,8 +370,8 @@ and rb_x_base_next, r0, ~3
- mov ra_y_next, r1
- add ra_x2_base_next, rb_x_base_next, r2
- 
--# set up VPM write
--mov vw_setup, rb28
-+# set up VPM write, we need to save 16bit precision
-+mov vw_setup, rb21
- 
- # get width,height of block
- mov r2, 16
-@@ -554,8 +554,8 @@ add r0, r0, r1 # Combine width and height of destination area
- shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
- add rb26, r0, rb27
- 
--# In a B frame, so also set up VPM read
--add vr_setup, r3, rb28
-+# In a B frame, so also set up VPM read (reading back 16bit precision)
-+add vr_setup, r3, rb21
- 
- sub.setf -,8,r1 # 8-r1, so if <0 (negative) we need to use the full code
- 
--- 
-2.7.4
-
-
-From abc51bf61df597082fbd7cf1bba5031e4d44318b Mon Sep 17 00:00:00 2001
-From: Peter de Rivaz <peter.derivaz@gmail.com>
-Date: Thu, 14 May 2015 10:30:44 +0100
-Subject: [PATCH 26/68] Corrected B prediction: matching md5 sum for hobbit50
-
----
- libavcodec/rpi_shader.c    | 815 ++++++++++++++++++++++-----------------------
- libavcodec/rpi_shader.h    |  12 +-
- libavcodec/rpi_shader.qasm |  36 +-
- 3 files changed, 429 insertions(+), 434 deletions(-)
-
-diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
-index ffd3a07..77cca46 100644
---- a/libavcodec/rpi_shader.c
-+++ b/libavcodec/rpi_shader.c
-@@ -38,431 +38,428 @@ unsigned int rpi_shader[] = {
- /* [0x00000068] */ 0x00000020, 0xe0020567, // mov ra21, 32
- /* [0x00000070] */ 0x00000100, 0xe00205a7, // mov ra22, 256
- /* [0x00000078] */ 0x00000008, 0xe00205e7, // mov ra23, 8
--/* [0x00000080] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
--/* [0x00000088] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
--/* [0x00000090] */ 0x00000018, 0xe00215e7, // mov rb23, 24
--/* [0x00000098] */ 0x00000000, 0xe0020227, // mov ra8, 0
--/* [0x000000a0] */ 0x00000000, 0xe0020267, // mov ra9, 0
--/* [0x000000a8] */ 0x00000000, 0xe00202a7, // mov ra10, 0
--/* [0x000000b0] */ 0x00000000, 0xe00202e7, // mov ra11, 0
--/* [0x000000b8] */ 0x00000000, 0xe0020327, // mov ra12, 0
--/* [0x000000c0] */ 0x00000000, 0xe0020367, // mov ra13, 0
--/* [0x000000c8] */ 0x00000000, 0xe00203a7, // mov ra14, 0
--/* [0x000000d0] */ 0x00000000, 0xe00203e7, // mov ra15, 0
--/* [0x000000d8] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
--/* [0x000000e0] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
--/* [0x000000e8] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
--/* [0x000000f0] */ 0x159e7480, 0x10020867, // mov r1, r2
--/* [0x000000f8] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
--/* [0x00000100] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
--/* [0x00000108] */ 0x159e7480, 0x10020827, // mov r0, r2
--/* [0x00000110] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
--/* [0x00000118] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
--/* [0x00000120] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
--/* [0x00000128] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
--/* [0x00000130] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
--/* [0x00000138] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
--/* [0x00000140] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
--/* [0x00000148] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
--/* [0x00000150] */ 0x159e7480, 0x10020867, // mov r1, r2
--/* [0x00000158] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
--/* [0x00000160] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
--/* [0x00000168] */ 0x159e7480, 0x10020827, // mov r0, r2
--/* [0x00000170] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
--/* [0x00000178] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
--/* [0x00000180] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
--/* [0x00000188] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
--/* [0x00000190] */ 0x0f9c11c0, 0xd0020827, // asr r0, r0, 1
--/* [0x00000198] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
--/* [0x000001a0] */ 0x0c9e7040, 0x10021567, // add rb21, r0, r1
--/* [0x000001a8] */ 0x15427d80, 0x10020827, // mov r0, ra_x_base
--/* [0x000001b0] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
--/* [0x000001b8] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base
--/* [0x000001c0] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
--/* [0x000001c8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
--/* [0x000001d0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
--/* [0x000001d8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
--/* [0x000001e0] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x_base, r0
--/* [0x000001e8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
--/* [0x000001f0] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
--/* [0x000001f8] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
--/* [0x00000200] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
--/* [0x00000208] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000080] */ 0x00000040, 0xe00207a7, // mov ra30, 64
-+/* [0x00000088] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
-+/* [0x00000090] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
-+/* [0x00000098] */ 0x00000018, 0xe00215e7, // mov rb23, 24
-+/* [0x000000a0] */ 0x00000000, 0xe0020227, // mov ra8, 0
-+/* [0x000000a8] */ 0x00000000, 0xe0020267, // mov ra9, 0
-+/* [0x000000b0] */ 0x00000000, 0xe00202a7, // mov ra10, 0
-+/* [0x000000b8] */ 0x00000000, 0xe00202e7, // mov ra11, 0
-+/* [0x000000c0] */ 0x00000000, 0xe0020327, // mov ra12, 0
-+/* [0x000000c8] */ 0x00000000, 0xe0020367, // mov ra13, 0
-+/* [0x000000d0] */ 0x00000000, 0xe00203a7, // mov ra14, 0
-+/* [0x000000d8] */ 0x00000000, 0xe00203e7, // mov ra15, 0
-+/* [0x000000e0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+/* [0x000000e8] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
-+/* [0x000000f0] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-+/* [0x000000f8] */ 0x159e7480, 0x10020867, // mov r1, r2
-+/* [0x00000100] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+/* [0x00000108] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+/* [0x00000110] */ 0x159e7480, 0x10020827, // mov r0, r2
-+/* [0x00000118] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-+/* [0x00000120] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000128] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-+/* [0x00000130] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-+/* [0x00000138] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
-+/* [0x00000140] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+/* [0x00000148] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
-+/* [0x00000150] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-+/* [0x00000158] */ 0x159e7480, 0x10020867, // mov r1, r2
-+/* [0x00000160] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+/* [0x00000168] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+/* [0x00000170] */ 0x159e7480, 0x10020827, // mov r0, r2
-+/* [0x00000178] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-+/* [0x00000180] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000188] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-+/* [0x00000190] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
-+/* [0x00000198] */ 0x0f9c11c0, 0xd0020827, // asr r0, r0, 1
-+/* [0x000001a0] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
-+/* [0x000001a8] */ 0x0c9e7040, 0x10021567, // add rb21, r0, r1
-+/* [0x000001b0] */ 0x15427d80, 0x10020827, // mov r0, ra_x_base
-+/* [0x000001b8] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
-+/* [0x000001c0] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base
-+/* [0x000001c8] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
-+/* [0x000001d0] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
-+/* [0x000001d8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+/* [0x000001e0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+/* [0x000001e8] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x_base, r0
-+/* [0x000001f0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+/* [0x000001f8] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
-+/* [0x00000200] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
-+/* [0x00000208] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
- /* [0x00000210] */ 0x15827d80, 0x10020827, // mov r0, unif
- /* [0x00000218] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000220] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
--/* [0x00000228] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
--/* [0x00000230] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
--/* [0x00000238] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
--/* [0x00000240] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
--/* [0x00000248] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
--/* [0x00000250] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
-+/* [0x00000220] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000228] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-+/* [0x00000230] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+/* [0x00000238] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-+/* [0x00000240] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00000248] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-+/* [0x00000250] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
-+/* [0x00000258] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
- // ::mc_filter_uv
--/* [0x00000258] */ 0x15827d80, 0x100207e7, // mov ra31, unif
--/* [0x00000260] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
--/* [0x00000268] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
--/* [0x00000270] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
--/* [0x00000278] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
--/* [0x00000280] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
--/* [0x00000288] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
--/* [0x00000290] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
--/* [0x00000298] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
--/* [0x000002a0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
--/* [0x000002a8] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
--/* [0x000002b0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
--/* [0x000002b8] */ 0x00000010, 0xe00208a7, // mov r2, 16
--/* [0x000002c0] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x000002c8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
--/* [0x000002d0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
--/* [0x000002d8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
--/* [0x000002e0] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
--/* [0x000002e8] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
--/* [0x000002f0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
--/* [0x000002f8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
--/* [0x00000300] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
--/* [0x00000308] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
--/* [0x00000310] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
--/* [0x00000318] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000320] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000328] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000330] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000338] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
--/* [0x00000340] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000348] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000350] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000358] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
--/* [0x00000360] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000368] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000370] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000378] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
--/* [0x00000380] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000388] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000390] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000398] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
--/* [0x000003a0] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
--/* [0x000003a8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x000003b0] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+/* [0x00000260] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x00000268] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+/* [0x00000270] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+/* [0x00000278] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+/* [0x00000280] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+/* [0x00000288] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x00000290] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+/* [0x00000298] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+/* [0x000002a0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+/* [0x000002a8] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+/* [0x000002b0] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+/* [0x000002b8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+/* [0x000002c0] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+/* [0x000002c8] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x000002d0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+/* [0x000002d8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+/* [0x000002e0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+/* [0x000002e8] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+/* [0x000002f0] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+/* [0x000002f8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+/* [0x00000300] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000308] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+/* [0x00000310] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+/* [0x00000318] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-+/* [0x00000320] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000328] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000330] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000338] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000340] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+/* [0x00000348] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000350] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000358] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000360] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+/* [0x00000368] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000370] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000378] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000380] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+/* [0x00000388] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000390] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000398] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x000003a0] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-+/* [0x000003a8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+/* [0x000003b0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x000003b8] */ 0x00000000, 0xe00208e7, // mov r3, 0
- // :uvloop
--/* [0x000003b8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
--/* [0x000003c0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
--/* [0x000003c8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
--/* [0x000003d0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
--/* [0x000003d8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
--/* [0x000003e0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
--/* [0x000003e8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
--/* [0x000003f0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
--/* [0x000003f8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
--/* [0x00000400] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
--/* [0x00000408] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00000410] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
--/* [0x00000418] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
--/* [0x00000420] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
--/* [0x00000428] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
--/* [0x00000430] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
--/* [0x00000438] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
--/* [0x00000440] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
--/* [0x00000448] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
--/* [0x00000450] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
--/* [0x00000458] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
--/* [0x00000460] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
--/* [0x00000468] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
--/* [0x00000470] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
--/* [0x00000478] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
--/* [0x00000480] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
--/* [0x00000488] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
--/* [0x00000490] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
--/* [0x00000498] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
--/* [0x000004a0] */ 0x15267d80, 0x10020227, // mov ra8, ra9
--/* [0x000004a8] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
--/* [0x000004b0] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
--/* [0x000004b8] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
--/* [0x000004c0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
--/* [0x000004c8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
--/* [0x000004d0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
--/* [0x000004d8] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop
--/* [0x000004e0] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
--/* [0x000004e8] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
--/* [0x000004f0] */ 0x009e7000, 0x100009e7, // nop                     ; nop
--/* [0x000004f8] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
--/* [0x00000500] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
--/* [0x00000508] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
--/* [0x00000510] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
--/* [0x00000518] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
--/* [0x00000520] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
--/* [0x00000528] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
--/* [0x00000530] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
--/* [0x00000538] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
--/* [0x00000540] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
--/* [0x00000548] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
--/* [0x00000550] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
--/* [0x00000558] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:uvloop
--/* [0x00000560] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
--/* [0x00000568] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
--/* [0x00000570] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
--/* [0x00000578] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
--/* [0x00000580] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x00000588] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
--/* [0x00000590] */ 0x00000010, 0xe0020827, // mov r0, 16
--/* [0x00000598] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
--/* [0x000005a0] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
--/* [0x000005a8] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
--/* [0x000005b0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x000005b8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x000003c0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+/* [0x000003c8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+/* [0x000003d0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+/* [0x000003d8] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x000003e0] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+/* [0x000003e8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x000003f0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x000003f8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+/* [0x00000400] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+/* [0x00000408] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+/* [0x00000410] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000418] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+/* [0x00000420] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+/* [0x00000428] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+/* [0x00000430] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+/* [0x00000438] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+/* [0x00000440] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+/* [0x00000448] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+/* [0x00000450] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+/* [0x00000458] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+/* [0x00000460] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+/* [0x00000468] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+/* [0x00000470] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+/* [0x00000478] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+/* [0x00000480] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+/* [0x00000488] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+/* [0x00000490] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+/* [0x00000498] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-+/* [0x000004a0] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+/* [0x000004a8] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-+/* [0x000004b0] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+/* [0x000004b8] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+/* [0x000004c0] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+/* [0x000004c8] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+/* [0x000004d0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+/* [0x000004d8] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-+/* [0x000004e0] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop
-+/* [0x000004e8] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-+/* [0x000004f0] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-+/* [0x000004f8] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-+/* [0x00000500] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
-+/* [0x00000508] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
-+/* [0x00000510] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
-+/* [0x00000518] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
-+/* [0x00000520] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
-+/* [0x00000528] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
-+/* [0x00000530] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
-+/* [0x00000538] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
-+/* [0x00000540] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+/* [0x00000548] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+/* [0x00000550] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+/* [0x00000558] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
-+/* [0x00000560] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:uvloop
-+/* [0x00000568] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-+/* [0x00000570] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+/* [0x00000578] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+/* [0x00000580] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x00000588] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000590] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x00000598] */ 0x00000010, 0xe0020827, // mov r0, 16
-+/* [0x000005a0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x000005a8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x000005b0] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+/* [0x000005b8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x000005c0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
- // ::mc_filter_uv_b0
--/* [0x000005c0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
--/* [0x000005c8] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
--/* [0x000005d0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
--/* [0x000005d8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
--/* [0x000005e0] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
--/* [0x000005e8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
--/* [0x000005f0] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
--/* [0x000005f8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
--/* [0x00000600] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
--/* [0x00000608] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
--/* [0x00000610] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
--/* [0x00000618] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
--/* [0x00000620] */ 0x00000010, 0xe00208a7, // mov r2, 16
--/* [0x00000628] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000630] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
--/* [0x00000638] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
--/* [0x00000640] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
--/* [0x00000648] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
--/* [0x00000650] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
--/* [0x00000658] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
--/* [0x00000660] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
--/* [0x00000668] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
--/* [0x00000670] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
--/* [0x00000678] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
--/* [0x00000680] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000688] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000690] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000698] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000006a0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
--/* [0x000006a8] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000006b0] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000006b8] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000006c0] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
--/* [0x000006c8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
--/* [0x000006d0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
--/* [0x000006d8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000006e0] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
--/* [0x000006e8] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
--/* [0x000006f0] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
--/* [0x000006f8] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000700] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
--/* [0x00000708] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
--/* [0x00000710] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00000718] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+/* [0x000005c8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x000005d0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+/* [0x000005d8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+/* [0x000005e0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+/* [0x000005e8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+/* [0x000005f0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x000005f8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+/* [0x00000600] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+/* [0x00000608] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+/* [0x00000610] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+/* [0x00000618] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+/* [0x00000620] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
-+/* [0x00000628] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+/* [0x00000630] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000638] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+/* [0x00000640] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+/* [0x00000648] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+/* [0x00000650] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+/* [0x00000658] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+/* [0x00000660] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+/* [0x00000668] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000670] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+/* [0x00000678] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+/* [0x00000680] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-+/* [0x00000688] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000690] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000698] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000006a0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000006a8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+/* [0x000006b0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000006b8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000006c0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000006c8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+/* [0x000006d0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x000006d8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x000006e0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000006e8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+/* [0x000006f0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x000006f8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000700] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000708] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-+/* [0x00000710] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+/* [0x00000718] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000720] */ 0x00000000, 0xe00208e7, // mov r3, 0
- // :uvloop_b0
--/* [0x00000720] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
--/* [0x00000728] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
--/* [0x00000730] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
--/* [0x00000738] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
--/* [0x00000740] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
--/* [0x00000748] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
--/* [0x00000750] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
--/* [0x00000758] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
--/* [0x00000760] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
--/* [0x00000768] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
--/* [0x00000770] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00000778] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
--/* [0x00000780] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
--/* [0x00000788] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
--/* [0x00000790] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
--/* [0x00000798] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
--/* [0x000007a0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
--/* [0x000007a8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
--/* [0x000007b0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
--/* [0x000007b8] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
--/* [0x000007c0] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
--/* [0x000007c8] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
--/* [0x000007d0] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
--/* [0x000007d8] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
--/* [0x000007e0] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
--/* [0x000007e8] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
--/* [0x000007f0] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
--/* [0x000007f8] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
--/* [0x00000800] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
--/* [0x00000808] */ 0x15267d80, 0x10020227, // mov ra8, ra9
--/* [0x00000810] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
--/* [0x00000818] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
--/* [0x00000820] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
--/* [0x00000828] */ 0x15367d80, 0x10020327, // mov ra12, ra13
--/* [0x00000830] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
--/* [0x00000838] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
--/* [0x00000840] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b0
--/* [0x00000848] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
--/* [0x00000850] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
--/* [0x00000858] */ 0x009e7000, 0x100009e7, // nop                     ; nop
--/* [0x00000860] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
--/* [0x00000868] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
--/* [0x00000870] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
--/* [0x00000878] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
--/* [0x00000880] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
--/* [0x00000888] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
--/* [0x00000890] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
--/* [0x00000898] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
--/* [0x000008a0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
--/* [0x000008a8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
--/* [0x000008b0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
--/* [0x000008b8] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
--/* [0x000008c0] */ 0xfffffad8, 0xf06809e7, // brr.anyn -, r:uvloop
--/* [0x000008c8] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
--/* [0x000008d0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
--/* [0x000008d8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
--/* [0x000008e0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
--/* [0x000008e8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x000008f0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
--/* [0x000008f8] */ 0x00000010, 0xe0020827, // mov r0, 16
--/* [0x00000900] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
--/* [0x00000908] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
--/* [0x00000910] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
--/* [0x00000918] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x00000920] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x00000728] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+/* [0x00000730] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+/* [0x00000738] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+/* [0x00000740] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x00000748] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+/* [0x00000750] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x00000758] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00000760] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+/* [0x00000768] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+/* [0x00000770] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+/* [0x00000778] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000780] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+/* [0x00000788] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+/* [0x00000790] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+/* [0x00000798] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+/* [0x000007a0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+/* [0x000007a8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+/* [0x000007b0] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+/* [0x000007b8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+/* [0x000007c0] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+/* [0x000007c8] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+/* [0x000007d0] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+/* [0x000007d8] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+/* [0x000007e0] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+/* [0x000007e8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+/* [0x000007f0] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+/* [0x000007f8] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+/* [0x00000800] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-+/* [0x00000808] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+/* [0x00000810] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-+/* [0x00000818] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+/* [0x00000820] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+/* [0x00000828] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+/* [0x00000830] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+/* [0x00000838] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+/* [0x00000840] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-+/* [0x00000848] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+/* [0x00000850] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-+/* [0x00000858] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-+/* [0x00000860] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-+/* [0x00000868] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
-+/* [0x00000870] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
-+/* [0x00000878] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
-+/* [0x00000880] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
-+/* [0x00000888] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
-+/* [0x00000890] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
-+/* [0x00000898] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
-+/* [0x000008a0] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
-+/* [0x000008a8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+/* [0x000008b0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+/* [0x000008b8] */ 0xfffffe50, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+/* [0x000008c0] */ 0x0f9ce3c0, 0xd0020c27, // asr vpm, r1, 14
-+/* [0x000008c8] */ 0x009e7000, 0x100009e7, // nop
-+/* [0x000008d0] */ 0x009e7000, 0x100009e7, // nop
-+/* [0x000008d8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x000008e0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x000008e8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x000008f0] */ 0x00000010, 0xe0020827, // mov r0, 16
-+/* [0x000008f8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000900] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00000908] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+/* [0x00000910] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000918] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
- // ::mc_filter_uv_b
--/* [0x00000928] */ 0x15827d80, 0x100207e7, // mov ra31, unif
--/* [0x00000930] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
--/* [0x00000938] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
--/* [0x00000940] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
--/* [0x00000948] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
--/* [0x00000950] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
--/* [0x00000958] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
--/* [0x00000960] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
--/* [0x00000968] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
--/* [0x00000970] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
--/* [0x00000978] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
--/* [0x00000980] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
--/* [0x00000988] */ 0x00000010, 0xe00208a7, // mov r2, 16
--/* [0x00000990] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000998] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
--/* [0x000009a0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
--/* [0x000009a8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
--/* [0x000009b0] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
--/* [0x000009b8] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
--/* [0x000009c0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
--/* [0x000009c8] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
--/* [0x000009d0] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
--/* [0x000009d8] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
--/* [0x000009e0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
--/* [0x000009e8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
--/* [0x000009f0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
--/* [0x000009f8] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
--/* [0x00000a00] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
--/* [0x00000a08] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000a10] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000a18] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000a20] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000a28] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
--/* [0x00000a30] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000a38] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000a40] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000a48] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
--/* [0x00000a50] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000a58] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000a60] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000a68] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
--/* [0x00000a70] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000a78] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000a80] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000a88] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
--/* [0x00000a90] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
--/* [0x00000a98] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00000aa0] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+/* [0x00000920] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x00000928] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+/* [0x00000930] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+/* [0x00000938] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+/* [0x00000940] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+/* [0x00000948] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x00000950] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+/* [0x00000958] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+/* [0x00000960] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+/* [0x00000968] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+/* [0x00000970] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+/* [0x00000978] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+/* [0x00000980] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+/* [0x00000988] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000990] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+/* [0x00000998] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+/* [0x000009a0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+/* [0x000009a8] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+/* [0x000009b0] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+/* [0x000009b8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+/* [0x000009c0] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-+/* [0x000009c8] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-+/* [0x000009d0] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-+/* [0x000009d8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x000009e0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+/* [0x000009e8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+/* [0x000009f0] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
-+/* [0x000009f8] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-+/* [0x00000a00] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000a08] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000a10] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000a18] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000a20] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+/* [0x00000a28] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000a30] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000a38] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000a40] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+/* [0x00000a48] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000a50] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000a58] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000a60] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+/* [0x00000a68] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000a70] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000a78] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000a80] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-+/* [0x00000a88] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+/* [0x00000a90] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000a98] */ 0x00000000, 0xe00208e7, // mov r3, 0
- // :uvloop_b
--/* [0x00000aa8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
--/* [0x00000ab0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
--/* [0x00000ab8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
--/* [0x00000ac0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
--/* [0x00000ac8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
--/* [0x00000ad0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
--/* [0x00000ad8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
--/* [0x00000ae0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
--/* [0x00000ae8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
--/* [0x00000af0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
--/* [0x00000af8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00000b00] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
--/* [0x00000b08] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
--/* [0x00000b10] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
--/* [0x00000b18] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
--/* [0x00000b20] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
--/* [0x00000b28] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
--/* [0x00000b30] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
--/* [0x00000b38] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
--/* [0x00000b40] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
--/* [0x00000b48] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
--/* [0x00000b50] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
--/* [0x00000b58] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
--/* [0x00000b60] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
--/* [0x00000b68] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
--/* [0x00000b70] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
--/* [0x00000b78] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
--/* [0x00000b80] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
--/* [0x00000b88] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
--/* [0x00000b90] */ 0x15267d80, 0x10020227, // mov ra8, ra9
--/* [0x00000b98] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
--/* [0x00000ba0] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
--/* [0x00000ba8] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
--/* [0x00000bb0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
--/* [0x00000bb8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
--/* [0x00000bc0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
--/* [0x00000bc8] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
--/* [0x00000bd0] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
--/* [0x00000bd8] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
--/* [0x00000be0] */ 0x009e7000, 0x100009e7, // nop                     ; nop
--/* [0x00000be8] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
--/* [0x00000bf0] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
--/* [0x00000bf8] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
--/* [0x00000c00] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
--/* [0x00000c08] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
--/* [0x00000c10] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
--/* [0x00000c18] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
--/* [0x00000c20] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
--/* [0x00000c28] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
--/* [0x00000c30] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
--/* [0x00000c38] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
--/* [0x00000c40] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
--/* [0x00000c48] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
--/* [0x00000c50] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
--/* [0x00000c58] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
--/* [0x00000c60] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:uvloop_b
--/* [0x00000c68] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
--/* [0x00000c70] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
--/* [0x00000c78] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
--/* [0x00000c80] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
--/* [0x00000c88] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x00000c90] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
--/* [0x00000c98] */ 0x00000010, 0xe0020827, // mov r0, 16
--/* [0x00000ca0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
--/* [0x00000ca8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
--/* [0x00000cb0] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
--/* [0x00000cb8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x00000cc0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x00000aa0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+/* [0x00000aa8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+/* [0x00000ab0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+/* [0x00000ab8] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x00000ac0] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+/* [0x00000ac8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x00000ad0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00000ad8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+/* [0x00000ae0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+/* [0x00000ae8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+/* [0x00000af0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000af8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+/* [0x00000b00] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+/* [0x00000b08] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+/* [0x00000b10] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+/* [0x00000b18] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+/* [0x00000b20] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+/* [0x00000b28] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+/* [0x00000b30] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+/* [0x00000b38] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+/* [0x00000b40] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+/* [0x00000b48] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+/* [0x00000b50] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+/* [0x00000b58] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+/* [0x00000b60] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+/* [0x00000b68] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+/* [0x00000b70] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+/* [0x00000b78] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-+/* [0x00000b80] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+/* [0x00000b88] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-+/* [0x00000b90] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+/* [0x00000b98] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+/* [0x00000ba0] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+/* [0x00000ba8] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+/* [0x00000bb0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+/* [0x00000bb8] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-+/* [0x00000bc0] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+/* [0x00000bc8] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-+/* [0x00000bd0] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-+/* [0x00000bd8] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-+/* [0x00000be0] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
-+/* [0x00000be8] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
-+/* [0x00000bf0] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
-+/* [0x00000bf8] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
-+/* [0x00000c00] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
-+/* [0x00000c08] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
-+/* [0x00000c10] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
-+/* [0x00000c18] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
-+/* [0x00000c20] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+/* [0x00000c28] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+/* [0x00000c30] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+/* [0x00000c38] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
-+/* [0x00000c40] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
-+/* [0x00000c48] */ 0xfffffe38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+/* [0x00000c50] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-+/* [0x00000c58] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+/* [0x00000c60] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+/* [0x00000c68] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x00000c70] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000c78] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x00000c80] */ 0x00000010, 0xe0020827, // mov r0, 16
-+/* [0x00000c88] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000c90] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00000c98] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+/* [0x00000ca0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000ca8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
- // ::mc_exit
--/* [0x00000cc8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
--/* [0x00000cd0] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-+/* [0x00000cb0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x00000cb8] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-+/* [0x00000cc0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000cc8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000cd0] */ 0x009e7000, 0xa00009e7, // ldtmu0
- /* [0x00000cd8] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000ce0] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000ce8] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000cf0] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000cf8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
--/* [0x00000d00] */ 0x009e7000, 0x100009e7, // nop        ; nop
--/* [0x00000d08] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+/* [0x00000ce0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x00000ce8] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+/* [0x00000cf0] */ 0x009e7000, 0x100009e7, // nop        ; nop
- // ::mc_interrupt_exit8
--/* [0x00000d10] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x00000cf8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x00000d00] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000d08] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000d10] */ 0x009e7000, 0xa00009e7, // ldtmu0
- /* [0x00000d18] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000d20] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000d28] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000d30] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000d20] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000d28] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000d30] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
- /* [0x00000d38] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
- /* [0x00000d40] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
- /* [0x00000d48] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
- /* [0x00000d50] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000d58] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000d60] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000d68] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000d70] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
--/* [0x00000d78] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
--/* [0x00000d80] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+/* [0x00000d58] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x00000d60] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+/* [0x00000d68] */ 0x009e7000, 0x100009e7, // nop        ; nop
- // ::mc_end
- };
- #ifdef __HIGHC__
-diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
-index 809e582..6562fa9 100644
---- a/libavcodec/rpi_shader.h
-+++ b/libavcodec/rpi_shader.h
-@@ -4,11 +4,11 @@
- extern unsigned int rpi_shader[];
- 
- #define mc_setup_uv (rpi_shader + 0)
--#define mc_filter_uv (rpi_shader + 150)
--#define mc_filter_uv_b0 (rpi_shader + 368)
--#define mc_filter_uv_b (rpi_shader + 586)
--#define mc_exit (rpi_shader + 818)
--#define mc_interrupt_exit8 (rpi_shader + 836)
--#define mc_end (rpi_shader + 866)
-+#define mc_filter_uv (rpi_shader + 152)
-+#define mc_filter_uv_b0 (rpi_shader + 370)
-+#define mc_filter_uv_b (rpi_shader + 584)
-+#define mc_exit (rpi_shader + 812)
-+#define mc_interrupt_exit8 (rpi_shader + 830)
-+#define mc_end (rpi_shader + 860)
- 
- #endif
-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-index 9577121..562dc35 100644
---- a/libavcodec/rpi_shader.qasm
-+++ b/libavcodec/rpi_shader.qasm
-@@ -39,13 +39,13 @@
- # rb30                                          frame height-1
- # rb31                                          used as temp to count loop iterations
- #
--# ra24...ra30                                   15, 14, 13, 12, 11, 10, 9
- # ra24                                          clipped(row start address+8+elem_num)&~3
- # ra25                                          per-channel shifts 2
- # ra26                                          next ra24
- # ra27                                          next ra25
- # ra28                                          next y
- # ra29                                          y for next texture access
-+# ra30                                          64
- #
- # ra31                                          next kernel address
- 
-@@ -102,6 +102,7 @@ mov ra20, 1
- mov ra21, 32
- mov ra22, 256
- mov ra23, 8
-+mov ra30, 64
- 
- mov rb20, 0xffffff00
- mov rb22, 255
-@@ -472,7 +473,7 @@ sub.setf -, r3, 8 ; mov r1, ra22
- # apply horizontal filter
- brr.anyn -, r:uvloop_b0
- mov ra14, ra15          ; mul24 r0, r0, r1         # last bit of context scroll
--asr ra15, r0, 8         ; nop
-+asr ra15, r0, 8         ; nop  # TODO isn't ra15 already in 24bit precision, may not need the sign extension here?
- nop                     ; nop  # Delay slot 3 (TODO move more of the context scroll into here)
- 
- # apply vertical filter and write to VPM
-@@ -487,18 +488,18 @@ add r1, r1, r0          ; mul24 r0, ra8, rb8
- add r1, r1, r0          ; mul24 r0, ra15, rb15
- add r1, r1, r0          ; mov -, vw_wait
- sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
--asr r1, r1, 14
--add r1, r1, ra21
--brr.anyn -, r:uvloop
--asr r1, r1, 6          # Delay 1
--min r1, r1, rb22       # Delay 2
--max vpm, r1, 0         # Delay 3
-+#asr r1, r1, 14
-+#add r1, r1, ra21
-+brr.anyn -, r:uvloop_b0
-+asr vpm, r1, 14        # Delay 1 shifts down by shift2=6, but results are still in 16bit precision TODO may be able to avoid the mul24 and use more delay slots
-+nop                    # Delay 2
-+nop                    # Delay 3
- 
- # DMA out for U
- 
- mov vw_setup, rb26 # VDW setup 0
- mov vw_setup, rb29 # Stride
--mov vw_addr, unif # start the VDW
-+mov vw_addr, unif # start the VDW    # TODO in pass0 we don't need to save any results
- 
- # DMA out for V
- # We need to wait for the U to complete first, but have nothing useful to compute while we wait.
-@@ -639,12 +640,11 @@ mov ra12, ra13
- mov ra13, ra14
- 
- sub.setf -, r3, 8 ; mov r1, ra22
--
- # apply horizontal filter
- brr.anyn -, r:uvloop_b
- mov ra14, ra15          ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
- asr ra15, r0, 8         ; nop
--nop                     ; nop
-+nop                     ; nop    # TODO improve use of delay slots
- 
- # apply vertical filter and write to VPM
- 
-@@ -658,15 +658,13 @@ add r1, r1, r0          ; mul24 r0, ra8, rb8
- add r1, r1, r0          ; mul24 r0, ra15, rb15
- add r1, r1, r0          ; mov -, vw_wait
- sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
--asr r1, r1, 14
--add r1, r1, ra21
--asr r1, r1, 6
--min r1, r1, rb22
--add r0, vpm, 1          # Blend in previous VPM contents at this location
-+asr r1, r1, 14          # shift2=6
-+add r1, r1, vpm         # Blend in previous VPM contents at this location
-+add r1, r1, ra30
- brr.anyn -, r:uvloop_b
--max r1, r1, 0
--add r1, r1, r0
--shr vpm, r1, 1
-+asr r1, r1, 7           # Delay 1
-+min r1, r1, rb22        # Delay 2
-+max vpm, r1, 0          # Delay 3
- 
- 
- # DMA out for U
--- 
-2.7.4
-
-
-From ea60373134f98099c4ebaf0d23cca666008b4bba Mon Sep 17 00:00:00 2001
-From: Peter de Rivaz <peter.derivaz@gmail.com>
-Date: Thu, 14 May 2015 10:55:07 +0100
-Subject: [PATCH 27/68] P prediction uses 4 tap filters
-
----
- libavcodec/hevc.c          |  50 ++--
- libavcodec/rpi_shader.c    | 631 ++++++++++++++++++++++-----------------------
- libavcodec/rpi_shader.h    |  10 +-
- libavcodec/rpi_shader.qasm |  43 +--
- 4 files changed, 344 insertions(+), 390 deletions(-)
-
-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-index 28a6660..a47ebc5 100644
---- a/libavcodec/hevc.c
-+++ b/libavcodec/hevc.c
-@@ -65,15 +65,15 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
- #define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24)
- 
- // TODO Chroma only needs 4 taps
--static uint32_t rpi_filter_coefs[8][2] = {
--        { ENCODE_COEFFS(  0,  0,  0,  64), ENCODE_COEFFS(   0,   0,  0,  0 ) },
--        { ENCODE_COEFFS(  0,  0, -2,  58), ENCODE_COEFFS(  10,  -2,  0,  0 ) },
--        { ENCODE_COEFFS(  0,  0, -4,  54), ENCODE_COEFFS(  16,  -2,  0,  0 ) },
--        { ENCODE_COEFFS(  0,  0, -6,  46), ENCODE_COEFFS(  28,  -4,  0,  0 ) },
--        { ENCODE_COEFFS(  0,  0, -4,  36), ENCODE_COEFFS(  36,  -4,  0,  0 ) },
--        { ENCODE_COEFFS(  0,  0, -4,  28), ENCODE_COEFFS(  46,  -6,  0,  0 ) },
--        { ENCODE_COEFFS(  0,  0, -2,  16), ENCODE_COEFFS(  54,  -4,  0,  0 ) },
--        { ENCODE_COEFFS(  0,  0, -2,  10), ENCODE_COEFFS(  58,  -2,  0,  0 ) }
-+static uint32_t rpi_filter_coefs[8][1] = {
-+        { ENCODE_COEFFS(   0,  64,   0,   0) },
-+        { ENCODE_COEFFS(  -2,  58,  10,  -2) },
-+        { ENCODE_COEFFS(  -4,  54,  16,  -2) },
-+        { ENCODE_COEFFS(  -6,  46,  28,  -4) },
-+        { ENCODE_COEFFS(  -4,  36,  36,  -4) },
-+        { ENCODE_COEFFS(  -4,  28,  46,  -6) },
-+        { ENCODE_COEFFS(  -2,  16,  54,  -4) },
-+        { ENCODE_COEFFS(  -2,  10,  58,  -2) }
- };
- 
- static uint32_t get_vc_address(AVBufferRef *bref) {
-@@ -2027,16 +2027,16 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
-                       u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
--                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 3 + start_x;
--                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 3 + start_y;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[1]);
-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[2]);
-                       *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
-                       // TODO chroma weight and offset... s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]
-                       *u++ = rpi_filter_coefs[_mx][0];
--                      *u++ = rpi_filter_coefs[_mx][1];
-+                      u++;
-                       *u++ = rpi_filter_coefs[_my][0];
--                      *u++ = rpi_filter_coefs[_my][1];
-+                      u++;
-                       *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
-                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-                     }
-@@ -2084,16 +2084,16 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
-                       u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
--                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 3 + start_x;
--                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 3 + start_y;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[1]);
-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[2]);
-                       *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
-                       // TODO chroma weight and offset... s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]
-                       *u++ = rpi_filter_coefs[_mx][0];
--                      *u++ = rpi_filter_coefs[_mx][1];
-+                      u++;
-                       *u++ = rpi_filter_coefs[_my][0];
--                      *u++ = rpi_filter_coefs[_my][1];
-+                      u++;
-                       *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
-                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-                     }
-@@ -2148,29 +2148,29 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
-                       u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b0;
--                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 3 + start_x;
--                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 3 + start_y;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[1]);
-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[2]);
-                       *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
-                       *u++ = rpi_filter_coefs[_mx][0];
--                      *u++ = rpi_filter_coefs[_mx][1];
-+                      u++;
-                       *u++ = rpi_filter_coefs[_my][0];
--                      *u++ = rpi_filter_coefs[_my][1];
-+                      u++;
-                       *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]); // TODO this will become unused once we have a dedicated pass0 filter
-                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
- 
-                       u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b;
--                      u++[-RPI_CHROMA_COMMAND_WORDS] = x2_c - 3 + start_x;
--                      u++[-RPI_CHROMA_COMMAND_WORDS] = y2_c - 3 + start_y;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = x2_c - 1 + start_x;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = y2_c - 1 + start_y;
-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[1]);
-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[2]);
-                       *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
-                       // TODO chroma weight and offset... s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]
-                       *u++ = rpi_filter_coefs[_mx2][0];
--                      *u++ = rpi_filter_coefs[_mx2][1];
-+                      u++;
-                       *u++ = rpi_filter_coefs[_my2][0];
--                      *u++ = rpi_filter_coefs[_my2][1];
-+                      u++;
-                       *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
-                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-                     }
-diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
-index 77cca46..c8d0728 100644
---- a/libavcodec/rpi_shader.c
-+++ b/libavcodec/rpi_shader.c
-@@ -116,8 +116,8 @@ unsigned int rpi_shader[] = {
- /* [0x000002d0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
- /* [0x000002d8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
- /* [0x000002e0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
--/* [0x000002e8] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
--/* [0x000002f0] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+/* [0x000002e8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-+/* [0x000002f0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
- /* [0x000002f8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
- /* [0x00000300] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
- /* [0x00000308] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-@@ -128,338 +128,315 @@ unsigned int rpi_shader[] = {
- /* [0x00000330] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
- /* [0x00000338] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
- /* [0x00000340] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
--/* [0x00000348] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000350] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000358] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000360] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
--/* [0x00000368] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000370] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000378] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000380] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
--/* [0x00000388] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000390] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000398] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
--/* [0x000003a0] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
--/* [0x000003a8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
--/* [0x000003b0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x000003b8] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+/* [0x00000348] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000350] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000358] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000360] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000368] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+/* [0x00000370] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+/* [0x00000378] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000380] */ 0x00000000, 0xe00208e7, // mov r3, 0
- // :uvloop
--/* [0x000003c0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
--/* [0x000003c8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
--/* [0x000003d0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
--/* [0x000003d8] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
--/* [0x000003e0] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
--/* [0x000003e8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
--/* [0x000003f0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
--/* [0x000003f8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
--/* [0x00000400] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
--/* [0x00000408] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
--/* [0x00000410] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00000418] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
--/* [0x00000420] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
--/* [0x00000428] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
--/* [0x00000430] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
--/* [0x00000438] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
--/* [0x00000440] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
--/* [0x00000448] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
--/* [0x00000450] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
--/* [0x00000458] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
--/* [0x00000460] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
--/* [0x00000468] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
--/* [0x00000470] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
--/* [0x00000478] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
--/* [0x00000480] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
--/* [0x00000488] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
--/* [0x00000490] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
--/* [0x00000498] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
--/* [0x000004a0] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
--/* [0x000004a8] */ 0x15267d80, 0x10020227, // mov ra8, ra9
--/* [0x000004b0] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
--/* [0x000004b8] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
--/* [0x000004c0] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
--/* [0x000004c8] */ 0x15367d80, 0x10020327, // mov ra12, ra13
--/* [0x000004d0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
--/* [0x000004d8] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
--/* [0x000004e0] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop
--/* [0x000004e8] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
--/* [0x000004f0] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
--/* [0x000004f8] */ 0x009e7000, 0x100009e7, // nop                     ; nop
--/* [0x00000500] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
--/* [0x00000508] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
--/* [0x00000510] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
--/* [0x00000518] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
--/* [0x00000520] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
--/* [0x00000528] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
--/* [0x00000530] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
--/* [0x00000538] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
--/* [0x00000540] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
--/* [0x00000548] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
--/* [0x00000550] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
--/* [0x00000558] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
--/* [0x00000560] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:uvloop
--/* [0x00000568] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
--/* [0x00000570] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
--/* [0x00000578] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
--/* [0x00000580] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
--/* [0x00000588] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x00000590] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
--/* [0x00000598] */ 0x00000010, 0xe0020827, // mov r0, 16
--/* [0x000005a0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
--/* [0x000005a8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
--/* [0x000005b0] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
--/* [0x000005b8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x000005c0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x00000388] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+/* [0x00000390] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+/* [0x00000398] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+/* [0x000003a0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x000003a8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+/* [0x000003b0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x000003b8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x000003c0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+/* [0x000003c8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+/* [0x000003d0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+/* [0x000003d8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x000003e0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+/* [0x000003e8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+/* [0x000003f0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+/* [0x000003f8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+/* [0x00000400] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+/* [0x00000408] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+/* [0x00000410] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+/* [0x00000418] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+/* [0x00000420] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-+/* [0x00000428] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+/* [0x00000430] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+/* [0x00000438] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+/* [0x00000440] */ 0x8d5847f6, 0xd00269e1, // sub.setf -, r3, 4 ; mov r1, ra22
-+/* [0x00000448] */ 0xffffff20, 0xf06809e7, // brr.anyn -, r:uvloop
-+/* [0x00000450] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-+/* [0x00000458] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-+/* [0x00000460] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-+/* [0x00000468] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+/* [0x00000470] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+/* [0x00000478] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+/* [0x00000480] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+/* [0x00000488] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+/* [0x00000490] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+/* [0x00000498] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+/* [0x000004a0] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
-+/* [0x000004a8] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop
-+/* [0x000004b0] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-+/* [0x000004b8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+/* [0x000004c0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+/* [0x000004c8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x000004d0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x000004d8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x000004e0] */ 0x00000010, 0xe0020827, // mov r0, 16
-+/* [0x000004e8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x000004f0] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x000004f8] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+/* [0x00000500] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000508] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
- // ::mc_filter_uv_b0
--/* [0x000005c8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
--/* [0x000005d0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
--/* [0x000005d8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
--/* [0x000005e0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
--/* [0x000005e8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
--/* [0x000005f0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
--/* [0x000005f8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
--/* [0x00000600] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
--/* [0x00000608] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
--/* [0x00000610] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
--/* [0x00000618] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
--/* [0x00000620] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
--/* [0x00000628] */ 0x00000010, 0xe00208a7, // mov r2, 16
--/* [0x00000630] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000638] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
--/* [0x00000640] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
--/* [0x00000648] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
--/* [0x00000650] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
--/* [0x00000658] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
--/* [0x00000660] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
--/* [0x00000668] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
--/* [0x00000670] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
--/* [0x00000678] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
--/* [0x00000680] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
--/* [0x00000688] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000690] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000698] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000006a0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000006a8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
--/* [0x000006b0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000006b8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000006c0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000006c8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
--/* [0x000006d0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
--/* [0x000006d8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
--/* [0x000006e0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000006e8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
--/* [0x000006f0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
--/* [0x000006f8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000700] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000708] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
--/* [0x00000710] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
--/* [0x00000718] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00000720] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+/* [0x00000510] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x00000518] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+/* [0x00000520] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+/* [0x00000528] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+/* [0x00000530] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+/* [0x00000538] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x00000540] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+/* [0x00000548] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+/* [0x00000550] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+/* [0x00000558] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+/* [0x00000560] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+/* [0x00000568] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
-+/* [0x00000570] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+/* [0x00000578] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000580] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+/* [0x00000588] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+/* [0x00000590] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+/* [0x00000598] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+/* [0x000005a0] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+/* [0x000005a8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+/* [0x000005b0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x000005b8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+/* [0x000005c0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+/* [0x000005c8] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-+/* [0x000005d0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x000005d8] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000005e0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000005e8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000005f0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+/* [0x000005f8] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000600] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000608] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000610] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+/* [0x00000618] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000620] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000628] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000630] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+/* [0x00000638] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000640] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000648] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000650] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-+/* [0x00000658] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+/* [0x00000660] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000668] */ 0x00000000, 0xe00208e7, // mov r3, 0
- // :uvloop_b0
--/* [0x00000728] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
--/* [0x00000730] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
--/* [0x00000738] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
--/* [0x00000740] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
--/* [0x00000748] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
--/* [0x00000750] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
--/* [0x00000758] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
--/* [0x00000760] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
--/* [0x00000768] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
--/* [0x00000770] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
--/* [0x00000778] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00000780] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
--/* [0x00000788] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
--/* [0x00000790] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
--/* [0x00000798] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
--/* [0x000007a0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
--/* [0x000007a8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
--/* [0x000007b0] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
--/* [0x000007b8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
--/* [0x000007c0] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
--/* [0x000007c8] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
--/* [0x000007d0] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
--/* [0x000007d8] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
--/* [0x000007e0] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
--/* [0x000007e8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
--/* [0x000007f0] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
--/* [0x000007f8] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
--/* [0x00000800] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
--/* [0x00000808] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
--/* [0x00000810] */ 0x15267d80, 0x10020227, // mov ra8, ra9
--/* [0x00000818] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
--/* [0x00000820] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
--/* [0x00000828] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
--/* [0x00000830] */ 0x15367d80, 0x10020327, // mov ra12, ra13
--/* [0x00000838] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
--/* [0x00000840] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
--/* [0x00000848] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b0
--/* [0x00000850] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
--/* [0x00000858] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
--/* [0x00000860] */ 0x009e7000, 0x100009e7, // nop                     ; nop
--/* [0x00000868] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
--/* [0x00000870] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
--/* [0x00000878] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
--/* [0x00000880] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
--/* [0x00000888] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
--/* [0x00000890] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
--/* [0x00000898] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
--/* [0x000008a0] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
--/* [0x000008a8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
--/* [0x000008b0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
--/* [0x000008b8] */ 0xfffffe50, 0xf06809e7, // brr.anyn -, r:uvloop_b0
--/* [0x000008c0] */ 0x0f9ce3c0, 0xd0020c27, // asr vpm, r1, 14
--/* [0x000008c8] */ 0x009e7000, 0x100009e7, // nop
--/* [0x000008d0] */ 0x009e7000, 0x100009e7, // nop
--/* [0x000008d8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
--/* [0x000008e0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x000008e8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
--/* [0x000008f0] */ 0x00000010, 0xe0020827, // mov r0, 16
--/* [0x000008f8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
--/* [0x00000900] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
--/* [0x00000908] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
--/* [0x00000910] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x00000918] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x00000670] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+/* [0x00000678] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+/* [0x00000680] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+/* [0x00000688] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x00000690] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+/* [0x00000698] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x000006a0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x000006a8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+/* [0x000006b0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+/* [0x000006b8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+/* [0x000006c0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x000006c8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+/* [0x000006d0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+/* [0x000006d8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+/* [0x000006e0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+/* [0x000006e8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+/* [0x000006f0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+/* [0x000006f8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+/* [0x00000700] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+/* [0x00000708] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+/* [0x00000710] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+/* [0x00000718] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+/* [0x00000720] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+/* [0x00000728] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+/* [0x00000730] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+/* [0x00000738] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+/* [0x00000740] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+/* [0x00000748] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-+/* [0x00000750] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+/* [0x00000758] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-+/* [0x00000760] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+/* [0x00000768] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+/* [0x00000770] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+/* [0x00000778] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+/* [0x00000780] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+/* [0x00000788] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-+/* [0x00000790] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+/* [0x00000798] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-+/* [0x000007a0] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-+/* [0x000007a8] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-+/* [0x000007b0] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
-+/* [0x000007b8] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
-+/* [0x000007c0] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
-+/* [0x000007c8] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
-+/* [0x000007d0] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
-+/* [0x000007d8] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
-+/* [0x000007e0] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
-+/* [0x000007e8] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
-+/* [0x000007f0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+/* [0x000007f8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+/* [0x00000800] */ 0xfffffe50, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+/* [0x00000808] */ 0x0f9ce3c0, 0xd0020c27, // asr vpm, r1, 14
-+/* [0x00000810] */ 0x009e7000, 0x100009e7, // nop
-+/* [0x00000818] */ 0x009e7000, 0x100009e7, // nop
-+/* [0x00000820] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x00000828] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000830] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x00000838] */ 0x00000010, 0xe0020827, // mov r0, 16
-+/* [0x00000840] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000848] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00000850] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+/* [0x00000858] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000860] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
- // ::mc_filter_uv_b
--/* [0x00000920] */ 0x15827d80, 0x100207e7, // mov ra31, unif
--/* [0x00000928] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
--/* [0x00000930] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
--/* [0x00000938] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
--/* [0x00000940] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
--/* [0x00000948] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
--/* [0x00000950] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
--/* [0x00000958] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
--/* [0x00000960] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
--/* [0x00000968] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
--/* [0x00000970] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
--/* [0x00000978] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
--/* [0x00000980] */ 0x00000010, 0xe00208a7, // mov r2, 16
--/* [0x00000988] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000990] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
--/* [0x00000998] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
--/* [0x000009a0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
--/* [0x000009a8] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
--/* [0x000009b0] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
--/* [0x000009b8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
--/* [0x000009c0] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
--/* [0x000009c8] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
--/* [0x000009d0] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
--/* [0x000009d8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
--/* [0x000009e0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
--/* [0x000009e8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
--/* [0x000009f0] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
--/* [0x000009f8] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
--/* [0x00000a00] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000a08] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000a10] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000a18] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000a20] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
--/* [0x00000a28] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000a30] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000a38] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000a40] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
--/* [0x00000a48] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000a50] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000a58] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000a60] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
--/* [0x00000a68] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000a70] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000a78] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000a80] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
--/* [0x00000a88] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
--/* [0x00000a90] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00000a98] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+/* [0x00000868] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x00000870] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+/* [0x00000878] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+/* [0x00000880] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+/* [0x00000888] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+/* [0x00000890] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x00000898] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+/* [0x000008a0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+/* [0x000008a8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+/* [0x000008b0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+/* [0x000008b8] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+/* [0x000008c0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+/* [0x000008c8] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+/* [0x000008d0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x000008d8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+/* [0x000008e0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+/* [0x000008e8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+/* [0x000008f0] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+/* [0x000008f8] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+/* [0x00000900] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+/* [0x00000908] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-+/* [0x00000910] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-+/* [0x00000918] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-+/* [0x00000920] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000928] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+/* [0x00000930] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+/* [0x00000938] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
-+/* [0x00000940] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-+/* [0x00000948] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000950] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000958] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000960] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000968] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+/* [0x00000970] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000978] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000980] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000988] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+/* [0x00000990] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000998] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x000009a0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000009a8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+/* [0x000009b0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x000009b8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x000009c0] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x000009c8] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-+/* [0x000009d0] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+/* [0x000009d8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x000009e0] */ 0x00000000, 0xe00208e7, // mov r3, 0
- // :uvloop_b
--/* [0x00000aa0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
--/* [0x00000aa8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
--/* [0x00000ab0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
--/* [0x00000ab8] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
--/* [0x00000ac0] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
--/* [0x00000ac8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
--/* [0x00000ad0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
--/* [0x00000ad8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
--/* [0x00000ae0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
--/* [0x00000ae8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
--/* [0x00000af0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00000af8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
--/* [0x00000b00] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
--/* [0x00000b08] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
--/* [0x00000b10] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
--/* [0x00000b18] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
--/* [0x00000b20] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
--/* [0x00000b28] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
--/* [0x00000b30] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
--/* [0x00000b38] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
--/* [0x00000b40] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
--/* [0x00000b48] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
--/* [0x00000b50] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
--/* [0x00000b58] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
--/* [0x00000b60] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
--/* [0x00000b68] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
--/* [0x00000b70] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
--/* [0x00000b78] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
--/* [0x00000b80] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
--/* [0x00000b88] */ 0x15267d80, 0x10020227, // mov ra8, ra9
--/* [0x00000b90] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
--/* [0x00000b98] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
--/* [0x00000ba0] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
--/* [0x00000ba8] */ 0x15367d80, 0x10020327, // mov ra12, ra13
--/* [0x00000bb0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
--/* [0x00000bb8] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
--/* [0x00000bc0] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
--/* [0x00000bc8] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
--/* [0x00000bd0] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
--/* [0x00000bd8] */ 0x009e7000, 0x100009e7, // nop                     ; nop
--/* [0x00000be0] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
--/* [0x00000be8] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
--/* [0x00000bf0] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
--/* [0x00000bf8] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
--/* [0x00000c00] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
--/* [0x00000c08] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
--/* [0x00000c10] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
--/* [0x00000c18] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
--/* [0x00000c20] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
--/* [0x00000c28] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
--/* [0x00000c30] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
--/* [0x00000c38] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
--/* [0x00000c40] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
--/* [0x00000c48] */ 0xfffffe38, 0xf06809e7, // brr.anyn -, r:uvloop_b
--/* [0x00000c50] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
--/* [0x00000c58] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
--/* [0x00000c60] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
--/* [0x00000c68] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
--/* [0x00000c70] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x00000c78] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
--/* [0x00000c80] */ 0x00000010, 0xe0020827, // mov r0, 16
--/* [0x00000c88] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
--/* [0x00000c90] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
--/* [0x00000c98] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
--/* [0x00000ca0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x00000ca8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x000009e8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+/* [0x000009f0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+/* [0x000009f8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+/* [0x00000a00] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x00000a08] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+/* [0x00000a10] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x00000a18] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00000a20] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+/* [0x00000a28] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+/* [0x00000a30] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+/* [0x00000a38] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000a40] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+/* [0x00000a48] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+/* [0x00000a50] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+/* [0x00000a58] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+/* [0x00000a60] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+/* [0x00000a68] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+/* [0x00000a70] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+/* [0x00000a78] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+/* [0x00000a80] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+/* [0x00000a88] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+/* [0x00000a90] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+/* [0x00000a98] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+/* [0x00000aa0] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+/* [0x00000aa8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+/* [0x00000ab0] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+/* [0x00000ab8] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+/* [0x00000ac0] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-+/* [0x00000ac8] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+/* [0x00000ad0] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-+/* [0x00000ad8] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+/* [0x00000ae0] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+/* [0x00000ae8] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+/* [0x00000af0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+/* [0x00000af8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+/* [0x00000b00] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-+/* [0x00000b08] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+/* [0x00000b10] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-+/* [0x00000b18] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-+/* [0x00000b20] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-+/* [0x00000b28] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
-+/* [0x00000b30] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
-+/* [0x00000b38] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
-+/* [0x00000b40] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
-+/* [0x00000b48] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
-+/* [0x00000b50] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
-+/* [0x00000b58] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
-+/* [0x00000b60] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
-+/* [0x00000b68] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+/* [0x00000b70] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+/* [0x00000b78] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+/* [0x00000b80] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
-+/* [0x00000b88] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
-+/* [0x00000b90] */ 0xfffffe38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+/* [0x00000b98] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-+/* [0x00000ba0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+/* [0x00000ba8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+/* [0x00000bb0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x00000bb8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000bc0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x00000bc8] */ 0x00000010, 0xe0020827, // mov r0, 16
-+/* [0x00000bd0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000bd8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00000be0] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+/* [0x00000be8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000bf0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
- // ::mc_exit
--/* [0x00000cb0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
--/* [0x00000cb8] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
--/* [0x00000cc0] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000cc8] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000cd0] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000cd8] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000ce0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
--/* [0x00000ce8] */ 0x009e7000, 0x100009e7, // nop        ; nop
--/* [0x00000cf0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+/* [0x00000bf8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x00000c00] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-+/* [0x00000c08] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000c10] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000c18] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000c20] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000c28] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x00000c30] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+/* [0x00000c38] */ 0x009e7000, 0x100009e7, // nop        ; nop
- // ::mc_interrupt_exit8
--/* [0x00000cf8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
--/* [0x00000d00] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000d08] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000d10] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000d18] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000d20] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000d28] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000d30] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000d38] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000d40] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000d48] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000d50] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000d58] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
--/* [0x00000d60] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
--/* [0x00000d68] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+/* [0x00000c40] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x00000c48] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000c50] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000c58] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000c60] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000c68] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000c70] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000c78] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000c80] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000c88] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000c90] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000c98] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000ca0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x00000ca8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+/* [0x00000cb0] */ 0x009e7000, 0x100009e7, // nop        ; nop
- // ::mc_end
- };
- #ifdef __HIGHC__
-diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
-index 6562fa9..1bf7a68 100644
---- a/libavcodec/rpi_shader.h
-+++ b/libavcodec/rpi_shader.h
-@@ -5,10 +5,10 @@ extern unsigned int rpi_shader[];
- 
- #define mc_setup_uv (rpi_shader + 0)
- #define mc_filter_uv (rpi_shader + 152)
--#define mc_filter_uv_b0 (rpi_shader + 370)
--#define mc_filter_uv_b (rpi_shader + 584)
--#define mc_exit (rpi_shader + 812)
--#define mc_interrupt_exit8 (rpi_shader + 830)
--#define mc_end (rpi_shader + 860)
-+#define mc_filter_uv_b0 (rpi_shader + 324)
-+#define mc_filter_uv_b (rpi_shader + 538)
-+#define mc_exit (rpi_shader + 766)
-+#define mc_interrupt_exit8 (rpi_shader + 784)
-+#define mc_end (rpi_shader + 814)
- 
- #endif
-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-index 562dc35..8e4f18f 100644
---- a/libavcodec/rpi_shader.qasm
-+++ b/libavcodec/rpi_shader.qasm
-@@ -16,8 +16,8 @@
- # ra19                                          next ra17
- #
- # rb16                                          pitch
--# rb17                                          height + 5
--# rb18                                          height + 7
-+# rb17                                          height + 1
-+# rb18                                          height + 3
- # rb19                                          next ra16
- #
- # ra20                                          1
-@@ -214,8 +214,8 @@ mov r0, unif
- shr r1, r0, r2 # Extract width
- sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
- and r0, r0, rb22 # Extract height
--add rb17, r0, 5
--add rb18, r0, 7
-+add rb17, r0, 1
-+add rb18, r0, 3
- shl r0, r0, 7
- add r0, r0, r1 # Combine width and height of destination area
- shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
-@@ -230,18 +230,11 @@ asr ra3, r0, rb23;      mul24 r0, r0, ra22
- asr ra2, r0, rb23;      mul24 r0, r0, ra22
- asr ra1, r0, rb23;      mul24 r0, r0, ra22
- asr ra0, r0, rb23;      mov r0, unif
--asr ra7, r0, rb23;      mul24 r0, r0, ra22
--asr ra6, r0, rb23;      mul24 r0, r0, ra22
--asr ra5, r0, rb23;      mul24 r0, r0, ra22
--asr ra4, r0, rb23;      mov r0, unif
-+                        mov r0, unif
- asr rb11, r0, rb23;     mul24 r0, r0, ra22
- asr rb10, r0, rb23;     mul24 r0, r0, ra22
- asr rb9, r0, rb23;      mul24 r0, r0, ra22
- asr rb8, r0, rb23;      mov r0, unif
--asr rb15, r0, rb23;     mul24 r0, r0, ra22
--asr rb14, r0, rb23;     mul24 r0, r0, ra22
--asr rb13, r0, rb23;     mul24 r0, r0, ra22
--asr rb12, r0, rb23
- 
- # r2 is elem_num
- # r3 is loop counter
-@@ -283,26 +276,14 @@ add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
- nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
- add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
- nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
--add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
--nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
--add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
--nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
--add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
--nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
--add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
--nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
- add r0, r2, r3
- 
- mov r3, rb31
- 
--mov ra8, ra9
--mov ra9, ra10
--mov ra10, ra11
--mov ra11, ra12
- mov ra12, ra13
- mov ra13, ra14
- 
--sub.setf -, r3, 8 ; mov r1, ra22
-+sub.setf -, r3, 4 ; mov r1, ra22
- 
- # apply horizontal filter
- brr.anyn -, r:uvloop
-@@ -312,14 +293,10 @@ nop                     ; nop  # Delay slot 3 (TODO move more of the context scr
- 
- # apply vertical filter and write to VPM
- 
--nop                     ; mul24 r1, ra14, rb14
--nop                     ; mul24 r0, ra13, rb13
--add r1, r1, r0          ; mul24 r0, ra12, rb12
--add r1, r1, r0          ; mul24 r0, ra11, rb11
--add r1, r1, r0          ; mul24 r0, ra10, rb10
--add r1, r1, r0          ; mul24 r0, ra9, rb9
--add r1, r1, r0          ; mul24 r0, ra8, rb8
--add r1, r1, r0          ; mul24 r0, ra15, rb15
-+nop                     ; mul24 r1, ra14, rb10
-+nop                     ; mul24 r0, ra13, rb9
-+add r1, r1, r0          ; mul24 r0, ra12, rb8
-+add r1, r1, r0          ; mul24 r0, ra15, rb11
- add r1, r1, r0          ; mov -, vw_wait
- sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
- asr r1, r1, 14
--- 
-2.7.4
-
-
-From e4bdd110d4640519b751ab428e7976a1e9a15802 Mon Sep 17 00:00:00 2001
-From: Peter de Rivaz <peter.derivaz@gmail.com>
-Date: Thu, 14 May 2015 11:03:51 +0100
-Subject: [PATCH 28/68] Optimised B0 pass
-
----
- libavcodec/rpi_shader.c    | 424 +++++++++++++++++++++------------------------
- libavcodec/rpi_shader.h    |   8 +-
- libavcodec/rpi_shader.qasm |  43 +----
- 3 files changed, 212 insertions(+), 263 deletions(-)
-
-diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
-index c8d0728..1f63ee0 100644
---- a/libavcodec/rpi_shader.c
-+++ b/libavcodec/rpi_shader.c
-@@ -204,239 +204,215 @@ unsigned int rpi_shader[] = {
- /* [0x00000580] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
- /* [0x00000588] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
- /* [0x00000590] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
--/* [0x00000598] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
--/* [0x000005a0] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+/* [0x00000598] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-+/* [0x000005a0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
- /* [0x000005a8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
- /* [0x000005b0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
- /* [0x000005b8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
- /* [0x000005c0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
--/* [0x000005c8] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
--/* [0x000005d0] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x000005d8] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000005e0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000005e8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000005f0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
--/* [0x000005f8] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000600] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000608] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000610] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
--/* [0x00000618] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000620] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000628] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000630] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
--/* [0x00000638] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000640] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000648] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000650] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
--/* [0x00000658] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
--/* [0x00000660] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00000668] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+/* [0x000005c8] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x000005d0] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000005d8] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000005e0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000005e8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+/* [0x000005f0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x000005f8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000600] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000608] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000610] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+/* [0x00000618] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+/* [0x00000620] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000628] */ 0x00000000, 0xe00208e7, // mov r3, 0
- // :uvloop_b0
--/* [0x00000670] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
--/* [0x00000678] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
--/* [0x00000680] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
--/* [0x00000688] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
--/* [0x00000690] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
--/* [0x00000698] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
--/* [0x000006a0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
--/* [0x000006a8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
--/* [0x000006b0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
--/* [0x000006b8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
--/* [0x000006c0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x000006c8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
--/* [0x000006d0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
--/* [0x000006d8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
--/* [0x000006e0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
--/* [0x000006e8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
--/* [0x000006f0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
--/* [0x000006f8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
--/* [0x00000700] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
--/* [0x00000708] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
--/* [0x00000710] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
--/* [0x00000718] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
--/* [0x00000720] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
--/* [0x00000728] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
--/* [0x00000730] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
--/* [0x00000738] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
--/* [0x00000740] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
--/* [0x00000748] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
--/* [0x00000750] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
--/* [0x00000758] */ 0x15267d80, 0x10020227, // mov ra8, ra9
--/* [0x00000760] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
--/* [0x00000768] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
--/* [0x00000770] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
--/* [0x00000778] */ 0x15367d80, 0x10020327, // mov ra12, ra13
--/* [0x00000780] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
--/* [0x00000788] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
--/* [0x00000790] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b0
--/* [0x00000798] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
--/* [0x000007a0] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
--/* [0x000007a8] */ 0x009e7000, 0x100009e7, // nop                     ; nop
--/* [0x000007b0] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
--/* [0x000007b8] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
--/* [0x000007c0] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
--/* [0x000007c8] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
--/* [0x000007d0] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
--/* [0x000007d8] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
--/* [0x000007e0] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
--/* [0x000007e8] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
--/* [0x000007f0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
--/* [0x000007f8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
--/* [0x00000800] */ 0xfffffe50, 0xf06809e7, // brr.anyn -, r:uvloop_b0
--/* [0x00000808] */ 0x0f9ce3c0, 0xd0020c27, // asr vpm, r1, 14
--/* [0x00000810] */ 0x009e7000, 0x100009e7, // nop
--/* [0x00000818] */ 0x009e7000, 0x100009e7, // nop
--/* [0x00000820] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
--/* [0x00000828] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x00000830] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
--/* [0x00000838] */ 0x00000010, 0xe0020827, // mov r0, 16
--/* [0x00000840] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
--/* [0x00000848] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
--/* [0x00000850] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
--/* [0x00000858] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x00000860] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x00000630] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+/* [0x00000638] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+/* [0x00000640] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+/* [0x00000648] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x00000650] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+/* [0x00000658] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x00000660] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00000668] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+/* [0x00000670] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+/* [0x00000678] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+/* [0x00000680] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000688] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+/* [0x00000690] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+/* [0x00000698] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+/* [0x000006a0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+/* [0x000006a8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+/* [0x000006b0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+/* [0x000006b8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+/* [0x000006c0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+/* [0x000006c8] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-+/* [0x000006d0] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+/* [0x000006d8] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+/* [0x000006e0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+/* [0x000006e8] */ 0x8d5847f6, 0xd00269e1, // sub.setf -, r3, 4 ; mov r1, ra22
-+/* [0x000006f0] */ 0xffffff20, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+/* [0x000006f8] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-+/* [0x00000700] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-+/* [0x00000708] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-+/* [0x00000710] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+/* [0x00000718] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+/* [0x00000720] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+/* [0x00000728] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+/* [0x00000730] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+/* [0x00000738] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+/* [0x00000740] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+/* [0x00000748] */ 0x0f9ce3c0, 0xd0020c27, // asr vpm, r1, 14
-+/* [0x00000750] */ 0x009e7000, 0x100009e7, // nop
-+/* [0x00000758] */ 0x009e7000, 0x100009e7, // nop
-+/* [0x00000760] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x00000768] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000770] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x00000778] */ 0x00000010, 0xe0020827, // mov r0, 16
-+/* [0x00000780] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000788] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00000790] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+/* [0x00000798] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x000007a0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
- // ::mc_filter_uv_b
--/* [0x00000868] */ 0x15827d80, 0x100207e7, // mov ra31, unif
--/* [0x00000870] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
--/* [0x00000878] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
--/* [0x00000880] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
--/* [0x00000888] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
--/* [0x00000890] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
--/* [0x00000898] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
--/* [0x000008a0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
--/* [0x000008a8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
--/* [0x000008b0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
--/* [0x000008b8] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
--/* [0x000008c0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
--/* [0x000008c8] */ 0x00000010, 0xe00208a7, // mov r2, 16
--/* [0x000008d0] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x000008d8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
--/* [0x000008e0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
--/* [0x000008e8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
--/* [0x000008f0] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
--/* [0x000008f8] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
--/* [0x00000900] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
--/* [0x00000908] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
--/* [0x00000910] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
--/* [0x00000918] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
--/* [0x00000920] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
--/* [0x00000928] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
--/* [0x00000930] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
--/* [0x00000938] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
--/* [0x00000940] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
--/* [0x00000948] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000950] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000958] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000960] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000968] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
--/* [0x00000970] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000978] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000980] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000988] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
--/* [0x00000990] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000998] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
--/* [0x000009a0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000009a8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
--/* [0x000009b0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
--/* [0x000009b8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
--/* [0x000009c0] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
--/* [0x000009c8] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
--/* [0x000009d0] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
--/* [0x000009d8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x000009e0] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+/* [0x000007a8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x000007b0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+/* [0x000007b8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+/* [0x000007c0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+/* [0x000007c8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+/* [0x000007d0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x000007d8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+/* [0x000007e0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+/* [0x000007e8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+/* [0x000007f0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+/* [0x000007f8] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+/* [0x00000800] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+/* [0x00000808] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+/* [0x00000810] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000818] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+/* [0x00000820] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+/* [0x00000828] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+/* [0x00000830] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+/* [0x00000838] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+/* [0x00000840] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+/* [0x00000848] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-+/* [0x00000850] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-+/* [0x00000858] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-+/* [0x00000860] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000868] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+/* [0x00000870] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+/* [0x00000878] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
-+/* [0x00000880] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-+/* [0x00000888] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000890] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000898] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000008a0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000008a8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+/* [0x000008b0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000008b8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000008c0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000008c8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+/* [0x000008d0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x000008d8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x000008e0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000008e8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+/* [0x000008f0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x000008f8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000900] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000908] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-+/* [0x00000910] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+/* [0x00000918] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000920] */ 0x00000000, 0xe00208e7, // mov r3, 0
- // :uvloop_b
--/* [0x000009e8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
--/* [0x000009f0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
--/* [0x000009f8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
--/* [0x00000a00] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
--/* [0x00000a08] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
--/* [0x00000a10] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
--/* [0x00000a18] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
--/* [0x00000a20] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
--/* [0x00000a28] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
--/* [0x00000a30] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
--/* [0x00000a38] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00000a40] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
--/* [0x00000a48] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
--/* [0x00000a50] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
--/* [0x00000a58] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
--/* [0x00000a60] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
--/* [0x00000a68] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
--/* [0x00000a70] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
--/* [0x00000a78] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
--/* [0x00000a80] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
--/* [0x00000a88] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
--/* [0x00000a90] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
--/* [0x00000a98] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
--/* [0x00000aa0] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
--/* [0x00000aa8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
--/* [0x00000ab0] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
--/* [0x00000ab8] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
--/* [0x00000ac0] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
--/* [0x00000ac8] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
--/* [0x00000ad0] */ 0x15267d80, 0x10020227, // mov ra8, ra9
--/* [0x00000ad8] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
--/* [0x00000ae0] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
--/* [0x00000ae8] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
--/* [0x00000af0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
--/* [0x00000af8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
--/* [0x00000b00] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
--/* [0x00000b08] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
--/* [0x00000b10] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
--/* [0x00000b18] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
--/* [0x00000b20] */ 0x009e7000, 0x100009e7, // nop                     ; nop
--/* [0x00000b28] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
--/* [0x00000b30] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
--/* [0x00000b38] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
--/* [0x00000b40] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
--/* [0x00000b48] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
--/* [0x00000b50] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
--/* [0x00000b58] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
--/* [0x00000b60] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
--/* [0x00000b68] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
--/* [0x00000b70] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
--/* [0x00000b78] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
--/* [0x00000b80] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
--/* [0x00000b88] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
--/* [0x00000b90] */ 0xfffffe38, 0xf06809e7, // brr.anyn -, r:uvloop_b
--/* [0x00000b98] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
--/* [0x00000ba0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
--/* [0x00000ba8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
--/* [0x00000bb0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
--/* [0x00000bb8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x00000bc0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
--/* [0x00000bc8] */ 0x00000010, 0xe0020827, // mov r0, 16
--/* [0x00000bd0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
--/* [0x00000bd8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
--/* [0x00000be0] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
--/* [0x00000be8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x00000bf0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x00000928] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+/* [0x00000930] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+/* [0x00000938] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+/* [0x00000940] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x00000948] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+/* [0x00000950] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x00000958] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00000960] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+/* [0x00000968] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+/* [0x00000970] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+/* [0x00000978] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000980] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+/* [0x00000988] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+/* [0x00000990] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+/* [0x00000998] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+/* [0x000009a0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+/* [0x000009a8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+/* [0x000009b0] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+/* [0x000009b8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+/* [0x000009c0] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+/* [0x000009c8] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+/* [0x000009d0] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+/* [0x000009d8] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+/* [0x000009e0] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+/* [0x000009e8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+/* [0x000009f0] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+/* [0x000009f8] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+/* [0x00000a00] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-+/* [0x00000a08] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+/* [0x00000a10] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-+/* [0x00000a18] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+/* [0x00000a20] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+/* [0x00000a28] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+/* [0x00000a30] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+/* [0x00000a38] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+/* [0x00000a40] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-+/* [0x00000a48] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+/* [0x00000a50] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-+/* [0x00000a58] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-+/* [0x00000a60] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-+/* [0x00000a68] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
-+/* [0x00000a70] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
-+/* [0x00000a78] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
-+/* [0x00000a80] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
-+/* [0x00000a88] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
-+/* [0x00000a90] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
-+/* [0x00000a98] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
-+/* [0x00000aa0] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
-+/* [0x00000aa8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+/* [0x00000ab0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+/* [0x00000ab8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+/* [0x00000ac0] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
-+/* [0x00000ac8] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
-+/* [0x00000ad0] */ 0xfffffe38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+/* [0x00000ad8] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-+/* [0x00000ae0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+/* [0x00000ae8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+/* [0x00000af0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x00000af8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000b00] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x00000b08] */ 0x00000010, 0xe0020827, // mov r0, 16
-+/* [0x00000b10] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000b18] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00000b20] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+/* [0x00000b28] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000b30] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
- // ::mc_exit
--/* [0x00000bf8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
--/* [0x00000c00] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
--/* [0x00000c08] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000c10] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000c18] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000c20] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000c28] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
--/* [0x00000c30] */ 0x009e7000, 0x100009e7, // nop        ; nop
--/* [0x00000c38] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+/* [0x00000b38] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x00000b40] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-+/* [0x00000b48] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000b50] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000b58] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000b60] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000b68] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x00000b70] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+/* [0x00000b78] */ 0x009e7000, 0x100009e7, // nop        ; nop
- // ::mc_interrupt_exit8
--/* [0x00000c40] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
--/* [0x00000c48] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000c50] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000c58] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000c60] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000c68] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000c70] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000c78] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000c80] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000c88] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000c90] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000c98] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000ca0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
--/* [0x00000ca8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
--/* [0x00000cb0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+/* [0x00000b80] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x00000b88] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000b90] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000b98] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000ba0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000ba8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000bb0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000bb8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000bc0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000bc8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000bd0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000bd8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000be0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x00000be8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+/* [0x00000bf0] */ 0x009e7000, 0x100009e7, // nop        ; nop
- // ::mc_end
- };
- #ifdef __HIGHC__
-diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
-index 1bf7a68..cb74887 100644
---- a/libavcodec/rpi_shader.h
-+++ b/libavcodec/rpi_shader.h
-@@ -6,9 +6,9 @@ extern unsigned int rpi_shader[];
- #define mc_setup_uv (rpi_shader + 0)
- #define mc_filter_uv (rpi_shader + 152)
- #define mc_filter_uv_b0 (rpi_shader + 324)
--#define mc_filter_uv_b (rpi_shader + 538)
--#define mc_exit (rpi_shader + 766)
--#define mc_interrupt_exit8 (rpi_shader + 784)
--#define mc_end (rpi_shader + 814)
-+#define mc_filter_uv_b (rpi_shader + 490)
-+#define mc_exit (rpi_shader + 718)
-+#define mc_interrupt_exit8 (rpi_shader + 736)
-+#define mc_end (rpi_shader + 766)
- 
- #endif
-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-index 8e4f18f..faa5755 100644
---- a/libavcodec/rpi_shader.qasm
-+++ b/libavcodec/rpi_shader.qasm
-@@ -357,15 +357,13 @@ mov r0, unif
- shr r1, r0, r2 # Extract width
- sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
- and r0, r0, rb22 # Extract height
--add rb17, r0, 5
--add rb18, r0, 7
-+add rb17, r0, 1
-+add rb18, r0, 3
- shl r0, r0, 7
- add r0, r0, r1 # Combine width and height of destination area
- shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
- add rb26, r0, rb27
- 
--sub.setf -,8,r1 # 8-r1, so if <0 (negative) we need to use the full code
--
- # get filter coefficients
- 
- mov r0, unif
-@@ -373,18 +371,11 @@ asr ra3, r0, rb23;      mul24 r0, r0, ra22
- asr ra2, r0, rb23;      mul24 r0, r0, ra22
- asr ra1, r0, rb23;      mul24 r0, r0, ra22
- asr ra0, r0, rb23;      mov r0, unif
--asr ra7, r0, rb23;      mul24 r0, r0, ra22
--asr ra6, r0, rb23;      mul24 r0, r0, ra22
--asr ra5, r0, rb23;      mul24 r0, r0, ra22
--asr ra4, r0, rb23;      mov r0, unif
-+                        mov r0, unif
- asr rb11, r0, rb23;     mul24 r0, r0, ra22
- asr rb10, r0, rb23;     mul24 r0, r0, ra22
- asr rb9, r0, rb23;      mul24 r0, r0, ra22
- asr rb8, r0, rb23;      mov r0, unif
--asr rb15, r0, rb23;     mul24 r0, r0, ra22
--asr rb14, r0, rb23;     mul24 r0, r0, ra22
--asr rb13, r0, rb23;     mul24 r0, r0, ra22
--asr rb12, r0, rb23
- 
- # r2 is elem_num
- # r3 is loop counter
-@@ -426,26 +417,14 @@ add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
- nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
- add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
- nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
--add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
--nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
--add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
--nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
--add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
--nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
--add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
--nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
- add r0, r2, r3
- 
- mov r3, rb31
- 
--mov ra8, ra9
--mov ra9, ra10
--mov ra10, ra11
--mov ra11, ra12
- mov ra12, ra13
- mov ra13, ra14
- 
--sub.setf -, r3, 8 ; mov r1, ra22
-+sub.setf -, r3, 4 ; mov r1, ra22
- 
- # apply horizontal filter
- brr.anyn -, r:uvloop_b0
-@@ -455,18 +434,12 @@ nop                     ; nop  # Delay slot 3 (TODO move more of the context scr
- 
- # apply vertical filter and write to VPM
- 
--nop                     ; mul24 r1, ra14, rb14
--nop                     ; mul24 r0, ra13, rb13
--add r1, r1, r0          ; mul24 r0, ra12, rb12
--add r1, r1, r0          ; mul24 r0, ra11, rb11
--add r1, r1, r0          ; mul24 r0, ra10, rb10
--add r1, r1, r0          ; mul24 r0, ra9, rb9
--add r1, r1, r0          ; mul24 r0, ra8, rb8
--add r1, r1, r0          ; mul24 r0, ra15, rb15
-+nop                     ; mul24 r1, ra14, rb10
-+nop                     ; mul24 r0, ra13, rb9
-+add r1, r1, r0          ; mul24 r0, ra12, rb8
-+add r1, r1, r0          ; mul24 r0, ra15, rb11
- add r1, r1, r0          ; mov -, vw_wait
- sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
--#asr r1, r1, 14
--#add r1, r1, ra21
- brr.anyn -, r:uvloop_b0
- asr vpm, r1, 14        # Delay 1 shifts down by shift2=6, but results are still in 16bit precision TODO may be able to avoid the mul24 and use more delay slots
- nop                    # Delay 2
--- 
-2.7.4
-
-
-From 93805e78a13d36e28ed84a0e8456da2eac45be89 Mon Sep 17 00:00:00 2001
-From: Peter de Rivaz <peter.derivaz@gmail.com>
-Date: Thu, 14 May 2015 11:12:43 +0100
-Subject: [PATCH 29/68] Optimised B pass
-
----
- libavcodec/rpi_shader.c    | 202 ++++++++++++++++++++-------------------------
- libavcodec/rpi_shader.h    |   6 +-
- libavcodec/rpi_shader.qasm |  41 ++-------
- 3 files changed, 100 insertions(+), 149 deletions(-)
-
-diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
-index 1f63ee0..4e6c5ea 100644
---- a/libavcodec/rpi_shader.c
-+++ b/libavcodec/rpi_shader.c
-@@ -289,8 +289,8 @@ unsigned int rpi_shader[] = {
- /* [0x00000818] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
- /* [0x00000820] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
- /* [0x00000828] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
--/* [0x00000830] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
--/* [0x00000838] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+/* [0x00000830] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-+/* [0x00000838] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
- /* [0x00000840] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
- /* [0x00000848] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
- /* [0x00000850] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-@@ -299,120 +299,96 @@ unsigned int rpi_shader[] = {
- /* [0x00000868] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
- /* [0x00000870] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
- /* [0x00000878] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
--/* [0x00000880] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
--/* [0x00000888] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000890] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000898] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000008a0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000008a8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
--/* [0x000008b0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000008b8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000008c0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000008c8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
--/* [0x000008d0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
--/* [0x000008d8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
--/* [0x000008e0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000008e8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
--/* [0x000008f0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
--/* [0x000008f8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000900] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000908] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
--/* [0x00000910] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
--/* [0x00000918] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00000920] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+/* [0x00000880] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000888] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000890] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000898] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000008a0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+/* [0x000008a8] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x000008b0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x000008b8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x000008c0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000008c8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+/* [0x000008d0] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+/* [0x000008d8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x000008e0] */ 0x00000000, 0xe00208e7, // mov r3, 0
- // :uvloop_b
--/* [0x00000928] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
--/* [0x00000930] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
--/* [0x00000938] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
--/* [0x00000940] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
--/* [0x00000948] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
--/* [0x00000950] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
--/* [0x00000958] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
--/* [0x00000960] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
--/* [0x00000968] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
--/* [0x00000970] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
--/* [0x00000978] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00000980] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
--/* [0x00000988] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
--/* [0x00000990] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
--/* [0x00000998] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
--/* [0x000009a0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
--/* [0x000009a8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
--/* [0x000009b0] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
--/* [0x000009b8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
--/* [0x000009c0] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
--/* [0x000009c8] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
--/* [0x000009d0] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
--/* [0x000009d8] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
--/* [0x000009e0] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
--/* [0x000009e8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
--/* [0x000009f0] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
--/* [0x000009f8] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
--/* [0x00000a00] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
--/* [0x00000a08] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
--/* [0x00000a10] */ 0x15267d80, 0x10020227, // mov ra8, ra9
--/* [0x00000a18] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
--/* [0x00000a20] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
--/* [0x00000a28] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
--/* [0x00000a30] */ 0x15367d80, 0x10020327, // mov ra12, ra13
--/* [0x00000a38] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
--/* [0x00000a40] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
--/* [0x00000a48] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
--/* [0x00000a50] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
--/* [0x00000a58] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
--/* [0x00000a60] */ 0x009e7000, 0x100009e7, // nop                     ; nop
--/* [0x00000a68] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
--/* [0x00000a70] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
--/* [0x00000a78] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
--/* [0x00000a80] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
--/* [0x00000a88] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
--/* [0x00000a90] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
--/* [0x00000a98] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
--/* [0x00000aa0] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
--/* [0x00000aa8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
--/* [0x00000ab0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
--/* [0x00000ab8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
--/* [0x00000ac0] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
--/* [0x00000ac8] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
--/* [0x00000ad0] */ 0xfffffe38, 0xf06809e7, // brr.anyn -, r:uvloop_b
--/* [0x00000ad8] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
--/* [0x00000ae0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
--/* [0x00000ae8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
--/* [0x00000af0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
--/* [0x00000af8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x00000b00] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
--/* [0x00000b08] */ 0x00000010, 0xe0020827, // mov r0, 16
--/* [0x00000b10] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
--/* [0x00000b18] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
--/* [0x00000b20] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
--/* [0x00000b28] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x00000b30] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x000008e8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+/* [0x000008f0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+/* [0x000008f8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+/* [0x00000900] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x00000908] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+/* [0x00000910] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x00000918] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00000920] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+/* [0x00000928] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+/* [0x00000930] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+/* [0x00000938] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000940] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+/* [0x00000948] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+/* [0x00000950] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+/* [0x00000958] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+/* [0x00000960] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+/* [0x00000968] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+/* [0x00000970] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+/* [0x00000978] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+/* [0x00000980] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-+/* [0x00000988] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+/* [0x00000990] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+/* [0x00000998] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+/* [0x000009a0] */ 0x8d5847f6, 0xd00269e1, // sub.setf -, r3, 4 ; mov r1, ra22
-+/* [0x000009a8] */ 0xffffff20, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+/* [0x000009b0] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-+/* [0x000009b8] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-+/* [0x000009c0] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-+/* [0x000009c8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+/* [0x000009d0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+/* [0x000009d8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+/* [0x000009e0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+/* [0x000009e8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+/* [0x000009f0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+/* [0x000009f8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+/* [0x00000a00] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
-+/* [0x00000a08] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
-+/* [0x00000a10] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+/* [0x00000a18] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-+/* [0x00000a20] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+/* [0x00000a28] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+/* [0x00000a30] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x00000a38] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000a40] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x00000a48] */ 0x00000010, 0xe0020827, // mov r0, 16
-+/* [0x00000a50] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000a58] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00000a60] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+/* [0x00000a68] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000a70] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
- // ::mc_exit
--/* [0x00000b38] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
--/* [0x00000b40] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
--/* [0x00000b48] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000b50] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000b58] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000b60] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000b68] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
--/* [0x00000b70] */ 0x009e7000, 0x100009e7, // nop        ; nop
--/* [0x00000b78] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+/* [0x00000a78] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x00000a80] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-+/* [0x00000a88] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000a90] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000a98] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000aa0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000aa8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x00000ab0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+/* [0x00000ab8] */ 0x009e7000, 0x100009e7, // nop        ; nop
- // ::mc_interrupt_exit8
--/* [0x00000b80] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
--/* [0x00000b88] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000b90] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000b98] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000ba0] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000ba8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000bb0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000bb8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000bc0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000bc8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000bd0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000bd8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000be0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
--/* [0x00000be8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
--/* [0x00000bf0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+/* [0x00000ac0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x00000ac8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000ad0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000ad8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000ae0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000ae8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000af0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000af8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000b00] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000b08] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000b10] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000b18] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000b20] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x00000b28] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+/* [0x00000b30] */ 0x009e7000, 0x100009e7, // nop        ; nop
- // ::mc_end
- };
- #ifdef __HIGHC__
-diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
-index cb74887..53da629 100644
---- a/libavcodec/rpi_shader.h
-+++ b/libavcodec/rpi_shader.h
-@@ -7,8 +7,8 @@ extern unsigned int rpi_shader[];
- #define mc_filter_uv (rpi_shader + 152)
- #define mc_filter_uv_b0 (rpi_shader + 324)
- #define mc_filter_uv_b (rpi_shader + 490)
--#define mc_exit (rpi_shader + 718)
--#define mc_interrupt_exit8 (rpi_shader + 736)
--#define mc_end (rpi_shader + 766)
-+#define mc_exit (rpi_shader + 670)
-+#define mc_interrupt_exit8 (rpi_shader + 688)
-+#define mc_end (rpi_shader + 718)
- 
- #endif
-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-index faa5755..f38c926 100644
---- a/libavcodec/rpi_shader.qasm
-+++ b/libavcodec/rpi_shader.qasm
-@@ -491,8 +491,8 @@ mov r0, unif
- shr r1, r0, r2 # Extract width
- sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
- and r0, r0, rb22 # Extract height
--add rb17, r0, 5
--add rb18, r0, 7
-+add rb17, r0, 1
-+add rb18, r0, 3
- shl r0, r0, 7
- 
- # r0 is currently height<<7
-@@ -508,8 +508,6 @@ add rb26, r0, rb27
- # In a B frame, so also set up VPM read (reading back 16bit precision)
- add vr_setup, r3, rb21
- 
--sub.setf -,8,r1 # 8-r1, so if <0 (negative) we need to use the full code
--
- # get filter coefficients
- 
- mov r0, unif
-@@ -517,18 +515,11 @@ asr ra3, r0, rb23;      mul24 r0, r0, ra22
- asr ra2, r0, rb23;      mul24 r0, r0, ra22
- asr ra1, r0, rb23;      mul24 r0, r0, ra22
- asr ra0, r0, rb23;      mov r0, unif
--asr ra7, r0, rb23;      mul24 r0, r0, ra22
--asr ra6, r0, rb23;      mul24 r0, r0, ra22
--asr ra5, r0, rb23;      mul24 r0, r0, ra22
--asr ra4, r0, rb23;      mov r0, unif
-+                        mov r0, unif
- asr rb11, r0, rb23;     mul24 r0, r0, ra22
- asr rb10, r0, rb23;     mul24 r0, r0, ra22
- asr rb9, r0, rb23;      mul24 r0, r0, ra22
- asr rb8, r0, rb23;      mov r0, unif
--asr rb15, r0, rb23;     mul24 r0, r0, ra22
--asr rb14, r0, rb23;     mul24 r0, r0, ra22
--asr rb13, r0, rb23;     mul24 r0, r0, ra22
--asr rb12, r0, rb23
- 
- # r2 is elem_num
- # r3 is loop counter
-@@ -570,26 +561,14 @@ add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
- nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
- add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
- nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
--add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
--nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
--add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
--nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
--add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
--nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
--add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
--nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
- add r0, r2, r3
- 
- mov r3, rb31
- 
--mov ra8, ra9
--mov ra9, ra10
--mov ra10, ra11
--mov ra11, ra12
- mov ra12, ra13
- mov ra13, ra14
- 
--sub.setf -, r3, 8 ; mov r1, ra22
-+sub.setf -, r3, 4 ; mov r1, ra22
- # apply horizontal filter
- brr.anyn -, r:uvloop_b
- mov ra14, ra15          ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
-@@ -598,14 +577,10 @@ nop                     ; nop    # TODO improve use of delay slots
- 
- # apply vertical filter and write to VPM
- 
--nop                     ; mul24 r1, ra14, rb14
--nop                     ; mul24 r0, ra13, rb13
--add r1, r1, r0          ; mul24 r0, ra12, rb12
--add r1, r1, r0          ; mul24 r0, ra11, rb11
--add r1, r1, r0          ; mul24 r0, ra10, rb10
--add r1, r1, r0          ; mul24 r0, ra9, rb9
--add r1, r1, r0          ; mul24 r0, ra8, rb8
--add r1, r1, r0          ; mul24 r0, ra15, rb15
-+nop                     ; mul24 r1, ra14, rb10
-+nop                     ; mul24 r0, ra13, rb9
-+add r1, r1, r0          ; mul24 r0, ra12, rb8
-+add r1, r1, r0          ; mul24 r0, ra15, rb11
- add r1, r1, r0          ; mov -, vw_wait
- sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
- asr r1, r1, 14          # shift2=6
--- 
-2.7.4
-
-
-From e48df43c16de74dddbc7c702d64dd01eaf8e6b39 Mon Sep 17 00:00:00 2001
-From: Peter de Rivaz <peter.derivaz@gmail.com>
-Date: Thu, 14 May 2015 11:17:09 +0100
-Subject: [PATCH 30/68] Used P delay slots more efficiently
-
----
- libavcodec/rpi_shader.c    | 437 ++++++++++++++++++++++-----------------------
- libavcodec/rpi_shader.h    |  10 +-
- libavcodec/rpi_shader.qasm |  19 +-
- 3 files changed, 228 insertions(+), 238 deletions(-)
-
-diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
-index 4e6c5ea..a1af4e3 100644
---- a/libavcodec/rpi_shader.c
-+++ b/libavcodec/rpi_shader.c
-@@ -156,239 +156,236 @@ unsigned int rpi_shader[] = {
- /* [0x00000408] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
- /* [0x00000410] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
- /* [0x00000418] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
--/* [0x00000420] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
--/* [0x00000428] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
--/* [0x00000430] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+/* [0x00000420] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+/* [0x00000428] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+/* [0x00000430] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
- /* [0x00000438] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
--/* [0x00000440] */ 0x8d5847f6, 0xd00269e1, // sub.setf -, r3, 4 ; mov r1, ra22
--/* [0x00000448] */ 0xffffff20, 0xf06809e7, // brr.anyn -, r:uvloop
--/* [0x00000450] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
--/* [0x00000458] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
--/* [0x00000460] */ 0x009e7000, 0x100009e7, // nop                     ; nop
--/* [0x00000468] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
--/* [0x00000470] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
--/* [0x00000478] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
--/* [0x00000480] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
--/* [0x00000488] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
--/* [0x00000490] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
--/* [0x00000498] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
--/* [0x000004a0] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
--/* [0x000004a8] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop
--/* [0x000004b0] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
--/* [0x000004b8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
--/* [0x000004c0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
--/* [0x000004c8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
--/* [0x000004d0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x000004d8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
--/* [0x000004e0] */ 0x00000010, 0xe0020827, // mov r0, 16
--/* [0x000004e8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
--/* [0x000004f0] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
--/* [0x000004f8] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
--/* [0x00000500] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x00000508] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x00000440] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+/* [0x00000448] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+/* [0x00000450] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+/* [0x00000458] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+/* [0x00000460] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+/* [0x00000468] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+/* [0x00000470] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+/* [0x00000478] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+/* [0x00000480] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+/* [0x00000488] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
-+/* [0x00000490] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:uvloop
-+/* [0x00000498] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-+/* [0x000004a0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+/* [0x000004a8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+/* [0x000004b0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x000004b8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x000004c0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x000004c8] */ 0x00000010, 0xe0020827, // mov r0, 16
-+/* [0x000004d0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x000004d8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x000004e0] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+/* [0x000004e8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x000004f0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
- // ::mc_filter_uv_b0
--/* [0x00000510] */ 0x15827d80, 0x100207e7, // mov ra31, unif
--/* [0x00000518] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
--/* [0x00000520] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
--/* [0x00000528] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
--/* [0x00000530] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
--/* [0x00000538] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
--/* [0x00000540] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
--/* [0x00000548] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
--/* [0x00000550] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
--/* [0x00000558] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
--/* [0x00000560] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
--/* [0x00000568] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
--/* [0x00000570] */ 0x00000010, 0xe00208a7, // mov r2, 16
--/* [0x00000578] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000580] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
--/* [0x00000588] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
--/* [0x00000590] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
--/* [0x00000598] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
--/* [0x000005a0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
--/* [0x000005a8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
--/* [0x000005b0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
--/* [0x000005b8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
--/* [0x000005c0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
--/* [0x000005c8] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x000005d0] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000005d8] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000005e0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000005e8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
--/* [0x000005f0] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x000005f8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000600] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000608] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000610] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
--/* [0x00000618] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
--/* [0x00000620] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00000628] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+/* [0x000004f8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x00000500] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+/* [0x00000508] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+/* [0x00000510] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+/* [0x00000518] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+/* [0x00000520] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x00000528] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+/* [0x00000530] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+/* [0x00000538] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+/* [0x00000540] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+/* [0x00000548] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+/* [0x00000550] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
-+/* [0x00000558] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+/* [0x00000560] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000568] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+/* [0x00000570] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+/* [0x00000578] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+/* [0x00000580] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-+/* [0x00000588] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+/* [0x00000590] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+/* [0x00000598] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x000005a0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+/* [0x000005a8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+/* [0x000005b0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x000005b8] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000005c0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000005c8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000005d0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+/* [0x000005d8] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x000005e0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x000005e8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x000005f0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000005f8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+/* [0x00000600] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+/* [0x00000608] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000610] */ 0x00000000, 0xe00208e7, // mov r3, 0
- // :uvloop_b0
--/* [0x00000630] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
--/* [0x00000638] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
--/* [0x00000640] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
--/* [0x00000648] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
--/* [0x00000650] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
--/* [0x00000658] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
--/* [0x00000660] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
--/* [0x00000668] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
--/* [0x00000670] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
--/* [0x00000678] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
--/* [0x00000680] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00000688] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
--/* [0x00000690] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
--/* [0x00000698] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
--/* [0x000006a0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
--/* [0x000006a8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
--/* [0x000006b0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
--/* [0x000006b8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
--/* [0x000006c0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
--/* [0x000006c8] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
--/* [0x000006d0] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
--/* [0x000006d8] */ 0x15367d80, 0x10020327, // mov ra12, ra13
--/* [0x000006e0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
--/* [0x000006e8] */ 0x8d5847f6, 0xd00269e1, // sub.setf -, r3, 4 ; mov r1, ra22
--/* [0x000006f0] */ 0xffffff20, 0xf06809e7, // brr.anyn -, r:uvloop_b0
--/* [0x000006f8] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
--/* [0x00000700] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
--/* [0x00000708] */ 0x009e7000, 0x100009e7, // nop                     ; nop
--/* [0x00000710] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
--/* [0x00000718] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
--/* [0x00000720] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
--/* [0x00000728] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
--/* [0x00000730] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
--/* [0x00000738] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
--/* [0x00000740] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b0
--/* [0x00000748] */ 0x0f9ce3c0, 0xd0020c27, // asr vpm, r1, 14
--/* [0x00000750] */ 0x009e7000, 0x100009e7, // nop
--/* [0x00000758] */ 0x009e7000, 0x100009e7, // nop
--/* [0x00000760] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
--/* [0x00000768] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x00000770] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
--/* [0x00000778] */ 0x00000010, 0xe0020827, // mov r0, 16
--/* [0x00000780] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
--/* [0x00000788] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
--/* [0x00000790] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
--/* [0x00000798] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x000007a0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x00000618] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+/* [0x00000620] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+/* [0x00000628] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+/* [0x00000630] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x00000638] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+/* [0x00000640] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x00000648] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00000650] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+/* [0x00000658] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+/* [0x00000660] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+/* [0x00000668] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000670] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+/* [0x00000678] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+/* [0x00000680] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+/* [0x00000688] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+/* [0x00000690] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+/* [0x00000698] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+/* [0x000006a0] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+/* [0x000006a8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+/* [0x000006b0] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-+/* [0x000006b8] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+/* [0x000006c0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+/* [0x000006c8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+/* [0x000006d0] */ 0x8d5847f6, 0xd00269e1, // sub.setf -, r3, 4 ; mov r1, ra22
-+/* [0x000006d8] */ 0xffffff20, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+/* [0x000006e0] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-+/* [0x000006e8] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-+/* [0x000006f0] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-+/* [0x000006f8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+/* [0x00000700] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+/* [0x00000708] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+/* [0x00000710] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+/* [0x00000718] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+/* [0x00000720] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+/* [0x00000728] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+/* [0x00000730] */ 0x0f9ce3c0, 0xd0020c27, // asr vpm, r1, 14
-+/* [0x00000738] */ 0x009e7000, 0x100009e7, // nop
-+/* [0x00000740] */ 0x009e7000, 0x100009e7, // nop
-+/* [0x00000748] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x00000750] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000758] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x00000760] */ 0x00000010, 0xe0020827, // mov r0, 16
-+/* [0x00000768] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000770] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00000778] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+/* [0x00000780] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000788] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
- // ::mc_filter_uv_b
--/* [0x000007a8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
--/* [0x000007b0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
--/* [0x000007b8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
--/* [0x000007c0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
--/* [0x000007c8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
--/* [0x000007d0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
--/* [0x000007d8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
--/* [0x000007e0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
--/* [0x000007e8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
--/* [0x000007f0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
--/* [0x000007f8] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
--/* [0x00000800] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
--/* [0x00000808] */ 0x00000010, 0xe00208a7, // mov r2, 16
--/* [0x00000810] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000818] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
--/* [0x00000820] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
--/* [0x00000828] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
--/* [0x00000830] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
--/* [0x00000838] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
--/* [0x00000840] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
--/* [0x00000848] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
--/* [0x00000850] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
--/* [0x00000858] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
--/* [0x00000860] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
--/* [0x00000868] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
--/* [0x00000870] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
--/* [0x00000878] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
--/* [0x00000880] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000888] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000890] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000898] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000008a0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
--/* [0x000008a8] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x000008b0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
--/* [0x000008b8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
--/* [0x000008c0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000008c8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
--/* [0x000008d0] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
--/* [0x000008d8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x000008e0] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+/* [0x00000790] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x00000798] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+/* [0x000007a0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+/* [0x000007a8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+/* [0x000007b0] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+/* [0x000007b8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x000007c0] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+/* [0x000007c8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+/* [0x000007d0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+/* [0x000007d8] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+/* [0x000007e0] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+/* [0x000007e8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+/* [0x000007f0] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+/* [0x000007f8] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000800] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+/* [0x00000808] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+/* [0x00000810] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+/* [0x00000818] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-+/* [0x00000820] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+/* [0x00000828] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+/* [0x00000830] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-+/* [0x00000838] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-+/* [0x00000840] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-+/* [0x00000848] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000850] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+/* [0x00000858] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+/* [0x00000860] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
-+/* [0x00000868] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000870] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000878] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000880] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000888] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+/* [0x00000890] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000898] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x000008a0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x000008a8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000008b0] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+/* [0x000008b8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+/* [0x000008c0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x000008c8] */ 0x00000000, 0xe00208e7, // mov r3, 0
- // :uvloop_b
--/* [0x000008e8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
--/* [0x000008f0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
--/* [0x000008f8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
--/* [0x00000900] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
--/* [0x00000908] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
--/* [0x00000910] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
--/* [0x00000918] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
--/* [0x00000920] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
--/* [0x00000928] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
--/* [0x00000930] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
--/* [0x00000938] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00000940] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
--/* [0x00000948] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
--/* [0x00000950] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
--/* [0x00000958] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
--/* [0x00000960] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
--/* [0x00000968] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
--/* [0x00000970] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
--/* [0x00000978] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
--/* [0x00000980] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
--/* [0x00000988] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
--/* [0x00000990] */ 0x15367d80, 0x10020327, // mov ra12, ra13
--/* [0x00000998] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
--/* [0x000009a0] */ 0x8d5847f6, 0xd00269e1, // sub.setf -, r3, 4 ; mov r1, ra22
--/* [0x000009a8] */ 0xffffff20, 0xf06809e7, // brr.anyn -, r:uvloop_b
--/* [0x000009b0] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
--/* [0x000009b8] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
--/* [0x000009c0] */ 0x009e7000, 0x100009e7, // nop                     ; nop
--/* [0x000009c8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
--/* [0x000009d0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
--/* [0x000009d8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
--/* [0x000009e0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
--/* [0x000009e8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
--/* [0x000009f0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
--/* [0x000009f8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
--/* [0x00000a00] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
--/* [0x00000a08] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
--/* [0x00000a10] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:uvloop_b
--/* [0x00000a18] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
--/* [0x00000a20] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
--/* [0x00000a28] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
--/* [0x00000a30] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
--/* [0x00000a38] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x00000a40] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
--/* [0x00000a48] */ 0x00000010, 0xe0020827, // mov r0, 16
--/* [0x00000a50] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
--/* [0x00000a58] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
--/* [0x00000a60] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
--/* [0x00000a68] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x00000a70] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x000008d0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+/* [0x000008d8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+/* [0x000008e0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+/* [0x000008e8] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x000008f0] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+/* [0x000008f8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x00000900] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00000908] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+/* [0x00000910] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+/* [0x00000918] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+/* [0x00000920] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000928] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+/* [0x00000930] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+/* [0x00000938] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+/* [0x00000940] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+/* [0x00000948] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+/* [0x00000950] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+/* [0x00000958] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+/* [0x00000960] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+/* [0x00000968] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-+/* [0x00000970] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+/* [0x00000978] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+/* [0x00000980] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+/* [0x00000988] */ 0x8d5847f6, 0xd00269e1, // sub.setf -, r3, 4 ; mov r1, ra22
-+/* [0x00000990] */ 0xffffff20, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+/* [0x00000998] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-+/* [0x000009a0] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-+/* [0x000009a8] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-+/* [0x000009b0] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+/* [0x000009b8] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+/* [0x000009c0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+/* [0x000009c8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+/* [0x000009d0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+/* [0x000009d8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+/* [0x000009e0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+/* [0x000009e8] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
-+/* [0x000009f0] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
-+/* [0x000009f8] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+/* [0x00000a00] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-+/* [0x00000a08] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+/* [0x00000a10] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+/* [0x00000a18] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x00000a20] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000a28] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x00000a30] */ 0x00000010, 0xe0020827, // mov r0, 16
-+/* [0x00000a38] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000a40] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00000a48] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+/* [0x00000a50] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000a58] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
- // ::mc_exit
--/* [0x00000a78] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
--/* [0x00000a80] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-+/* [0x00000a60] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x00000a68] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-+/* [0x00000a70] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000a78] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000a80] */ 0x009e7000, 0xa00009e7, // ldtmu0
- /* [0x00000a88] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000a90] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000a98] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000aa0] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000aa8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
--/* [0x00000ab0] */ 0x009e7000, 0x100009e7, // nop        ; nop
--/* [0x00000ab8] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+/* [0x00000a90] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x00000a98] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+/* [0x00000aa0] */ 0x009e7000, 0x100009e7, // nop        ; nop
- // ::mc_interrupt_exit8
--/* [0x00000ac0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x00000aa8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x00000ab0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000ab8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000ac0] */ 0x009e7000, 0xa00009e7, // ldtmu0
- /* [0x00000ac8] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000ad0] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000ad8] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000ae0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000ad0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000ad8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000ae0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
- /* [0x00000ae8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
- /* [0x00000af0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
- /* [0x00000af8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
- /* [0x00000b00] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000b08] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000b10] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000b18] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000b20] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
--/* [0x00000b28] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
--/* [0x00000b30] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+/* [0x00000b08] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x00000b10] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+/* [0x00000b18] */ 0x009e7000, 0x100009e7, // nop        ; nop
- // ::mc_end
- };
- #ifdef __HIGHC__
-diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
-index 53da629..1fb3e37 100644
---- a/libavcodec/rpi_shader.h
-+++ b/libavcodec/rpi_shader.h
-@@ -5,10 +5,10 @@ extern unsigned int rpi_shader[];
- 
- #define mc_setup_uv (rpi_shader + 0)
- #define mc_filter_uv (rpi_shader + 152)
--#define mc_filter_uv_b0 (rpi_shader + 324)
--#define mc_filter_uv_b (rpi_shader + 490)
--#define mc_exit (rpi_shader + 670)
--#define mc_interrupt_exit8 (rpi_shader + 688)
--#define mc_end (rpi_shader + 718)
-+#define mc_filter_uv_b0 (rpi_shader + 318)
-+#define mc_filter_uv_b (rpi_shader + 484)
-+#define mc_exit (rpi_shader + 664)
-+#define mc_interrupt_exit8 (rpi_shader + 682)
-+#define mc_end (rpi_shader + 712)
- 
- #endif
-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-index f38c926..02e95dd 100644
---- a/libavcodec/rpi_shader.qasm
-+++ b/libavcodec/rpi_shader.qasm
-@@ -268,6 +268,7 @@ add t0s, ra_x2_base, r2
- 
- mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
- 
-+# apply horizontal filter
- nop                  ; mul24 r2, r0, ra0
- nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
- nop                  ; mul24      r3, ra1 << 1, r0 << 1
-@@ -276,20 +277,12 @@ add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
- nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
- add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
- nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
--add r0, r2, r3
--
--mov r3, rb31
--
--mov ra12, ra13
--mov ra13, ra14
--
--sub.setf -, r3, 4 ; mov r1, ra22
--
--# apply horizontal filter
-+add r0, r2, r3       ; mov r3, rb31
-+sub.setf -, r3, 4    ; mov ra12, ra13
- brr.anyn -, r:uvloop
--mov ra14, ra15          ; mul24 r0, r0, r1         # last bit of context scroll
--asr ra15, r0, 8         ; nop
--nop                     ; nop  # Delay slot 3 (TODO move more of the context scroll into here)
-+mov ra13, ra14       # Delay slot 1
-+mov ra14, ra15       # Delay slot 2
-+mov ra15, r0         # Delay slot 3
- 
- # apply vertical filter and write to VPM
- 
--- 
-2.7.4
-
-
-From b33dfc243ff5509299685add3c532ab7f207fd73 Mon Sep 17 00:00:00 2001
-From: Peter de Rivaz <peter.derivaz@gmail.com>
-Date: Thu, 14 May 2015 11:22:25 +0100
-Subject: [PATCH 31/68] Improved use of delay slots
-
----
- libavcodec/rpi_shader.c    | 503 ++++++++++++++++++++++-----------------------
- libavcodec/rpi_shader.h    |  10 +-
- libavcodec/rpi_shader.qasm |  41 ++--
- 3 files changed, 265 insertions(+), 289 deletions(-)
-
-diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
-index a1af4e3..c498f28 100644
---- a/libavcodec/rpi_shader.c
-+++ b/libavcodec/rpi_shader.c
-@@ -122,270 +122,263 @@ unsigned int rpi_shader[] = {
- /* [0x00000300] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
- /* [0x00000308] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
- /* [0x00000310] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
--/* [0x00000318] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
--/* [0x00000320] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000328] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000330] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000338] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000340] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
--/* [0x00000348] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000350] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000358] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000360] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000368] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
--/* [0x00000370] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
--/* [0x00000378] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00000380] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+/* [0x00000318] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000320] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000328] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000330] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000338] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+/* [0x00000340] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000348] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000350] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000358] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000360] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+/* [0x00000368] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+/* [0x00000370] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000378] */ 0x00000000, 0xe00208e7, // mov r3, 0
- // :uvloop
--/* [0x00000388] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
--/* [0x00000390] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
--/* [0x00000398] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
--/* [0x000003a0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
--/* [0x000003a8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
--/* [0x000003b0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
--/* [0x000003b8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
--/* [0x000003c0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
--/* [0x000003c8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
--/* [0x000003d0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
--/* [0x000003d8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x000003e0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
--/* [0x000003e8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
--/* [0x000003f0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
--/* [0x000003f8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
--/* [0x00000400] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
--/* [0x00000408] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
--/* [0x00000410] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
--/* [0x00000418] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
--/* [0x00000420] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
--/* [0x00000428] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
--/* [0x00000430] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
--/* [0x00000438] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
--/* [0x00000440] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
--/* [0x00000448] */ 0x159e7000, 0x100203e7, // mov ra15, r0
--/* [0x00000450] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
--/* [0x00000458] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
--/* [0x00000460] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
--/* [0x00000468] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
--/* [0x00000470] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
--/* [0x00000478] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
--/* [0x00000480] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
--/* [0x00000488] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
--/* [0x00000490] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:uvloop
--/* [0x00000498] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
--/* [0x000004a0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
--/* [0x000004a8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
--/* [0x000004b0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
--/* [0x000004b8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x000004c0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
--/* [0x000004c8] */ 0x00000010, 0xe0020827, // mov r0, 16
--/* [0x000004d0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
--/* [0x000004d8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
--/* [0x000004e0] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
--/* [0x000004e8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x000004f0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x00000380] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+/* [0x00000388] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+/* [0x00000390] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+/* [0x00000398] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x000003a0] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+/* [0x000003a8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x000003b0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x000003b8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+/* [0x000003c0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+/* [0x000003c8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+/* [0x000003d0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x000003d8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+/* [0x000003e0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+/* [0x000003e8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+/* [0x000003f0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+/* [0x000003f8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+/* [0x00000400] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+/* [0x00000408] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+/* [0x00000410] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+/* [0x00000418] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+/* [0x00000420] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+/* [0x00000428] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
-+/* [0x00000430] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+/* [0x00000438] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+/* [0x00000440] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+/* [0x00000448] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+/* [0x00000450] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+/* [0x00000458] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+/* [0x00000460] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+/* [0x00000468] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+/* [0x00000470] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+/* [0x00000478] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+/* [0x00000480] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
-+/* [0x00000488] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:uvloop
-+/* [0x00000490] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-+/* [0x00000498] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+/* [0x000004a0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+/* [0x000004a8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x000004b0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x000004b8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x000004c0] */ 0x00000010, 0xe0020827, // mov r0, 16
-+/* [0x000004c8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x000004d0] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x000004d8] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+/* [0x000004e0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x000004e8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
- // ::mc_filter_uv_b0
--/* [0x000004f8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
--/* [0x00000500] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
--/* [0x00000508] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
--/* [0x00000510] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
--/* [0x00000518] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
--/* [0x00000520] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
--/* [0x00000528] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
--/* [0x00000530] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
--/* [0x00000538] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
--/* [0x00000540] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
--/* [0x00000548] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
--/* [0x00000550] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
--/* [0x00000558] */ 0x00000010, 0xe00208a7, // mov r2, 16
--/* [0x00000560] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000568] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
--/* [0x00000570] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
--/* [0x00000578] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
--/* [0x00000580] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
--/* [0x00000588] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
--/* [0x00000590] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
--/* [0x00000598] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
--/* [0x000005a0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
--/* [0x000005a8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
--/* [0x000005b0] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x000005b8] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000005c0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000005c8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000005d0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
--/* [0x000005d8] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x000005e0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
--/* [0x000005e8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
--/* [0x000005f0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000005f8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
--/* [0x00000600] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
--/* [0x00000608] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00000610] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+/* [0x000004f0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x000004f8] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+/* [0x00000500] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+/* [0x00000508] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+/* [0x00000510] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+/* [0x00000518] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x00000520] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+/* [0x00000528] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+/* [0x00000530] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+/* [0x00000538] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+/* [0x00000540] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+/* [0x00000548] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
-+/* [0x00000550] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+/* [0x00000558] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000560] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+/* [0x00000568] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+/* [0x00000570] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+/* [0x00000578] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-+/* [0x00000580] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+/* [0x00000588] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+/* [0x00000590] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000598] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+/* [0x000005a0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+/* [0x000005a8] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x000005b0] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000005b8] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000005c0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000005c8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+/* [0x000005d0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x000005d8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x000005e0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x000005e8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000005f0] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+/* [0x000005f8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+/* [0x00000600] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000608] */ 0x00000000, 0xe00208e7, // mov r3, 0
- // :uvloop_b0
--/* [0x00000618] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
--/* [0x00000620] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
--/* [0x00000628] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
--/* [0x00000630] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
--/* [0x00000638] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
--/* [0x00000640] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
--/* [0x00000648] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
--/* [0x00000650] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
--/* [0x00000658] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
--/* [0x00000660] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
--/* [0x00000668] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00000670] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
--/* [0x00000678] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
--/* [0x00000680] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
--/* [0x00000688] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
--/* [0x00000690] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
--/* [0x00000698] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
--/* [0x000006a0] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
--/* [0x000006a8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
--/* [0x000006b0] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
--/* [0x000006b8] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
--/* [0x000006c0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
--/* [0x000006c8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
--/* [0x000006d0] */ 0x8d5847f6, 0xd00269e1, // sub.setf -, r3, 4 ; mov r1, ra22
--/* [0x000006d8] */ 0xffffff20, 0xf06809e7, // brr.anyn -, r:uvloop_b0
--/* [0x000006e0] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
--/* [0x000006e8] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
--/* [0x000006f0] */ 0x009e7000, 0x100009e7, // nop                     ; nop
--/* [0x000006f8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
--/* [0x00000700] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
--/* [0x00000708] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
--/* [0x00000710] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
--/* [0x00000718] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
--/* [0x00000720] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
--/* [0x00000728] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b0
--/* [0x00000730] */ 0x0f9ce3c0, 0xd0020c27, // asr vpm, r1, 14
--/* [0x00000738] */ 0x009e7000, 0x100009e7, // nop
--/* [0x00000740] */ 0x009e7000, 0x100009e7, // nop
--/* [0x00000748] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
--/* [0x00000750] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x00000758] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
--/* [0x00000760] */ 0x00000010, 0xe0020827, // mov r0, 16
--/* [0x00000768] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
--/* [0x00000770] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
--/* [0x00000778] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
--/* [0x00000780] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x00000788] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x00000610] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+/* [0x00000618] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+/* [0x00000620] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+/* [0x00000628] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x00000630] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+/* [0x00000638] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x00000640] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00000648] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+/* [0x00000650] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+/* [0x00000658] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+/* [0x00000660] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000668] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+/* [0x00000670] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+/* [0x00000678] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+/* [0x00000680] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+/* [0x00000688] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+/* [0x00000690] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+/* [0x00000698] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+/* [0x000006a0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+/* [0x000006a8] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+/* [0x000006b0] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+/* [0x000006b8] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+/* [0x000006c0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+/* [0x000006c8] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+/* [0x000006d0] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+/* [0x000006d8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+/* [0x000006e0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+/* [0x000006e8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+/* [0x000006f0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+/* [0x000006f8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+/* [0x00000700] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
-+/* [0x00000708] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+/* [0x00000710] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
-+/* [0x00000718] */ 0x009e7000, 0x100009e7, // nop
-+/* [0x00000720] */ 0x009e7000, 0x100009e7, // nop
-+/* [0x00000728] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x00000730] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000738] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x00000740] */ 0x00000010, 0xe0020827, // mov r0, 16
-+/* [0x00000748] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000750] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00000758] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+/* [0x00000760] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000768] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
- // ::mc_filter_uv_b
--/* [0x00000790] */ 0x15827d80, 0x100207e7, // mov ra31, unif
--/* [0x00000798] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
--/* [0x000007a0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
--/* [0x000007a8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
--/* [0x000007b0] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
--/* [0x000007b8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
--/* [0x000007c0] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
--/* [0x000007c8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
--/* [0x000007d0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
--/* [0x000007d8] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
--/* [0x000007e0] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
--/* [0x000007e8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
--/* [0x000007f0] */ 0x00000010, 0xe00208a7, // mov r2, 16
--/* [0x000007f8] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000800] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
--/* [0x00000808] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
--/* [0x00000810] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
--/* [0x00000818] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
--/* [0x00000820] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
--/* [0x00000828] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
--/* [0x00000830] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
--/* [0x00000838] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
--/* [0x00000840] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
--/* [0x00000848] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
--/* [0x00000850] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
--/* [0x00000858] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
--/* [0x00000860] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
--/* [0x00000868] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000870] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000878] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000880] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000888] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
--/* [0x00000890] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000898] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
--/* [0x000008a0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
--/* [0x000008a8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000008b0] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
--/* [0x000008b8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
--/* [0x000008c0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x000008c8] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+/* [0x00000770] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x00000778] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+/* [0x00000780] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+/* [0x00000788] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+/* [0x00000790] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+/* [0x00000798] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x000007a0] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+/* [0x000007a8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+/* [0x000007b0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+/* [0x000007b8] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+/* [0x000007c0] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+/* [0x000007c8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+/* [0x000007d0] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+/* [0x000007d8] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x000007e0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+/* [0x000007e8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+/* [0x000007f0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+/* [0x000007f8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-+/* [0x00000800] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+/* [0x00000808] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+/* [0x00000810] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-+/* [0x00000818] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-+/* [0x00000820] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-+/* [0x00000828] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000830] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+/* [0x00000838] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+/* [0x00000840] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
-+/* [0x00000848] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000850] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000858] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000860] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000868] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+/* [0x00000870] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000878] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000880] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000888] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000890] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+/* [0x00000898] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+/* [0x000008a0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x000008a8] */ 0x00000000, 0xe00208e7, // mov r3, 0
- // :uvloop_b
--/* [0x000008d0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
--/* [0x000008d8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
--/* [0x000008e0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
--/* [0x000008e8] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
--/* [0x000008f0] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
--/* [0x000008f8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
--/* [0x00000900] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
--/* [0x00000908] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
--/* [0x00000910] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
--/* [0x00000918] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
--/* [0x00000920] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00000928] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
--/* [0x00000930] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
--/* [0x00000938] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
--/* [0x00000940] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
--/* [0x00000948] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
--/* [0x00000950] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
--/* [0x00000958] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
--/* [0x00000960] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
--/* [0x00000968] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
--/* [0x00000970] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
--/* [0x00000978] */ 0x15367d80, 0x10020327, // mov ra12, ra13
--/* [0x00000980] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
--/* [0x00000988] */ 0x8d5847f6, 0xd00269e1, // sub.setf -, r3, 4 ; mov r1, ra22
--/* [0x00000990] */ 0xffffff20, 0xf06809e7, // brr.anyn -, r:uvloop_b
--/* [0x00000998] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
--/* [0x000009a0] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
--/* [0x000009a8] */ 0x009e7000, 0x100009e7, // nop                     ; nop
--/* [0x000009b0] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
--/* [0x000009b8] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
--/* [0x000009c0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
--/* [0x000009c8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
--/* [0x000009d0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
--/* [0x000009d8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
--/* [0x000009e0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
--/* [0x000009e8] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
--/* [0x000009f0] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
--/* [0x000009f8] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:uvloop_b
--/* [0x00000a00] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
--/* [0x00000a08] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
--/* [0x00000a10] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
--/* [0x00000a18] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
--/* [0x00000a20] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x00000a28] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
--/* [0x00000a30] */ 0x00000010, 0xe0020827, // mov r0, 16
--/* [0x00000a38] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
--/* [0x00000a40] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
--/* [0x00000a48] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
--/* [0x00000a50] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x00000a58] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x000008b0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+/* [0x000008b8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+/* [0x000008c0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+/* [0x000008c8] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x000008d0] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+/* [0x000008d8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x000008e0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x000008e8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+/* [0x000008f0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+/* [0x000008f8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+/* [0x00000900] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000908] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+/* [0x00000910] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+/* [0x00000918] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+/* [0x00000920] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+/* [0x00000928] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+/* [0x00000930] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+/* [0x00000938] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+/* [0x00000940] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+/* [0x00000948] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+/* [0x00000950] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+/* [0x00000958] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+/* [0x00000960] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+/* [0x00000968] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+/* [0x00000970] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+/* [0x00000978] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+/* [0x00000980] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+/* [0x00000988] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+/* [0x00000990] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+/* [0x00000998] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+/* [0x000009a0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+/* [0x000009a8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+/* [0x000009b0] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
-+/* [0x000009b8] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
-+/* [0x000009c0] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+/* [0x000009c8] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-+/* [0x000009d0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+/* [0x000009d8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+/* [0x000009e0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x000009e8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x000009f0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x000009f8] */ 0x00000010, 0xe0020827, // mov r0, 16
-+/* [0x00000a00] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000a08] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00000a10] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+/* [0x00000a18] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000a20] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
- // ::mc_exit
--/* [0x00000a60] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
--/* [0x00000a68] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
--/* [0x00000a70] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000a28] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x00000a30] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-+/* [0x00000a38] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000a40] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000a48] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000a50] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000a58] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x00000a60] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+/* [0x00000a68] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+// ::mc_interrupt_exit8
-+/* [0x00000a70] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
- /* [0x00000a78] */ 0x009e7000, 0xa00009e7, // ldtmu0
- /* [0x00000a80] */ 0x009e7000, 0xa00009e7, // ldtmu0
- /* [0x00000a88] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000a90] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
--/* [0x00000a98] */ 0x009e7000, 0x100009e7, // nop        ; nop
--/* [0x00000aa0] */ 0x009e7000, 0x100009e7, // nop        ; nop
--// ::mc_interrupt_exit8
--/* [0x00000aa8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
--/* [0x00000ab0] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000ab8] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000ac0] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000ac8] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000ad0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000ad8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000ae0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000ae8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000af0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000af8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000b00] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000b08] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
--/* [0x00000b10] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
--/* [0x00000b18] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+/* [0x00000a90] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000a98] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000aa0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000aa8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000ab0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000ab8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000ac0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000ac8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000ad0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x00000ad8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+/* [0x00000ae0] */ 0x009e7000, 0x100009e7, // nop        ; nop
- // ::mc_end
- };
- #ifdef __HIGHC__
-diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
-index 1fb3e37..3fac45f 100644
---- a/libavcodec/rpi_shader.h
-+++ b/libavcodec/rpi_shader.h
-@@ -5,10 +5,10 @@ extern unsigned int rpi_shader[];
- 
- #define mc_setup_uv (rpi_shader + 0)
- #define mc_filter_uv (rpi_shader + 152)
--#define mc_filter_uv_b0 (rpi_shader + 318)
--#define mc_filter_uv_b (rpi_shader + 484)
--#define mc_exit (rpi_shader + 664)
--#define mc_interrupt_exit8 (rpi_shader + 682)
--#define mc_end (rpi_shader + 712)
-+#define mc_filter_uv_b0 (rpi_shader + 316)
-+#define mc_filter_uv_b (rpi_shader + 476)
-+#define mc_exit (rpi_shader + 650)
-+#define mc_interrupt_exit8 (rpi_shader + 668)
-+#define mc_end (rpi_shader + 698)
- 
- #endif
-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-index 02e95dd..10f5113 100644
---- a/libavcodec/rpi_shader.qasm
-+++ b/libavcodec/rpi_shader.qasm
-@@ -221,8 +221,6 @@ add r0, r0, r1 # Combine width and height of destination area
- shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
- add rb26, r0, rb27
- 
--sub.setf -,8,r1 # 8-r1, so if <0 (negative) we need to use the full code
--
- # get filter coefficients
- 
- mov r0, unif
-@@ -410,20 +408,12 @@ add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
- nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
- add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
- nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
--add r0, r2, r3
--
--mov r3, rb31
--
--mov ra12, ra13
--mov ra13, ra14
--
--sub.setf -, r3, 4 ; mov r1, ra22
--
--# apply horizontal filter
-+add r0, r2, r3       ; mov r3, rb31
-+sub.setf -, r3, 4    ; mov ra12, ra13
- brr.anyn -, r:uvloop_b0
--mov ra14, ra15          ; mul24 r0, r0, r1         # last bit of context scroll
--asr ra15, r0, 8         ; nop  # TODO isn't ra15 already in 24bit precision, may not need the sign extension here?
--nop                     ; nop  # Delay slot 3 (TODO move more of the context scroll into here)
-+mov ra13, ra14       # Delay slot 1
-+mov ra14, ra15       # Delay slot 2
-+mov ra15, r0         # Delay slot 3
- 
- # apply vertical filter and write to VPM
- 
-@@ -432,9 +422,9 @@ nop                     ; mul24 r0, ra13, rb9
- add r1, r1, r0          ; mul24 r0, ra12, rb8
- add r1, r1, r0          ; mul24 r0, ra15, rb11
- add r1, r1, r0          ; mov -, vw_wait
--sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+sub.setf -, r3, rb18
- brr.anyn -, r:uvloop_b0
--asr vpm, r1, 14        # Delay 1 shifts down by shift2=6, but results are still in 16bit precision TODO may be able to avoid the mul24 and use more delay slots
-+asr vpm, r1, 6         # Delay 1 shifts down by shift2=6, but results are still in 16bit precision
- nop                    # Delay 2
- nop                    # Delay 3
- 
-@@ -554,19 +544,12 @@ add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
- nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
- add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
- nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
--add r0, r2, r3
--
--mov r3, rb31
--
--mov ra12, ra13
--mov ra13, ra14
--
--sub.setf -, r3, 4 ; mov r1, ra22
--# apply horizontal filter
-+add r0, r2, r3       ; mov r3, rb31
-+sub.setf -, r3, 4    ; mov ra12, ra13
- brr.anyn -, r:uvloop_b
--mov ra14, ra15          ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
--asr ra15, r0, 8         ; nop
--nop                     ; nop    # TODO improve use of delay slots
-+mov ra13, ra14       # Delay slot 1
-+mov ra14, ra15       # Delay slot 2
-+mov ra15, r0         # Delay slot 3
- 
- # apply vertical filter and write to VPM
- 
--- 
-2.7.4
-
-
-From af59f8e00eb977e97debc5e72ba47e0077db1787 Mon Sep 17 00:00:00 2001
-From: Peter de Rivaz <peter.derivaz@gmail.com>
-Date: Thu, 14 May 2015 11:31:23 +0100
-Subject: [PATCH 32/68] Avoid writeback of first B results
-
----
- libavcodec/rpi_shader.c    | 229 ++++++++++++++++++++++-----------------------
- libavcodec/rpi_shader.h    |   8 +-
- libavcodec/rpi_shader.qasm |  18 +---
- 3 files changed, 121 insertions(+), 134 deletions(-)
-
-diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
-index c498f28..ba453a2 100644
---- a/libavcodec/rpi_shader.c
-+++ b/libavcodec/rpi_shader.c
-@@ -255,130 +255,125 @@ unsigned int rpi_shader[] = {
- /* [0x00000710] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
- /* [0x00000718] */ 0x009e7000, 0x100009e7, // nop
- /* [0x00000720] */ 0x009e7000, 0x100009e7, // nop
--/* [0x00000728] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
--/* [0x00000730] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x00000738] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
--/* [0x00000740] */ 0x00000010, 0xe0020827, // mov r0, 16
--/* [0x00000748] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
--/* [0x00000750] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
--/* [0x00000758] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
--/* [0x00000760] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x00000768] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x00000728] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00000730] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000738] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000740] */ 0x009e7000, 0x100009e7, // nop
- // ::mc_filter_uv_b
--/* [0x00000770] */ 0x15827d80, 0x100207e7, // mov ra31, unif
--/* [0x00000778] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
--/* [0x00000780] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
--/* [0x00000788] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
--/* [0x00000790] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
--/* [0x00000798] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
--/* [0x000007a0] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
--/* [0x000007a8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
--/* [0x000007b0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
--/* [0x000007b8] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
--/* [0x000007c0] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
--/* [0x000007c8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
--/* [0x000007d0] */ 0x00000010, 0xe00208a7, // mov r2, 16
--/* [0x000007d8] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x000007e0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
--/* [0x000007e8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
--/* [0x000007f0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
--/* [0x000007f8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
--/* [0x00000800] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
--/* [0x00000808] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
--/* [0x00000810] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
--/* [0x00000818] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
--/* [0x00000820] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
--/* [0x00000828] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
--/* [0x00000830] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
--/* [0x00000838] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
--/* [0x00000840] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
-+/* [0x00000748] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x00000750] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+/* [0x00000758] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+/* [0x00000760] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+/* [0x00000768] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+/* [0x00000770] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x00000778] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+/* [0x00000780] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+/* [0x00000788] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+/* [0x00000790] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+/* [0x00000798] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+/* [0x000007a0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+/* [0x000007a8] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+/* [0x000007b0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x000007b8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+/* [0x000007c0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+/* [0x000007c8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+/* [0x000007d0] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-+/* [0x000007d8] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+/* [0x000007e0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+/* [0x000007e8] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-+/* [0x000007f0] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-+/* [0x000007f8] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-+/* [0x00000800] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000808] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+/* [0x00000810] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+/* [0x00000818] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
-+/* [0x00000820] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000828] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000830] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000838] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000840] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
- /* [0x00000848] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000850] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000858] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000860] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000868] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
--/* [0x00000870] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000878] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000880] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000888] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000890] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
--/* [0x00000898] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
--/* [0x000008a0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x000008a8] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+/* [0x00000850] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000858] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000860] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000868] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+/* [0x00000870] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+/* [0x00000878] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000880] */ 0x00000000, 0xe00208e7, // mov r3, 0
- // :uvloop_b
--/* [0x000008b0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
--/* [0x000008b8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
--/* [0x000008c0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
--/* [0x000008c8] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
--/* [0x000008d0] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
--/* [0x000008d8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
--/* [0x000008e0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
--/* [0x000008e8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
--/* [0x000008f0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
--/* [0x000008f8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
--/* [0x00000900] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00000908] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
--/* [0x00000910] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
--/* [0x00000918] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
--/* [0x00000920] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
--/* [0x00000928] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
--/* [0x00000930] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
--/* [0x00000938] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
--/* [0x00000940] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
--/* [0x00000948] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
--/* [0x00000950] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
--/* [0x00000958] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
--/* [0x00000960] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
--/* [0x00000968] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
--/* [0x00000970] */ 0x159e7000, 0x100203e7, // mov ra15, r0
--/* [0x00000978] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
--/* [0x00000980] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
--/* [0x00000988] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
--/* [0x00000990] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
--/* [0x00000998] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
--/* [0x000009a0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
--/* [0x000009a8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
--/* [0x000009b0] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
--/* [0x000009b8] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
--/* [0x000009c0] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
--/* [0x000009c8] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
--/* [0x000009d0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
--/* [0x000009d8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
--/* [0x000009e0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
--/* [0x000009e8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x000009f0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
--/* [0x000009f8] */ 0x00000010, 0xe0020827, // mov r0, 16
--/* [0x00000a00] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
--/* [0x00000a08] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
--/* [0x00000a10] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
--/* [0x00000a18] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x00000a20] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x00000888] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+/* [0x00000890] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+/* [0x00000898] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+/* [0x000008a0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x000008a8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+/* [0x000008b0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x000008b8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x000008c0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+/* [0x000008c8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+/* [0x000008d0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+/* [0x000008d8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x000008e0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+/* [0x000008e8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+/* [0x000008f0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+/* [0x000008f8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+/* [0x00000900] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+/* [0x00000908] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+/* [0x00000910] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+/* [0x00000918] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+/* [0x00000920] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+/* [0x00000928] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+/* [0x00000930] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+/* [0x00000938] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+/* [0x00000940] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+/* [0x00000948] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+/* [0x00000950] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+/* [0x00000958] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+/* [0x00000960] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+/* [0x00000968] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+/* [0x00000970] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+/* [0x00000978] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+/* [0x00000980] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+/* [0x00000988] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
-+/* [0x00000990] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
-+/* [0x00000998] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+/* [0x000009a0] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-+/* [0x000009a8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+/* [0x000009b0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+/* [0x000009b8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x000009c0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x000009c8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x000009d0] */ 0x00000010, 0xe0020827, // mov r0, 16
-+/* [0x000009d8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x000009e0] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x000009e8] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+/* [0x000009f0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x000009f8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
- // ::mc_exit
--/* [0x00000a28] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
--/* [0x00000a30] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
--/* [0x00000a38] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000a40] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000a48] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000a50] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000a58] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
--/* [0x00000a60] */ 0x009e7000, 0x100009e7, // nop        ; nop
--/* [0x00000a68] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+/* [0x00000a00] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x00000a08] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-+/* [0x00000a10] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000a18] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000a20] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000a28] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000a30] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x00000a38] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+/* [0x00000a40] */ 0x009e7000, 0x100009e7, // nop        ; nop
- // ::mc_interrupt_exit8
--/* [0x00000a70] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
--/* [0x00000a78] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000a80] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000a88] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000a90] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000a48] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x00000a50] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000a58] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000a60] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000a68] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000a70] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000a78] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000a80] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000a88] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000a90] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
- /* [0x00000a98] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
- /* [0x00000aa0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000aa8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000ab0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000ab8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000ac0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000ac8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000ad0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
--/* [0x00000ad8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
--/* [0x00000ae0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+/* [0x00000aa8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x00000ab0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+/* [0x00000ab8] */ 0x009e7000, 0x100009e7, // nop        ; nop
- // ::mc_end
- };
- #ifdef __HIGHC__
-diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
-index 3fac45f..45dbe0e 100644
---- a/libavcodec/rpi_shader.h
-+++ b/libavcodec/rpi_shader.h
-@@ -6,9 +6,9 @@ extern unsigned int rpi_shader[];
- #define mc_setup_uv (rpi_shader + 0)
- #define mc_filter_uv (rpi_shader + 152)
- #define mc_filter_uv_b0 (rpi_shader + 316)
--#define mc_filter_uv_b (rpi_shader + 476)
--#define mc_exit (rpi_shader + 650)
--#define mc_interrupt_exit8 (rpi_shader + 668)
--#define mc_end (rpi_shader + 698)
-+#define mc_filter_uv_b (rpi_shader + 466)
-+#define mc_exit (rpi_shader + 640)
-+#define mc_interrupt_exit8 (rpi_shader + 658)
-+#define mc_end (rpi_shader + 688)
- 
- #endif
-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-index 10f5113..e138c95 100644
---- a/libavcodec/rpi_shader.qasm
-+++ b/libavcodec/rpi_shader.qasm
-@@ -428,22 +428,14 @@ asr vpm, r1, 6         # Delay 1 shifts down by shift2=6, but results are still
- nop                    # Delay 2
- nop                    # Delay 3
- 
-+# in pass0 we don't really need to save any results, but need to discard the uniforms
- # DMA out for U
- 
--mov vw_setup, rb26 # VDW setup 0
--mov vw_setup, rb29 # Stride
--mov vw_addr, unif # start the VDW    # TODO in pass0 we don't need to save any results
--
--# DMA out for V
--# We need to wait for the U to complete first, but have nothing useful to compute while we wait.
--# Could potentially push this write into the start of the next pipeline stage.
--mov r0, 16
--mov -, vw_wait
--
- bra -, ra31
--add vw_setup, rb26, r0 # VDW setup 0
--mov vw_setup, rb29 # Stride
--mov vw_addr, unif # start the VDW
-+mov r0, unif           # Delay 1
-+mov r0, unif           # Delay 2
-+nop                    # Delay 3
-+
- 
- ################################################################################
- 
--- 
-2.7.4
-
-
-From 12e57278cb19a769d2e1488e8e94003027493d09 Mon Sep 17 00:00:00 2001
-From: Peter de Rivaz <peter.derivaz@gmail.com>
-Date: Thu, 14 May 2015 11:36:24 +0100
-Subject: [PATCH 33/68] Cutdown size of chroma prediction commands
-
----
- libavcodec/hevc.c          |  17 +-
- libavcodec/rpi_shader.c    | 543 ++++++++++++++++++++++-----------------------
- libavcodec/rpi_shader.h    |  12 +-
- libavcodec/rpi_shader.qasm |  11 +-
- 4 files changed, 281 insertions(+), 302 deletions(-)
-
-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-index a47ebc5..32b89d5 100644
---- a/libavcodec/hevc.c
-+++ b/libavcodec/hevc.c
-@@ -56,7 +56,7 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
- 
- #ifdef RPI_INTER_QPU
- 
--#define RPI_CHROMA_COMMAND_WORDS 12
-+#define RPI_CHROMA_COMMAND_WORDS 10
- #define UV_COMMANDS_PER_QPU ((1 + (256*64*2)/(4*4)) * RPI_CHROMA_COMMAND_WORDS)
- // The QPU code for UV blocks only works up to a block width of 8
- #define RPI_CHROMA_BLOCK_WIDTH 8
-@@ -2032,11 +2032,8 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[1]);
-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[2]);
-                       *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
--                      // TODO chroma weight and offset... s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]
-                       *u++ = rpi_filter_coefs[_mx][0];
--                      u++;
-                       *u++ = rpi_filter_coefs[_my][0];
--                      u++;
-                       *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
-                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-                     }
-@@ -2091,9 +2088,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-                       *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
-                       // TODO chroma weight and offset... s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]
-                       *u++ = rpi_filter_coefs[_mx][0];
--                      u++;
-                       *u++ = rpi_filter_coefs[_my][0];
--                      u++;
-                       *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
-                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-                     }
-@@ -2154,11 +2149,8 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[2]);
-                       *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
-                       *u++ = rpi_filter_coefs[_mx][0];
--                      u++;
-                       *u++ = rpi_filter_coefs[_my][0];
--                      u++;
--                      *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]); // TODO this will become unused once we have a dedicated pass0 filter
--                      *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-+                      u+=2; // Intermediate results are not written back in first pass of B filtering
- 
-                       u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b;
-                       u++[-RPI_CHROMA_COMMAND_WORDS] = x2_c - 1 + start_x;
-@@ -2166,11 +2158,8 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[1]);
-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[2]);
-                       *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
--                      // TODO chroma weight and offset... s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]
-                       *u++ = rpi_filter_coefs[_mx2][0];
--                      u++;
-                       *u++ = rpi_filter_coefs[_my2][0];
--                      u++;
-                       *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
-                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-                     }
-@@ -2808,7 +2797,7 @@ static void rpi_inter_clear(HEVCContext *s)
-         *s->u_mvs[i]++ = pic_height;
-         *s->u_mvs[i]++ = s->frame->linesize[1];
-         *s->u_mvs[i]++ = s->frame->linesize[2];
--        s->u_mvs[i] += 3;  // Padding words
-+        s->u_mvs[i] += 1;  // Padding words
-     }
- }
- 
-diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
-index ba453a2..b0b93b5 100644
---- a/libavcodec/rpi_shader.c
-+++ b/libavcodec/rpi_shader.c
-@@ -89,291 +89,286 @@ unsigned int rpi_shader[] = {
- /* [0x00000200] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
- /* [0x00000208] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
- /* [0x00000210] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000218] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000220] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000228] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
--/* [0x00000230] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
--/* [0x00000238] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
--/* [0x00000240] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
--/* [0x00000248] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
--/* [0x00000250] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
--/* [0x00000258] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
-+/* [0x00000218] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-+/* [0x00000220] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+/* [0x00000228] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-+/* [0x00000230] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00000238] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-+/* [0x00000240] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
-+/* [0x00000248] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
- // ::mc_filter_uv
--/* [0x00000260] */ 0x15827d80, 0x100207e7, // mov ra31, unif
--/* [0x00000268] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
--/* [0x00000270] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
--/* [0x00000278] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
--/* [0x00000280] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
--/* [0x00000288] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
--/* [0x00000290] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
--/* [0x00000298] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
--/* [0x000002a0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
--/* [0x000002a8] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
--/* [0x000002b0] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
--/* [0x000002b8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
--/* [0x000002c0] */ 0x00000010, 0xe00208a7, // mov r2, 16
--/* [0x000002c8] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x000002d0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
--/* [0x000002d8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
--/* [0x000002e0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
--/* [0x000002e8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
--/* [0x000002f0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
--/* [0x000002f8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
--/* [0x00000300] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
--/* [0x00000308] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
--/* [0x00000310] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
--/* [0x00000318] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000320] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000328] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000330] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000338] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
--/* [0x00000340] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000348] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000350] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000358] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000360] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
--/* [0x00000368] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
--/* [0x00000370] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00000378] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+/* [0x00000250] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x00000258] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+/* [0x00000260] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+/* [0x00000268] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+/* [0x00000270] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+/* [0x00000278] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x00000280] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+/* [0x00000288] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+/* [0x00000290] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+/* [0x00000298] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+/* [0x000002a0] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+/* [0x000002a8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+/* [0x000002b0] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+/* [0x000002b8] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x000002c0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+/* [0x000002c8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+/* [0x000002d0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+/* [0x000002d8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-+/* [0x000002e0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+/* [0x000002e8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+/* [0x000002f0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x000002f8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+/* [0x00000300] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+/* [0x00000308] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000310] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000318] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000320] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000328] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+/* [0x00000330] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000338] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000340] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000348] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-+/* [0x00000350] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+/* [0x00000358] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000360] */ 0x00000000, 0xe00208e7, // mov r3, 0
- // :uvloop
--/* [0x00000380] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
--/* [0x00000388] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
--/* [0x00000390] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
--/* [0x00000398] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
--/* [0x000003a0] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
--/* [0x000003a8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
--/* [0x000003b0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
--/* [0x000003b8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
--/* [0x000003c0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
--/* [0x000003c8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
--/* [0x000003d0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x000003d8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
--/* [0x000003e0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
--/* [0x000003e8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
--/* [0x000003f0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
--/* [0x000003f8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
--/* [0x00000400] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
--/* [0x00000408] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
--/* [0x00000410] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
--/* [0x00000418] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
--/* [0x00000420] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
--/* [0x00000428] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
--/* [0x00000430] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
--/* [0x00000438] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
--/* [0x00000440] */ 0x159e7000, 0x100203e7, // mov ra15, r0
--/* [0x00000448] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
--/* [0x00000450] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
--/* [0x00000458] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
--/* [0x00000460] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
--/* [0x00000468] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
--/* [0x00000470] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
--/* [0x00000478] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
--/* [0x00000480] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
--/* [0x00000488] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:uvloop
--/* [0x00000490] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
--/* [0x00000498] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
--/* [0x000004a0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
--/* [0x000004a8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
--/* [0x000004b0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x000004b8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
--/* [0x000004c0] */ 0x00000010, 0xe0020827, // mov r0, 16
--/* [0x000004c8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
--/* [0x000004d0] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
--/* [0x000004d8] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
--/* [0x000004e0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x000004e8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x00000368] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+/* [0x00000370] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+/* [0x00000378] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+/* [0x00000380] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x00000388] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+/* [0x00000390] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x00000398] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x000003a0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+/* [0x000003a8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+/* [0x000003b0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+/* [0x000003b8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x000003c0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+/* [0x000003c8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+/* [0x000003d0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+/* [0x000003d8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+/* [0x000003e0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+/* [0x000003e8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+/* [0x000003f0] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+/* [0x000003f8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+/* [0x00000400] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+/* [0x00000408] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+/* [0x00000410] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
-+/* [0x00000418] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+/* [0x00000420] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+/* [0x00000428] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+/* [0x00000430] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+/* [0x00000438] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+/* [0x00000440] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+/* [0x00000448] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+/* [0x00000450] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+/* [0x00000458] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+/* [0x00000460] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+/* [0x00000468] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
-+/* [0x00000470] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:uvloop
-+/* [0x00000478] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-+/* [0x00000480] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+/* [0x00000488] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+/* [0x00000490] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x00000498] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x000004a0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x000004a8] */ 0x00000010, 0xe0020827, // mov r0, 16
-+/* [0x000004b0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x000004b8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x000004c0] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+/* [0x000004c8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x000004d0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
- // ::mc_filter_uv_b0
--/* [0x000004f0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
--/* [0x000004f8] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
--/* [0x00000500] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
--/* [0x00000508] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
--/* [0x00000510] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
--/* [0x00000518] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
--/* [0x00000520] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
--/* [0x00000528] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
--/* [0x00000530] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
--/* [0x00000538] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
--/* [0x00000540] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
--/* [0x00000548] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
--/* [0x00000550] */ 0x00000010, 0xe00208a7, // mov r2, 16
--/* [0x00000558] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000560] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
--/* [0x00000568] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
--/* [0x00000570] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
--/* [0x00000578] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
--/* [0x00000580] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
--/* [0x00000588] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
--/* [0x00000590] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
--/* [0x00000598] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
--/* [0x000005a0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
--/* [0x000005a8] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x000005b0] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000005b8] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000005c0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000005c8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
--/* [0x000005d0] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x000005d8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
--/* [0x000005e0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
--/* [0x000005e8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000005f0] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
--/* [0x000005f8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
--/* [0x00000600] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00000608] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+/* [0x000004d8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x000004e0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+/* [0x000004e8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+/* [0x000004f0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+/* [0x000004f8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+/* [0x00000500] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x00000508] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+/* [0x00000510] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+/* [0x00000518] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+/* [0x00000520] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+/* [0x00000528] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+/* [0x00000530] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
-+/* [0x00000538] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+/* [0x00000540] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000548] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+/* [0x00000550] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+/* [0x00000558] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+/* [0x00000560] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-+/* [0x00000568] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+/* [0x00000570] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+/* [0x00000578] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000580] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+/* [0x00000588] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+/* [0x00000590] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000598] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000005a0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000005a8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000005b0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+/* [0x000005b8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x000005c0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x000005c8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000005d0] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-+/* [0x000005d8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+/* [0x000005e0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x000005e8] */ 0x00000000, 0xe00208e7, // mov r3, 0
- // :uvloop_b0
--/* [0x00000610] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
--/* [0x00000618] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
--/* [0x00000620] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
--/* [0x00000628] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
--/* [0x00000630] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
--/* [0x00000638] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
--/* [0x00000640] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
--/* [0x00000648] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
--/* [0x00000650] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
--/* [0x00000658] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
--/* [0x00000660] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00000668] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
--/* [0x00000670] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
--/* [0x00000678] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
--/* [0x00000680] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
--/* [0x00000688] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
--/* [0x00000690] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
--/* [0x00000698] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
--/* [0x000006a0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
--/* [0x000006a8] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
--/* [0x000006b0] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
--/* [0x000006b8] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
--/* [0x000006c0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
--/* [0x000006c8] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
--/* [0x000006d0] */ 0x159e7000, 0x100203e7, // mov ra15, r0
--/* [0x000006d8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
--/* [0x000006e0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
--/* [0x000006e8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
--/* [0x000006f0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
--/* [0x000006f8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
--/* [0x00000700] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
--/* [0x00000708] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:uvloop_b0
--/* [0x00000710] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
--/* [0x00000718] */ 0x009e7000, 0x100009e7, // nop
-+/* [0x000005f0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+/* [0x000005f8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+/* [0x00000600] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+/* [0x00000608] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x00000610] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+/* [0x00000618] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x00000620] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00000628] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+/* [0x00000630] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+/* [0x00000638] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+/* [0x00000640] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000648] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+/* [0x00000650] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+/* [0x00000658] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+/* [0x00000660] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+/* [0x00000668] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+/* [0x00000670] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+/* [0x00000678] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+/* [0x00000680] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+/* [0x00000688] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+/* [0x00000690] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+/* [0x00000698] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+/* [0x000006a0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+/* [0x000006a8] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+/* [0x000006b0] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+/* [0x000006b8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+/* [0x000006c0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+/* [0x000006c8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+/* [0x000006d0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+/* [0x000006d8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+/* [0x000006e0] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
-+/* [0x000006e8] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+/* [0x000006f0] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
-+/* [0x000006f8] */ 0x009e7000, 0x100009e7, // nop
-+/* [0x00000700] */ 0x009e7000, 0x100009e7, // nop
-+/* [0x00000708] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00000710] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000718] */ 0x15827d80, 0x10020827, // mov r0, unif
- /* [0x00000720] */ 0x009e7000, 0x100009e7, // nop
--/* [0x00000728] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
--/* [0x00000730] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000738] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000740] */ 0x009e7000, 0x100009e7, // nop
- // ::mc_filter_uv_b
--/* [0x00000748] */ 0x15827d80, 0x100207e7, // mov ra31, unif
--/* [0x00000750] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
--/* [0x00000758] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
--/* [0x00000760] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
--/* [0x00000768] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
--/* [0x00000770] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
--/* [0x00000778] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
--/* [0x00000780] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
--/* [0x00000788] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
--/* [0x00000790] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
--/* [0x00000798] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
--/* [0x000007a0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
--/* [0x000007a8] */ 0x00000010, 0xe00208a7, // mov r2, 16
--/* [0x000007b0] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x000007b8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
--/* [0x000007c0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
--/* [0x000007c8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
--/* [0x000007d0] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
--/* [0x000007d8] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
--/* [0x000007e0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
--/* [0x000007e8] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
--/* [0x000007f0] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
--/* [0x000007f8] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
--/* [0x00000800] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
--/* [0x00000808] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
--/* [0x00000810] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
--/* [0x00000818] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
--/* [0x00000820] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000828] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000830] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000838] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000840] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
--/* [0x00000848] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000850] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000858] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000860] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000868] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
--/* [0x00000870] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
--/* [0x00000878] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00000880] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+/* [0x00000728] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x00000730] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+/* [0x00000738] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+/* [0x00000740] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+/* [0x00000748] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+/* [0x00000750] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x00000758] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+/* [0x00000760] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+/* [0x00000768] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+/* [0x00000770] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+/* [0x00000778] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+/* [0x00000780] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+/* [0x00000788] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+/* [0x00000790] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000798] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+/* [0x000007a0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+/* [0x000007a8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+/* [0x000007b0] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-+/* [0x000007b8] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+/* [0x000007c0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+/* [0x000007c8] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-+/* [0x000007d0] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-+/* [0x000007d8] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-+/* [0x000007e0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x000007e8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+/* [0x000007f0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+/* [0x000007f8] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
-+/* [0x00000800] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000808] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000810] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000818] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000820] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+/* [0x00000828] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000830] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000838] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000840] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-+/* [0x00000848] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+/* [0x00000850] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000858] */ 0x00000000, 0xe00208e7, // mov r3, 0
- // :uvloop_b
--/* [0x00000888] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
--/* [0x00000890] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
--/* [0x00000898] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
--/* [0x000008a0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
--/* [0x000008a8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
--/* [0x000008b0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
--/* [0x000008b8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
--/* [0x000008c0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
--/* [0x000008c8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
--/* [0x000008d0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
--/* [0x000008d8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x000008e0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
--/* [0x000008e8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
--/* [0x000008f0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
--/* [0x000008f8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
--/* [0x00000900] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
--/* [0x00000908] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
--/* [0x00000910] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
--/* [0x00000918] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
--/* [0x00000920] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
--/* [0x00000928] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
--/* [0x00000930] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
--/* [0x00000938] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
--/* [0x00000940] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
--/* [0x00000948] */ 0x159e7000, 0x100203e7, // mov ra15, r0
--/* [0x00000950] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
--/* [0x00000958] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
--/* [0x00000960] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
--/* [0x00000968] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
--/* [0x00000970] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
--/* [0x00000978] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
--/* [0x00000980] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
--/* [0x00000988] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
--/* [0x00000990] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
--/* [0x00000998] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
--/* [0x000009a0] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
--/* [0x000009a8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
--/* [0x000009b0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
--/* [0x000009b8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
--/* [0x000009c0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x000009c8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
--/* [0x000009d0] */ 0x00000010, 0xe0020827, // mov r0, 16
--/* [0x000009d8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
--/* [0x000009e0] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
--/* [0x000009e8] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
--/* [0x000009f0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x000009f8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x00000860] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+/* [0x00000868] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+/* [0x00000870] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+/* [0x00000878] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x00000880] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+/* [0x00000888] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x00000890] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00000898] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+/* [0x000008a0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+/* [0x000008a8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+/* [0x000008b0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x000008b8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+/* [0x000008c0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+/* [0x000008c8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+/* [0x000008d0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+/* [0x000008d8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+/* [0x000008e0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+/* [0x000008e8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+/* [0x000008f0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+/* [0x000008f8] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+/* [0x00000900] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+/* [0x00000908] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+/* [0x00000910] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+/* [0x00000918] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+/* [0x00000920] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+/* [0x00000928] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+/* [0x00000930] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+/* [0x00000938] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+/* [0x00000940] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+/* [0x00000948] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+/* [0x00000950] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+/* [0x00000958] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+/* [0x00000960] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
-+/* [0x00000968] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
-+/* [0x00000970] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+/* [0x00000978] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-+/* [0x00000980] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+/* [0x00000988] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+/* [0x00000990] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x00000998] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x000009a0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x000009a8] */ 0x00000010, 0xe0020827, // mov r0, 16
-+/* [0x000009b0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x000009b8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x000009c0] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+/* [0x000009c8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x000009d0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
- // ::mc_exit
--/* [0x00000a00] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
--/* [0x00000a08] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
--/* [0x00000a10] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000a18] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000a20] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000a28] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000a30] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
--/* [0x00000a38] */ 0x009e7000, 0x100009e7, // nop        ; nop
--/* [0x00000a40] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+/* [0x000009d8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x000009e0] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-+/* [0x000009e8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x000009f0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x000009f8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000a00] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000a08] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x00000a10] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+/* [0x00000a18] */ 0x009e7000, 0x100009e7, // nop        ; nop
- // ::mc_interrupt_exit8
--/* [0x00000a48] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
--/* [0x00000a50] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000a58] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000a60] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000a68] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000a20] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x00000a28] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000a30] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000a38] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000a40] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000a48] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000a50] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000a58] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000a60] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000a68] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
- /* [0x00000a70] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
- /* [0x00000a78] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000a80] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000a88] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000a90] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000a98] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000aa0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000aa8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
--/* [0x00000ab0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
--/* [0x00000ab8] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+/* [0x00000a80] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x00000a88] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+/* [0x00000a90] */ 0x009e7000, 0x100009e7, // nop        ; nop
- // ::mc_end
- };
- #ifdef __HIGHC__
-diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
-index 45dbe0e..99927c4 100644
---- a/libavcodec/rpi_shader.h
-+++ b/libavcodec/rpi_shader.h
-@@ -4,11 +4,11 @@
- extern unsigned int rpi_shader[];
- 
- #define mc_setup_uv (rpi_shader + 0)
--#define mc_filter_uv (rpi_shader + 152)
--#define mc_filter_uv_b0 (rpi_shader + 316)
--#define mc_filter_uv_b (rpi_shader + 466)
--#define mc_exit (rpi_shader + 640)
--#define mc_interrupt_exit8 (rpi_shader + 658)
--#define mc_end (rpi_shader + 688)
-+#define mc_filter_uv (rpi_shader + 148)
-+#define mc_filter_uv_b0 (rpi_shader + 310)
-+#define mc_filter_uv_b (rpi_shader + 458)
-+#define mc_exit (rpi_shader + 630)
-+#define mc_interrupt_exit8 (rpi_shader + 648)
-+#define mc_end (rpi_shader + 678)
- 
- #endif
-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-index e138c95..d9ffcda 100644
---- a/libavcodec/rpi_shader.qasm
-+++ b/libavcodec/rpi_shader.qasm
-@@ -167,8 +167,6 @@ add t0s, r2, r1
- 
- # Dump padding words
- mov r0, unif
--mov r0, unif
--mov r0, unif
- 
- # submit texture requests for second line
- max r1, ra_y, 0
-@@ -228,11 +226,10 @@ asr ra3, r0, rb23;      mul24 r0, r0, ra22
- asr ra2, r0, rb23;      mul24 r0, r0, ra22
- asr ra1, r0, rb23;      mul24 r0, r0, ra22
- asr ra0, r0, rb23;      mov r0, unif
--                        mov r0, unif
- asr rb11, r0, rb23;     mul24 r0, r0, ra22
- asr rb10, r0, rb23;     mul24 r0, r0, ra22
- asr rb9, r0, rb23;      mul24 r0, r0, ra22
--asr rb8, r0, rb23;      mov r0, unif
-+asr rb8, r0, rb23
- 
- # r2 is elem_num
- # r3 is loop counter
-@@ -362,11 +359,10 @@ asr ra3, r0, rb23;      mul24 r0, r0, ra22
- asr ra2, r0, rb23;      mul24 r0, r0, ra22
- asr ra1, r0, rb23;      mul24 r0, r0, ra22
- asr ra0, r0, rb23;      mov r0, unif
--                        mov r0, unif
- asr rb11, r0, rb23;     mul24 r0, r0, ra22
- asr rb10, r0, rb23;     mul24 r0, r0, ra22
- asr rb9, r0, rb23;      mul24 r0, r0, ra22
--asr rb8, r0, rb23;      mov r0, unif
-+asr rb8, r0, rb23
- 
- # r2 is elem_num
- # r3 is loop counter
-@@ -490,11 +486,10 @@ asr ra3, r0, rb23;      mul24 r0, r0, ra22
- asr ra2, r0, rb23;      mul24 r0, r0, ra22
- asr ra1, r0, rb23;      mul24 r0, r0, ra22
- asr ra0, r0, rb23;      mov r0, unif
--                        mov r0, unif
- asr rb11, r0, rb23;     mul24 r0, r0, ra22
- asr rb10, r0, rb23;     mul24 r0, r0, ra22
- asr rb9, r0, rb23;      mul24 r0, r0, ra22
--asr rb8, r0, rb23;      mov r0, unif
-+asr rb8, r0, rb23
- 
- # r2 is elem_num
- # r3 is loop counter
--- 
-2.7.4
-
-
-From 3e8f02cf9d3e4bfcd07a5fcf321ace07c4f2e6f3 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Thu, 14 May 2015 15:21:49 +0100
-Subject: [PATCH 34/68] hevc: don't redirect when not rpi_enabled
-
----
- libavcodec/hevc.c | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-index 32b89d5..2459e34 100644
---- a/libavcodec/hevc.c
-+++ b/libavcodec/hevc.c
-@@ -1468,7 +1468,7 @@ static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size)
-  */
- 
- #ifdef RPI_INTER
--#define RPI_REDIRECT(fn) rpi_ ## fn
-+#define RPI_REDIRECT(fn) (s->enable_rpi ? rpi_ ## fn : fn)
- static void rpi_luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
-                         AVFrame *ref, const Mv *mv, int x_off, int y_off,
-                         int block_w, int block_h, int luma_weight, int luma_offset)
--- 
-2.7.4
-
-
-From 6da455b382b28c3c1f4e98c1703a695cdb946ad3 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Thu, 14 May 2015 15:22:02 +0100
-Subject: [PATCH 35/68] Use /dev/vcio for mailbox access
-
----
- libavcodec/rpi_mailbox.c | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/libavcodec/rpi_mailbox.c b/libavcodec/rpi_mailbox.c
-index 536896f..77a56dd 100644
---- a/libavcodec/rpi_mailbox.c
-+++ b/libavcodec/rpi_mailbox.c
-@@ -39,7 +39,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- 
- #define MAJOR_NUM 100
- #define IOCTL_MBOX_PROPERTY _IOWR(MAJOR_NUM, 0, char *)
--#define DEVICE_FILE_NAME "/dev/char_dev"
-+#define DEVICE_FILE_NAME "/dev/vcio"
- 
- #include "rpi_mailbox.h"
- 
--- 
-2.7.4
-
-
-From f96ef6131f16a4c03b8e2882bdf7319c3b646a6c Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Thu, 14 May 2015 15:25:25 +0100
-Subject: [PATCH 36/68] Use vcsm for all memory allocations
-
----
- libavcodec/rpi_qpu.c | 174 +++++++++++++++++++--------------------------------
- 1 file changed, 64 insertions(+), 110 deletions(-)
-
-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-index 60bf079..f62051f 100644
---- a/libavcodec/rpi_qpu.c
-+++ b/libavcodec/rpi_qpu.c
-@@ -1,7 +1,5 @@
- #ifdef RPI
--// define RPI_USE_VCSM to use the vcsm device for shared memory
- // This works better than the mmap in that the memory can be cached, but requires a kernel modification to enable the device.
--#define RPI_USE_VCSM
- // define RPI_TIME_TOTAL_QPU to print out how much time is spent in the QPU code
- #define RPI_TIME_TOTAL_QPU
- // define RPI_TIME_TOTAL_VPU to print out how much time is spent in the VPI code
-@@ -25,9 +23,7 @@
- #include "rpi_shader.h"
- #include "rpi_hevc_transform.h"
- 
--#ifdef RPI_USE_VCSM
- #include "rpi_user_vcsm.h"
--#endif
- 
- // On Pi2 there is no way to access the VPU L2 cache
- // GPU_MEM_FLG should be 4 for uncached memory.  (Or C for alias to allocate in the VPU L2 cache)
-@@ -96,7 +92,6 @@ struct GPU
-   unsigned int vpu_code[VPU_CODE_SIZE];
-   short transMatrix2even[16*16*2];
-   int open_count; // Number of allocated video buffers
--  unsigned int vc_handle; // Handle of this memory
-   int      mb; // Mailbox handle
-   int      vc; // Address in GPU memory
-   int mail[12]; // These are used to pass pairs of code/unifs to the QPUs
-@@ -105,6 +100,7 @@ struct GPU
- // Stop more than one thread trying to allocate memory or use the processing resources at once
- static pthread_mutex_t gpu_mutex = PTHREAD_MUTEX_INITIALIZER;
- static volatile struct GPU* gpu = NULL;
-+static GPU_MEM_PTR_T gpu_mem_ptr;
- 
- #if defined(RPI_TIME_TOTAL_QPU) || defined(RPI_TIME_TOTAL_VPU)
- static unsigned int Microseconds(void) {
-@@ -132,39 +128,27 @@ static volatile int vpu_async_tail=0; // Contains the number of posted jobs
- static volatile int vpu_async_head=0;
- #endif
- 
-+static int gpu_malloc_uncached_internal(int numbytes, GPU_MEM_PTR_T *p, int mb);
-+static void gpu_free_internal(GPU_MEM_PTR_T *p);
-+
- // Connect to QPU, returns 0 on success.
- static int gpu_init(volatile struct GPU **gpu) {
-   int mb = mbox_open();
-   int vc;
--  int handle;
-   volatile struct GPU* ptr;
- 	if (mb < 0)
- 		return -1;
- 
- 	if (qpu_enable(mb, 1)) return -2;
- 
--#ifdef RPI_USE_VCSM
-   vcsm_init();
--#endif
-+  gpu_malloc_uncached_internal(sizeof(struct GPU), &gpu_mem_ptr, mb);
-+  ptr = (volatile struct GPU*)gpu_mem_ptr.arm;
-+  memset(ptr, 0, sizeof *ptr);
-+  vc = gpu_mem_ptr.vc;
- 
--  handle = mem_alloc(mb, sizeof(struct GPU), 4096, GPU_MEM_FLG);
--  if (!handle)
--  {
--    qpu_enable(mb, 0);
--    return -3;
--  }
--	vc = mem_lock(mb, handle);
--	ptr = mapmem_shared((vc+GPU_MEM_MAP)&~0xc0000000, sizeof(struct GPU));
--	if (ptr == NULL)
--	{	mem_free(mb, handle);
--		mem_unlock(mb, handle);
--		qpu_enable(mb, 0);
--		return -4;
--	}
--
--	ptr->mb = mb;
--	ptr->vc_handle = handle;
--	ptr->vc = vc;
-+  ptr->mb = mb;
-+  ptr->vc = vc;
- 
-   printf("GPU allocated at 0x%x\n",vc);
- 
-@@ -226,94 +210,74 @@ static void gpu_unlock(void) {
-   pthread_mutex_unlock(&gpu_mutex);
- }
- 
-+static int gpu_malloc_uncached_internal(int numbytes, GPU_MEM_PTR_T *p, int mb) {
-+  p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
-+  assert(p->vcsm_handle);
-+  p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
-+  assert(p->vc_handle);
-+  p->arm = vcsm_lock(p->vcsm_handle);
-+  assert(p->arm);
-+  p->vc = mem_lock(mb, p->vc_handle);
-+  assert(p->vc);
-+  return 0;
-+}
-+
- // Allocate memory on GPU
- // Fills in structure <p> containing ARM pointer, videocore handle, videocore memory address, numbytes
- // Returns 0 on success.
- // This allocates memory that will not be cached in ARM's data cache.
- // Therefore safe to use without data cache flushing.
--int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p) {
-+int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p)
-+{
-+  int r;
-   gpu_lock();
--  p->vc_handle = mem_alloc(gpu->mb, numbytes, 4096, GPU_MEM_FLG);
--  p->vcsm_handle = 0;
--  if (!p->vc_handle)
--  {
--    qpu_enable(gpu->mb, 0);
--    return -3;
--  }
--  p->vc = mem_lock(gpu->mb, p->vc_handle);
--  p->arm = mapmem_shared((p->vc+GPU_MEM_MAP)&~0xc0000000,numbytes);
--  p->numbytes = numbytes;
--  if (p->arm == NULL)
--  {
--    mem_free(gpu->mb, p->vc_handle);
--    mem_unlock(gpu->mb, p->vc_handle);
--    gpu_unlock();
--    qpu_enable(gpu->mb, 0);
--    return -4;
--  }
-+  r = gpu_malloc_uncached_internal(numbytes, p, gpu->mb);
-   gpu->open_count++;
-   gpu_unlock();
--  return 0;
-+  return r;
- }
- 
- void gpu_cache_flush(GPU_MEM_PTR_T *p)
- {
--  // This only works when using RPI_USE_VCSM
-   void *tmp = vcsm_lock(p->vcsm_handle);
-   vcsm_unlock_ptr(tmp);
- }
- 
-+static int gpu_malloc_cached_internal(int numbytes, GPU_MEM_PTR_T *p) {
-+  p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST, (char *)"Video Frame" );
-+  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_VC, (char *)"Video Frame" );
-+  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
-+  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST_AND_VC, (char *)"Video Frame" );
-+  assert(p->vcsm_handle);
-+  p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
-+  assert(p->vc_handle);
-+  p->arm = vcsm_lock(p->vcsm_handle);
-+  assert(p->arm);
-+  p->vc = mem_lock(gpu->mb, p->vc_handle);
-+  assert(p->vc);
-+  return 0;
-+}
-+
- // This allocates data that will be
- //    Cached in ARM L2
- //    Uncached in VPU L2
--int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p) {
-+int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p)
-+{
-+  int r;
-   gpu_lock();
--#ifdef RPI_USE_VCSM
--  {
--      p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST, (char *)"Video Frame" ); // f....... locks up for VP9 - retest this?
--      //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_VC, (char *)"Video Frame" ); // 3b...... works
--      //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" ); //fb...... locks up
--      //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST_AND_VC, (char *)"Video Frame" ); // 3b works (but corrupted due to caching)
--      p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
--      p->arm = vcsm_lock(p->vcsm_handle);
--      p->vc = mem_lock(gpu->mb, p->vc_handle);
--  }
--#else
--  p->vc_handle = mem_alloc(gpu->mb, numbytes, 4096, GPU_MEM_FLG);
--  p->vcsm_handle = 0;
--  if (!p->handle)
--  {
--    qpu_enable(gpu->mb, 0);
--    return -3;
--  }
--  p->vc = mem_lock(gpu->mb, p->vc_handle);
--  printf("This mapmem_private does not seem to work\n");
--  exit(-1);
--  p->arm = mapmem_private((p->vc+GPU_MEM_MAP)&~0xc0000000,numbytes);
--  p->numbytes = numbytes;
--  if (p->arm == NULL)
--  {
--    mem_free(gpu->mb, p->handle);
--    mem_unlock(gpu->mb, p->handle);
--    gpu_unlock();
--    qpu_enable(gpu->mb, 0);
--    return -4;
--  }
--#endif
-+  r = gpu_malloc_cached_internal(numbytes, p);
-   gpu->open_count++;
-   gpu_unlock();
--  return 0;
-+  return r;
- }
- 
- static void gpu_term(void)
- {
--	int mb;
--	unsigned handle;
-+  int mb;
- 
-   if (gpu==NULL)
-     return;
-   mb = gpu->mb;
--  handle = gpu->vc_handle;
- 
- #ifdef RPI_ASYNC
-   {
-@@ -323,37 +287,26 @@ static void gpu_term(void)
-   }
- #endif
- 
-+  qpu_enable(mb, 0);
-+  gpu_free_internal(&gpu_mem_ptr);
- 
--	unmapmem((void*)gpu, sizeof(struct GPU));
--	mem_unlock(mb, handle);
--	mem_free(mb, handle);
--	qpu_enable(mb, 0);
--#ifdef RPI_USE_VCSM
-   vcsm_exit();
--#endif
--	mbox_close(mb);
-+
-+  mbox_close(mb);
-   gpu = NULL;
- }
- 
--void gpu_free(GPU_MEM_PTR_T *p) {
-+void gpu_free_internal(GPU_MEM_PTR_T *p) {
-   int mb = gpu->mb;
--	unsigned handle = p->vc_handle;
-+  mem_unlock(mb,p->vc_handle);
-+  vcsm_unlock_ptr(p->arm);
-+  vcsm_free(p->vcsm_handle);
-+}
-+
-+void gpu_free(GPU_MEM_PTR_T *p) {
-   gpu_lock();
--#ifdef RPI_USE_VCSM
--  if (p->vcsm_handle) {
--      mem_unlock(mb,p->vc_handle);
--      vcsm_unlock_ptr(p->arm);
--      vcsm_free(p->vcsm_handle);
--  } else {
--	unmapmem((void*)p->arm, sizeof(struct GPU));
--      mem_unlock(mb, handle);
--      mem_free(mb, handle);
--  }
--#else
--	unmapmem((void*)p->arm, sizeof(struct GPU));
--	mem_unlock(mb, handle);
--	mem_free(mb, handle);
--#endif
-+
-+  gpu_free_internal(p);
- 
-   gpu->open_count--;
-   if (gpu->open_count==0) {
-@@ -386,20 +339,21 @@ unsigned int vpu_get_constants(void) {
- 
- static void *vpu_start(void *arg) {
-   while(1) {
-+    int *p;
-     pthread_mutex_lock(&post_mutex);
-     while( vpu_async_tail - vpu_async_head <= 0)
-     {
-       //printf("Checking number %d %d\n",vpu_async_head,vpu_async_tail);
-       pthread_cond_wait(&post_cond_tail, &post_mutex);
-     }
--    int *p = vpu_cmds[vpu_async_head%MAXCMDS];
-+    p = vpu_cmds[vpu_async_head%MAXCMDS];
-     pthread_mutex_unlock(&post_mutex);
- 
-     if (p[6] == -1) {
-       break; // Last job
-     }
-     if (p[7]) {
--        GPU_MEM_PTR_T *buf = (GPU_MEM_PTR_T *)p[7];
-+        //GPU_MEM_PTR_T *buf = (GPU_MEM_PTR_T *)p[7];
-         //gpu_cache_flush(buf);
-     }
-     vpu_execute_code(p[0], p[1], p[2], p[3], p[4], p[5], p[6]);
--- 
-2.7.4
-
-
-From 7c94b833b48a455d27d82eb2ca1b53a162705caf Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Thu, 14 May 2015 15:43:17 +0100
-Subject: [PATCH 37/68] Enable EARLY_MALLOC and fix sps access bug
-
----
- libavcodec/hevc.c | 5 +++--
- 1 file changed, 3 insertions(+), 2 deletions(-)
-
-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-index 2459e34..4e82a15 100644
---- a/libavcodec/hevc.c
-+++ b/libavcodec/hevc.c
-@@ -44,7 +44,7 @@
- #ifdef RPI
-   #include "rpi_qpu.h"
-   // For some unknown reason, the code seems to crash if I do a late malloc
--  #define EARLY_MALLOC
-+  //#define EARLY_MALLOC
-   // Move Inter prediction into separate pass
-   #define RPI_INTER
- #endif
-@@ -149,7 +149,8 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
- #ifdef RPI
- #ifdef EARLY_MALLOC
- #else
--    int coeffs_in_ctb = (1 << s->ps.sps->log2_ctb_size) * (1 << s->ps.sps->log2_ctb_size);
-+    assert(sps);
-+    int coeffs_in_ctb = (1 << sps->log2_ctb_size) * (1 << sps->log2_ctb_size);
-     int coefs_per_row = sps->ctb_width * coeffs_in_ctb * 3;  // Allow space for chroma
-     printf("pic_arrays_init\n");
-     printf("Allocated %d\n",coefs_per_row);
--- 
-2.7.4
-
-
-From 0a0a92817a7959d213dca9c75a242b6ad88d6b80 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Thu, 14 May 2015 16:40:51 +0100
-Subject: [PATCH 38/68] Add copy of av_mod_uintp2 for use with stable ffmpeg
-
----
- libavcodec/hevc.c | 8 ++++++++
- 1 file changed, 8 insertions(+)
-
-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-index 4e82a15..80db603 100644
---- a/libavcodec/hevc.c
-+++ b/libavcodec/hevc.c
-@@ -51,6 +51,14 @@
- 
- // #define DISABLE_MC
- 
-+#ifndef av_mod_uintp2
-+static av_always_inline av_const unsigned av_mod_uintp2_c(unsigned a, unsigned p)
-+{
-+    return a & ((1 << p) - 1);
-+}
-+#   define av_mod_uintp2   av_mod_uintp2_c
-+#endif
-+
- const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
- 
- 
--- 
-2.7.4
-
-
-From c48d08e968b24c2e260b0cc76c7901a1b4d75bbf Mon Sep 17 00:00:00 2001
-From: Peter de Rivaz <peter.derivaz@gmail.com>
-Date: Mon, 18 May 2015 11:11:02 +0100
-Subject: [PATCH 39/68] Added support for weighted prediction in P frames
-
----
- libavcodec/hevc.c          |  52 ++++-
- libavcodec/rpi_shader.c    | 566 +++++++++++++++++++++++----------------------
- libavcodec/rpi_shader.h    |  12 +-
- libavcodec/rpi_shader.qasm |  39 +++-
- 4 files changed, 384 insertions(+), 285 deletions(-)
-
-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-index 80db603..9668ef8 100644
---- a/libavcodec/hevc.c
-+++ b/libavcodec/hevc.c
-@@ -64,7 +64,7 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
- 
- #ifdef RPI_INTER_QPU
- 
--#define RPI_CHROMA_COMMAND_WORDS 10
-+#define RPI_CHROMA_COMMAND_WORDS 12
- #define UV_COMMANDS_PER_QPU ((1 + (256*64*2)/(4*4)) * RPI_CHROMA_COMMAND_WORDS)
- // The QPU code for UV blocks only works up to a block width of 8
- #define RPI_CHROMA_BLOCK_WIDTH 8
-@@ -2031,6 +2031,8 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-                 int y1_c = y0_c + (mv->y >> (2 + hshift));
-                 //int chan = x0>>8; // Allocate commands for the first 256 luma pixels across to the first QPU.  This is optimised for images around 1920 width
-                 int chan = x0>>8;
-+                int weight_flag      = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
-+                                       (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
- 
-                 uint32_t *u = s->u_mvs[chan & 7];
-                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-@@ -2043,6 +2045,13 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-                       *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
-                       *u++ = rpi_filter_coefs[_mx][0];
-                       *u++ = rpi_filter_coefs[_my][0];
-+                      if (weight_flag) {
-+                          *u++ = (s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0] << 16) + (s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0] & 0xffff);
-+                          *u++ = (s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1] << 16) + (s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1] & 0xffff);
-+                      } else {
-+                          *u++ = 1; // Weight of 1 and offset of 0
-+                          *u++ = 1;
-+                      }
-                       *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
-                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-                     }
-@@ -2085,6 +2094,8 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-                 int y1_c = y0_c + (mv->y >> (2 + hshift));
-                 //int chan = x0>>8; // Allocate commands for the first 256 luma pixels across to the first QPU.  This is optimised for images around 1920 width
-                 int chan = x0>>8;
-+                int weight_flag      = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
-+                                       (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
- 
-                 uint32_t *u = s->u_mvs[chan & 7];
-                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-@@ -2098,6 +2109,13 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-                       // TODO chroma weight and offset... s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]
-                       *u++ = rpi_filter_coefs[_mx][0];
-                       *u++ = rpi_filter_coefs[_my][0];
-+                      if (weight_flag) {
-+                          *u++ = (s->sh.chroma_offset_l0[current_mv.ref_idx[1]][0] << 16) + (s->sh.chroma_weight_l0[current_mv.ref_idx[1]][0] & 0xffff);
-+                          *u++ = (s->sh.chroma_offset_l0[current_mv.ref_idx[1]][1] << 16) + (s->sh.chroma_weight_l0[current_mv.ref_idx[1]][1] & 0xffff);
-+                      } else {
-+                          *u++ = 1; // Weight of 1 and offset of 0
-+                          *u++ = 1;
-+                      }
-                       *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
-                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-                     }
-@@ -2159,6 +2177,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-                       *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
-                       *u++ = rpi_filter_coefs[_mx][0];
-                       *u++ = rpi_filter_coefs[_my][0];
-+                      u+=2; // Weights not supported in B slices
-                       u+=2; // Intermediate results are not written back in first pass of B filtering
- 
-                       u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b;
-@@ -2169,6 +2188,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-                       *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
-                       *u++ = rpi_filter_coefs[_mx2][0];
-                       *u++ = rpi_filter_coefs[_my2][0];
-+                      u+=2; // Weights not supported in B slices
-                       *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
-                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-                     }
-@@ -2795,6 +2815,9 @@ static void rpi_inter_clear(HEVCContext *s)
-     int i;
-     int pic_width        = s->ps.sps->width >> s->ps.sps->hshift[1];
-     int pic_height       = s->ps.sps->height >> s->ps.sps->vshift[1];
-+    int weight_flag      = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
-+                           (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
-+
-     for(i=0;i<8;i++) {
-         s->u_mvs[i] = s->mvs_base[i];
-         *s->u_mvs[i]++ = 0;
-@@ -2806,6 +2829,13 @@ static void rpi_inter_clear(HEVCContext *s)
-         *s->u_mvs[i]++ = pic_height;
-         *s->u_mvs[i]++ = s->frame->linesize[1];
-         *s->u_mvs[i]++ = s->frame->linesize[2];
-+        if (weight_flag) {
-+            *s->u_mvs[i]++ = 1 << (s->sh.chroma_log2_weight_denom + 6 - 1);
-+            *s->u_mvs[i]++ = s->sh.chroma_log2_weight_denom + 6;
-+        } else {
-+            *s->u_mvs[i]++ = 1 << 5;
-+            *s->u_mvs[i]++ = 6;
-+        }
-         s->u_mvs[i] += 1;  // Padding words
-     }
- }
-@@ -2849,12 +2879,29 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-     int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs];
- 
- #ifdef RPI
-+#ifdef RPI_INTER_QPU
-     s->enable_rpi = s->ps.sps->bit_depth == 8
-                     && s->ps.sps->width <= RPI_MAX_WIDTH
-                     && !s->ps.pps->cross_component_prediction_enabled_flag
-                     && s->ps.pps->num_tile_rows <= 1 && s->ps.pps->num_tile_columns <= 1
--                    && !(s->ps.pps->weighted_pred_flag && s->sh.slice_type == P_SLICE)
-                     && !(s->ps.pps->weighted_bipred_flag && s->sh.slice_type == B_SLICE);
-+#else
-+    s->enable_rpi = s->ps.sps->bit_depth == 8
-+                    && s->ps.sps->width <= RPI_MAX_WIDTH
-+                    && !s->ps.pps->cross_component_prediction_enabled_flag
-+                    && s->ps.pps->num_tile_rows <= 1 && s->ps.pps->num_tile_columns <= 1;
-+#endif
-+
-+    /*if (!s->enable_rpi) {
-+      if (s->ps.pps->cross_component_prediction_enabled_flag)
-+        printf("Cross component\n");
-+      if (s->ps.pps->num_tile_rows > 1 || s->ps.pps->num_tile_columns > 1)
-+        printf("Tiles\n");
-+      if (s->ps.pps->weighted_pred_flag && s->sh.slice_type == P_SLICE)
-+        printf("Weighted P slice\n");
-+      if (s->ps.pps->weighted_bipred_flag && s->sh.slice_type == B_SLICE)
-+        printf("Weighted B slice\n");
-+    }*/
- 
- #endif
- 
-@@ -2987,6 +3034,7 @@ static int hls_decode_entry_wpp(AVCodecContext *avctxt, void *input_ctb_row, int
- 
- #ifdef RPI
-     s->enable_rpi = 0;
-+    //printf("Wavefront\n");
- #endif
- 
-     if(ctb_row) {
-diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
-index b0b93b5..3f04d80 100644
---- a/libavcodec/rpi_shader.c
-+++ b/libavcodec/rpi_shader.c
-@@ -88,287 +88,307 @@ unsigned int rpi_shader[] = {
- /* [0x000001f8] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
- /* [0x00000200] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
- /* [0x00000208] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
--/* [0x00000210] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000218] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
--/* [0x00000220] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
--/* [0x00000228] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
--/* [0x00000230] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
--/* [0x00000238] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
--/* [0x00000240] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
--/* [0x00000248] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
-+/* [0x00000210] */ 0x15827d80, 0x10021327, // mov rb12,unif
-+/* [0x00000218] */ 0x15827d80, 0x10021367, // mov rb13,unif
-+/* [0x00000220] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000228] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-+/* [0x00000230] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+/* [0x00000238] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-+/* [0x00000240] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00000248] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-+/* [0x00000250] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
-+/* [0x00000258] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
- // ::mc_filter_uv
--/* [0x00000250] */ 0x15827d80, 0x100207e7, // mov ra31, unif
--/* [0x00000258] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
--/* [0x00000260] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
--/* [0x00000268] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
--/* [0x00000270] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
--/* [0x00000278] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
--/* [0x00000280] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
--/* [0x00000288] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
--/* [0x00000290] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
--/* [0x00000298] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
--/* [0x000002a0] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
--/* [0x000002a8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
--/* [0x000002b0] */ 0x00000010, 0xe00208a7, // mov r2, 16
--/* [0x000002b8] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x000002c0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
--/* [0x000002c8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
--/* [0x000002d0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
--/* [0x000002d8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
--/* [0x000002e0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
--/* [0x000002e8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
--/* [0x000002f0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
--/* [0x000002f8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
--/* [0x00000300] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
--/* [0x00000308] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000310] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000318] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000320] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000328] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
--/* [0x00000330] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000338] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000340] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000348] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
--/* [0x00000350] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
--/* [0x00000358] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00000360] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+/* [0x00000260] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x00000268] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+/* [0x00000270] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+/* [0x00000278] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+/* [0x00000280] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+/* [0x00000288] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x00000290] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+/* [0x00000298] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+/* [0x000002a0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+/* [0x000002a8] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+/* [0x000002b0] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+/* [0x000002b8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+/* [0x000002c0] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+/* [0x000002c8] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x000002d0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+/* [0x000002d8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+/* [0x000002e0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+/* [0x000002e8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-+/* [0x000002f0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+/* [0x000002f8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+/* [0x00000300] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000308] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+/* [0x00000310] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+/* [0x00000318] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000320] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000328] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000330] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000338] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+/* [0x00000340] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000348] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000350] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000358] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-+/* [0x00000360] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000368] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000370] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
-+/* [0x00000378] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+/* [0x00000380] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
-+/* [0x00000388] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000390] */ 0x0f9e7080, 0x100613e7, // asr.ifnz rb15, r0, r2
-+/* [0x00000398] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+/* [0x000003a0] */ 0x0f9e7080, 0x100613a7, // asr.ifnz rb14, r0, r2
-+/* [0x000003a8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+/* [0x000003b0] */ 0x00000000, 0xe00208e7, // mov r3, 0
- // :uvloop
--/* [0x00000368] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
--/* [0x00000370] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
--/* [0x00000378] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
--/* [0x00000380] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
--/* [0x00000388] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
--/* [0x00000390] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
--/* [0x00000398] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
--/* [0x000003a0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
--/* [0x000003a8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
--/* [0x000003b0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
--/* [0x000003b8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x000003c0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
--/* [0x000003c8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
--/* [0x000003d0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
--/* [0x000003d8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
--/* [0x000003e0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
--/* [0x000003e8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
--/* [0x000003f0] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
--/* [0x000003f8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
--/* [0x00000400] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
--/* [0x00000408] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
--/* [0x00000410] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
--/* [0x00000418] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
--/* [0x00000420] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
--/* [0x00000428] */ 0x159e7000, 0x100203e7, // mov ra15, r0
--/* [0x00000430] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
--/* [0x00000438] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
--/* [0x00000440] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
--/* [0x00000448] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
--/* [0x00000450] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
--/* [0x00000458] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
--/* [0x00000460] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
--/* [0x00000468] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
--/* [0x00000470] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:uvloop
--/* [0x00000478] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
--/* [0x00000480] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
--/* [0x00000488] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
--/* [0x00000490] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
--/* [0x00000498] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x000004a0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
--/* [0x000004a8] */ 0x00000010, 0xe0020827, // mov r0, 16
--/* [0x000004b0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
--/* [0x000004b8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
--/* [0x000004c0] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
--/* [0x000004c8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x000004d0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x000003b8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+/* [0x000003c0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+/* [0x000003c8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+/* [0x000003d0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x000003d8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+/* [0x000003e0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x000003e8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x000003f0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+/* [0x000003f8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+/* [0x00000400] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+/* [0x00000408] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000410] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+/* [0x00000418] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+/* [0x00000420] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+/* [0x00000428] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+/* [0x00000430] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+/* [0x00000438] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+/* [0x00000440] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+/* [0x00000448] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+/* [0x00000450] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+/* [0x00000458] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+/* [0x00000460] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
-+/* [0x00000468] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+/* [0x00000470] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+/* [0x00000478] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+/* [0x00000480] */ 0x00000020, 0xe0021327, // mov rb12,32
-+/* [0x00000488] */ 0x00000006, 0xe0021367, // mov rb13,6
-+/* [0x00000490] */ 0x00000001, 0xe00213a7, // mov rb14,1
-+/* [0x00000498] */ 0x00000000, 0xe00213e7, // mov rb15,0
-+/* [0x000004a0] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+/* [0x000004a8] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+/* [0x000004b0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+/* [0x000004b8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+/* [0x000004c0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+/* [0x000004c8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+/* [0x000004d0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+/* [0x000004d8] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
-+/* [0x000004e0] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
-+/* [0x000004e8] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
-+/* [0x000004f0] */ 0xfffffea8, 0xf06809e7, // brr.anyn -, r:uvloop
-+/* [0x000004f8] */ 0x0c9cf3c0, 0x10020867, // add r1, r1, rb15
-+/* [0x00000500] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+/* [0x00000508] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+/* [0x00000510] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x00000518] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000520] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x00000528] */ 0x00000010, 0xe0020827, // mov r0, 16
-+/* [0x00000530] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000538] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00000540] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+/* [0x00000548] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000550] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
- // ::mc_filter_uv_b0
--/* [0x000004d8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
--/* [0x000004e0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
--/* [0x000004e8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
--/* [0x000004f0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
--/* [0x000004f8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
--/* [0x00000500] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
--/* [0x00000508] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
--/* [0x00000510] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
--/* [0x00000518] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
--/* [0x00000520] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
--/* [0x00000528] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
--/* [0x00000530] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
--/* [0x00000538] */ 0x00000010, 0xe00208a7, // mov r2, 16
--/* [0x00000540] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000548] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
--/* [0x00000550] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
--/* [0x00000558] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
--/* [0x00000560] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
--/* [0x00000568] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
--/* [0x00000570] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
--/* [0x00000578] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
--/* [0x00000580] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
--/* [0x00000588] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
--/* [0x00000590] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000598] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000005a0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000005a8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000005b0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
--/* [0x000005b8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
--/* [0x000005c0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
--/* [0x000005c8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000005d0] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
--/* [0x000005d8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
--/* [0x000005e0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x000005e8] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+/* [0x00000558] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x00000560] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+/* [0x00000568] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+/* [0x00000570] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+/* [0x00000578] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+/* [0x00000580] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x00000588] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+/* [0x00000590] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+/* [0x00000598] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+/* [0x000005a0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+/* [0x000005a8] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+/* [0x000005b0] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
-+/* [0x000005b8] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+/* [0x000005c0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x000005c8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+/* [0x000005d0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+/* [0x000005d8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+/* [0x000005e0] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-+/* [0x000005e8] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+/* [0x000005f0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+/* [0x000005f8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000600] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+/* [0x00000608] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+/* [0x00000610] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000618] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000620] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000628] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000630] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+/* [0x00000638] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000640] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000648] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000650] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-+/* [0x00000658] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000660] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000668] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+/* [0x00000670] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000678] */ 0x00000000, 0xe00208e7, // mov r3, 0
- // :uvloop_b0
--/* [0x000005f0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
--/* [0x000005f8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
--/* [0x00000600] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
--/* [0x00000608] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
--/* [0x00000610] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
--/* [0x00000618] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
--/* [0x00000620] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
--/* [0x00000628] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
--/* [0x00000630] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
--/* [0x00000638] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
--/* [0x00000640] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00000648] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
--/* [0x00000650] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
--/* [0x00000658] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
--/* [0x00000660] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
--/* [0x00000668] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
--/* [0x00000670] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
--/* [0x00000678] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
--/* [0x00000680] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
--/* [0x00000688] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
--/* [0x00000690] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
--/* [0x00000698] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
--/* [0x000006a0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
--/* [0x000006a8] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
--/* [0x000006b0] */ 0x159e7000, 0x100203e7, // mov ra15, r0
--/* [0x000006b8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
--/* [0x000006c0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
--/* [0x000006c8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
--/* [0x000006d0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
--/* [0x000006d8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
--/* [0x000006e0] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
--/* [0x000006e8] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:uvloop_b0
--/* [0x000006f0] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
--/* [0x000006f8] */ 0x009e7000, 0x100009e7, // nop
--/* [0x00000700] */ 0x009e7000, 0x100009e7, // nop
--/* [0x00000708] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
--/* [0x00000710] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000718] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000720] */ 0x009e7000, 0x100009e7, // nop
-+/* [0x00000680] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+/* [0x00000688] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+/* [0x00000690] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+/* [0x00000698] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x000006a0] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+/* [0x000006a8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x000006b0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x000006b8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+/* [0x000006c0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+/* [0x000006c8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+/* [0x000006d0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x000006d8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+/* [0x000006e0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+/* [0x000006e8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+/* [0x000006f0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+/* [0x000006f8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+/* [0x00000700] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+/* [0x00000708] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+/* [0x00000710] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+/* [0x00000718] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+/* [0x00000720] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+/* [0x00000728] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+/* [0x00000730] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+/* [0x00000738] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+/* [0x00000740] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+/* [0x00000748] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+/* [0x00000750] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+/* [0x00000758] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+/* [0x00000760] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+/* [0x00000768] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+/* [0x00000770] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
-+/* [0x00000778] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+/* [0x00000780] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
-+/* [0x00000788] */ 0x009e7000, 0x100009e7, // nop
-+/* [0x00000790] */ 0x009e7000, 0x100009e7, // nop
-+/* [0x00000798] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x000007a0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x000007a8] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x000007b0] */ 0x009e7000, 0x100009e7, // nop
- // ::mc_filter_uv_b
--/* [0x00000728] */ 0x15827d80, 0x100207e7, // mov ra31, unif
--/* [0x00000730] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
--/* [0x00000738] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
--/* [0x00000740] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
--/* [0x00000748] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
--/* [0x00000750] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
--/* [0x00000758] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
--/* [0x00000760] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
--/* [0x00000768] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
--/* [0x00000770] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
--/* [0x00000778] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
--/* [0x00000780] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
--/* [0x00000788] */ 0x00000010, 0xe00208a7, // mov r2, 16
--/* [0x00000790] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000798] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
--/* [0x000007a0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
--/* [0x000007a8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
--/* [0x000007b0] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
--/* [0x000007b8] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
--/* [0x000007c0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
--/* [0x000007c8] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
--/* [0x000007d0] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
--/* [0x000007d8] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
--/* [0x000007e0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
--/* [0x000007e8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
--/* [0x000007f0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
--/* [0x000007f8] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
--/* [0x00000800] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000808] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000810] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000818] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000820] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
--/* [0x00000828] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000830] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000838] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000840] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
--/* [0x00000848] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
--/* [0x00000850] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00000858] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+/* [0x000007b8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x000007c0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+/* [0x000007c8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+/* [0x000007d0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+/* [0x000007d8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+/* [0x000007e0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x000007e8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+/* [0x000007f0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+/* [0x000007f8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+/* [0x00000800] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+/* [0x00000808] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+/* [0x00000810] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+/* [0x00000818] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+/* [0x00000820] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000828] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+/* [0x00000830] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+/* [0x00000838] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+/* [0x00000840] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-+/* [0x00000848] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+/* [0x00000850] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+/* [0x00000858] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-+/* [0x00000860] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-+/* [0x00000868] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-+/* [0x00000870] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000878] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+/* [0x00000880] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+/* [0x00000888] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
-+/* [0x00000890] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000898] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000008a0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000008a8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000008b0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+/* [0x000008b8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x000008c0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x000008c8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000008d0] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-+/* [0x000008d8] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x000008e0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x000008e8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+/* [0x000008f0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x000008f8] */ 0x00000000, 0xe00208e7, // mov r3, 0
- // :uvloop_b
--/* [0x00000860] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
--/* [0x00000868] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
--/* [0x00000870] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
--/* [0x00000878] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
--/* [0x00000880] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
--/* [0x00000888] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
--/* [0x00000890] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
--/* [0x00000898] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
--/* [0x000008a0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
--/* [0x000008a8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
--/* [0x000008b0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x000008b8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
--/* [0x000008c0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
--/* [0x000008c8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
--/* [0x000008d0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
--/* [0x000008d8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
--/* [0x000008e0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
--/* [0x000008e8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
--/* [0x000008f0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
--/* [0x000008f8] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
--/* [0x00000900] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
--/* [0x00000908] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
--/* [0x00000910] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
--/* [0x00000918] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
--/* [0x00000920] */ 0x159e7000, 0x100203e7, // mov ra15, r0
--/* [0x00000928] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
--/* [0x00000930] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
--/* [0x00000938] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
--/* [0x00000940] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
--/* [0x00000948] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
--/* [0x00000950] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
--/* [0x00000958] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
--/* [0x00000960] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
--/* [0x00000968] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
--/* [0x00000970] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
--/* [0x00000978] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
--/* [0x00000980] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
--/* [0x00000988] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
--/* [0x00000990] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
--/* [0x00000998] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x000009a0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
--/* [0x000009a8] */ 0x00000010, 0xe0020827, // mov r0, 16
--/* [0x000009b0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
--/* [0x000009b8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
--/* [0x000009c0] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
--/* [0x000009c8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x000009d0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x00000900] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+/* [0x00000908] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+/* [0x00000910] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+/* [0x00000918] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x00000920] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+/* [0x00000928] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x00000930] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00000938] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+/* [0x00000940] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+/* [0x00000948] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+/* [0x00000950] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000958] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+/* [0x00000960] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+/* [0x00000968] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+/* [0x00000970] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+/* [0x00000978] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+/* [0x00000980] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+/* [0x00000988] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+/* [0x00000990] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+/* [0x00000998] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+/* [0x000009a0] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+/* [0x000009a8] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+/* [0x000009b0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+/* [0x000009b8] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+/* [0x000009c0] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+/* [0x000009c8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+/* [0x000009d0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+/* [0x000009d8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+/* [0x000009e0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+/* [0x000009e8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+/* [0x000009f0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+/* [0x000009f8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+/* [0x00000a00] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
-+/* [0x00000a08] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
-+/* [0x00000a10] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+/* [0x00000a18] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-+/* [0x00000a20] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+/* [0x00000a28] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+/* [0x00000a30] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x00000a38] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000a40] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x00000a48] */ 0x00000010, 0xe0020827, // mov r0, 16
-+/* [0x00000a50] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000a58] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00000a60] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+/* [0x00000a68] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000a70] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
- // ::mc_exit
--/* [0x000009d8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
--/* [0x000009e0] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
--/* [0x000009e8] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x000009f0] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x000009f8] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000a00] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000a08] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
--/* [0x00000a10] */ 0x009e7000, 0x100009e7, // nop        ; nop
--/* [0x00000a18] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+/* [0x00000a78] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x00000a80] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-+/* [0x00000a88] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000a90] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000a98] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000aa0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000aa8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x00000ab0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+/* [0x00000ab8] */ 0x009e7000, 0x100009e7, // nop        ; nop
- // ::mc_interrupt_exit8
--/* [0x00000a20] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
--/* [0x00000a28] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000a30] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000a38] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000a40] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000a48] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000a50] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000a58] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000a60] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000a68] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000a70] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000a78] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000a80] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
--/* [0x00000a88] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
--/* [0x00000a90] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+/* [0x00000ac0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x00000ac8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000ad0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000ad8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000ae0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000ae8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000af0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000af8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000b00] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000b08] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000b10] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000b18] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000b20] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x00000b28] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+/* [0x00000b30] */ 0x009e7000, 0x100009e7, // nop        ; nop
- // ::mc_end
- };
- #ifdef __HIGHC__
-diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
-index 99927c4..cec9901 100644
---- a/libavcodec/rpi_shader.h
-+++ b/libavcodec/rpi_shader.h
-@@ -4,11 +4,11 @@
- extern unsigned int rpi_shader[];
- 
- #define mc_setup_uv (rpi_shader + 0)
--#define mc_filter_uv (rpi_shader + 148)
--#define mc_filter_uv_b0 (rpi_shader + 310)
--#define mc_filter_uv_b (rpi_shader + 458)
--#define mc_exit (rpi_shader + 630)
--#define mc_interrupt_exit8 (rpi_shader + 648)
--#define mc_end (rpi_shader + 678)
-+#define mc_filter_uv (rpi_shader + 152)
-+#define mc_filter_uv_b0 (rpi_shader + 342)
-+#define mc_filter_uv_b (rpi_shader + 494)
-+#define mc_exit (rpi_shader + 670)
-+#define mc_interrupt_exit8 (rpi_shader + 688)
-+#define mc_end (rpi_shader + 718)
- 
- #endif
-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-index d9ffcda..97c4c02 100644
---- a/libavcodec/rpi_shader.qasm
-+++ b/libavcodec/rpi_shader.qasm
-@@ -9,7 +9,12 @@
- #                                               (ra15 isn't clamped to zero - this happens during the
- #                                                copy to ra14, and during its use in the vertical filter)
- #
--# rb8...rb15                                    eight vertical filter coefficients
-+# rb8...rb11                                    eight vertical filter coefficients
-+
-+# rb12 offset to add before shift
-+# rb13 shift
-+# rb14 weight (U on left, V on right)
-+# rb15 offset (U on left, V on right)
- #
- # ra16                                          clipped(row start address+elem_num)&~3
- # ra17                                          per-channel shifts
-@@ -165,6 +170,9 @@ add r2, r2, r0 ; mul24 r1, r1, rb_pitch
- add t0s, r0, r1 ; mov ra_x2_base, r2
- add t0s, r2, r1
- 
-+mov rb12,unif # offset before shift
-+mov rb13,unif # offset after shift
-+
- # Dump padding words
- mov r0, unif
- 
-@@ -231,11 +239,21 @@ asr rb10, r0, rb23;     mul24 r0, r0, ra22
- asr rb9, r0, rb23;      mul24 r0, r0, ra22
- asr rb8, r0, rb23
- 
-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+
-+mov r0, unif # U offset/weight
-+asr rb15, r0, r2  # Compute offset from MSBs
-+shl r0, r0, r2
-+asr rb14, r0, r2  # Compute weight from LSBs
-+mov r0, unif # V offset/weight
-+asr.ifnz rb15, r0, r2
-+shl r0, r0, r2
-+asr.ifnz rb14, r0, r2
-+
- # r2 is elem_num
- # r3 is loop counter
- 
- mov r5rep, -8
--mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
- 
- # retrieve texture results and pick out bytes
- # then submit two more texture requests
-@@ -279,6 +297,11 @@ mov ra13, ra14       # Delay slot 1
- mov ra14, ra15       # Delay slot 2
- mov ra15, r0         # Delay slot 3
- 
-+mov rb12,32
-+mov rb13,6
-+mov rb14,1
-+mov rb15,0
-+
- # apply vertical filter and write to VPM
- 
- nop                     ; mul24 r1, ra14, rb10
-@@ -288,9 +311,11 @@ add r1, r1, r0          ; mul24 r0, ra15, rb11
- add r1, r1, r0          ; mov -, vw_wait
- sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
- asr r1, r1, 14
--add r1, r1, ra21
-+nop                     ; mul24 r1, r1, rb14
-+add r1, r1, rb12
-+asr r1, r1, rb13
- brr.anyn -, r:uvloop
--asr r1, r1, 6          # Delay 1
-+add r1, r1, rb15       # Delay 1
- min r1, r1, rb22       # Delay 2
- max vpm, r1, 0         # Delay 3
- 
-@@ -364,6 +389,9 @@ asr rb10, r0, rb23;     mul24 r0, r0, ra22
- asr rb9, r0, rb23;      mul24 r0, r0, ra22
- asr rb8, r0, rb23
- 
-+mov r0, unif # U offset/weight
-+mov r0, unif # V offset/weight
-+
- # r2 is elem_num
- # r3 is loop counter
- 
-@@ -491,6 +519,9 @@ asr rb10, r0, rb23;     mul24 r0, r0, ra22
- asr rb9, r0, rb23;      mul24 r0, r0, ra22
- asr rb8, r0, rb23
- 
-+mov r0, unif # U offset/weight
-+mov r0, unif # V offset/weight
-+
- # r2 is elem_num
- # r3 is loop counter
- 
--- 
-2.7.4
-
-
-From 310d994ea39e29b41a6a013abc4d94e6b90487b2 Mon Sep 17 00:00:00 2001
-From: Peter de Rivaz <peter.derivaz@gmail.com>
-Date: Tue, 19 May 2015 08:43:30 +0100
-Subject: [PATCH 40/68] Improved ordering of tasks
-
----
- libavcodec/hevc.c | 8 ++++----
- 1 file changed, 4 insertions(+), 4 deletions(-)
-
-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-index 9668ef8..951e2d3 100644
---- a/libavcodec/hevc.c
-+++ b/libavcodec/hevc.c
-@@ -2943,15 +2943,15 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-           s->dblk_cmds[s->num_dblk_cmds][0] = x_ctb;
-           s->dblk_cmds[s->num_dblk_cmds++][1] = y_ctb;
-           if ( (((y_ctb + ctb_size)&63) == 0) && x_ctb + ctb_size >= s->ps.sps->width) {
--#ifdef RPI_INTER_QPU
--            // Kick off inter prediction on QPUs
--            rpi_execute_inter_qpu(s);
--#endif
-             // Transform all blocks
-             // printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
-             rpi_execute_transform(s);
-             // Perform inter prediction
-             rpi_execute_inter_cmds(s);
-+#ifdef RPI_INTER_QPU
-+            // Kick off inter prediction on QPUs
-+            rpi_execute_inter_qpu(s);
-+#endif
-             // Wait for transform completion
-             vpu_wait(s->vpu_id);
- 
--- 
-2.7.4
-
-
-From d6e1ce7898196e49e52a6223c12979b3d0014588 Mon Sep 17 00:00:00 2001
-From: Peter de Rivaz <peter.derivaz@gmail.com>
-Date: Wed, 20 May 2015 19:58:19 +0100
-Subject: [PATCH 41/68] Drafted Luma inter prediction
-
----
- libavcodec/rpi_shader.qasm | 594 ++++++++++++++++++++++++++++++++++++++++++---
- 1 file changed, 554 insertions(+), 40 deletions(-)
-
-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-index 97c4c02..9cfc0d9 100644
---- a/libavcodec/rpi_shader.qasm
-+++ b/libavcodec/rpi_shader.qasm
-@@ -2,7 +2,10 @@
- #
- # ra0...ra7                                     eight horizontal filter coefficients
- #
--# rb1...rb7                                     seven shifted copies of the current unfiltered row
-+# rb0 rx_shift2
-+# rb1 ra_y2_next
-+#
-+# rb4...rb7
- #
- # ra8...ra15                                    eight filtered rows of context (rb15 == most recent)
- #
-@@ -26,9 +29,9 @@
- # rb19                                          next ra16
- #
- # ra20                                          1
--# ra21                                          32
-+# ra21                                          ra_21
- # ra22                                          256
--# ra23                                          8
-+# ra23                                          rx_shift2_next
- #
- # rb20                                          0xffffff00
- # rb21                                          vpm_setup for reading/writing 16bit results into VPM
-@@ -57,16 +60,23 @@
- .set rb_frame_width_minus_1,       rb25
- .set rb_frame_height_minus_1,      rb30
- .set rb_pitch,                     rb16
--.set ra_x_base,                    ra16
--.set rb_x_base_next,               rb19
--.set ra_x2_base,                   ra24
--.set ra_x2_base_next,              ra26
-+.set ra_x,                         ra16
-+.set ra_y2,                        ra21
-+.set ra_y2_next,                   rb1
-+
-+.set rb_x_next,                    rb19
-+.set rx_frame_base2_next,          rb19
-+
-+.set ra_frame_base,                ra24
-+.set ra_frame_base_next,           ra26
- .set ra_xshift,                    ra17
- 
--.set ra_x2shift,                   ra25
- .set ra_u2v_ref_offset,            ra25
-+.set ra_frame_base2,               ra25
- 
- .set ra_xshift_next,               ra19
-+.set rx_xshift2,                   rb0
-+.set rx_xshift2_next,              ra23
- 
- .set ra_x2shift_next,              ra27
- .set ra_u2v_dst_offset,            ra27
-@@ -83,11 +93,11 @@
- mov ra31, unif
- 
- # Load first request location
--add ra_x_base, unif, elem_num # Store x
-+add ra_x, unif, elem_num # Store x
- mov ra_y, unif # Store y
--mov ra_x2_base, unif # Store frame u base
-+mov ra_frame_base, unif # Store frame u base
- nop
--sub ra_u2v_ref_offset, unif, ra_x2_base # Store offset to add to move from u to v in reference frame
-+sub ra_u2v_ref_offset, unif, ra_frame_base # Store offset to add to move from u to v in reference frame
- 
- # Read image dimensions
- sub rb25,unif,1
-@@ -104,9 +114,7 @@ add rb24, r1, r0
- # load constants
- 
- mov ra20, 1
--mov ra21, 32
- mov ra22, 256
--mov ra23, 8
- mov ra30, 64
- 
- mov rb20, 0xffffff00
-@@ -156,18 +164,18 @@ mov r1, vpm_setup(0, 2, h16p(0, 0))  # 2 is stride - stride acts on ADDR which i
- add rb21, r0, r1
- 
- # Compute base address for first and second access
--mov r0, ra_x_base           # Load x
-+mov r0, ra_x           # Load x
- max r0, r0, 0; mov r1, ra_y # Load y
--min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base  # Load the frame base
-+min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base  # Load the frame base
- shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
- add ra_y, r1, 1
- add r0, r0, r3
- and r0, r0, ~3
--max r1, r1, 0 ; mov ra_x_base, r0 # y
-+max r1, r1, 0 ; mov ra_x, r0 # y
- min r1, r1, rb_frame_height_minus_1
- # submit texture requests for first line
- add r2, r2, r0 ; mul24 r1, r1, rb_pitch
--add t0s, r0, r1 ; mov ra_x2_base, r2
-+add t0s, r0, r1 ; mov ra_frame_base, r2
- add t0s, r2, r1
- 
- mov rb12,unif # offset before shift
-@@ -182,8 +190,8 @@ min r1, r1, rb_frame_height_minus_1
- add ra_y, ra_y, 1
- bra -, ra31
- nop ; mul24 r1, r1, rb_pitch
--add t0s, r1, ra_x_base
--add t0s, r1, ra_x2_base
-+add t0s, r1, ra_x
-+add t0s, r1, ra_frame_base
- 
- 
- 
-@@ -192,7 +200,7 @@ add t0s, r1, ra_x2_base
- # mc_filter_uv(next_kernel, x, y, frame_u_base, frame_v_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_u_dst, this_v_dst)
- 
- # At this point we have already issued two pairs of texture requests for the current block
--# ra_x_base, ra_x16_base point to the current coordinates for this block
-+# ra_x, ra_x16_base point to the current coordinates for this block
- ::mc_filter_uv
- mov ra31, unif
- 
-@@ -207,9 +215,9 @@ min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base
- shl ra_xshift_next, r0, 3
- sub r2, unif, r3 # compute offset from frame base u to frame base v
- add r0, r0, r3
--and rb_x_base_next, r0, ~3
-+and rb_x_next, r0, ~3
- mov ra_y_next, r1
--add ra_x2_base_next, rb_x_base_next, r2
-+add ra_frame_base_next, rb_x_next, r2
- 
- # set up VPM write
- mov vw_setup, rb28
-@@ -265,16 +273,16 @@ mov r3, 0
- # then submit two more texture requests
- 
- sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
--shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
--mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
-+mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
- mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
- shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
- 
- max r2, ra_y, 0  # y
- min r2, r2, rb_frame_height_minus_1
- add ra_y, ra_y, 1         ; mul24 r2, r2, r3
--add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
--add t0s, ra_x2_base, r2
-+add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-+add t0s, ra_frame_base, r2
- 
- # generate seven shifted versions
- # interleave with scroll of vertical context
-@@ -297,7 +305,7 @@ mov ra13, ra14       # Delay slot 1
- mov ra14, ra15       # Delay slot 2
- mov ra15, r0         # Delay slot 3
- 
--mov rb12,32
-+mov rb12,32 # TODO remove these to make P weighted prediction work properly
- mov rb13,6
- mov rb14,1
- mov rb15,0
-@@ -342,7 +350,7 @@ mov vw_addr, unif # start the VDW
- # mc_filter_uv_b0(next_kernel, x, y, frame_u_base, frame_v_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_u_dst, this_v_dst)
- 
- # At this point we have already issued two pairs of texture requests for the current block
--# ra_x_base, ra_x16_base point to the current coordinates for this block
-+# ra_x, ra_x16_base point to the current coordinates for this block
- ::mc_filter_uv_b0
- mov ra31, unif
- 
-@@ -357,9 +365,9 @@ min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base
- shl ra_xshift_next, r0, 3
- sub r2, unif, r3 # compute offset from frame base u to frame base v
- add r0, r0, r3
--and rb_x_base_next, r0, ~3
-+and rb_x_next, r0, ~3
- mov ra_y_next, r1
--add ra_x2_base_next, rb_x_base_next, r2
-+add ra_frame_base_next, rb_x_next, r2
- 
- # set up VPM write, we need to save 16bit precision
- mov vw_setup, rb21
-@@ -408,16 +416,16 @@ mov r3, 0
- # then submit two more texture requests
- 
- sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
--shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
--mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
-+mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
- mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
- shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
- 
- max r2, ra_y, 0  # y
- min r2, r2, rb_frame_height_minus_1
- add ra_y, ra_y, 1         ; mul24 r2, r2, r3
--add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
--add t0s, ra_x2_base, r2
-+add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-+add t0s, ra_frame_base, r2
- 
- # generate seven shifted versions
- # interleave with scroll of vertical context
-@@ -477,9 +485,9 @@ min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base
- shl ra_xshift_next, r0, 3
- sub r2, unif, r3 # compute offset from frame base u to frame base v
- add r0, r0, r3
--and rb_x_base_next, r0, ~3
-+and rb_x_next, r0, ~3
- mov ra_y_next, r1
--add ra_x2_base_next, rb_x_base_next, r2
-+add ra_frame_base_next, rb_x_next, r2
- 
- # set up VPM write
- mov vw_setup, rb28
-@@ -538,16 +546,16 @@ mov r3, 0
- # then submit two more texture requests
- 
- sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
--shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
--mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
-+mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
- mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
- shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
- 
- max r2, ra_y, 0  # y
- min r2, r2, rb_frame_height_minus_1
- add ra_y, ra_y, 1         ; mul24 r2, r2, r3
--add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
--add t0s, ra_x2_base, r2
-+add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-+add t0s, ra_frame_base, r2
- 
- # generate seven shifted versions
- # interleave with scroll of vertical context
-@@ -642,5 +650,511 @@ nop        ; nop ; thrend
- mov interrupt, 1; nop # delay slot 1
- nop        ; nop # delay slot 2
- 
-+
-+
-+
-+
-+# LUMA CODE
-+
-+# The idea is to form B predictions by doing 8 pixels from ref0 in parallel with 8 pixels from ref1.
-+# For P frames we make the second x,y coordinates offset by +8
-+
-+################################################################################
-+# mc_setup(next_kernel, x, y, ref_y_base, x2, y2, ref_y2_base, frame_width, frame_height, pitch, dst_pitch, offset, shift, pad2)
-+::mc_setup
-+
-+# Read starting kernel
-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+mov ra31, unif
-+
-+# Compute base address for first and second access
-+add r0, unif, elem_num # Load x
-+max r0, r0, 0; mov r1, unif # Load y
-+min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
-+shl ra_xshift_next, r0, 3 # Compute shifts
-+add ra_y, r1, 1
-+and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
-+add r2, r2, r0  # r2 is address for frame0 (not including y offset)
-+max r1, r1, 0
-+min r1, r1, rb_frame_height_minus_1
-+nop             ; mul24 r1, r1, rb_pitch   # r2 contains the addresses (not including y offset) for frame0
-+add t0s, r2, r1 ; mov ra_frame_base, r2
-+
-+add r0, unif, elem_num # Load x
-+max r0, r0, 0; mov r1, unif # Load y
-+min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
-+shl rx_xshift2_next, r0, 3 # Compute shifts
-+add ra_y2, r1, 1
-+and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
-+add r2, r2, r0  # r2 is address for frame1 (not including y offset)
-+max r1, r1, 0
-+min r1, r1, rb_frame_height_minus_1
-+nop             ; mul24 r1, r1, rb_pitch   # r2 contains the addresses (not including y offset) for frame0
-+add t0s, r2, r1 ; mov ra_frame_base2, r2
-+
-+
-+# Read image dimensions
-+sub rb25,unif,1
-+sub rb30,unif,1
-+
-+# get source pitch
-+mov rb16, unif
-+
-+# get destination pitch
-+mov r0, unif
-+mov r1, vdw_setup_1(0)
-+add rb24, r1, r0
-+
-+# load constants
-+
-+mov ra20, 1
-+mov ra22, 256
-+mov ra30, 64
-+
-+mov rb20, 0xffffff00
-+mov rb22, 255
-+mov rb23, 24
-+
-+# touch vertical context to keep simulator happy
-+
-+mov ra8, 0
-+mov ra9, 0
-+mov ra10, 0
-+mov ra11, 0
-+mov ra12, 0
-+mov ra13, 0
-+mov ra14, 0
-+mov ra15, 0
-+
-+# Compute part of VPM to use for DMA output
-+mov r2, qpu_num
-+mov r1, r2
-+asr r1, r1, 2
-+shl r1, r1, 6
-+mov r0, r2
-+and r0, r0, 3
-+add r0, r0, r1
-+mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
-+shl r0, r0, 5
-+add rb27, r0, r1
-+
-+# Compute part of VPM to save data into
-+mov r2, qpu_num   # qpu_num = abcd
-+mov r1, r2
-+asr r1, r1, 2
-+shl r1, r1, 6
-+mov r0, r2
-+and r0, r0, 3
-+add r0, r0, r1
-+mov r1, vpm_setup(0, 4, h8p(0, 0))   # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
-+add rb28, r0, r1
-+
-+mov rb12,unif # offset before shift
-+mov rb13,unif # shift
-+
-+# Dump padding words
-+mov r0, unif
-+
-+# submit texture requests for second line
-+max r1, ra_y, 0
-+min r1, r1, rb_frame_height_minus_1
-+add ra_y, ra_y, 1
-+nop ; mul24 r1, r1, rb_pitch
-+add t0s, r1, ra_frame_base
-+
-+max r1, ra_y2, 0
-+min r1, r1, rb_frame_height_minus_1
-+bra -, ra31
-+add ra_y2, ra_y2, 1           # Delay 1
-+nop ; mul24 r1, r1, rb_pitch  # Delay 2
-+add t0s, r1, ra_frame_base2   # Delay 3
-+
-+
-+################################################################################
-+
-+# mc_filter(next_kernel, x, y, frame_base, x2, y2, frame_base2, height, hcoeffs[0], hcoeffs2[0], hcoeffs[1], hcoeffs2[1], vcoeffs[0], vcoeffs2[0], vcoeffs[1], vcoeffs2[1], offsetweight0, offsetweight1, this_dst)
-+# In a P block, only the first half of coefficients contain used information.
-+# At this point we have already issued two pairs of texture requests for the current block
-+# ra_x, ra_x16_base point to the current coordinates for this block
-+::mc_filter
-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+mov ra31, unif
-+
-+# per-channel shifts were calculated on the *previous* invocation
-+
-+mov ra_xshift, ra_xshift_next
-+mov rx_xshift2, rx_xshift2_next
-+
-+# get base addresses and per-channel shifts for *next* invocation
-+add r0, unif, elem_num # Load x
-+max r0, r0, 0; mov r1, unif # Load y
-+min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
-+shl ra_xshift_next, r0, 3 # Compute shifts
-+mov ra_y_next, r1
-+and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
-+add ra_frame_base_next, r2, r0  # r2 is address for frame0 (not including y offset)
-+
-+add r0, unif, elem_num # Load x
-+max r0, r0, 0   ; mov r1, unif # Load y
-+min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
-+shl rx_xshift2_next, r0, 3 # Compute shifts
-+add ra_y2_next, r1, 1
-+and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
-+add rx_frame_base2_next, r2, r0  # r2 is address for frame1 (not including y offset)
-+
-+
-+# set up VPM write
-+mov vw_setup, rb28
-+
-+# get width,height of block
-+mov r2, 16
-+mov r0, unif
-+shr r1, r0, r2 # Extract width
-+sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
-+and r0, r0, rb22 # Extract height
-+add rb17, r0, 5
-+add rb18, r0, 7
-+shl r0, r0, 7
-+add r0, r0, r1 # Combine width and height of destination area
-+shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
-+add rb26, r0, rb27
-+
-+# get filter coefficients and discard unused B frame values
-+mov r0, unif
-+mov.ifnz -, unif # Alternate coefficients are unused for P frames
-+asr ra3, r0, rb23;      mul24 r0, r0, ra22 # These may need some pre-rotation to be used in B frames correctly
-+asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+asr ra0, r0, rb23;      mov r0, unif
-+mov.ifnz -, unif
-+asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+asr ra4, r0, rb23;      mov r0, unif
-+mov.ifnz -, unif
-+asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+asr rb8, r0, rb23;      mov r0, unif
-+mov.ifnz -, unif
-+asr rb7, r0, rb23;      mul24 r0, r0, ra22
-+asr rb6, r0, rb23;      mul24 r0, r0, ra22
-+asr rb5, r0, rb23;      mul24 r0, r0, ra22
-+asr rb4, r0, rb23
-+
-+mov r0, unif # Frame0 offset/weight
-+mov.ifnz -, unif # Frame1 offset/weight unused
-+asr rb15, r0, r2  # Compute offset from MSBs
-+shl r0, r0, r2
-+asr rb14, r0, r2  # Compute weight from LSBs
-+
-+# r3 is loop counter
-+
-+# retrieve texture results and pick out bytes
-+# then submit two more texture requests
-+
-+mov r3, 0
-+
-+:yloop
-+# retrieve texture results and pick out bytes
-+# then submit two more texture requests
-+
-+# If we knew there was no clipping then this code would get simpler.
-+# Perhaps we could add on the pitch and clip using larger values?
-+
-+sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
-+shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu0
-+mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+shr r1, r4, rx_xshift2
-+mov.ifz ra_y2, ra_y2_next
-+
-+max r2, ra_y, 0  # y
-+min r2, r2, rb_frame_height_minus_1
-+add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-+add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte
-+
-+max r2, ra_y2, 0  # y
-+min r2, r2, rb_frame_height_minus_1
-+add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
-+add t0s, ra_frame_base2, r2   ; v8subs r0, r0, rb20
-+
-+
-+# generate seven shifted versions
-+# interleave with scroll of vertical context
-+
-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+
-+# apply horizontal filter
-+nop                  ; mul24 r2, r0, ra0
-+nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+add r0, r2, r3       ; mov r3, rb31
-+sub.setf -, r3, 8    ; mov ra12, ra13
-+mov ra9, ra10
-+mov ra10, ra11
-+mov ra11, ra12
-+mov ra12, ra13
-+brr.anyn -, r:yloop
-+mov ra13, ra14       # Delay slot 1
-+mov ra14, ra15       # Delay slot 2
-+mov ra15, r0         # Delay slot 3
-+
-+# apply vertical filter and write to VPM
-+
-+nop                     ; mul24 r1, ra14, rb10
-+nop                     ; mul24 r0, ra13, rb9
-+add r1, r1, r0          ; mul24 r0, ra12, rb8
-+add r1, r1, r0          ; mul24 r0, ra15, rb11
-+add r1, r1, r0          ; mul24 r0, ra8, rb4
-+add r1, r1, r0          ; mul24 r0, ra9, rb5
-+add r1, r1, r0          ; mul24 r0, ra10, rb6
-+add r1, r1, r0          ; mul24 r0, ra11, rb7
-+
-+add r1, r1, r0          ; mov -, vw_wait
-+sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+asr r1, r1, 14
-+nop                     ; mul24 r1, r1, rb14
-+add r1, r1, rb12
-+asr r1, r1, rb13
-+brr.anyn -, r:yloop
-+add r1, r1, rb15       # Delay 1
-+min r1, r1, rb22       # Delay 2
-+max vpm, r1, 0         # Delay 3
-+
-+# DMA out
-+
-+bra -, ra31
-+mov vw_setup, rb26 # VDW setup 0    Delay 1
-+mov vw_setup, rb29 # Stride         Delay 2
-+mov vw_addr, unif # start the VDW   Delay 3
-+
-+
-+
-+################################################################################
-+
-+# mc_filter_b(next_kernel, x, y, frame_base, x2, y2, frame_base2, width_height, hcoeffs[0], hcoeffs2[0], hcoeffs[1], hcoeffs2[1], vcoeffs[0], vcoeffs2[0], vcoeffs[1], vcoeffs2[1], offsetweight0, offsetweight1, this_dst)
-+# In a P block, only the first half of coefficients contain used information.
-+# At this point we have already issued two pairs of texture requests for the current block
-+# May be better to just send 16.16 motion vector and figure out the coefficients inside this block (only 4 cases so can compute hcoeffs in around 24 cycles?)
-+# Can fill in the coefficients so only
-+# Can also assume default weighted prediction for B frames.
-+# Perhaps can unpack coefficients in a more efficient manner by doing H/V for a and b at the same time?
-+# Or possibly by taking advantage of symmetry?
-+# From 19->7 32bits per command.
-+::mc_filter_b
-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+mov ra31, unif
-+
-+# per-channel shifts were calculated on the *previous* invocation
-+
-+mov ra_xshift, ra_xshift_next
-+mov rx_xshift2, rx_xshift2_next
-+
-+# get base addresses and per-channel shifts for *next* invocation
-+add r0, unif, elem_num # Load x
-+max r0, r0, 0; mov r1, unif # Load y
-+min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
-+shl ra_xshift_next, r0, 3 # Compute shifts
-+mov ra_y_next, r1
-+and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
-+add ra_frame_base_next, r2, r0  # r2 is address for frame0 (not including y offset)
-+
-+add r0, unif, elem_num # Load x
-+max r0, r0, 0   ; mov r1, unif # Load y
-+min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
-+shl rx_xshift2_next, r0, 3 # Compute shifts
-+add ra_y2_next, r1, 1
-+and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
-+add rx_frame_base2_next, r2, r0  # r2 is address for frame1 (not including y offset)
-+
-+
-+# set up VPM write
-+mov vw_setup, rb28
-+
-+# get width,height of block
-+mov r2, 16
-+mov r0, unif
-+shr r1, r0, r2 # Extract width
-+sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
-+and r0, r0, rb22 # Extract height
-+add rb17, r0, 5
-+add rb18, r0, 7
-+shl r0, r0, 7
-+add r0, r0, r1 # Combine width and height of destination area
-+shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
-+add rb26, r0, rb27
-+
-+# get filter coefficients and discard unused B frame values
-+mov r0, unif
-+mov r1, 1
-+mov.ifnz r0, unif # Alternate coefficients are unused for P frames
-+nop              ;      mul24 r0, r0 << 13, r1 << 13
-+asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+nop              ;      mul24 r0, r0 << 14, r1 << 14
-+asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+nop              ;      mul24 r0, r0 << 15, r1 << 15 # Adjust such that a rotate of 1 will produce the values with first 8 on left, second 8 on right
-+asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+asr ra0, r0, rb23;      mov r0, unif
-+mov.ifnz r0, unif
-+nop              ;      mul24 r0, r0 << 9, r1 << 9
-+asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+nop              ;      mul24 r0, r0 << 10, r1 << 10
-+asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+nop              ;      mul24 r0, r0 << 11, r1 << 11
-+asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+nop              ;      mul24 r0, r0 << 12, r1 << 12
-+asr ra4, r0, rb23;      mov r0, unif
-+mov.ifnz r0, unif
-+asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+asr rb8, r0, rb23;      mov r0, unif
-+mov.ifnz r0, unif
-+asr rb7, r0, rb23;      mul24 r0, r0, ra22
-+asr rb6, r0, rb23;      mul24 r0, r0, ra22
-+asr rb5, r0, rb23;      mul24 r0, r0, ra22
-+asr rb4, r0, rb23
-+
-+mov r0, unif # Frame0 offset/weight
-+mov.ifnz r0, unif # Frame1 offset/weight unused
-+asr rb15, r0, r2  # Compute offset from MSBs
-+shl r0, r0, r2
-+asr rb14, r0, r2  # Compute weight from LSBs
-+
-+# r3 is loop counter
-+
-+# retrieve texture results and pick out bytes
-+# then submit two more texture requests
-+
-+mov r3, 0
-+
-+:yloopb
-+# retrieve texture results and pick out bytes
-+# then submit two more texture requests
-+
-+# If we knew there was no clipping then this code would get simpler.
-+# Perhaps we could add on the pitch and clip using larger values?
-+
-+sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
-+shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu0
-+mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+shr r1, r4, rx_xshift2
-+mov.ifz ra_y2, ra_y2_next
-+
-+max r2, ra_y, 0  # y
-+min r2, r2, rb_frame_height_minus_1
-+add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-+add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte
-+
-+max r2, ra_y2, 0  # y
-+min r2, r2, rb_frame_height_minus_1
-+add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
-+add t0s, ra_frame_base2, r2   ; v8subs r0, r0, rb20
-+
-+
-+# generate seven shifted versions
-+# interleave with scroll of vertical context
-+
-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+
-+# apply horizontal filter
-+nop                  ; mul24 r2, r0, ra0
-+nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+add r0, r2, r3       ; mov r3, rb31
-+sub.setf -, r3, 8    ; mov ra12, ra13
-+mov ra9, ra10
-+mov ra10, ra11
-+mov ra11, ra12
-+mov ra12, ra13
-+brr.anyn -, r:yloopb
-+mov ra13, ra14       # Delay slot 1
-+mov ra14, ra15       # Delay slot 2
-+mov ra15, r0         # Delay slot 3
-+
-+# apply vertical filter and write to VPM
-+
-+nop                     ; mul24 r1, ra14, rb10
-+nop                     ; mul24 r0, ra13, rb9
-+add r1, r1, r0          ; mul24 r0, ra12, rb8
-+add r1, r1, r0          ; mul24 r0, ra15, rb11
-+add r1, r1, r0          ; mul24 r0, ra8, rb4
-+add r1, r1, r0          ; mul24 r0, ra9, rb5
-+add r1, r1, r0          ; mul24 r0, ra10, rb6
-+add r1, r1, r0          ; mul24 r0, ra11, rb7
-+
-+add r1, r1, r0          ; mov -, vw_wait
-+sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+asr r1, r1, 14
-+nop                     ; mul24 r1, r1 << 8, ra20 << 8 # Rotate to align left and right halves
-+add r1, r1, ra30        ; mul24 r0, r1, rb14
-+add r1, r1, r0
-+brr.anyn -, r:yloopb
-+asr r1, r1, 7          # Delay 1
-+min r1, r1, rb22       # Delay 2
-+max vpm, r1, 0         # Delay 3
-+
-+# DMA out
-+bra -, ra31
-+mov vw_setup, rb26 # VDW setup 0    Delay 1
-+mov vw_setup, rb29 # Stride         Delay 2
-+mov vw_addr, unif # start the VDW   Delay 3
-+
-+################################################################################
-+
-+# mc_interrupt_exit12()
-+::mc_interrupt_exit12
-+mov  -, vw_wait # wait on the VDW
-+
-+ldtmu0
-+ldtmu0
-+ldtmu0
-+ldtmu0
-+
-+mov -,sacq(0) # 1
-+mov -,sacq(0) # 2
-+mov -,sacq(0) # 3
-+mov -,sacq(0) # 4
-+mov -,sacq(0) # 5
-+mov -,sacq(0) # 6
-+mov -,sacq(0) # 7
-+mov -,sacq(0) # 8
-+mov -,sacq(0) # 9
-+mov -,sacq(0) # 10
-+mov -,sacq(0) # 11
-+
-+nop        ; nop ; thrend
-+mov interrupt, 1; nop # delay slot 1
-+nop        ; nop # delay slot 2
-+
-+
- ::mc_end
- # Do not add code here because mc_end must appear after all other code.
--- 
-2.7.4
-
-
-From f2ffe4186fa49cb27579953c276b51728a08a8b5 Mon Sep 17 00:00:00 2001
-From: Peter de Rivaz <peter.derivaz@gmail.com>
-Date: Wed, 20 May 2015 19:58:30 +0100
-Subject: [PATCH 42/68] Added support for fast cache flush in deblocker
-
----
- libavcodec/hevc_filter.c   |   44 +-
- libavcodec/rpi_qpu.c       |    6 +
- libavcodec/rpi_qpu.h       |    2 +
- libavcodec/rpi_shader.c    | 1028 +++++++++++++++++++++++++++++---------------
- libavcodec/rpi_shader.h    |   16 +-
- libavcodec/rpi_user_vcsm.h |   22 +
- 6 files changed, 768 insertions(+), 350 deletions(-)
-
-diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
-index 92a8271..186317a 100644
---- a/libavcodec/hevc_filter.c
-+++ b/libavcodec/hevc_filter.c
-@@ -37,6 +37,11 @@
- 
- #include "bit_depth_template.c"
- 
-+#ifdef RPI
-+#include "rpi_user_vcsm.h"
-+#include "rpi_qpu.h"
-+#endif
-+
- #define LUMA 0
- #define CB 1
- #define CR 2
-@@ -872,15 +877,46 @@ static void flush_buffer(AVBufferRef *bref) {
-     gpu_cache_flush(p);
- }
- 
--static void ff_hevc_flush_chroma(HEVCContext *s)
-+// Return Physical address for this image
-+static int ff_hevc_buf_base(AVBufferRef *bref) {
-+  GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
-+  return p->vc & 0x3fffffff;
-+}
-+
-+static void ff_hevc_flush_chroma(HEVCContext *s, ThreadFrame *f, int n)
- {
-     if (s->enable_rpi && !(  s->nal_unit_type == NAL_TRAIL_N ||
-             s->nal_unit_type == NAL_TSA_N   ||
-             s->nal_unit_type == NAL_STSA_N  ||
-             s->nal_unit_type == NAL_RADL_N  ||
-             s->nal_unit_type == NAL_RASL_N )) {
-+#define RPI_FAST_CACHEFLUSH
-+#ifdef RPI_FAST_CACHEFLUSH
-+        struct vcsm_user_clean_invalid_s iocache = {};
-+        int curr_y = f->progress->data[0];
-+        int sz,base;
-+        if (curr_y < 0) curr_y = 0;
-+        if (n<=curr_y) return; // Should not happen
-+        sz = s->frame->linesize[1] * (n-curr_y);
-+        base = s->frame->linesize[1] * curr_y;
-+        iocache.s[0].cmd = 3; // Flush L1 cache
-+        iocache.s[0].addr = 0;
-+        iocache.s[0].size  = 0;
-+
-+        iocache.s[1].cmd = 2;
-+        iocache.s[1].addr = ff_hevc_buf_base(s->frame->buf[1]) + base;
-+        iocache.s[1].size  = sz;
-+
-+        iocache.s[2].cmd = 2;
-+        iocache.s[2].addr = ff_hevc_buf_base(s->frame->buf[2]) + base;
-+        iocache.s[2].size  = sz;
-+
-+        vcsm_clean_invalid( gpu_get_mailbox(), &iocache );
-+
-+#else
-         flush_buffer(s->frame->buf[1]);
-         flush_buffer(s->frame->buf[2]);
-+#endif
-         //memcpy(s->dummy.arm,s->frame->data[0],2048*64);
-         //memcpy(s->dummy.arm,s->frame->data[1],1024*32);
-         //memcpy(s->dummy.arm,s->frame->data[2],1024*32);
-@@ -903,7 +939,7 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
-             sao_filter_CTB(s, x, y - ctb_size);
-             if (s->threads_type & FF_THREAD_FRAME ) {
- #ifdef RPI_INTER_QPU
--                ff_hevc_flush_chroma(s);
-+                ff_hevc_flush_chroma(s,&s->ref->tf, y);
- #endif
-                 ff_thread_report_progress(&s->ref->tf, y, 0);
-             }
-@@ -912,7 +948,7 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
-             sao_filter_CTB(s, x , y);
-             if (s->threads_type & FF_THREAD_FRAME ) {
- #ifdef RPI_INTER_QPU
--                ff_hevc_flush_chroma(s);
-+                ff_hevc_flush_chroma(s, &s->ref->tf, y + ctb_size);
- #endif
-                 ff_thread_report_progress(&s->ref->tf, y + ctb_size, 0);
-             }
-@@ -922,7 +958,7 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
-         //int currh = s->ref->tf.progress->data[0];
-         //if (((y + ctb_size)&63)==0)
- #ifdef RPI_INTER_QPU
--        ff_hevc_flush_chroma(s);
-+        ff_hevc_flush_chroma(s, &s->ref->tf, y + ctb_size - 4);
- #endif
-         ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
-     }
-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-index f62051f..fd8a276 100644
---- a/libavcodec/rpi_qpu.c
-+++ b/libavcodec/rpi_qpu.c
-@@ -237,6 +237,12 @@ int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p)
-   return r;
- }
- 
-+int gpu_get_mailbox(void)
-+{
-+  assert(gpu);
-+  return gpu->mb;
-+}
-+
- void gpu_cache_flush(GPU_MEM_PTR_T *p)
- {
-   void *tmp = vcsm_lock(p->vcsm_handle);
-diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
-index 543c84b..88965e5 100644
---- a/libavcodec/rpi_qpu.h
-+++ b/libavcodec/rpi_qpu.h
-@@ -49,4 +49,6 @@ extern int rpi_test_shader(void);
- extern void rpi_do_block(const unsigned char *in_buffer_vc, int src_pitch, unsigned char *dst_vc, int dst_pitch, unsigned char *dst);
- extern void rpi_do_block_arm(const unsigned char *in_buffer, int src_pitch, unsigned char *dst, int dst_pitch);
- 
-+extern int gpu_get_mailbox(void);
-+
- #endif
-diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
-index 3f04d80..9c30e32 100644
---- a/libavcodec/rpi_shader.c
-+++ b/libavcodec/rpi_shader.c
-@@ -23,11 +23,11 @@ __attribute__((aligned(8)))
- unsigned int rpi_shader[] = {
- // ::mc_setup_uv
- /* [0x00000000] */ 0x15827d80, 0x100207e7, // mov ra31, unif
--/* [0x00000008] */ 0x0c9a0f80, 0x10020427, // add ra_x_base, unif, elem_num
-+/* [0x00000008] */ 0x0c9a0f80, 0x10020427, // add ra_x, unif, elem_num
- /* [0x00000010] */ 0x15827d80, 0x10020767, // mov ra_y, unif
--/* [0x00000018] */ 0x15827d80, 0x10020627, // mov ra_x2_base, unif
-+/* [0x00000018] */ 0x15827d80, 0x10020627, // mov ra_frame_base, unif
- /* [0x00000020] */ 0x009e7000, 0x100009e7, // nop
--/* [0x00000028] */ 0x0d620f80, 0x10020667, // sub ra_u2v_ref_offset, unif, ra_x2_base
-+/* [0x00000028] */ 0x0d620f80, 0x10020667, // sub ra_u2v_ref_offset, unif, ra_frame_base
- /* [0x00000030] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
- /* [0x00000038] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
- /* [0x00000040] */ 0x15827d80, 0x10021427, // mov rb16, unif
-@@ -35,360 +35,708 @@ unsigned int rpi_shader[] = {
- /* [0x00000050] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
- /* [0x00000058] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
- /* [0x00000060] */ 0x00000001, 0xe0020527, // mov ra20, 1
--/* [0x00000068] */ 0x00000020, 0xe0020567, // mov ra21, 32
--/* [0x00000070] */ 0x00000100, 0xe00205a7, // mov ra22, 256
--/* [0x00000078] */ 0x00000008, 0xe00205e7, // mov ra23, 8
--/* [0x00000080] */ 0x00000040, 0xe00207a7, // mov ra30, 64
--/* [0x00000088] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
--/* [0x00000090] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
--/* [0x00000098] */ 0x00000018, 0xe00215e7, // mov rb23, 24
--/* [0x000000a0] */ 0x00000000, 0xe0020227, // mov ra8, 0
--/* [0x000000a8] */ 0x00000000, 0xe0020267, // mov ra9, 0
--/* [0x000000b0] */ 0x00000000, 0xe00202a7, // mov ra10, 0
--/* [0x000000b8] */ 0x00000000, 0xe00202e7, // mov ra11, 0
--/* [0x000000c0] */ 0x00000000, 0xe0020327, // mov ra12, 0
--/* [0x000000c8] */ 0x00000000, 0xe0020367, // mov ra13, 0
--/* [0x000000d0] */ 0x00000000, 0xe00203a7, // mov ra14, 0
--/* [0x000000d8] */ 0x00000000, 0xe00203e7, // mov ra15, 0
--/* [0x000000e0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
--/* [0x000000e8] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
--/* [0x000000f0] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
--/* [0x000000f8] */ 0x159e7480, 0x10020867, // mov r1, r2
--/* [0x00000100] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
--/* [0x00000108] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
--/* [0x00000110] */ 0x159e7480, 0x10020827, // mov r0, r2
--/* [0x00000118] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
--/* [0x00000120] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
--/* [0x00000128] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
--/* [0x00000130] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
--/* [0x00000138] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
--/* [0x00000140] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
--/* [0x00000148] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
--/* [0x00000150] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
--/* [0x00000158] */ 0x159e7480, 0x10020867, // mov r1, r2
--/* [0x00000160] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
--/* [0x00000168] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
--/* [0x00000170] */ 0x159e7480, 0x10020827, // mov r0, r2
--/* [0x00000178] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
--/* [0x00000180] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
--/* [0x00000188] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
--/* [0x00000190] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
--/* [0x00000198] */ 0x0f9c11c0, 0xd0020827, // asr r0, r0, 1
--/* [0x000001a0] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
--/* [0x000001a8] */ 0x0c9e7040, 0x10021567, // add rb21, r0, r1
--/* [0x000001b0] */ 0x15427d80, 0x10020827, // mov r0, ra_x_base
--/* [0x000001b8] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
--/* [0x000001c0] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base
--/* [0x000001c8] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
--/* [0x000001d0] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
--/* [0x000001d8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
--/* [0x000001e0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
--/* [0x000001e8] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x_base, r0
--/* [0x000001f0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
--/* [0x000001f8] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
--/* [0x00000200] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
--/* [0x00000208] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
--/* [0x00000210] */ 0x15827d80, 0x10021327, // mov rb12,unif
--/* [0x00000218] */ 0x15827d80, 0x10021367, // mov rb13,unif
--/* [0x00000220] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000228] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
--/* [0x00000230] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
--/* [0x00000238] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
--/* [0x00000240] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
--/* [0x00000248] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
--/* [0x00000250] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
--/* [0x00000258] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
-+/* [0x00000068] */ 0x00000100, 0xe00205a7, // mov ra22, 256
-+/* [0x00000070] */ 0x00000040, 0xe00207a7, // mov ra30, 64
-+/* [0x00000078] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
-+/* [0x00000080] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
-+/* [0x00000088] */ 0x00000018, 0xe00215e7, // mov rb23, 24
-+/* [0x00000090] */ 0x00000000, 0xe0020227, // mov ra8, 0
-+/* [0x00000098] */ 0x00000000, 0xe0020267, // mov ra9, 0
-+/* [0x000000a0] */ 0x00000000, 0xe00202a7, // mov ra10, 0
-+/* [0x000000a8] */ 0x00000000, 0xe00202e7, // mov ra11, 0
-+/* [0x000000b0] */ 0x00000000, 0xe0020327, // mov ra12, 0
-+/* [0x000000b8] */ 0x00000000, 0xe0020367, // mov ra13, 0
-+/* [0x000000c0] */ 0x00000000, 0xe00203a7, // mov ra14, 0
-+/* [0x000000c8] */ 0x00000000, 0xe00203e7, // mov ra15, 0
-+/* [0x000000d0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+/* [0x000000d8] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
-+/* [0x000000e0] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-+/* [0x000000e8] */ 0x159e7480, 0x10020867, // mov r1, r2
-+/* [0x000000f0] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+/* [0x000000f8] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+/* [0x00000100] */ 0x159e7480, 0x10020827, // mov r0, r2
-+/* [0x00000108] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-+/* [0x00000110] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000118] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-+/* [0x00000120] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-+/* [0x00000128] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
-+/* [0x00000130] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+/* [0x00000138] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
-+/* [0x00000140] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-+/* [0x00000148] */ 0x159e7480, 0x10020867, // mov r1, r2
-+/* [0x00000150] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+/* [0x00000158] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+/* [0x00000160] */ 0x159e7480, 0x10020827, // mov r0, r2
-+/* [0x00000168] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-+/* [0x00000170] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000178] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-+/* [0x00000180] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
-+/* [0x00000188] */ 0x0f9c11c0, 0xd0020827, // asr r0, r0, 1
-+/* [0x00000190] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
-+/* [0x00000198] */ 0x0c9e7040, 0x10021567, // add rb21, r0, r1
-+/* [0x000001a0] */ 0x15427d80, 0x10020827, // mov r0, ra_x
-+/* [0x000001a8] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
-+/* [0x000001b0] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base
-+/* [0x000001b8] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
-+/* [0x000001c0] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
-+/* [0x000001c8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+/* [0x000001d0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+/* [0x000001d8] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x, r0
-+/* [0x000001e0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+/* [0x000001e8] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
-+/* [0x000001f0] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_frame_base, r2
-+/* [0x000001f8] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
-+/* [0x00000200] */ 0x15827d80, 0x10021327, // mov rb12,unif
-+/* [0x00000208] */ 0x15827d80, 0x10021367, // mov rb13,unif
-+/* [0x00000210] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000218] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-+/* [0x00000220] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+/* [0x00000228] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-+/* [0x00000230] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00000238] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-+/* [0x00000240] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x
-+/* [0x00000248] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_frame_base
- // ::mc_filter_uv
--/* [0x00000260] */ 0x15827d80, 0x100207e7, // mov ra31, unif
--/* [0x00000268] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
--/* [0x00000270] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
--/* [0x00000278] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
--/* [0x00000280] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
--/* [0x00000288] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
--/* [0x00000290] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
--/* [0x00000298] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
--/* [0x000002a0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
--/* [0x000002a8] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
--/* [0x000002b0] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
--/* [0x000002b8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
--/* [0x000002c0] */ 0x00000010, 0xe00208a7, // mov r2, 16
--/* [0x000002c8] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x000002d0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
--/* [0x000002d8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
--/* [0x000002e0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
--/* [0x000002e8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
--/* [0x000002f0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
--/* [0x000002f8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
--/* [0x00000300] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
--/* [0x00000308] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
--/* [0x00000310] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
--/* [0x00000318] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000320] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000328] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000330] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000338] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
--/* [0x00000340] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000348] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000350] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000358] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
--/* [0x00000360] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00000368] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000370] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
--/* [0x00000378] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
--/* [0x00000380] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
--/* [0x00000388] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000390] */ 0x0f9e7080, 0x100613e7, // asr.ifnz rb15, r0, r2
--/* [0x00000398] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
--/* [0x000003a0] */ 0x0f9e7080, 0x100613a7, // asr.ifnz rb14, r0, r2
--/* [0x000003a8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
--/* [0x000003b0] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+/* [0x00000250] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x00000258] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+/* [0x00000260] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+/* [0x00000268] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+/* [0x00000270] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+/* [0x00000278] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x00000280] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+/* [0x00000288] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+/* [0x00000290] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
-+/* [0x00000298] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+/* [0x000002a0] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
-+/* [0x000002a8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+/* [0x000002b0] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+/* [0x000002b8] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x000002c0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+/* [0x000002c8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+/* [0x000002d0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+/* [0x000002d8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-+/* [0x000002e0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+/* [0x000002e8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+/* [0x000002f0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x000002f8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+/* [0x00000300] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+/* [0x00000308] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000310] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000318] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000320] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000328] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+/* [0x00000330] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000338] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000340] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000348] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-+/* [0x00000350] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000358] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000360] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
-+/* [0x00000368] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+/* [0x00000370] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
-+/* [0x00000378] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000380] */ 0x0f9e7080, 0x100613e7, // asr.ifnz rb15, r0, r2
-+/* [0x00000388] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+/* [0x00000390] */ 0x0f9e7080, 0x100613a7, // asr.ifnz rb14, r0, r2
-+/* [0x00000398] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+/* [0x000003a0] */ 0x00000000, 0xe00208e7, // mov r3, 0
- // :uvloop
--/* [0x000003b8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
--/* [0x000003c0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
--/* [0x000003c8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
--/* [0x000003d0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
--/* [0x000003d8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
--/* [0x000003e0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
--/* [0x000003e8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
--/* [0x000003f0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
--/* [0x000003f8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
--/* [0x00000400] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
--/* [0x00000408] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00000410] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
--/* [0x00000418] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
--/* [0x00000420] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
--/* [0x00000428] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
--/* [0x00000430] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
--/* [0x00000438] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
--/* [0x00000440] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
--/* [0x00000448] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
--/* [0x00000450] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
--/* [0x00000458] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
--/* [0x00000460] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
--/* [0x00000468] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
--/* [0x00000470] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
--/* [0x00000478] */ 0x159e7000, 0x100203e7, // mov ra15, r0
--/* [0x00000480] */ 0x00000020, 0xe0021327, // mov rb12,32
--/* [0x00000488] */ 0x00000006, 0xe0021367, // mov rb13,6
--/* [0x00000490] */ 0x00000001, 0xe00213a7, // mov rb14,1
--/* [0x00000498] */ 0x00000000, 0xe00213e7, // mov rb15,0
--/* [0x000004a0] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
--/* [0x000004a8] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
--/* [0x000004b0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
--/* [0x000004b8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
--/* [0x000004c0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
--/* [0x000004c8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
--/* [0x000004d0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
--/* [0x000004d8] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
--/* [0x000004e0] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
--/* [0x000004e8] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
--/* [0x000004f0] */ 0xfffffea8, 0xf06809e7, // brr.anyn -, r:uvloop
--/* [0x000004f8] */ 0x0c9cf3c0, 0x10020867, // add r1, r1, rb15
--/* [0x00000500] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
--/* [0x00000508] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
--/* [0x00000510] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
--/* [0x00000518] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x00000520] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
--/* [0x00000528] */ 0x00000010, 0xe0020827, // mov r0, 16
--/* [0x00000530] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
--/* [0x00000538] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
--/* [0x00000540] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
--/* [0x00000548] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x00000550] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x000003a8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+/* [0x000003b0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
-+/* [0x000003b8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+/* [0x000003c0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x000003c8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+/* [0x000003d0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x000003d8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x000003e0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+/* [0x000003e8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-+/* [0x000003f0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_frame_base, r2
-+/* [0x000003f8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000400] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+/* [0x00000408] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+/* [0x00000410] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+/* [0x00000418] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+/* [0x00000420] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+/* [0x00000428] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+/* [0x00000430] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+/* [0x00000438] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+/* [0x00000440] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+/* [0x00000448] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+/* [0x00000450] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
-+/* [0x00000458] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+/* [0x00000460] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+/* [0x00000468] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+/* [0x00000470] */ 0x00000020, 0xe0021327, // mov rb12,32
-+/* [0x00000478] */ 0x00000006, 0xe0021367, // mov rb13,6
-+/* [0x00000480] */ 0x00000001, 0xe00213a7, // mov rb14,1
-+/* [0x00000488] */ 0x00000000, 0xe00213e7, // mov rb15,0
-+/* [0x00000490] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+/* [0x00000498] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+/* [0x000004a0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+/* [0x000004a8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+/* [0x000004b0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+/* [0x000004b8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+/* [0x000004c0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+/* [0x000004c8] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
-+/* [0x000004d0] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
-+/* [0x000004d8] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
-+/* [0x000004e0] */ 0xfffffea8, 0xf06809e7, // brr.anyn -, r:uvloop
-+/* [0x000004e8] */ 0x0c9cf3c0, 0x10020867, // add r1, r1, rb15
-+/* [0x000004f0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+/* [0x000004f8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+/* [0x00000500] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x00000508] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000510] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x00000518] */ 0x00000010, 0xe0020827, // mov r0, 16
-+/* [0x00000520] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000528] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00000530] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+/* [0x00000538] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000540] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
- // ::mc_filter_uv_b0
--/* [0x00000558] */ 0x15827d80, 0x100207e7, // mov ra31, unif
--/* [0x00000560] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
--/* [0x00000568] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
--/* [0x00000570] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
--/* [0x00000578] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
--/* [0x00000580] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
--/* [0x00000588] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
--/* [0x00000590] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
--/* [0x00000598] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
--/* [0x000005a0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
--/* [0x000005a8] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
--/* [0x000005b0] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
--/* [0x000005b8] */ 0x00000010, 0xe00208a7, // mov r2, 16
--/* [0x000005c0] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x000005c8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
--/* [0x000005d0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
--/* [0x000005d8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
--/* [0x000005e0] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
--/* [0x000005e8] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
--/* [0x000005f0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
--/* [0x000005f8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
--/* [0x00000600] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
--/* [0x00000608] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
--/* [0x00000610] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000618] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000620] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000628] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000630] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
--/* [0x00000638] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000640] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000648] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000650] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
--/* [0x00000658] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000660] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000668] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
--/* [0x00000670] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00000678] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+/* [0x00000548] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x00000550] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+/* [0x00000558] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+/* [0x00000560] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+/* [0x00000568] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+/* [0x00000570] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x00000578] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+/* [0x00000580] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+/* [0x00000588] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
-+/* [0x00000590] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+/* [0x00000598] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
-+/* [0x000005a0] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
-+/* [0x000005a8] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+/* [0x000005b0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x000005b8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+/* [0x000005c0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+/* [0x000005c8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+/* [0x000005d0] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-+/* [0x000005d8] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+/* [0x000005e0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+/* [0x000005e8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x000005f0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+/* [0x000005f8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+/* [0x00000600] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000608] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000610] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000618] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000620] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+/* [0x00000628] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000630] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000638] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000640] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-+/* [0x00000648] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000650] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000658] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+/* [0x00000660] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000668] */ 0x00000000, 0xe00208e7, // mov r3, 0
- // :uvloop_b0
--/* [0x00000680] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
--/* [0x00000688] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
--/* [0x00000690] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
--/* [0x00000698] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
--/* [0x000006a0] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
--/* [0x000006a8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
--/* [0x000006b0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
--/* [0x000006b8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
--/* [0x000006c0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
--/* [0x000006c8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
--/* [0x000006d0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x000006d8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
--/* [0x000006e0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
--/* [0x000006e8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
--/* [0x000006f0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
--/* [0x000006f8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
--/* [0x00000700] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
--/* [0x00000708] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
--/* [0x00000710] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
--/* [0x00000718] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
--/* [0x00000720] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
--/* [0x00000728] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
--/* [0x00000730] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
--/* [0x00000738] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
--/* [0x00000740] */ 0x159e7000, 0x100203e7, // mov ra15, r0
--/* [0x00000748] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
--/* [0x00000750] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
--/* [0x00000758] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
--/* [0x00000760] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
--/* [0x00000768] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
--/* [0x00000770] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
--/* [0x00000778] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:uvloop_b0
--/* [0x00000780] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
--/* [0x00000788] */ 0x009e7000, 0x100009e7, // nop
--/* [0x00000790] */ 0x009e7000, 0x100009e7, // nop
--/* [0x00000798] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
--/* [0x000007a0] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x000007a8] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x000007b0] */ 0x009e7000, 0x100009e7, // nop
-+/* [0x00000670] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+/* [0x00000678] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
-+/* [0x00000680] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+/* [0x00000688] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x00000690] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+/* [0x00000698] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x000006a0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x000006a8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+/* [0x000006b0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-+/* [0x000006b8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_frame_base, r2
-+/* [0x000006c0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x000006c8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+/* [0x000006d0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+/* [0x000006d8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+/* [0x000006e0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+/* [0x000006e8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+/* [0x000006f0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+/* [0x000006f8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+/* [0x00000700] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+/* [0x00000708] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+/* [0x00000710] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+/* [0x00000718] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+/* [0x00000720] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+/* [0x00000728] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+/* [0x00000730] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+/* [0x00000738] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+/* [0x00000740] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+/* [0x00000748] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+/* [0x00000750] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+/* [0x00000758] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+/* [0x00000760] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
-+/* [0x00000768] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+/* [0x00000770] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
-+/* [0x00000778] */ 0x009e7000, 0x100009e7, // nop
-+/* [0x00000780] */ 0x009e7000, 0x100009e7, // nop
-+/* [0x00000788] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00000790] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000798] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x000007a0] */ 0x009e7000, 0x100009e7, // nop
- // ::mc_filter_uv_b
--/* [0x000007b8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
--/* [0x000007c0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
--/* [0x000007c8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
--/* [0x000007d0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
--/* [0x000007d8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
--/* [0x000007e0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
--/* [0x000007e8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
--/* [0x000007f0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
--/* [0x000007f8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
--/* [0x00000800] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
--/* [0x00000808] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
--/* [0x00000810] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
--/* [0x00000818] */ 0x00000010, 0xe00208a7, // mov r2, 16
--/* [0x00000820] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000828] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
--/* [0x00000830] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
--/* [0x00000838] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
--/* [0x00000840] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
--/* [0x00000848] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
--/* [0x00000850] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
--/* [0x00000858] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
--/* [0x00000860] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
--/* [0x00000868] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
--/* [0x00000870] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
--/* [0x00000878] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
--/* [0x00000880] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
--/* [0x00000888] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
--/* [0x00000890] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000898] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000008a0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000008a8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000008b0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
--/* [0x000008b8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
--/* [0x000008c0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
--/* [0x000008c8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000008d0] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
--/* [0x000008d8] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x000008e0] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x000008e8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
--/* [0x000008f0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x000008f8] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+/* [0x000007a8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x000007b0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+/* [0x000007b8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+/* [0x000007c0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+/* [0x000007c8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+/* [0x000007d0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x000007d8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+/* [0x000007e0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+/* [0x000007e8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
-+/* [0x000007f0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+/* [0x000007f8] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
-+/* [0x00000800] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+/* [0x00000808] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+/* [0x00000810] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000818] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+/* [0x00000820] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+/* [0x00000828] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+/* [0x00000830] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-+/* [0x00000838] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+/* [0x00000840] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+/* [0x00000848] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-+/* [0x00000850] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-+/* [0x00000858] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-+/* [0x00000860] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000868] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+/* [0x00000870] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+/* [0x00000878] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
-+/* [0x00000880] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000888] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000890] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000898] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000008a0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+/* [0x000008a8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x000008b0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x000008b8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000008c0] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-+/* [0x000008c8] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x000008d0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x000008d8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+/* [0x000008e0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x000008e8] */ 0x00000000, 0xe00208e7, // mov r3, 0
- // :uvloop_b
--/* [0x00000900] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
--/* [0x00000908] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
--/* [0x00000910] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
--/* [0x00000918] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
--/* [0x00000920] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
--/* [0x00000928] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
--/* [0x00000930] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
--/* [0x00000938] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
--/* [0x00000940] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
--/* [0x00000948] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
--/* [0x00000950] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00000958] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
--/* [0x00000960] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
--/* [0x00000968] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
--/* [0x00000970] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
--/* [0x00000978] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
--/* [0x00000980] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
--/* [0x00000988] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
--/* [0x00000990] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
--/* [0x00000998] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
--/* [0x000009a0] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
--/* [0x000009a8] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
--/* [0x000009b0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
--/* [0x000009b8] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
--/* [0x000009c0] */ 0x159e7000, 0x100203e7, // mov ra15, r0
--/* [0x000009c8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
--/* [0x000009d0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
--/* [0x000009d8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
--/* [0x000009e0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
--/* [0x000009e8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
--/* [0x000009f0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
--/* [0x000009f8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
--/* [0x00000a00] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
--/* [0x00000a08] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
--/* [0x00000a10] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
--/* [0x00000a18] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
--/* [0x00000a20] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
--/* [0x00000a28] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
--/* [0x00000a30] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
--/* [0x00000a38] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x00000a40] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
--/* [0x00000a48] */ 0x00000010, 0xe0020827, // mov r0, 16
--/* [0x00000a50] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
--/* [0x00000a58] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
--/* [0x00000a60] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
--/* [0x00000a68] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x00000a70] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x000008f0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+/* [0x000008f8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
-+/* [0x00000900] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+/* [0x00000908] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x00000910] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+/* [0x00000918] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x00000920] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00000928] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+/* [0x00000930] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-+/* [0x00000938] */ 0x0c627c80, 0x10020e27, // add t0s, ra_frame_base, r2
-+/* [0x00000940] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000948] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+/* [0x00000950] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+/* [0x00000958] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+/* [0x00000960] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+/* [0x00000968] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+/* [0x00000970] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+/* [0x00000978] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+/* [0x00000980] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+/* [0x00000988] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+/* [0x00000990] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+/* [0x00000998] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+/* [0x000009a0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+/* [0x000009a8] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+/* [0x000009b0] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+/* [0x000009b8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+/* [0x000009c0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+/* [0x000009c8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+/* [0x000009d0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+/* [0x000009d8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+/* [0x000009e0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+/* [0x000009e8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+/* [0x000009f0] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
-+/* [0x000009f8] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
-+/* [0x00000a00] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+/* [0x00000a08] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-+/* [0x00000a10] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+/* [0x00000a18] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+/* [0x00000a20] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x00000a28] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000a30] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x00000a38] */ 0x00000010, 0xe0020827, // mov r0, 16
-+/* [0x00000a40] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000a48] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00000a50] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+/* [0x00000a58] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000a60] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
- // ::mc_exit
--/* [0x00000a78] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
--/* [0x00000a80] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-+/* [0x00000a68] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x00000a70] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-+/* [0x00000a78] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000a80] */ 0x009e7000, 0xa00009e7, // ldtmu0
- /* [0x00000a88] */ 0x009e7000, 0xa00009e7, // ldtmu0
- /* [0x00000a90] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000a98] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000aa0] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000aa8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
--/* [0x00000ab0] */ 0x009e7000, 0x100009e7, // nop        ; nop
--/* [0x00000ab8] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+/* [0x00000a98] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x00000aa0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+/* [0x00000aa8] */ 0x009e7000, 0x100009e7, // nop        ; nop
- // ::mc_interrupt_exit8
--/* [0x00000ac0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x00000ab0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x00000ab8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000ac0] */ 0x009e7000, 0xa00009e7, // ldtmu0
- /* [0x00000ac8] */ 0x009e7000, 0xa00009e7, // ldtmu0
- /* [0x00000ad0] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000ad8] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000ae0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000ad8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000ae0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
- /* [0x00000ae8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
- /* [0x00000af0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
- /* [0x00000af8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
- /* [0x00000b00] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
- /* [0x00000b08] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000b10] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000b18] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000b20] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
--/* [0x00000b28] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
--/* [0x00000b30] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+/* [0x00000b10] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x00000b18] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+/* [0x00000b20] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+// ::mc_setup
-+/* [0x00000b28] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000b30] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x00000b38] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+/* [0x00000b40] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+/* [0x00000b48] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-+/* [0x00000b50] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x00000b58] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
-+/* [0x00000b60] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+/* [0x00000b68] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
-+/* [0x00000b70] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-+/* [0x00000b78] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+/* [0x00000b80] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
-+/* [0x00000b88] */ 0x8c9e7452, 0x10025e18, // add t0s, r2, r1 ; mov ra_frame_base, r2
-+/* [0x00000b90] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+/* [0x00000b98] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+/* [0x00000ba0] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-+/* [0x00000ba8] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
-+/* [0x00000bb0] */ 0x0c9c13c0, 0xd0020567, // add ra_y2, r1, 1
-+/* [0x00000bb8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+/* [0x00000bc0] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
-+/* [0x00000bc8] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-+/* [0x00000bd0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+/* [0x00000bd8] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
-+/* [0x00000be0] */ 0x8c9e7452, 0x10025e19, // add t0s, r2, r1 ; mov ra_frame_base2, r2
-+/* [0x00000be8] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
-+/* [0x00000bf0] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
-+/* [0x00000bf8] */ 0x15827d80, 0x10021427, // mov rb16, unif
-+/* [0x00000c00] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000c08] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-+/* [0x00000c10] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
-+/* [0x00000c18] */ 0x00000001, 0xe0020527, // mov ra20, 1
-+/* [0x00000c20] */ 0x00000100, 0xe00205a7, // mov ra22, 256
-+/* [0x00000c28] */ 0x00000040, 0xe00207a7, // mov ra30, 64
-+/* [0x00000c30] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
-+/* [0x00000c38] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
-+/* [0x00000c40] */ 0x00000018, 0xe00215e7, // mov rb23, 24
-+/* [0x00000c48] */ 0x00000000, 0xe0020227, // mov ra8, 0
-+/* [0x00000c50] */ 0x00000000, 0xe0020267, // mov ra9, 0
-+/* [0x00000c58] */ 0x00000000, 0xe00202a7, // mov ra10, 0
-+/* [0x00000c60] */ 0x00000000, 0xe00202e7, // mov ra11, 0
-+/* [0x00000c68] */ 0x00000000, 0xe0020327, // mov ra12, 0
-+/* [0x00000c70] */ 0x00000000, 0xe0020367, // mov ra13, 0
-+/* [0x00000c78] */ 0x00000000, 0xe00203a7, // mov ra14, 0
-+/* [0x00000c80] */ 0x00000000, 0xe00203e7, // mov ra15, 0
-+/* [0x00000c88] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+/* [0x00000c90] */ 0x159e7480, 0x10020867, // mov r1, r2
-+/* [0x00000c98] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+/* [0x00000ca0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+/* [0x00000ca8] */ 0x159e7480, 0x10020827, // mov r0, r2
-+/* [0x00000cb0] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-+/* [0x00000cb8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000cc0] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-+/* [0x00000cc8] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-+/* [0x00000cd0] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
-+/* [0x00000cd8] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+/* [0x00000ce0] */ 0x159e7480, 0x10020867, // mov r1, r2
-+/* [0x00000ce8] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+/* [0x00000cf0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+/* [0x00000cf8] */ 0x159e7480, 0x10020827, // mov r0, r2
-+/* [0x00000d00] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-+/* [0x00000d08] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000d10] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-+/* [0x00000d18] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
-+/* [0x00000d20] */ 0x15827d80, 0x10021327, // mov rb12,unif
-+/* [0x00000d28] */ 0x15827d80, 0x10021367, // mov rb13,unif
-+/* [0x00000d30] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000d38] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-+/* [0x00000d40] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+/* [0x00000d48] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-+/* [0x00000d50] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-+/* [0x00000d58] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_frame_base
-+/* [0x00000d60] */ 0x13540dc0, 0xd0020867, // max r1, ra_y2, 0
-+/* [0x00000d68] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+/* [0x00000d70] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00000d78] */ 0x0c541dc0, 0xd0020567, // add ra_y2, ra_y2, 1
-+/* [0x00000d80] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-+/* [0x00000d88] */ 0x0c667380, 0x10020e27, // add t0s, r1, ra_frame_base2
-+// ::mc_filter
-+/* [0x00000d90] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000d98] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x00000da0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+/* [0x00000da8] */ 0x155e7d80, 0x10021027, // mov rx_xshift2, rx_xshift2_next
-+/* [0x00000db0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+/* [0x00000db8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+/* [0x00000dc0] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-+/* [0x00000dc8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x00000dd0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+/* [0x00000dd8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+/* [0x00000de0] */ 0x0c9e7400, 0x100206a7, // add ra_frame_base_next, r2, r0
-+/* [0x00000de8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+/* [0x00000df0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0   ; mov r1, unif
-+/* [0x00000df8] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-+/* [0x00000e00] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
-+/* [0x00000e08] */ 0x0c9c13c0, 0xd0021067, // add ra_y2_next, r1, 1
-+/* [0x00000e10] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+/* [0x00000e18] */ 0x0c9e7400, 0x100214e7, // add rx_frame_base2_next, r2, r0
-+/* [0x00000e20] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+/* [0x00000e28] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+/* [0x00000e30] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000e38] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+/* [0x00000e40] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+/* [0x00000e48] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+/* [0x00000e50] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+/* [0x00000e58] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+/* [0x00000e60] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+/* [0x00000e68] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000e70] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+/* [0x00000e78] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+/* [0x00000e80] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000e88] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
-+/* [0x00000e90] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000e98] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000ea0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000ea8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+/* [0x00000eb0] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
-+/* [0x00000eb8] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000ec0] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000ec8] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000ed0] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+/* [0x00000ed8] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
-+/* [0x00000ee0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000ee8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000ef0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000ef8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+/* [0x00000f00] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
-+/* [0x00000f08] */ 0x4f5971c6, 0x100251e0, // asr rb7, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000f10] */ 0x4f5971c6, 0x100251a0, // asr rb6, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000f18] */ 0x4f5971c6, 0x10025160, // asr rb5, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000f20] */ 0x0f9d71c0, 0x10021127, // asr rb4, r0, rb23
-+/* [0x00000f28] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000f30] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
-+/* [0x00000f38] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
-+/* [0x00000f40] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+/* [0x00000f48] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
-+/* [0x00000f50] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+// :yloop
-+/* [0x00000f58] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
-+/* [0x00000f60] */ 0x8e4539bf, 0xa0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu0
-+/* [0x00000f68] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+/* [0x00000f70] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x00000f78] */ 0x0e9c09c0, 0x10020867, // shr r1, r4, rx_xshift2
-+/* [0x00000f80] */ 0x159c1fc0, 0x10040567, // mov.ifz ra_y2, ra_y2_next
-+/* [0x00000f88] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x00000f90] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00000f98] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-+/* [0x00000fa0] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
-+/* [0x00000fa8] */ 0x13540dc0, 0xd00208a7, // max r2, ra_y2, 0
-+/* [0x00000fb0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00000fb8] */ 0x4c541dd3, 0xd0024562, // add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
-+/* [0x00000fc0] */ 0xec654c87, 0x10024e20, // add t0s, ra_frame_base2, r2   ; v8subs r0, r0, rb20
-+/* [0x00000fc8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000fd0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+/* [0x00000fd8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+/* [0x00000fe0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+/* [0x00000fe8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+/* [0x00000ff0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+/* [0x00000ff8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+/* [0x00001000] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+/* [0x00001008] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+/* [0x00001010] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+/* [0x00001018] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+/* [0x00001020] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+/* [0x00001028] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+/* [0x00001030] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+/* [0x00001038] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+/* [0x00001040] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+/* [0x00001048] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+/* [0x00001050] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+/* [0x00001058] */ 0x8d3487f6, 0xd00279cc, // sub.setf -, r3, 8    ; mov ra12, ra13
-+/* [0x00001060] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+/* [0x00001068] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+/* [0x00001070] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+/* [0x00001078] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+/* [0x00001080] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:yloop
-+/* [0x00001088] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+/* [0x00001090] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+/* [0x00001098] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+/* [0x000010a0] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+/* [0x000010a8] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+/* [0x000010b0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+/* [0x000010b8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+/* [0x000010c0] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb4
-+/* [0x000010c8] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb5
-+/* [0x000010d0] */ 0x4c286237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb6
-+/* [0x000010d8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
-+/* [0x000010e0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+/* [0x000010e8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+/* [0x000010f0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+/* [0x000010f8] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
-+/* [0x00001100] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
-+/* [0x00001108] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
-+/* [0x00001110] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:yloop
-+/* [0x00001118] */ 0x0c9cf3c0, 0x10020867, // add r1, r1, rb15
-+/* [0x00001120] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+/* [0x00001128] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+/* [0x00001130] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00001138] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x00001140] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00001148] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+// ::mc_filter_b
-+/* [0x00001150] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00001158] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x00001160] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+/* [0x00001168] */ 0x155e7d80, 0x10021027, // mov rx_xshift2, rx_xshift2_next
-+/* [0x00001170] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+/* [0x00001178] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+/* [0x00001180] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-+/* [0x00001188] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x00001190] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+/* [0x00001198] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+/* [0x000011a0] */ 0x0c9e7400, 0x100206a7, // add ra_frame_base_next, r2, r0
-+/* [0x000011a8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+/* [0x000011b0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0   ; mov r1, unif
-+/* [0x000011b8] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-+/* [0x000011c0] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
-+/* [0x000011c8] */ 0x0c9c13c0, 0xd0021067, // add ra_y2_next, r1, 1
-+/* [0x000011d0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+/* [0x000011d8] */ 0x0c9e7400, 0x100214e7, // add rx_frame_base2_next, r2, r0
-+/* [0x000011e0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+/* [0x000011e8] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+/* [0x000011f0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x000011f8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+/* [0x00001200] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+/* [0x00001208] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+/* [0x00001210] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+/* [0x00001218] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+/* [0x00001220] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+/* [0x00001228] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00001230] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+/* [0x00001238] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+/* [0x00001240] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00001248] */ 0x00000001, 0xe0020867, // mov r1, 1
-+/* [0x00001250] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
-+/* [0x00001258] */ 0x409f3001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 13, r1 << 13
-+/* [0x00001260] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00001268] */ 0x409f2001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 14, r1 << 14
-+/* [0x00001270] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00001278] */ 0x409f1001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 15, r1 << 15
-+/* [0x00001280] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00001288] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+/* [0x00001290] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
-+/* [0x00001298] */ 0x409f7001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 9, r1 << 9
-+/* [0x000012a0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000012a8] */ 0x409f6001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 10, r1 << 10
-+/* [0x000012b0] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000012b8] */ 0x409f5001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 11, r1 << 11
-+/* [0x000012c0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000012c8] */ 0x409f4001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 12, r1 << 12
-+/* [0x000012d0] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+/* [0x000012d8] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
-+/* [0x000012e0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x000012e8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x000012f0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000012f8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+/* [0x00001300] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
-+/* [0x00001308] */ 0x4f5971c6, 0x100251e0, // asr rb7, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00001310] */ 0x4f5971c6, 0x100251a0, // asr rb6, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00001318] */ 0x4f5971c6, 0x10025160, // asr rb5, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00001320] */ 0x0f9d71c0, 0x10021127, // asr rb4, r0, rb23
-+/* [0x00001328] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00001330] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
-+/* [0x00001338] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
-+/* [0x00001340] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+/* [0x00001348] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
-+/* [0x00001350] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+// :yloopb
-+/* [0x00001358] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
-+/* [0x00001360] */ 0x8e4539bf, 0xa0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu0
-+/* [0x00001368] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+/* [0x00001370] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x00001378] */ 0x0e9c09c0, 0x10020867, // shr r1, r4, rx_xshift2
-+/* [0x00001380] */ 0x159c1fc0, 0x10040567, // mov.ifz ra_y2, ra_y2_next
-+/* [0x00001388] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x00001390] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00001398] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-+/* [0x000013a0] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
-+/* [0x000013a8] */ 0x13540dc0, 0xd00208a7, // max r2, ra_y2, 0
-+/* [0x000013b0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x000013b8] */ 0x4c541dd3, 0xd0024562, // add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
-+/* [0x000013c0] */ 0xec654c87, 0x10024e20, // add t0s, ra_frame_base2, r2   ; v8subs r0, r0, rb20
-+/* [0x000013c8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x000013d0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+/* [0x000013d8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+/* [0x000013e0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+/* [0x000013e8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+/* [0x000013f0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+/* [0x000013f8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+/* [0x00001400] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+/* [0x00001408] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+/* [0x00001410] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+/* [0x00001418] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+/* [0x00001420] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+/* [0x00001428] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+/* [0x00001430] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+/* [0x00001438] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+/* [0x00001440] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+/* [0x00001448] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+/* [0x00001450] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+/* [0x00001458] */ 0x8d3487f6, 0xd00279cc, // sub.setf -, r3, 8    ; mov ra12, ra13
-+/* [0x00001460] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+/* [0x00001468] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+/* [0x00001470] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+/* [0x00001478] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+/* [0x00001480] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:yloopb
-+/* [0x00001488] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+/* [0x00001490] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+/* [0x00001498] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+/* [0x000014a0] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+/* [0x000014a8] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+/* [0x000014b0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+/* [0x000014b8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+/* [0x000014c0] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb4
-+/* [0x000014c8] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb5
-+/* [0x000014d0] */ 0x4c286237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb6
-+/* [0x000014d8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
-+/* [0x000014e0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+/* [0x000014e8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+/* [0x000014f0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+/* [0x000014f8] */ 0x4053800e, 0xd00049e1, // nop                     ; mul24 r1, r1 << 8, ra20 << 8
-+/* [0x00001500] */ 0x4c78e38f, 0x10024860, // add r1, r1, ra30        ; mul24 r0, r1, rb14
-+/* [0x00001508] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
-+/* [0x00001510] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:yloopb
-+/* [0x00001518] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-+/* [0x00001520] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+/* [0x00001528] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+/* [0x00001530] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00001538] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x00001540] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00001548] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+// ::mc_interrupt_exit12
-+/* [0x00001550] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x00001558] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00001560] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00001568] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00001570] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00001578] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00001580] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00001588] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00001590] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00001598] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000015a0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000015a8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000015b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000015b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000015c0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000015c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000015d0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x000015d8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+/* [0x000015e0] */ 0x009e7000, 0x100009e7, // nop        ; nop
- // ::mc_end
- };
- #ifdef __HIGHC__
-diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
-index cec9901..3fa8531 100644
---- a/libavcodec/rpi_shader.h
-+++ b/libavcodec/rpi_shader.h
-@@ -4,11 +4,15 @@
- extern unsigned int rpi_shader[];
- 
- #define mc_setup_uv (rpi_shader + 0)
--#define mc_filter_uv (rpi_shader + 152)
--#define mc_filter_uv_b0 (rpi_shader + 342)
--#define mc_filter_uv_b (rpi_shader + 494)
--#define mc_exit (rpi_shader + 670)
--#define mc_interrupt_exit8 (rpi_shader + 688)
--#define mc_end (rpi_shader + 718)
-+#define mc_filter_uv (rpi_shader + 148)
-+#define mc_filter_uv_b0 (rpi_shader + 338)
-+#define mc_filter_uv_b (rpi_shader + 490)
-+#define mc_exit (rpi_shader + 666)
-+#define mc_interrupt_exit8 (rpi_shader + 684)
-+#define mc_setup (rpi_shader + 714)
-+#define mc_filter (rpi_shader + 868)
-+#define mc_filter_b (rpi_shader + 1108)
-+#define mc_interrupt_exit12 (rpi_shader + 1364)
-+#define mc_end (rpi_shader + 1402)
- 
- #endif
-diff --git a/libavcodec/rpi_user_vcsm.h b/libavcodec/rpi_user_vcsm.h
-index fbebbbe..95e6de1 100644
---- a/libavcodec/rpi_user_vcsm.h
-+++ b/libavcodec/rpi_user_vcsm.h
-@@ -418,6 +418,28 @@ int vcsm_unlock_hdl( unsigned int handle );
- */
- int vcsm_unlock_hdl_sp( unsigned int handle, int cache_no_flush );
- 
-+/* Clean and/or invalidate the memory associated with this user opaque handle
-+**
-+** Returns:        non-zero on error
-+**
-+** structure contains a list of flush/invalidate commands. Commands are:
-+** 0: nop
-+** 1: invalidate given physical range in L2
-+** 2: clean      given physical range in L2
-+** 3: clean+invalidate all of L1
-+** 4: flush      all of L2 and all of L1
-+*/
-+struct vcsm_user_clean_invalid_s {
-+    struct {
-+       unsigned int cmd;
-+       unsigned int addr;
-+       unsigned int size;
-+    } s[8];
-+};
-+
-+int vcsm_clean_invalid( unsigned int handle, struct vcsm_user_clean_invalid_s *s );
-+
-+
- #ifdef __cplusplus
- }
- #endif
--- 
-2.7.4
-
-
-From 09685ab55aecb9400e354522894e0fbbb6381ca9 Mon Sep 17 00:00:00 2001
-From: Peter de Rivaz <peter.derivaz@gmail.com>
-Date: Wed, 20 May 2015 21:12:55 +0100
-Subject: [PATCH 43/68] Added multi mailbox - not working
-
----
- libavcodec/hevc.c        | 40 ++++++++++++++++++++++++++++---
- libavcodec/rpi_mailbox.c | 47 +++++++++++++++++++++++++++++++++++++
- libavcodec/rpi_mailbox.h |  5 ++++
- libavcodec/rpi_qpu.c     | 61 ++++++++++++++++++++++++++++++++++++++++++++----
- libavcodec/rpi_qpu.h     |  2 ++
- 5 files changed, 147 insertions(+), 8 deletions(-)
-
-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-index 951e2d3..ab63efd 100644
---- a/libavcodec/hevc.c
-+++ b/libavcodec/hevc.c
-@@ -47,6 +47,11 @@
-   //#define EARLY_MALLOC
-   // Move Inter prediction into separate pass
-   #define RPI_INTER
-+
-+  #ifdef RPI_INTER_QPU
-+    // Define RPI_MULTI_MAILBOX to use the updated mailbox that can launch both QPU and VPU
-+    #define RPI_MULTI_MAILBOX
-+  #endif
- #endif
- 
- // #define DISABLE_MC
-@@ -2843,10 +2848,14 @@ static void rpi_inter_clear(HEVCContext *s)
- static void rpi_execute_inter_qpu(HEVCContext *s)
- {
-     int k;
-+    int i;
-     uint32_t *unif_vc = (uint32_t *)s->unif_mvs_ptr.vc;
--
--    if (s->sh.slice_type == I_SLICE)
--        return;
-+    if (s->sh.slice_type == I_SLICE) {
-+#ifdef RPI_MULTI_MAILBOX
-+      rpi_execute_transform(s);
-+      return;
-+#endif
-+    }
-     for(k=0;k<8;k++) {
-         s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
-         s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
-@@ -2856,6 +2865,22 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
- 
-     s->u_mvs[8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
- 
-+#ifdef RPI_MULTI_MAILBOX
-+    gpu_cache_flush(&s->coeffs_buf_accelerated);
-+    s->vpu_id = vpu_qpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0,
-+                                   qpu_get_fn(QPU_MC_SETUP_UV),
-+                                   (uint32_t)(unif_vc+(s->mvs_base[0 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+                                   (uint32_t)(unif_vc+(s->mvs_base[1 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+                                   (uint32_t)(unif_vc+(s->mvs_base[2 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+                                   (uint32_t)(unif_vc+(s->mvs_base[3 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+                                   (uint32_t)(unif_vc+(s->mvs_base[4 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+                                   (uint32_t)(unif_vc+(s->mvs_base[5 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+                                   (uint32_t)(unif_vc+(s->mvs_base[6 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+                                   (uint32_t)(unif_vc+(s->mvs_base[7 ] - (uint32_t*)s->unif_mvs_ptr.arm))
-+                                 );
-+    for(i=0;i<4;i++)
-+        s->num_coeffs[i] = 0;
-+#else
-     qpu_run_shader8(qpu_get_fn(QPU_MC_SETUP_UV),
-       (uint32_t)(unif_vc+(s->mvs_base[0 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-       (uint32_t)(unif_vc+(s->mvs_base[1 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-@@ -2866,6 +2891,7 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
-       (uint32_t)(unif_vc+(s->mvs_base[6 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-       (uint32_t)(unif_vc+(s->mvs_base[7 ] - (uint32_t*)s->unif_mvs_ptr.arm))
-       );
-+#endif
- }
- #endif
- 
-@@ -2945,6 +2971,12 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-           if ( (((y_ctb + ctb_size)&63) == 0) && x_ctb + ctb_size >= s->ps.sps->width) {
-             // Transform all blocks
-             // printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
-+#ifdef RPI_MULTI_MAILBOX
-+            // Kick off inter prediction on QPUs
-+            rpi_execute_inter_qpu(s);
-+            // Perform luma inter prediction
-+            rpi_execute_inter_cmds(s);
-+#else
-             rpi_execute_transform(s);
-             // Perform inter prediction
-             rpi_execute_inter_cmds(s);
-@@ -2952,6 +2984,8 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-             // Kick off inter prediction on QPUs
-             rpi_execute_inter_qpu(s);
- #endif
-+#endif
-+
-             // Wait for transform completion
-             vpu_wait(s->vpu_id);
- 
-diff --git a/libavcodec/rpi_mailbox.c b/libavcodec/rpi_mailbox.c
-index 77a56dd..3904efc 100644
---- a/libavcodec/rpi_mailbox.c
-+++ b/libavcodec/rpi_mailbox.c
-@@ -276,6 +276,53 @@ unsigned execute_qpu(int file_desc, unsigned num_qpus, unsigned control, unsigne
-    return p[5];
- }
- 
-+void execute_multi(int file_desc,
-+   unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout,
-+   unsigned num_qpus_2, unsigned control_2, unsigned noflush_2, unsigned timeout_2,
-+   unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
-+   unsigned code_2, unsigned r0_2, unsigned r1_2, unsigned r2_2, unsigned r3_2, unsigned r4_2, unsigned r5_2) {
-+   int i=0;
-+   unsigned p[32];
-+
-+   p[i++] = 0; // size
-+   p[i++] = 0x00000000; // process request
-+   p[i++] = 0x30018; // (the tag id)
-+   p[i++] = 88; // (size of the buffer)
-+   p[i++] = 88; // (size of the data)
-+
-+   p[i++] = num_qpus;
-+   p[i++] = control;
-+   p[i++] = noflush;
-+   p[i++] = timeout; // ms
-+
-+   p[i++] = num_qpus_2;
-+   p[i++] = control_2;
-+   p[i++] = noflush_2;
-+   p[i++] = timeout_2; // ms
-+
-+   p[i++] = code;
-+   p[i++] = r0;
-+   p[i++] = r1;
-+   p[i++] = r2;
-+   p[i++] = r3;
-+   p[i++] = r4;
-+   p[i++] = r5;
-+
-+   p[i++] = code_2;
-+   p[i++] = r0_2;
-+   p[i++] = r1_2;
-+   p[i++] = r2_2;
-+   p[i++] = r3_2;
-+   p[i++] = r4_2;
-+   p[i++] = r5_2;
-+
-+   p[i++] = 0x00000000; // end tag
-+   p[0] = i*sizeof *p; // actual size
-+
-+   mbox_property(file_desc, p);
-+   return;
-+}
-+
- int mbox_open() {
-    int file_desc;
- 
-diff --git a/libavcodec/rpi_mailbox.h b/libavcodec/rpi_mailbox.h
-index c264d2e..5898102 100644
---- a/libavcodec/rpi_mailbox.h
-+++ b/libavcodec/rpi_mailbox.h
-@@ -15,6 +15,11 @@ extern void unmapmem(void *addr, unsigned size);
- 
- extern unsigned execute_code(int file_desc, unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5);
- extern unsigned execute_qpu(int file_desc, unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout);
-+extern void execute_multi(int file_desc,
-+   unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout,
-+   unsigned num_qpus_2, unsigned control_2, unsigned noflush_2, unsigned timeout_2,
-+   unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
-+   unsigned code_2, unsigned r0_2, unsigned r1_2, unsigned r2_2, unsigned r3_2, unsigned r4_2, unsigned r5_2);
- extern unsigned qpu_enable(int file_desc, unsigned enable);
- 
- #endif
-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-index fd8a276..feb3284 100644
---- a/libavcodec/rpi_qpu.c
-+++ b/libavcodec/rpi_qpu.c
-@@ -123,7 +123,7 @@ static pthread_cond_t post_cond_head = PTHREAD_COND_INITIALIZER;
- static pthread_cond_t post_cond_tail = PTHREAD_COND_INITIALIZER;
- static pthread_mutex_t post_mutex = PTHREAD_MUTEX_INITIALIZER;
- 
--static int vpu_cmds[MAXCMDS][8];
-+static int vpu_cmds[MAXCMDS][16];
- static volatile int vpu_async_tail=0; // Contains the number of posted jobs
- static volatile int vpu_async_head=0;
- #endif
-@@ -346,6 +346,7 @@ unsigned int vpu_get_constants(void) {
- static void *vpu_start(void *arg) {
-   while(1) {
-     int *p;
-+    int qpu_code;
-     pthread_mutex_lock(&post_mutex);
-     while( vpu_async_tail - vpu_async_head <= 0)
-     {
-@@ -358,12 +359,25 @@ static void *vpu_start(void *arg) {
-     if (p[6] == -1) {
-       break; // Last job
-     }
--    if (p[7]) {
-+    qpu_code = p[7];
-+    //if (p[7]) {
-         //GPU_MEM_PTR_T *buf = (GPU_MEM_PTR_T *)p[7];
-         //gpu_cache_flush(buf);
--    }
--    vpu_execute_code(p[0], p[1], p[2], p[3], p[4], p[5], p[6]);
-+    //}
-+    if (!qpu_code) {
-+      vpu_execute_code(p[0], p[1], p[2], p[3], p[4], p[5], p[6]);
-+    } else {
-+      int i;
-+      for(i=0;i<8;i++) {
-+        gpu->mail[i*2] = p[8+i];
-+        gpu->mail[i*2 + 1] = qpu_code;
-+      }
- 
-+      execute_multi(gpu->mb,8,gpu->vc + offsetof(struct GPU, mail), 1 /* no flush */, 5000 /* timeout ms */,
-+                              0, 0, 0, 0,
-+                              p[0], p[1], p[2], p[3], p[4], p[5], p[6], // VPU0
-+                              0,    0   , 0   , 0   , 0   , 0   , 0); // VPU1
-+    }
-     pthread_mutex_lock(&post_mutex);
-     vpu_async_head++;
-     pthread_cond_broadcast(&post_cond_head);
-@@ -400,7 +414,43 @@ int vpu_post_code(unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned
-     p[4] = r3;
-     p[5] = r4;
-     p[6] = r5;
--    p[7] = (int) buf;
-+    p[7] = 0;
-+    if (num<=1)
-+      pthread_cond_broadcast(&post_cond_tail); // Otherwise the vpu thread must already be awake
-+    pthread_mutex_unlock(&post_mutex);
-+    return id;
-+  }
-+}
-+
-+int vpu_qpu_post_code(unsigned vpu_code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
-+                      int qpu_code, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8)
-+{
-+
-+  pthread_mutex_lock(&post_mutex);
-+  {
-+    int id = vpu_async_tail++;
-+    int *p = vpu_cmds[id%MAXCMDS];
-+    int num = vpu_async_tail - vpu_async_head;
-+    if (num>MAXCMDS) {
-+      printf("Too many commands submitted\n");
-+      exit(-1);
-+    }
-+    p[0] = vpu_code;
-+    p[1] = r0;
-+    p[2] = r1;
-+    p[3] = r2;
-+    p[4] = r3;
-+    p[5] = r4;
-+    p[6] = r5;
-+    p[7] = qpu_code;
-+    p[8 ] = unifs1;
-+    p[9 ] = unifs2;
-+    p[10] = unifs3;
-+    p[11] = unifs4;
-+    p[12] = unifs5;
-+    p[13] = unifs6;
-+    p[14] = unifs7;
-+    p[15] = unifs8;
-     if (num<=1)
-       pthread_cond_broadcast(&post_cond_tail); // Otherwise the vpu thread must already be awake
-     pthread_mutex_unlock(&post_mutex);
-@@ -966,6 +1016,7 @@ void rpi_do_block(const uint8_t *in_buffer_vc, int src_pitch, uint8_t *dst_vc, i
- }
- 
- 
-+
- #endif
- 
- #endif // RPI
-diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
-index 88965e5..2f08f03 100644
---- a/libavcodec/rpi_qpu.h
-+++ b/libavcodec/rpi_qpu.h
-@@ -41,6 +41,8 @@ extern unsigned int vpu_get_fn(void);
- extern unsigned int vpu_get_constants(void);
- extern unsigned vpu_execute_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5);
- extern int vpu_post_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5, GPU_MEM_PTR_T *buf);
-+int vpu_qpu_post_code(unsigned vpu_code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
-+                      int qpu_code, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8);
- extern void vpu_wait( int id);
- 
- // Simple test of shader code
--- 
-2.7.4
-
-
-From 311f2da06d13a98d9bdda2df8684d7cf55b9a08e Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Thu, 21 May 2015 16:50:02 +0100
-Subject: [PATCH 44/68] Pass qpu number in as uniform
-
----
- libavcodec/hevc.c          |    2 +-
- libavcodec/rpi_shader.c    | 1288 ++++++++++++++++++++++----------------------
- libavcodec/rpi_shader.h    |   20 +-
- libavcodec/rpi_shader.qasm |   10 +-
- 4 files changed, 657 insertions(+), 663 deletions(-)
-
-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-index ab63efd..caadfaa 100644
---- a/libavcodec/hevc.c
-+++ b/libavcodec/hevc.c
-@@ -2834,6 +2834,7 @@ static void rpi_inter_clear(HEVCContext *s)
-         *s->u_mvs[i]++ = pic_height;
-         *s->u_mvs[i]++ = s->frame->linesize[1];
-         *s->u_mvs[i]++ = s->frame->linesize[2];
-+        *s->u_mvs[i]++ = i;
-         if (weight_flag) {
-             *s->u_mvs[i]++ = 1 << (s->sh.chroma_log2_weight_denom + 6 - 1);
-             *s->u_mvs[i]++ = s->sh.chroma_log2_weight_denom + 6;
-@@ -2841,7 +2842,6 @@ static void rpi_inter_clear(HEVCContext *s)
-             *s->u_mvs[i]++ = 1 << 5;
-             *s->u_mvs[i]++ = 6;
-         }
--        s->u_mvs[i] += 1;  // Padding words
-     }
- }
- 
-diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
-index 9c30e32..a0f0282 100644
---- a/libavcodec/rpi_shader.c
-+++ b/libavcodec/rpi_shader.c
-@@ -48,8 +48,8 @@ unsigned int rpi_shader[] = {
- /* [0x000000b8] */ 0x00000000, 0xe0020367, // mov ra13, 0
- /* [0x000000c0] */ 0x00000000, 0xe00203a7, // mov ra14, 0
- /* [0x000000c8] */ 0x00000000, 0xe00203e7, // mov ra15, 0
--/* [0x000000d0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
--/* [0x000000d8] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
-+/* [0x000000d0] */ 0x15827d80, 0x100208e7, // mov r3, unif
-+/* [0x000000d8] */ 0x119c17c0, 0xd00208a7, // shl r2, r3, 1
- /* [0x000000e0] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
- /* [0x000000e8] */ 0x159e7480, 0x10020867, // mov r1, r2
- /* [0x000000f0] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-@@ -60,669 +60,669 @@ unsigned int rpi_shader[] = {
- /* [0x00000118] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
- /* [0x00000120] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
- /* [0x00000128] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
--/* [0x00000130] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
--/* [0x00000138] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
--/* [0x00000140] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
--/* [0x00000148] */ 0x159e7480, 0x10020867, // mov r1, r2
--/* [0x00000150] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
--/* [0x00000158] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
--/* [0x00000160] */ 0x159e7480, 0x10020827, // mov r0, r2
--/* [0x00000168] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
--/* [0x00000170] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
--/* [0x00000178] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
--/* [0x00000180] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
--/* [0x00000188] */ 0x0f9c11c0, 0xd0020827, // asr r0, r0, 1
--/* [0x00000190] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
--/* [0x00000198] */ 0x0c9e7040, 0x10021567, // add rb21, r0, r1
--/* [0x000001a0] */ 0x15427d80, 0x10020827, // mov r0, ra_x
--/* [0x000001a8] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
--/* [0x000001b0] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base
--/* [0x000001b8] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
--/* [0x000001c0] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
--/* [0x000001c8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
--/* [0x000001d0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
--/* [0x000001d8] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x, r0
--/* [0x000001e0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
--/* [0x000001e8] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
--/* [0x000001f0] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_frame_base, r2
--/* [0x000001f8] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
--/* [0x00000200] */ 0x15827d80, 0x10021327, // mov rb12,unif
--/* [0x00000208] */ 0x15827d80, 0x10021367, // mov rb13,unif
--/* [0x00000210] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000218] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
--/* [0x00000220] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
--/* [0x00000228] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
--/* [0x00000230] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
--/* [0x00000238] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
--/* [0x00000240] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x
--/* [0x00000248] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_frame_base
-+/* [0x00000130] */ 0x119c17c0, 0xd00208a7, // shl r2, r3, 1
-+/* [0x00000138] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-+/* [0x00000140] */ 0x159e7480, 0x10020867, // mov r1, r2
-+/* [0x00000148] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+/* [0x00000150] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+/* [0x00000158] */ 0x159e7480, 0x10020827, // mov r0, r2
-+/* [0x00000160] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-+/* [0x00000168] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000170] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-+/* [0x00000178] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
-+/* [0x00000180] */ 0x0f9c11c0, 0xd0020827, // asr r0, r0, 1
-+/* [0x00000188] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
-+/* [0x00000190] */ 0x0c9e7040, 0x10021567, // add rb21, r0, r1
-+/* [0x00000198] */ 0x15427d80, 0x10020827, // mov r0, ra_x
-+/* [0x000001a0] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
-+/* [0x000001a8] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base
-+/* [0x000001b0] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
-+/* [0x000001b8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
-+/* [0x000001c0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+/* [0x000001c8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+/* [0x000001d0] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x, r0
-+/* [0x000001d8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+/* [0x000001e0] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
-+/* [0x000001e8] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_frame_base, r2
-+/* [0x000001f0] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
-+/* [0x000001f8] */ 0x15827d80, 0x10021327, // mov rb12,unif
-+/* [0x00000200] */ 0x15827d80, 0x10021367, // mov rb13,unif
-+/* [0x00000208] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-+/* [0x00000210] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+/* [0x00000218] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-+/* [0x00000220] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00000228] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-+/* [0x00000230] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x
-+/* [0x00000238] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_frame_base
- // ::mc_filter_uv
--/* [0x00000250] */ 0x15827d80, 0x100207e7, // mov ra31, unif
--/* [0x00000258] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
--/* [0x00000260] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
--/* [0x00000268] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
--/* [0x00000270] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
--/* [0x00000278] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
--/* [0x00000280] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
--/* [0x00000288] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
--/* [0x00000290] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
--/* [0x00000298] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
--/* [0x000002a0] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
--/* [0x000002a8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
--/* [0x000002b0] */ 0x00000010, 0xe00208a7, // mov r2, 16
--/* [0x000002b8] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x000002c0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
--/* [0x000002c8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
--/* [0x000002d0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
--/* [0x000002d8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
--/* [0x000002e0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
--/* [0x000002e8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
--/* [0x000002f0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
--/* [0x000002f8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
--/* [0x00000300] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
--/* [0x00000308] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000310] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000318] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000320] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000328] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
--/* [0x00000330] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000338] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000340] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000348] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
--/* [0x00000350] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00000358] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000360] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
--/* [0x00000368] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
--/* [0x00000370] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
--/* [0x00000378] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000380] */ 0x0f9e7080, 0x100613e7, // asr.ifnz rb15, r0, r2
--/* [0x00000388] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
--/* [0x00000390] */ 0x0f9e7080, 0x100613a7, // asr.ifnz rb14, r0, r2
--/* [0x00000398] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
--/* [0x000003a0] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+/* [0x00000240] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x00000248] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+/* [0x00000250] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+/* [0x00000258] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+/* [0x00000260] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+/* [0x00000268] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x00000270] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+/* [0x00000278] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+/* [0x00000280] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
-+/* [0x00000288] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+/* [0x00000290] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
-+/* [0x00000298] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+/* [0x000002a0] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+/* [0x000002a8] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x000002b0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+/* [0x000002b8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+/* [0x000002c0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+/* [0x000002c8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-+/* [0x000002d0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+/* [0x000002d8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+/* [0x000002e0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x000002e8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+/* [0x000002f0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+/* [0x000002f8] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000300] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000308] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000310] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000318] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+/* [0x00000320] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000328] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000330] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000338] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-+/* [0x00000340] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000348] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000350] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
-+/* [0x00000358] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+/* [0x00000360] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
-+/* [0x00000368] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000370] */ 0x0f9e7080, 0x100613e7, // asr.ifnz rb15, r0, r2
-+/* [0x00000378] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+/* [0x00000380] */ 0x0f9e7080, 0x100613a7, // asr.ifnz rb14, r0, r2
-+/* [0x00000388] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+/* [0x00000390] */ 0x00000000, 0xe00208e7, // mov r3, 0
- // :uvloop
--/* [0x000003a8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
--/* [0x000003b0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
--/* [0x000003b8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
--/* [0x000003c0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
--/* [0x000003c8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
--/* [0x000003d0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
--/* [0x000003d8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
--/* [0x000003e0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
--/* [0x000003e8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
--/* [0x000003f0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_frame_base, r2
--/* [0x000003f8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00000400] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
--/* [0x00000408] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
--/* [0x00000410] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
--/* [0x00000418] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
--/* [0x00000420] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
--/* [0x00000428] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
--/* [0x00000430] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
--/* [0x00000438] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
--/* [0x00000440] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
--/* [0x00000448] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
--/* [0x00000450] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
--/* [0x00000458] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
--/* [0x00000460] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
--/* [0x00000468] */ 0x159e7000, 0x100203e7, // mov ra15, r0
--/* [0x00000470] */ 0x00000020, 0xe0021327, // mov rb12,32
--/* [0x00000478] */ 0x00000006, 0xe0021367, // mov rb13,6
--/* [0x00000480] */ 0x00000001, 0xe00213a7, // mov rb14,1
--/* [0x00000488] */ 0x00000000, 0xe00213e7, // mov rb15,0
--/* [0x00000490] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
--/* [0x00000498] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
--/* [0x000004a0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
--/* [0x000004a8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
--/* [0x000004b0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
--/* [0x000004b8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
--/* [0x000004c0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
--/* [0x000004c8] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
--/* [0x000004d0] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
--/* [0x000004d8] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
--/* [0x000004e0] */ 0xfffffea8, 0xf06809e7, // brr.anyn -, r:uvloop
--/* [0x000004e8] */ 0x0c9cf3c0, 0x10020867, // add r1, r1, rb15
--/* [0x000004f0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
--/* [0x000004f8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
--/* [0x00000500] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
--/* [0x00000508] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x00000510] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
--/* [0x00000518] */ 0x00000010, 0xe0020827, // mov r0, 16
--/* [0x00000520] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
--/* [0x00000528] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
--/* [0x00000530] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
--/* [0x00000538] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x00000540] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x00000398] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+/* [0x000003a0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
-+/* [0x000003a8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+/* [0x000003b0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x000003b8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+/* [0x000003c0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x000003c8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x000003d0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+/* [0x000003d8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-+/* [0x000003e0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_frame_base, r2
-+/* [0x000003e8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x000003f0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+/* [0x000003f8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+/* [0x00000400] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+/* [0x00000408] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+/* [0x00000410] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+/* [0x00000418] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+/* [0x00000420] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+/* [0x00000428] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+/* [0x00000430] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+/* [0x00000438] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+/* [0x00000440] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
-+/* [0x00000448] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+/* [0x00000450] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+/* [0x00000458] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+/* [0x00000460] */ 0x00000020, 0xe0021327, // mov rb12,32
-+/* [0x00000468] */ 0x00000006, 0xe0021367, // mov rb13,6
-+/* [0x00000470] */ 0x00000001, 0xe00213a7, // mov rb14,1
-+/* [0x00000478] */ 0x00000000, 0xe00213e7, // mov rb15,0
-+/* [0x00000480] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+/* [0x00000488] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+/* [0x00000490] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+/* [0x00000498] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+/* [0x000004a0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+/* [0x000004a8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+/* [0x000004b0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+/* [0x000004b8] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
-+/* [0x000004c0] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
-+/* [0x000004c8] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
-+/* [0x000004d0] */ 0xfffffea8, 0xf06809e7, // brr.anyn -, r:uvloop
-+/* [0x000004d8] */ 0x0c9cf3c0, 0x10020867, // add r1, r1, rb15
-+/* [0x000004e0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+/* [0x000004e8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+/* [0x000004f0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x000004f8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000500] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x00000508] */ 0x00000010, 0xe0020827, // mov r0, 16
-+/* [0x00000510] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000518] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00000520] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+/* [0x00000528] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000530] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
- // ::mc_filter_uv_b0
--/* [0x00000548] */ 0x15827d80, 0x100207e7, // mov ra31, unif
--/* [0x00000550] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
--/* [0x00000558] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
--/* [0x00000560] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
--/* [0x00000568] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
--/* [0x00000570] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
--/* [0x00000578] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
--/* [0x00000580] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
--/* [0x00000588] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
--/* [0x00000590] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
--/* [0x00000598] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
--/* [0x000005a0] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
--/* [0x000005a8] */ 0x00000010, 0xe00208a7, // mov r2, 16
--/* [0x000005b0] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x000005b8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
--/* [0x000005c0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
--/* [0x000005c8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
--/* [0x000005d0] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
--/* [0x000005d8] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
--/* [0x000005e0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
--/* [0x000005e8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
--/* [0x000005f0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
--/* [0x000005f8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
--/* [0x00000600] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000608] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000610] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000618] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000620] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
--/* [0x00000628] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000630] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000638] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000640] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
--/* [0x00000648] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000650] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000658] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
--/* [0x00000660] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00000668] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+/* [0x00000538] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x00000540] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+/* [0x00000548] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+/* [0x00000550] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+/* [0x00000558] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+/* [0x00000560] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x00000568] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+/* [0x00000570] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+/* [0x00000578] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
-+/* [0x00000580] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+/* [0x00000588] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
-+/* [0x00000590] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
-+/* [0x00000598] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+/* [0x000005a0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x000005a8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+/* [0x000005b0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+/* [0x000005b8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+/* [0x000005c0] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-+/* [0x000005c8] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+/* [0x000005d0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+/* [0x000005d8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x000005e0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+/* [0x000005e8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+/* [0x000005f0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x000005f8] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000600] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000608] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000610] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+/* [0x00000618] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000620] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000628] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000630] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-+/* [0x00000638] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000640] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000648] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+/* [0x00000650] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000658] */ 0x00000000, 0xe00208e7, // mov r3, 0
- // :uvloop_b0
--/* [0x00000670] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
--/* [0x00000678] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
--/* [0x00000680] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
--/* [0x00000688] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
--/* [0x00000690] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
--/* [0x00000698] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
--/* [0x000006a0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
--/* [0x000006a8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
--/* [0x000006b0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
--/* [0x000006b8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_frame_base, r2
--/* [0x000006c0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x000006c8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
--/* [0x000006d0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
--/* [0x000006d8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
--/* [0x000006e0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
--/* [0x000006e8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
--/* [0x000006f0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
--/* [0x000006f8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
--/* [0x00000700] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
--/* [0x00000708] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
--/* [0x00000710] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
--/* [0x00000718] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
--/* [0x00000720] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
--/* [0x00000728] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
--/* [0x00000730] */ 0x159e7000, 0x100203e7, // mov ra15, r0
--/* [0x00000738] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
--/* [0x00000740] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
--/* [0x00000748] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
--/* [0x00000750] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
--/* [0x00000758] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
--/* [0x00000760] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
--/* [0x00000768] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:uvloop_b0
--/* [0x00000770] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
--/* [0x00000778] */ 0x009e7000, 0x100009e7, // nop
--/* [0x00000780] */ 0x009e7000, 0x100009e7, // nop
--/* [0x00000788] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
--/* [0x00000790] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000798] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x000007a0] */ 0x009e7000, 0x100009e7, // nop
-+/* [0x00000660] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+/* [0x00000668] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
-+/* [0x00000670] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+/* [0x00000678] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x00000680] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+/* [0x00000688] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x00000690] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00000698] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+/* [0x000006a0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-+/* [0x000006a8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_frame_base, r2
-+/* [0x000006b0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x000006b8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+/* [0x000006c0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+/* [0x000006c8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+/* [0x000006d0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+/* [0x000006d8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+/* [0x000006e0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+/* [0x000006e8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+/* [0x000006f0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+/* [0x000006f8] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+/* [0x00000700] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+/* [0x00000708] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+/* [0x00000710] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+/* [0x00000718] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+/* [0x00000720] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+/* [0x00000728] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+/* [0x00000730] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+/* [0x00000738] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+/* [0x00000740] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+/* [0x00000748] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+/* [0x00000750] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
-+/* [0x00000758] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+/* [0x00000760] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
-+/* [0x00000768] */ 0x009e7000, 0x100009e7, // nop
-+/* [0x00000770] */ 0x009e7000, 0x100009e7, // nop
-+/* [0x00000778] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00000780] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000788] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000790] */ 0x009e7000, 0x100009e7, // nop
- // ::mc_filter_uv_b
--/* [0x000007a8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
--/* [0x000007b0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
--/* [0x000007b8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
--/* [0x000007c0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
--/* [0x000007c8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
--/* [0x000007d0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
--/* [0x000007d8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
--/* [0x000007e0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
--/* [0x000007e8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
--/* [0x000007f0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
--/* [0x000007f8] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
--/* [0x00000800] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
--/* [0x00000808] */ 0x00000010, 0xe00208a7, // mov r2, 16
--/* [0x00000810] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000818] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
--/* [0x00000820] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
--/* [0x00000828] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
--/* [0x00000830] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
--/* [0x00000838] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
--/* [0x00000840] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
--/* [0x00000848] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
--/* [0x00000850] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
--/* [0x00000858] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
--/* [0x00000860] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
--/* [0x00000868] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
--/* [0x00000870] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
--/* [0x00000878] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
--/* [0x00000880] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000888] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000890] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000898] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000008a0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
--/* [0x000008a8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
--/* [0x000008b0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
--/* [0x000008b8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000008c0] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
--/* [0x000008c8] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x000008d0] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x000008d8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
--/* [0x000008e0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x000008e8] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+/* [0x00000798] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x000007a0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+/* [0x000007a8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+/* [0x000007b0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+/* [0x000007b8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+/* [0x000007c0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x000007c8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+/* [0x000007d0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+/* [0x000007d8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
-+/* [0x000007e0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+/* [0x000007e8] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
-+/* [0x000007f0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+/* [0x000007f8] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+/* [0x00000800] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000808] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+/* [0x00000810] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+/* [0x00000818] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+/* [0x00000820] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-+/* [0x00000828] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+/* [0x00000830] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+/* [0x00000838] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-+/* [0x00000840] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-+/* [0x00000848] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-+/* [0x00000850] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000858] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+/* [0x00000860] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+/* [0x00000868] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
-+/* [0x00000870] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000878] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000880] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000888] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000890] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+/* [0x00000898] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x000008a0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x000008a8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000008b0] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-+/* [0x000008b8] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x000008c0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x000008c8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+/* [0x000008d0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x000008d8] */ 0x00000000, 0xe00208e7, // mov r3, 0
- // :uvloop_b
--/* [0x000008f0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
--/* [0x000008f8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
--/* [0x00000900] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
--/* [0x00000908] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
--/* [0x00000910] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
--/* [0x00000918] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
--/* [0x00000920] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
--/* [0x00000928] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
--/* [0x00000930] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
--/* [0x00000938] */ 0x0c627c80, 0x10020e27, // add t0s, ra_frame_base, r2
--/* [0x00000940] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00000948] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
--/* [0x00000950] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
--/* [0x00000958] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
--/* [0x00000960] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
--/* [0x00000968] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
--/* [0x00000970] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
--/* [0x00000978] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
--/* [0x00000980] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
--/* [0x00000988] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
--/* [0x00000990] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
--/* [0x00000998] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
--/* [0x000009a0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
--/* [0x000009a8] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
--/* [0x000009b0] */ 0x159e7000, 0x100203e7, // mov ra15, r0
--/* [0x000009b8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
--/* [0x000009c0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
--/* [0x000009c8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
--/* [0x000009d0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
--/* [0x000009d8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
--/* [0x000009e0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
--/* [0x000009e8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
--/* [0x000009f0] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
--/* [0x000009f8] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
--/* [0x00000a00] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
--/* [0x00000a08] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
--/* [0x00000a10] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
--/* [0x00000a18] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
--/* [0x00000a20] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
--/* [0x00000a28] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x00000a30] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
--/* [0x00000a38] */ 0x00000010, 0xe0020827, // mov r0, 16
--/* [0x00000a40] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
--/* [0x00000a48] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
--/* [0x00000a50] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
--/* [0x00000a58] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x00000a60] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x000008e0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+/* [0x000008e8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
-+/* [0x000008f0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+/* [0x000008f8] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x00000900] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+/* [0x00000908] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x00000910] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00000918] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+/* [0x00000920] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-+/* [0x00000928] */ 0x0c627c80, 0x10020e27, // add t0s, ra_frame_base, r2
-+/* [0x00000930] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000938] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+/* [0x00000940] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+/* [0x00000948] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+/* [0x00000950] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+/* [0x00000958] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+/* [0x00000960] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+/* [0x00000968] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+/* [0x00000970] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+/* [0x00000978] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+/* [0x00000980] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+/* [0x00000988] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+/* [0x00000990] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+/* [0x00000998] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+/* [0x000009a0] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+/* [0x000009a8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+/* [0x000009b0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+/* [0x000009b8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+/* [0x000009c0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+/* [0x000009c8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+/* [0x000009d0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+/* [0x000009d8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+/* [0x000009e0] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
-+/* [0x000009e8] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
-+/* [0x000009f0] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+/* [0x000009f8] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-+/* [0x00000a00] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+/* [0x00000a08] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+/* [0x00000a10] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x00000a18] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000a20] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x00000a28] */ 0x00000010, 0xe0020827, // mov r0, 16
-+/* [0x00000a30] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000a38] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00000a40] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+/* [0x00000a48] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000a50] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
- // ::mc_exit
--/* [0x00000a68] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
--/* [0x00000a70] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-+/* [0x00000a58] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x00000a60] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-+/* [0x00000a68] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000a70] */ 0x009e7000, 0xa00009e7, // ldtmu0
- /* [0x00000a78] */ 0x009e7000, 0xa00009e7, // ldtmu0
- /* [0x00000a80] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000a88] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000a90] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000a98] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
--/* [0x00000aa0] */ 0x009e7000, 0x100009e7, // nop        ; nop
--/* [0x00000aa8] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+/* [0x00000a88] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x00000a90] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+/* [0x00000a98] */ 0x009e7000, 0x100009e7, // nop        ; nop
- // ::mc_interrupt_exit8
--/* [0x00000ab0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x00000aa0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x00000aa8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000ab0] */ 0x009e7000, 0xa00009e7, // ldtmu0
- /* [0x00000ab8] */ 0x009e7000, 0xa00009e7, // ldtmu0
- /* [0x00000ac0] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000ac8] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000ad0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000ac8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000ad0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
- /* [0x00000ad8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
- /* [0x00000ae0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
- /* [0x00000ae8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
- /* [0x00000af0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
- /* [0x00000af8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000b00] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000b08] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000b10] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
--/* [0x00000b18] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
--/* [0x00000b20] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+/* [0x00000b00] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x00000b08] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+/* [0x00000b10] */ 0x009e7000, 0x100009e7, // nop        ; nop
- // ::mc_setup
--/* [0x00000b28] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00000b30] */ 0x15827d80, 0x100207e7, // mov ra31, unif
--/* [0x00000b38] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
--/* [0x00000b40] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
--/* [0x00000b48] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
--/* [0x00000b50] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
--/* [0x00000b58] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
--/* [0x00000b60] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
--/* [0x00000b68] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
--/* [0x00000b70] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
--/* [0x00000b78] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
--/* [0x00000b80] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
--/* [0x00000b88] */ 0x8c9e7452, 0x10025e18, // add t0s, r2, r1 ; mov ra_frame_base, r2
--/* [0x00000b90] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
--/* [0x00000b98] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
--/* [0x00000ba0] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
--/* [0x00000ba8] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
--/* [0x00000bb0] */ 0x0c9c13c0, 0xd0020567, // add ra_y2, r1, 1
--/* [0x00000bb8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
--/* [0x00000bc0] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
--/* [0x00000bc8] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
--/* [0x00000bd0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
--/* [0x00000bd8] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
--/* [0x00000be0] */ 0x8c9e7452, 0x10025e19, // add t0s, r2, r1 ; mov ra_frame_base2, r2
--/* [0x00000be8] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
--/* [0x00000bf0] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
--/* [0x00000bf8] */ 0x15827d80, 0x10021427, // mov rb16, unif
--/* [0x00000c00] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000c08] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
--/* [0x00000c10] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
--/* [0x00000c18] */ 0x00000001, 0xe0020527, // mov ra20, 1
--/* [0x00000c20] */ 0x00000100, 0xe00205a7, // mov ra22, 256
--/* [0x00000c28] */ 0x00000040, 0xe00207a7, // mov ra30, 64
--/* [0x00000c30] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
--/* [0x00000c38] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
--/* [0x00000c40] */ 0x00000018, 0xe00215e7, // mov rb23, 24
--/* [0x00000c48] */ 0x00000000, 0xe0020227, // mov ra8, 0
--/* [0x00000c50] */ 0x00000000, 0xe0020267, // mov ra9, 0
--/* [0x00000c58] */ 0x00000000, 0xe00202a7, // mov ra10, 0
--/* [0x00000c60] */ 0x00000000, 0xe00202e7, // mov ra11, 0
--/* [0x00000c68] */ 0x00000000, 0xe0020327, // mov ra12, 0
--/* [0x00000c70] */ 0x00000000, 0xe0020367, // mov ra13, 0
--/* [0x00000c78] */ 0x00000000, 0xe00203a7, // mov ra14, 0
--/* [0x00000c80] */ 0x00000000, 0xe00203e7, // mov ra15, 0
--/* [0x00000c88] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
--/* [0x00000c90] */ 0x159e7480, 0x10020867, // mov r1, r2
--/* [0x00000c98] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
--/* [0x00000ca0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
--/* [0x00000ca8] */ 0x159e7480, 0x10020827, // mov r0, r2
--/* [0x00000cb0] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
--/* [0x00000cb8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
--/* [0x00000cc0] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
--/* [0x00000cc8] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
--/* [0x00000cd0] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
--/* [0x00000cd8] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
--/* [0x00000ce0] */ 0x159e7480, 0x10020867, // mov r1, r2
--/* [0x00000ce8] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
--/* [0x00000cf0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
--/* [0x00000cf8] */ 0x159e7480, 0x10020827, // mov r0, r2
--/* [0x00000d00] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
--/* [0x00000d08] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
--/* [0x00000d10] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
--/* [0x00000d18] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
--/* [0x00000d20] */ 0x15827d80, 0x10021327, // mov rb12,unif
--/* [0x00000d28] */ 0x15827d80, 0x10021367, // mov rb13,unif
--/* [0x00000d30] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000d38] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
--/* [0x00000d40] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
--/* [0x00000d48] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
--/* [0x00000d50] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
--/* [0x00000d58] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_frame_base
--/* [0x00000d60] */ 0x13540dc0, 0xd0020867, // max r1, ra_y2, 0
--/* [0x00000d68] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
--/* [0x00000d70] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
--/* [0x00000d78] */ 0x0c541dc0, 0xd0020567, // add ra_y2, ra_y2, 1
--/* [0x00000d80] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
--/* [0x00000d88] */ 0x0c667380, 0x10020e27, // add t0s, r1, ra_frame_base2
-+/* [0x00000b18] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000b20] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x00000b28] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+/* [0x00000b30] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+/* [0x00000b38] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-+/* [0x00000b40] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x00000b48] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
-+/* [0x00000b50] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+/* [0x00000b58] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
-+/* [0x00000b60] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-+/* [0x00000b68] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+/* [0x00000b70] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
-+/* [0x00000b78] */ 0x8c9e7452, 0x10025e18, // add t0s, r2, r1 ; mov ra_frame_base, r2
-+/* [0x00000b80] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+/* [0x00000b88] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+/* [0x00000b90] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-+/* [0x00000b98] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
-+/* [0x00000ba0] */ 0x0c9c13c0, 0xd0020567, // add ra_y2, r1, 1
-+/* [0x00000ba8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+/* [0x00000bb0] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
-+/* [0x00000bb8] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-+/* [0x00000bc0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+/* [0x00000bc8] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
-+/* [0x00000bd0] */ 0x8c9e7452, 0x10025e19, // add t0s, r2, r1 ; mov ra_frame_base2, r2
-+/* [0x00000bd8] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
-+/* [0x00000be0] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
-+/* [0x00000be8] */ 0x15827d80, 0x10021427, // mov rb16, unif
-+/* [0x00000bf0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000bf8] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-+/* [0x00000c00] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
-+/* [0x00000c08] */ 0x00000001, 0xe0020527, // mov ra20, 1
-+/* [0x00000c10] */ 0x00000100, 0xe00205a7, // mov ra22, 256
-+/* [0x00000c18] */ 0x00000040, 0xe00207a7, // mov ra30, 64
-+/* [0x00000c20] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
-+/* [0x00000c28] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
-+/* [0x00000c30] */ 0x00000018, 0xe00215e7, // mov rb23, 24
-+/* [0x00000c38] */ 0x00000000, 0xe0020227, // mov ra8, 0
-+/* [0x00000c40] */ 0x00000000, 0xe0020267, // mov ra9, 0
-+/* [0x00000c48] */ 0x00000000, 0xe00202a7, // mov ra10, 0
-+/* [0x00000c50] */ 0x00000000, 0xe00202e7, // mov ra11, 0
-+/* [0x00000c58] */ 0x00000000, 0xe0020327, // mov ra12, 0
-+/* [0x00000c60] */ 0x00000000, 0xe0020367, // mov ra13, 0
-+/* [0x00000c68] */ 0x00000000, 0xe00203a7, // mov ra14, 0
-+/* [0x00000c70] */ 0x00000000, 0xe00203e7, // mov ra15, 0
-+/* [0x00000c78] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+/* [0x00000c80] */ 0x159e7480, 0x10020867, // mov r1, r2
-+/* [0x00000c88] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+/* [0x00000c90] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+/* [0x00000c98] */ 0x159e7480, 0x10020827, // mov r0, r2
-+/* [0x00000ca0] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-+/* [0x00000ca8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000cb0] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-+/* [0x00000cb8] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-+/* [0x00000cc0] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
-+/* [0x00000cc8] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+/* [0x00000cd0] */ 0x159e7480, 0x10020867, // mov r1, r2
-+/* [0x00000cd8] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+/* [0x00000ce0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+/* [0x00000ce8] */ 0x159e7480, 0x10020827, // mov r0, r2
-+/* [0x00000cf0] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-+/* [0x00000cf8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000d00] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-+/* [0x00000d08] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
-+/* [0x00000d10] */ 0x15827d80, 0x10021327, // mov rb12,unif
-+/* [0x00000d18] */ 0x15827d80, 0x10021367, // mov rb13,unif
-+/* [0x00000d20] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000d28] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-+/* [0x00000d30] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+/* [0x00000d38] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-+/* [0x00000d40] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-+/* [0x00000d48] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_frame_base
-+/* [0x00000d50] */ 0x13540dc0, 0xd0020867, // max r1, ra_y2, 0
-+/* [0x00000d58] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+/* [0x00000d60] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00000d68] */ 0x0c541dc0, 0xd0020567, // add ra_y2, ra_y2, 1
-+/* [0x00000d70] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-+/* [0x00000d78] */ 0x0c667380, 0x10020e27, // add t0s, r1, ra_frame_base2
- // ::mc_filter
--/* [0x00000d90] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00000d98] */ 0x15827d80, 0x100207e7, // mov ra31, unif
--/* [0x00000da0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
--/* [0x00000da8] */ 0x155e7d80, 0x10021027, // mov rx_xshift2, rx_xshift2_next
--/* [0x00000db0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
--/* [0x00000db8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
--/* [0x00000dc0] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
--/* [0x00000dc8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
--/* [0x00000dd0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
--/* [0x00000dd8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
--/* [0x00000de0] */ 0x0c9e7400, 0x100206a7, // add ra_frame_base_next, r2, r0
--/* [0x00000de8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
--/* [0x00000df0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0   ; mov r1, unif
--/* [0x00000df8] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
--/* [0x00000e00] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
--/* [0x00000e08] */ 0x0c9c13c0, 0xd0021067, // add ra_y2_next, r1, 1
--/* [0x00000e10] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
--/* [0x00000e18] */ 0x0c9e7400, 0x100214e7, // add rx_frame_base2_next, r2, r0
--/* [0x00000e20] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
--/* [0x00000e28] */ 0x00000010, 0xe00208a7, // mov r2, 16
--/* [0x00000e30] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000e38] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
--/* [0x00000e40] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
--/* [0x00000e48] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
--/* [0x00000e50] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
--/* [0x00000e58] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
--/* [0x00000e60] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
--/* [0x00000e68] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
--/* [0x00000e70] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
--/* [0x00000e78] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
--/* [0x00000e80] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000e88] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
--/* [0x00000e90] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000e98] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000ea0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000ea8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
--/* [0x00000eb0] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
--/* [0x00000eb8] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000ec0] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000ec8] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000ed0] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
--/* [0x00000ed8] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
--/* [0x00000ee0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000ee8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000ef0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000ef8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
--/* [0x00000f00] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
--/* [0x00000f08] */ 0x4f5971c6, 0x100251e0, // asr rb7, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000f10] */ 0x4f5971c6, 0x100251a0, // asr rb6, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000f18] */ 0x4f5971c6, 0x10025160, // asr rb5, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000f20] */ 0x0f9d71c0, 0x10021127, // asr rb4, r0, rb23
--/* [0x00000f28] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000f30] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
--/* [0x00000f38] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
--/* [0x00000f40] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
--/* [0x00000f48] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
--/* [0x00000f50] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+/* [0x00000d80] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000d88] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x00000d90] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+/* [0x00000d98] */ 0x155e7d80, 0x10021027, // mov rx_xshift2, rx_xshift2_next
-+/* [0x00000da0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+/* [0x00000da8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+/* [0x00000db0] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-+/* [0x00000db8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x00000dc0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+/* [0x00000dc8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+/* [0x00000dd0] */ 0x0c9e7400, 0x100206a7, // add ra_frame_base_next, r2, r0
-+/* [0x00000dd8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+/* [0x00000de0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0   ; mov r1, unif
-+/* [0x00000de8] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-+/* [0x00000df0] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
-+/* [0x00000df8] */ 0x0c9c13c0, 0xd0021067, // add ra_y2_next, r1, 1
-+/* [0x00000e00] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+/* [0x00000e08] */ 0x0c9e7400, 0x100214e7, // add rx_frame_base2_next, r2, r0
-+/* [0x00000e10] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+/* [0x00000e18] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+/* [0x00000e20] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000e28] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+/* [0x00000e30] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+/* [0x00000e38] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+/* [0x00000e40] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+/* [0x00000e48] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+/* [0x00000e50] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+/* [0x00000e58] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000e60] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+/* [0x00000e68] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+/* [0x00000e70] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000e78] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
-+/* [0x00000e80] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000e88] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000e90] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000e98] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+/* [0x00000ea0] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
-+/* [0x00000ea8] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000eb0] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000eb8] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000ec0] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+/* [0x00000ec8] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
-+/* [0x00000ed0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000ed8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000ee0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000ee8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+/* [0x00000ef0] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
-+/* [0x00000ef8] */ 0x4f5971c6, 0x100251e0, // asr rb7, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000f00] */ 0x4f5971c6, 0x100251a0, // asr rb6, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000f08] */ 0x4f5971c6, 0x10025160, // asr rb5, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000f10] */ 0x0f9d71c0, 0x10021127, // asr rb4, r0, rb23
-+/* [0x00000f18] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000f20] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
-+/* [0x00000f28] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
-+/* [0x00000f30] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+/* [0x00000f38] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
-+/* [0x00000f40] */ 0x00000000, 0xe00208e7, // mov r3, 0
- // :yloop
--/* [0x00000f58] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
--/* [0x00000f60] */ 0x8e4539bf, 0xa0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu0
--/* [0x00000f68] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
--/* [0x00000f70] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
--/* [0x00000f78] */ 0x0e9c09c0, 0x10020867, // shr r1, r4, rx_xshift2
--/* [0x00000f80] */ 0x159c1fc0, 0x10040567, // mov.ifz ra_y2, ra_y2_next
--/* [0x00000f88] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
--/* [0x00000f90] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
--/* [0x00000f98] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
--/* [0x00000fa0] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
--/* [0x00000fa8] */ 0x13540dc0, 0xd00208a7, // max r2, ra_y2, 0
--/* [0x00000fb0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
--/* [0x00000fb8] */ 0x4c541dd3, 0xd0024562, // add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
--/* [0x00000fc0] */ 0xec654c87, 0x10024e20, // add t0s, ra_frame_base2, r2   ; v8subs r0, r0, rb20
--/* [0x00000fc8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00000fd0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
--/* [0x00000fd8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
--/* [0x00000fe0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
--/* [0x00000fe8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
--/* [0x00000ff0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
--/* [0x00000ff8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
--/* [0x00001000] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
--/* [0x00001008] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
--/* [0x00001010] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
--/* [0x00001018] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
--/* [0x00001020] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
--/* [0x00001028] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
--/* [0x00001030] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
--/* [0x00001038] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
--/* [0x00001040] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
--/* [0x00001048] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
--/* [0x00001050] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
--/* [0x00001058] */ 0x8d3487f6, 0xd00279cc, // sub.setf -, r3, 8    ; mov ra12, ra13
--/* [0x00001060] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
--/* [0x00001068] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
--/* [0x00001070] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
--/* [0x00001078] */ 0x15367d80, 0x10020327, // mov ra12, ra13
--/* [0x00001080] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:yloop
--/* [0x00001088] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
--/* [0x00001090] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
--/* [0x00001098] */ 0x159e7000, 0x100203e7, // mov ra15, r0
--/* [0x000010a0] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
--/* [0x000010a8] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
--/* [0x000010b0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
--/* [0x000010b8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
--/* [0x000010c0] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb4
--/* [0x000010c8] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb5
--/* [0x000010d0] */ 0x4c286237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb6
--/* [0x000010d8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
--/* [0x000010e0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
--/* [0x000010e8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
--/* [0x000010f0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
--/* [0x000010f8] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
--/* [0x00001100] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
--/* [0x00001108] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
--/* [0x00001110] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:yloop
--/* [0x00001118] */ 0x0c9cf3c0, 0x10020867, // add r1, r1, rb15
--/* [0x00001120] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
--/* [0x00001128] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
--/* [0x00001130] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
--/* [0x00001138] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
--/* [0x00001140] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x00001148] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x00000f48] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
-+/* [0x00000f50] */ 0x8e4539bf, 0xa0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu0
-+/* [0x00000f58] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+/* [0x00000f60] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x00000f68] */ 0x0e9c09c0, 0x10020867, // shr r1, r4, rx_xshift2
-+/* [0x00000f70] */ 0x159c1fc0, 0x10040567, // mov.ifz ra_y2, ra_y2_next
-+/* [0x00000f78] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x00000f80] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00000f88] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-+/* [0x00000f90] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
-+/* [0x00000f98] */ 0x13540dc0, 0xd00208a7, // max r2, ra_y2, 0
-+/* [0x00000fa0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00000fa8] */ 0x4c541dd3, 0xd0024562, // add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
-+/* [0x00000fb0] */ 0xec654c87, 0x10024e20, // add t0s, ra_frame_base2, r2   ; v8subs r0, r0, rb20
-+/* [0x00000fb8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000fc0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+/* [0x00000fc8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+/* [0x00000fd0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+/* [0x00000fd8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+/* [0x00000fe0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+/* [0x00000fe8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+/* [0x00000ff0] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+/* [0x00000ff8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+/* [0x00001000] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+/* [0x00001008] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+/* [0x00001010] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+/* [0x00001018] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+/* [0x00001020] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+/* [0x00001028] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+/* [0x00001030] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+/* [0x00001038] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+/* [0x00001040] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+/* [0x00001048] */ 0x8d3487f6, 0xd00279cc, // sub.setf -, r3, 8    ; mov ra12, ra13
-+/* [0x00001050] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+/* [0x00001058] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+/* [0x00001060] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+/* [0x00001068] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+/* [0x00001070] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:yloop
-+/* [0x00001078] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+/* [0x00001080] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+/* [0x00001088] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+/* [0x00001090] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+/* [0x00001098] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+/* [0x000010a0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+/* [0x000010a8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+/* [0x000010b0] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb4
-+/* [0x000010b8] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb5
-+/* [0x000010c0] */ 0x4c286237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb6
-+/* [0x000010c8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
-+/* [0x000010d0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+/* [0x000010d8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+/* [0x000010e0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+/* [0x000010e8] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
-+/* [0x000010f0] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
-+/* [0x000010f8] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
-+/* [0x00001100] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:yloop
-+/* [0x00001108] */ 0x0c9cf3c0, 0x10020867, // add r1, r1, rb15
-+/* [0x00001110] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+/* [0x00001118] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+/* [0x00001120] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00001128] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x00001130] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00001138] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
- // ::mc_filter_b
--/* [0x00001150] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00001158] */ 0x15827d80, 0x100207e7, // mov ra31, unif
--/* [0x00001160] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
--/* [0x00001168] */ 0x155e7d80, 0x10021027, // mov rx_xshift2, rx_xshift2_next
--/* [0x00001170] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
--/* [0x00001178] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
--/* [0x00001180] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
--/* [0x00001188] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
--/* [0x00001190] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
--/* [0x00001198] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
--/* [0x000011a0] */ 0x0c9e7400, 0x100206a7, // add ra_frame_base_next, r2, r0
--/* [0x000011a8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
--/* [0x000011b0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0   ; mov r1, unif
--/* [0x000011b8] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
--/* [0x000011c0] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
--/* [0x000011c8] */ 0x0c9c13c0, 0xd0021067, // add ra_y2_next, r1, 1
--/* [0x000011d0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
--/* [0x000011d8] */ 0x0c9e7400, 0x100214e7, // add rx_frame_base2_next, r2, r0
--/* [0x000011e0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
--/* [0x000011e8] */ 0x00000010, 0xe00208a7, // mov r2, 16
--/* [0x000011f0] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x000011f8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
--/* [0x00001200] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
--/* [0x00001208] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
--/* [0x00001210] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
--/* [0x00001218] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
--/* [0x00001220] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
--/* [0x00001228] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
--/* [0x00001230] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
--/* [0x00001238] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
--/* [0x00001240] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00001248] */ 0x00000001, 0xe0020867, // mov r1, 1
--/* [0x00001250] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
--/* [0x00001258] */ 0x409f3001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 13, r1 << 13
--/* [0x00001260] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00001268] */ 0x409f2001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 14, r1 << 14
--/* [0x00001270] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00001278] */ 0x409f1001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 15, r1 << 15
--/* [0x00001280] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00001288] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
--/* [0x00001290] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
--/* [0x00001298] */ 0x409f7001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 9, r1 << 9
--/* [0x000012a0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000012a8] */ 0x409f6001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 10, r1 << 10
--/* [0x000012b0] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000012b8] */ 0x409f5001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 11, r1 << 11
--/* [0x000012c0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000012c8] */ 0x409f4001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 12, r1 << 12
--/* [0x000012d0] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
--/* [0x000012d8] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
--/* [0x000012e0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
--/* [0x000012e8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
--/* [0x000012f0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000012f8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
--/* [0x00001300] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
--/* [0x00001308] */ 0x4f5971c6, 0x100251e0, // asr rb7, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00001310] */ 0x4f5971c6, 0x100251a0, // asr rb6, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00001318] */ 0x4f5971c6, 0x10025160, // asr rb5, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00001320] */ 0x0f9d71c0, 0x10021127, // asr rb4, r0, rb23
--/* [0x00001328] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00001330] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
--/* [0x00001338] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
--/* [0x00001340] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
--/* [0x00001348] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
--/* [0x00001350] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+/* [0x00001140] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00001148] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x00001150] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+/* [0x00001158] */ 0x155e7d80, 0x10021027, // mov rx_xshift2, rx_xshift2_next
-+/* [0x00001160] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+/* [0x00001168] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+/* [0x00001170] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-+/* [0x00001178] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x00001180] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+/* [0x00001188] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+/* [0x00001190] */ 0x0c9e7400, 0x100206a7, // add ra_frame_base_next, r2, r0
-+/* [0x00001198] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+/* [0x000011a0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0   ; mov r1, unif
-+/* [0x000011a8] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-+/* [0x000011b0] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
-+/* [0x000011b8] */ 0x0c9c13c0, 0xd0021067, // add ra_y2_next, r1, 1
-+/* [0x000011c0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+/* [0x000011c8] */ 0x0c9e7400, 0x100214e7, // add rx_frame_base2_next, r2, r0
-+/* [0x000011d0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+/* [0x000011d8] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+/* [0x000011e0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x000011e8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+/* [0x000011f0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+/* [0x000011f8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+/* [0x00001200] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+/* [0x00001208] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+/* [0x00001210] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+/* [0x00001218] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00001220] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+/* [0x00001228] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+/* [0x00001230] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00001238] */ 0x00000001, 0xe0020867, // mov r1, 1
-+/* [0x00001240] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
-+/* [0x00001248] */ 0x409f3001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 13, r1 << 13
-+/* [0x00001250] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00001258] */ 0x409f2001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 14, r1 << 14
-+/* [0x00001260] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00001268] */ 0x409f1001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 15, r1 << 15
-+/* [0x00001270] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00001278] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+/* [0x00001280] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
-+/* [0x00001288] */ 0x409f7001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 9, r1 << 9
-+/* [0x00001290] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00001298] */ 0x409f6001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 10, r1 << 10
-+/* [0x000012a0] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000012a8] */ 0x409f5001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 11, r1 << 11
-+/* [0x000012b0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000012b8] */ 0x409f4001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 12, r1 << 12
-+/* [0x000012c0] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+/* [0x000012c8] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
-+/* [0x000012d0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x000012d8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x000012e0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000012e8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+/* [0x000012f0] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
-+/* [0x000012f8] */ 0x4f5971c6, 0x100251e0, // asr rb7, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00001300] */ 0x4f5971c6, 0x100251a0, // asr rb6, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00001308] */ 0x4f5971c6, 0x10025160, // asr rb5, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00001310] */ 0x0f9d71c0, 0x10021127, // asr rb4, r0, rb23
-+/* [0x00001318] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00001320] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
-+/* [0x00001328] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
-+/* [0x00001330] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+/* [0x00001338] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
-+/* [0x00001340] */ 0x00000000, 0xe00208e7, // mov r3, 0
- // :yloopb
--/* [0x00001358] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
--/* [0x00001360] */ 0x8e4539bf, 0xa0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu0
--/* [0x00001368] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
--/* [0x00001370] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
--/* [0x00001378] */ 0x0e9c09c0, 0x10020867, // shr r1, r4, rx_xshift2
--/* [0x00001380] */ 0x159c1fc0, 0x10040567, // mov.ifz ra_y2, ra_y2_next
--/* [0x00001388] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
--/* [0x00001390] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
--/* [0x00001398] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
--/* [0x000013a0] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
--/* [0x000013a8] */ 0x13540dc0, 0xd00208a7, // max r2, ra_y2, 0
--/* [0x000013b0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
--/* [0x000013b8] */ 0x4c541dd3, 0xd0024562, // add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
--/* [0x000013c0] */ 0xec654c87, 0x10024e20, // add t0s, ra_frame_base2, r2   ; v8subs r0, r0, rb20
--/* [0x000013c8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x000013d0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
--/* [0x000013d8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
--/* [0x000013e0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
--/* [0x000013e8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
--/* [0x000013f0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
--/* [0x000013f8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
--/* [0x00001400] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
--/* [0x00001408] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
--/* [0x00001410] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
--/* [0x00001418] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
--/* [0x00001420] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
--/* [0x00001428] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
--/* [0x00001430] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
--/* [0x00001438] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
--/* [0x00001440] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
--/* [0x00001448] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
--/* [0x00001450] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
--/* [0x00001458] */ 0x8d3487f6, 0xd00279cc, // sub.setf -, r3, 8    ; mov ra12, ra13
--/* [0x00001460] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
--/* [0x00001468] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
--/* [0x00001470] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
--/* [0x00001478] */ 0x15367d80, 0x10020327, // mov ra12, ra13
--/* [0x00001480] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:yloopb
--/* [0x00001488] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
--/* [0x00001490] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
--/* [0x00001498] */ 0x159e7000, 0x100203e7, // mov ra15, r0
--/* [0x000014a0] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
--/* [0x000014a8] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
--/* [0x000014b0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
--/* [0x000014b8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
--/* [0x000014c0] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb4
--/* [0x000014c8] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb5
--/* [0x000014d0] */ 0x4c286237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb6
--/* [0x000014d8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
--/* [0x000014e0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
--/* [0x000014e8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
--/* [0x000014f0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
--/* [0x000014f8] */ 0x4053800e, 0xd00049e1, // nop                     ; mul24 r1, r1 << 8, ra20 << 8
--/* [0x00001500] */ 0x4c78e38f, 0x10024860, // add r1, r1, ra30        ; mul24 r0, r1, rb14
--/* [0x00001508] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
--/* [0x00001510] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:yloopb
--/* [0x00001518] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
--/* [0x00001520] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
--/* [0x00001528] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
--/* [0x00001530] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
--/* [0x00001538] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
--/* [0x00001540] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x00001548] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x00001348] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
-+/* [0x00001350] */ 0x8e4539bf, 0xa0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu0
-+/* [0x00001358] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+/* [0x00001360] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x00001368] */ 0x0e9c09c0, 0x10020867, // shr r1, r4, rx_xshift2
-+/* [0x00001370] */ 0x159c1fc0, 0x10040567, // mov.ifz ra_y2, ra_y2_next
-+/* [0x00001378] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x00001380] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00001388] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-+/* [0x00001390] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
-+/* [0x00001398] */ 0x13540dc0, 0xd00208a7, // max r2, ra_y2, 0
-+/* [0x000013a0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x000013a8] */ 0x4c541dd3, 0xd0024562, // add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
-+/* [0x000013b0] */ 0xec654c87, 0x10024e20, // add t0s, ra_frame_base2, r2   ; v8subs r0, r0, rb20
-+/* [0x000013b8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x000013c0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+/* [0x000013c8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+/* [0x000013d0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+/* [0x000013d8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+/* [0x000013e0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+/* [0x000013e8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+/* [0x000013f0] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+/* [0x000013f8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+/* [0x00001400] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+/* [0x00001408] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+/* [0x00001410] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+/* [0x00001418] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+/* [0x00001420] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+/* [0x00001428] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+/* [0x00001430] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+/* [0x00001438] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+/* [0x00001440] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+/* [0x00001448] */ 0x8d3487f6, 0xd00279cc, // sub.setf -, r3, 8    ; mov ra12, ra13
-+/* [0x00001450] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+/* [0x00001458] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+/* [0x00001460] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+/* [0x00001468] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+/* [0x00001470] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:yloopb
-+/* [0x00001478] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+/* [0x00001480] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+/* [0x00001488] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+/* [0x00001490] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+/* [0x00001498] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+/* [0x000014a0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+/* [0x000014a8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+/* [0x000014b0] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb4
-+/* [0x000014b8] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb5
-+/* [0x000014c0] */ 0x4c286237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb6
-+/* [0x000014c8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
-+/* [0x000014d0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+/* [0x000014d8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+/* [0x000014e0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+/* [0x000014e8] */ 0x4053800e, 0xd00049e1, // nop                     ; mul24 r1, r1 << 8, ra20 << 8
-+/* [0x000014f0] */ 0x4c78e38f, 0x10024860, // add r1, r1, ra30        ; mul24 r0, r1, rb14
-+/* [0x000014f8] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
-+/* [0x00001500] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:yloopb
-+/* [0x00001508] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-+/* [0x00001510] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+/* [0x00001518] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+/* [0x00001520] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00001528] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x00001530] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00001538] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
- // ::mc_interrupt_exit12
--/* [0x00001550] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x00001540] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x00001548] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00001550] */ 0x009e7000, 0xa00009e7, // ldtmu0
- /* [0x00001558] */ 0x009e7000, 0xa00009e7, // ldtmu0
- /* [0x00001560] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00001568] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00001570] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00001568] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00001570] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
- /* [0x00001578] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
- /* [0x00001580] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
- /* [0x00001588] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-@@ -732,11 +732,9 @@ unsigned int rpi_shader[] = {
- /* [0x000015a8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
- /* [0x000015b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
- /* [0x000015b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x000015c0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x000015c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x000015d0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
--/* [0x000015d8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
--/* [0x000015e0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+/* [0x000015c0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x000015c8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+/* [0x000015d0] */ 0x009e7000, 0x100009e7, // nop        ; nop
- // ::mc_end
- };
- #ifdef __HIGHC__
-diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
-index 3fa8531..6e552d9 100644
---- a/libavcodec/rpi_shader.h
-+++ b/libavcodec/rpi_shader.h
-@@ -4,15 +4,15 @@
- extern unsigned int rpi_shader[];
- 
- #define mc_setup_uv (rpi_shader + 0)
--#define mc_filter_uv (rpi_shader + 148)
--#define mc_filter_uv_b0 (rpi_shader + 338)
--#define mc_filter_uv_b (rpi_shader + 490)
--#define mc_exit (rpi_shader + 666)
--#define mc_interrupt_exit8 (rpi_shader + 684)
--#define mc_setup (rpi_shader + 714)
--#define mc_filter (rpi_shader + 868)
--#define mc_filter_b (rpi_shader + 1108)
--#define mc_interrupt_exit12 (rpi_shader + 1364)
--#define mc_end (rpi_shader + 1402)
-+#define mc_filter_uv (rpi_shader + 144)
-+#define mc_filter_uv_b0 (rpi_shader + 334)
-+#define mc_filter_uv_b (rpi_shader + 486)
-+#define mc_exit (rpi_shader + 662)
-+#define mc_interrupt_exit8 (rpi_shader + 680)
-+#define mc_setup (rpi_shader + 710)
-+#define mc_filter (rpi_shader + 864)
-+#define mc_filter_b (rpi_shader + 1104)
-+#define mc_interrupt_exit12 (rpi_shader + 1360)
-+#define mc_end (rpi_shader + 1398)
- 
- #endif
-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-index 9cfc0d9..a0b8e5a 100644
---- a/libavcodec/rpi_shader.qasm
-+++ b/libavcodec/rpi_shader.qasm
-@@ -133,8 +133,8 @@ mov ra14, 0
- mov ra15, 0
- 
- # Compute part of VPM to use for DMA output
--mov r2, qpu_num
--shl r2, r2, 1   # Convert QPU numbers to be even (this means we can only use 8 QPUs, but is necessary as we need to save 16bit intermediate results)
-+mov r3, unif
-+shl r2, r3, 1   # Convert QPU numbers to be even (this means we can only use 8 QPUs, but is necessary as we need to save 16bit intermediate results)
- and r2, r2, 15
- mov r1, r2
- asr r1, r1, 2
-@@ -147,8 +147,7 @@ shl r0, r0, 5
- add rb27, r0, r1
- 
- # Compute part of VPM to save data into
--mov r2, qpu_num   # qpu_num = abcd
--shl r2, r2, 1
-+shl r2, r3, 1
- and r2, r2, 15    # r2 = bcd0
- mov r1, r2        # r1 = bcd0
- asr r1, r1, 2     # r1 = bc
-@@ -181,9 +180,6 @@ add t0s, r2, r1
- mov rb12,unif # offset before shift
- mov rb13,unif # offset after shift
- 
--# Dump padding words
--mov r0, unif
--
- # submit texture requests for second line
- max r1, ra_y, 0
- min r1, r1, rb_frame_height_minus_1
--- 
-2.7.4
-
-
-From db6fe49d50e42c444b5833acc6206c0bbfaacef4 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Sat, 23 May 2015 13:20:21 +0100
-Subject: [PATCH 45/68] Add new cache flushing routine
-
----
- libavcodec/hevc.c          |  8 +++--
- libavcodec/hevc_filter.c   | 39 ++++++++++-----------
- libavcodec/rpi_qpu.c       | 17 +++++++--
- libavcodec/rpi_qpu.h       |  2 ++
- libavcodec/rpi_user_vcsm.h | 86 ++++++++++++++++++++++++++--------------------
- 5 files changed, 91 insertions(+), 61 deletions(-)
-
-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-index caadfaa..9d12583 100644
---- a/libavcodec/hevc.c
-+++ b/libavcodec/hevc.c
-@@ -3575,9 +3575,13 @@ static int decode_nal_units(HEVCContext *s, const uint8_t *buf, int length)
-     }
- 
- fail:
--    if (s->ref && s->threads_type == FF_THREAD_FRAME)
-+    if (s->ref && s->threads_type == FF_THREAD_FRAME) {
-+#ifdef RPI_INTER_QPU
-+        void ff_hevc_flush_chroma(HEVCContext *s, ThreadFrame *f, int n);
-+        ff_hevc_flush_chroma(s, &s->ref->tf, s->ps.sps->height);
-+#endif
-         ff_thread_report_progress(&s->ref->tf, INT_MAX, 0);
--
-+    }
-     return ret;
- }
- 
-diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
-index 186317a..ec84e8a 100644
---- a/libavcodec/hevc_filter.c
-+++ b/libavcodec/hevc_filter.c
-@@ -883,36 +883,35 @@ static int ff_hevc_buf_base(AVBufferRef *bref) {
-   return p->vc & 0x3fffffff;
- }
- 
--static void ff_hevc_flush_chroma(HEVCContext *s, ThreadFrame *f, int n)
-+void ff_hevc_flush_chroma(HEVCContext *s, ThreadFrame *f, int n);
-+void ff_hevc_flush_chroma(HEVCContext *s, ThreadFrame *f, int n)
- {
-     if (s->enable_rpi && !(  s->nal_unit_type == NAL_TRAIL_N ||
-             s->nal_unit_type == NAL_TSA_N   ||
-             s->nal_unit_type == NAL_STSA_N  ||
-             s->nal_unit_type == NAL_RADL_N  ||
-             s->nal_unit_type == NAL_RASL_N )) {
--#define RPI_FAST_CACHEFLUSH
- #ifdef RPI_FAST_CACHEFLUSH
-         struct vcsm_user_clean_invalid_s iocache = {};
--        int curr_y = f->progress->data[0];
-+        int curr_y = ((int *)f->progress->data)[0];
-+        int curr_uv = curr_y >> s->ps.sps->vshift[1];
-+        int n_uv = n >> s->ps.sps->vshift[1];
-         int sz,base;
--        if (curr_y < 0) curr_y = 0;
--        if (n<=curr_y) return; // Should not happen
--        sz = s->frame->linesize[1] * (n-curr_y);
--        base = s->frame->linesize[1] * curr_y;
--        iocache.s[0].cmd = 3; // Flush L1 cache
--        iocache.s[0].addr = 0;
--        iocache.s[0].size  = 0;
--
--        iocache.s[1].cmd = 2;
--        iocache.s[1].addr = ff_hevc_buf_base(s->frame->buf[1]) + base;
-+        if (curr_uv < 0) curr_uv = 0;
-+        if (n_uv<=curr_uv) { assert(0); return; } // Should not happen
-+        sz = s->frame->linesize[1] * (n_uv-curr_uv);
-+        base = s->frame->linesize[1] * curr_uv;
-+        GPU_MEM_PTR_T *p = av_buffer_pool_opaque(s->frame->buf[1]);
-+        iocache.s[0].handle = p->vcsm_handle;
-+        iocache.s[0].cmd = 3; // clean+invalidate
-+        iocache.s[0].addr = p->arm + base;
-+        iocache.s[0].size  = sz;
-+        p = av_buffer_pool_opaque(s->frame->buf[2]);
-+        iocache.s[1].handle = p->vcsm_handle;
-+        iocache.s[1].cmd = 3; // clean+invalidate
-+        iocache.s[1].addr = p->arm + base;
-         iocache.s[1].size  = sz;
--
--        iocache.s[2].cmd = 2;
--        iocache.s[2].addr = ff_hevc_buf_base(s->frame->buf[2]) + base;
--        iocache.s[2].size  = sz;
--
--        vcsm_clean_invalid( gpu_get_mailbox(), &iocache );
--
-+        vcsm_clean_invalid( &iocache );
- #else
-         flush_buffer(s->frame->buf[1]);
-         flush_buffer(s->frame->buf[2]);
-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-index feb3284..aa65a77 100644
---- a/libavcodec/rpi_qpu.c
-+++ b/libavcodec/rpi_qpu.c
-@@ -211,6 +211,7 @@ static void gpu_unlock(void) {
- }
- 
- static int gpu_malloc_uncached_internal(int numbytes, GPU_MEM_PTR_T *p, int mb) {
-+  p->numbytes = numbytes;
-   p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
-   assert(p->vcsm_handle);
-   p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
-@@ -243,13 +244,25 @@ int gpu_get_mailbox(void)
-   return gpu->mb;
- }
- 
-+// Call this to clean and invalidate a region of memory
- void gpu_cache_flush(GPU_MEM_PTR_T *p)
- {
--  void *tmp = vcsm_lock(p->vcsm_handle);
--  vcsm_unlock_ptr(tmp);
-+#define RPI_FAST_CACHEFLUSH
-+#ifdef RPI_FAST_CACHEFLUSH
-+    struct vcsm_user_clean_invalid_s iocache = {};
-+    iocache.s[0].handle = p->vcsm_handle;
-+    iocache.s[0].cmd = 3; // clean+invalidate
-+    iocache.s[0].addr = p->arm;
-+    iocache.s[0].size  = p->numbytes;
-+    vcsm_clean_invalid( &iocache );
-+#else
-+    void *tmp = vcsm_lock(p->vcsm_handle);
-+    vcsm_unlock_ptr(tmp);
-+#endif
- }
- 
- static int gpu_malloc_cached_internal(int numbytes, GPU_MEM_PTR_T *p) {
-+  p->numbytes = numbytes;
-   p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST, (char *)"Video Frame" );
-   //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_VC, (char *)"Video Frame" );
-   //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
-diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
-index 2f08f03..0565a60 100644
---- a/libavcodec/rpi_qpu.h
-+++ b/libavcodec/rpi_qpu.h
-@@ -1,6 +1,8 @@
- #ifndef RPI_QPU_H
- #define RPI_QPU_H
- 
-+#define RPI_FAST_CACHEFLUSH
-+
- typedef struct gpu_mem_ptr_s {
-   unsigned char *arm; // Pointer to memory mapped on ARM side
-   int vc_handle;   // Videocore handle of relocatable memory
-diff --git a/libavcodec/rpi_user_vcsm.h b/libavcodec/rpi_user_vcsm.h
-index 95e6de1..db41a4d 100644
---- a/libavcodec/rpi_user_vcsm.h
-+++ b/libavcodec/rpi_user_vcsm.h
-@@ -1,29 +1,41 @@
--/*
--Copyright (c) 2012, Broadcom Europe Ltd
--All rights reserved.
--
--Redistribution and use in source and binary forms, with or without
--modification, are permitted provided that the following conditions are met:
--    * Redistributions of source code must retain the above copyright
--      notice, this list of conditions and the following disclaimer.
--    * Redistributions in binary form must reproduce the above copyright
--      notice, this list of conditions and the following disclaimer in the
--      documentation and/or other materials provided with the distribution.
--    * Neither the name of the copyright holder nor the
--      names of its contributors may be used to endorse or promote products
--      derived from this software without specific prior written permission.
--
--THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
--ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
--WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
--DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
--DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
--(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
--LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
--ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
--(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
--SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--*/
-+/*****************************************************************************
-+* Copyright 2001 - 2011 Broadcom Corporation.  All rights reserved.
-+*
-+* This program is the proprietary software of Broadcom Corporation and/or
-+* its licensors, and may only be used, duplicated, modified or distributed
-+* pursuant to the terms and conditions of a separate, written license
-+* agreement executed between you and Broadcom (an "Authorized License").
-+* Except as set forth in an Authorized License, Broadcom grants no license
-+* (express or implied), right to use, or waiver of any kind with respect to
-+* the Software, and Broadcom expressly reserves all rights in and to the
-+* Software and all intellectual property rights therein.  IF YOU HAVE NO
-+* AUTHORIZED LICENSE, THEN YOU HAVE NO RIGHT TO USE THIS SOFTWARE IN ANY
-+* WAY, AND SHOULD IMMEDIATELY NOTIFY BROADCOM AND DISCONTINUE ALL USE OF
-+* THE SOFTWARE.
-+*
-+* Except as expressly set forth in the Authorized License,
-+* 1. This program, including its structure, sequence and organization,
-+*    constitutes the valuable trade secrets of Broadcom, and you shall use
-+*    all reasonable efforts to protect the confidentiality thereof, and to
-+*    use this information only in connection with your use of Broadcom
-+*    integrated circuit products.
-+* 2. TO THE MAXIMUM EXTENT PERMITTED BY LAW, THE SOFTWARE IS PROVIDED "AS IS"
-+*    AND WITH ALL FAULTS AND BROADCOM MAKES NO PROMISES, REPRESENTATIONS OR
-+*    WARRANTIES, EITHER EXPRESS, IMPLIED, STATUTORY, OR OTHERWISE, WITH
-+*    RESPECT TO THE SOFTWARE.  BROADCOM SPECIFICALLY DISCLAIMS ANY AND ALL
-+*    IMPLIED WARRANTIES OF TITLE, MERCHANTABILITY, NONINFRINGEMENT, FITNESS
-+*    FOR A PARTICULAR PURPOSE, LACK OF VIRUSES, ACCURACY OR COMPLETENESS,
-+*    QUIET ENJOYMENT, QUIET POSSESSION OR CORRESPONDENCE TO DESCRIPTION. YOU
-+*    ASSUME THE ENTIRE RISK ARISING OUT OF USE OR PERFORMANCE OF THE SOFTWARE.
-+* 3. TO THE MAXIMUM EXTENT PERMITTED BY LAW, IN NO EVENT SHALL BROADCOM OR ITS
-+*    LICENSORS BE LIABLE FOR (i) CONSEQUENTIAL, INCIDENTAL, SPECIAL, INDIRECT,
-+*    OR EXEMPLARY DAMAGES WHATSOEVER ARISING OUT OF OR IN ANY WAY RELATING TO
-+*    YOUR USE OF OR INABILITY TO USE THE SOFTWARE EVEN IF BROADCOM HAS BEEN
-+*    ADVISED OF THE POSSIBILITY OF SUCH DAMAGES; OR (ii) ANY AMOUNT IN EXCESS
-+*    OF THE AMOUNT ACTUALLY PAID FOR THE SOFTWARE ITSELF OR U.S. $1, WHICHEVER
-+*    IS GREATER. THESE LIMITATIONS SHALL APPLY NOTWITHSTANDING ANY FAILURE OF
-+*    ESSENTIAL PURPOSE OF ANY LIMITED REMEDY.
-+*****************************************************************************/
- 
- #ifndef __USER_VCSM__H__INCLUDED__
- #define __USER_VCSM__H__INCLUDED__
-@@ -424,21 +436,21 @@ int vcsm_unlock_hdl_sp( unsigned int handle, int cache_no_flush );
- **
- ** structure contains a list of flush/invalidate commands. Commands are:
- ** 0: nop
--** 1: invalidate given physical range in L2
--** 2: clean      given physical range in L2
--** 3: clean+invalidate all of L1
--** 4: flush      all of L2 and all of L1
-+** 1: invalidate       given virtual range in L1/L2
-+** 2: clean            given virtual range in L1/L2
-+** 3: clean+invalidate given virtual range in L1/L2
-+** 4: flush all L1/L2
- */
- struct vcsm_user_clean_invalid_s {
--    struct {
--       unsigned int cmd;
--       unsigned int addr;
--       unsigned int size;
--    } s[8];
-+   struct {
-+      unsigned int cmd;
-+      unsigned int handle;
-+      unsigned int addr;
-+      unsigned int size;
-+   } s[8];
- };
- 
--int vcsm_clean_invalid( unsigned int handle, struct vcsm_user_clean_invalid_s *s );
--
-+int vcsm_clean_invalid( struct vcsm_user_clean_invalid_s *s );
- 
- #ifdef __cplusplus
- }
--- 
-2.7.4
-
-
-From 87a6cb3a4f7189e711c85de6d20077b6453b2ebe Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Sat, 23 May 2015 21:10:10 +0100
-Subject: [PATCH 46/68] Fix multi mailbox extra transform call
-
----
- libavcodec/hevc.c | 2 ++
- 1 file changed, 2 insertions(+)
-
-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-index 9d12583..30f5834 100644
---- a/libavcodec/hevc.c
-+++ b/libavcodec/hevc.c
-@@ -3024,7 +3024,9 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
- #ifdef RPI_INTER_QPU
-         rpi_execute_inter_qpu(s);
- #endif
-+#ifndef RPI_MULTI_MAILBOX
-         rpi_execute_transform(s);
-+#endif
-         rpi_execute_inter_cmds(s);
-         vpu_wait(s->vpu_id);
-         rpi_execute_pred_cmds(s);
--- 
-2.7.4
-
-
-From 2a3672a1bda0296453953bebe8b17d69445260b4 Mon Sep 17 00:00:00 2001
-From: Peter de Rivaz <peter.derivaz@gmail.com>
-Date: Wed, 27 May 2015 16:44:29 +0100
-Subject: [PATCH 47/68] Added support for running luma prediction on QPUs
-
----
- libavcodec/hevc.c          |  237 +++++++-
- libavcodec/hevc.h          |   26 +-
- libavcodec/hevc_filter.c   |   23 +-
- libavcodec/rpi_qpu.c       |  156 ++++--
- libavcodec/rpi_qpu.h       |    8 +-
- libavcodec/rpi_shader.c    | 1313 ++++++++++++++++++++++----------------------
- libavcodec/rpi_shader.h    |   21 +-
- libavcodec/rpi_shader.qasm |  883 ++++++++++++++---------------
- 8 files changed, 1464 insertions(+), 1203 deletions(-)
-
-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-index 30f5834..2da88ec 100644
---- a/libavcodec/hevc.c
-+++ b/libavcodec/hevc.c
-@@ -52,6 +52,11 @@
-     // Define RPI_MULTI_MAILBOX to use the updated mailbox that can launch both QPU and VPU
-     #define RPI_MULTI_MAILBOX
-   #endif
-+
-+  // Define RPI_CACHE_UNIF_MVS to write motion vector uniform stream to cached memory
-+  // RPI_CACHE_UNIF_MVS doesn't seem to make much difference, so left undefined.
-+
-+
- #endif
- 
- // #define DISABLE_MC
-@@ -74,6 +79,13 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
- // The QPU code for UV blocks only works up to a block width of 8
- #define RPI_CHROMA_BLOCK_WIDTH 8
- 
-+// Split image of 2048 into parts 64 wide
-+// So some QPUs will have 3 blocks of 64 to do, and others 2 blocks for an image 2048 wide with 32 blocks across
-+// Each block of 64*64
-+// Smallest CTU size is 16x16, so smallest block is 8x8
-+// Corresponds to a total of 83kbytes over all 12 QPUs
-+#define RPI_LUMA_COMMAND_WORDS 9
-+#define Y_COMMANDS_PER_QPU ((1+3*(64*64)/(8*8)) * RPI_LUMA_COMMAND_WORDS)
- 
- #define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24)
- 
-@@ -2015,10 +2027,46 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-         int nPbW_c = nPbW >> s->ps.sps->hshift[1];
-         int nPbH_c = nPbH >> s->ps.sps->vshift[1];
- 
--        RPI_REDIRECT(luma_mc_uni)(s, dst0, s->frame->linesize[0], ref0->frame,
-+#ifdef RPI_LUMA_QPU
-+        if (s->enable_rpi) {
-+            int reflist = 0;
-+            const Mv *mv         = &current_mv.mv[reflist];
-+            int mx          = mv->x & 3;
-+            int my          = mv->y & 3;
-+            int my_mx = (my<<8) + mx;
-+            int my2_mx2_my_mx = (my_mx << 16) + my_mx;
-+            int x1 = x0 + (mv->x >> 2);
-+            int y1 = y0 + (mv->y >> 2);
-+            int chan = x0>>6; // 64 wide blocks per QPU
-+            int weight_flag = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
-+                              (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
-+            uint32_t *y = s->y_mvs[chan % 12];
-+            for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
-+              for(int start_x=0;start_x < nPbW;start_x+=16) {
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff);
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[0]);
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + 8 + start_x) & 0xffff);
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[0]);
-+                  *y++ = ( (nPbW<16 ? nPbW : 16) << 16 ) + (nPbH<16 ? nPbH : 16);
-+                  *y++ = my2_mx2_my_mx;
-+                  if (weight_flag) {
-+                      *y++ = (s->sh.luma_offset_l0[current_mv.ref_idx[reflist]] << 16) + (s->sh.luma_weight_l0[current_mv.ref_idx[reflist]] & 0xffff);
-+                  } else {
-+                      *y++ = 1; // Weight of 1 and offset of 0
-+                  }
-+                  *y++ = (get_vc_address(s->frame->buf[0]) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter;
-+                }
-+            }
-+            s->y_mvs[chan % 12] = y;
-+        } else
-+#endif
-+        {
-+            RPI_REDIRECT(luma_mc_uni)(s, dst0, s->frame->linesize[0], ref0->frame,
-                     &current_mv.mv[0], x0, y0, nPbW, nPbH,
-                     s->sh.luma_weight_l0[current_mv.ref_idx[0]],
-                     s->sh.luma_offset_l0[current_mv.ref_idx[0]]);
-+        }
- 
-         if (s->ps.sps->chroma_format_idc) {
- #ifdef RPI_INTER_QPU
-@@ -2078,10 +2126,47 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-         int nPbW_c = nPbW >> s->ps.sps->hshift[1];
-         int nPbH_c = nPbH >> s->ps.sps->vshift[1];
- 
--        RPI_REDIRECT(luma_mc_uni)(s, dst0, s->frame->linesize[0], ref1->frame,
-+#ifdef RPI_LUMA_QPU
-+        if (s->enable_rpi) {
-+            int reflist = 1;
-+            const Mv *mv    = &current_mv.mv[reflist];
-+            int mx          = mv->x & 3;
-+            int my          = mv->y & 3;
-+            int my_mx = (my<<8) + mx;
-+            int my2_mx2_my_mx = (my_mx << 16) + my_mx;
-+            int x1 = x0 + (mv->x >> 2);
-+            int y1 = y0 + (mv->y >> 2);
-+            int chan = x0>>6; // 64 wide blocks per QPU
-+            int weight_flag = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
-+                              (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
-+            uint32_t *y = s->y_mvs[chan % 12];
-+            for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
-+              for(int start_x=0;start_x < nPbW;start_x+=16) {
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff);
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[0]);
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + 8 + start_x) & 0xffff);
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[0]);
-+                  *y++ = ( (nPbW<16 ? nPbW : 16) << 16 ) + (nPbH<16 ? nPbH : 16);
-+                  *y++ = my2_mx2_my_mx;
-+                  if (weight_flag) {
-+                      *y++ = (s->sh.luma_offset_l0[current_mv.ref_idx[reflist]] << 16) + (s->sh.luma_weight_l0[current_mv.ref_idx[reflist]] & 0xffff);
-+                  } else {
-+                      *y++ = 1; // Weight of 1 and offset of 0
-+                  }
-+                  *y++ = (get_vc_address(s->frame->buf[0]) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter;
-+                }
-+            }
-+            s->y_mvs[chan % 12] = y;
-+        } else
-+#endif
-+
-+        {
-+            RPI_REDIRECT(luma_mc_uni)(s, dst0, s->frame->linesize[0], ref1->frame,
-                     &current_mv.mv[1], x0, y0, nPbW, nPbH,
-                     s->sh.luma_weight_l1[current_mv.ref_idx[1]],
-                     s->sh.luma_offset_l1[current_mv.ref_idx[1]]);
-+        }
- 
-         if (s->ps.sps->chroma_format_idc) {
- #ifdef RPI_INTER_QPU
-@@ -2115,8 +2200,8 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-                       *u++ = rpi_filter_coefs[_mx][0];
-                       *u++ = rpi_filter_coefs[_my][0];
-                       if (weight_flag) {
--                          *u++ = (s->sh.chroma_offset_l0[current_mv.ref_idx[1]][0] << 16) + (s->sh.chroma_weight_l0[current_mv.ref_idx[1]][0] & 0xffff);
--                          *u++ = (s->sh.chroma_offset_l0[current_mv.ref_idx[1]][1] << 16) + (s->sh.chroma_weight_l0[current_mv.ref_idx[1]][1] & 0xffff);
-+                          *u++ = (s->sh.chroma_offset_l0[current_mv.ref_idx[reflist]][0] << 16) + (s->sh.chroma_weight_l0[current_mv.ref_idx[reflist]][0] & 0xffff);
-+                          *u++ = (s->sh.chroma_offset_l0[current_mv.ref_idx[reflist]][1] << 16) + (s->sh.chroma_weight_l0[current_mv.ref_idx[reflist]][1] & 0xffff);
-                       } else {
-                           *u++ = 1; // Weight of 1 and offset of 0
-                           *u++ = 1;
-@@ -2143,9 +2228,44 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-         int nPbW_c = nPbW >> s->ps.sps->hshift[1];
-         int nPbH_c = nPbH >> s->ps.sps->vshift[1];
- 
--        RPI_REDIRECT(luma_mc_bi)(s, dst0, s->frame->linesize[0], ref0->frame,
-+#ifdef RPI_LUMA_QPU
-+        if (s->enable_rpi) {
-+            const Mv *mv    = &current_mv.mv[0];
-+            int mx          = mv->x & 3;
-+            int my          = mv->y & 3;
-+            int my_mx = (my<<8) + mx;
-+            const Mv *mv2    = &current_mv.mv[1];
-+            int mx2          = mv2->x & 3;
-+            int my2          = mv2->y & 3;
-+            int my2_mx2 = (my2<<8) + mx2;
-+            int my2_mx2_my_mx = (my2_mx2 << 16) + my_mx;
-+            int x1 = x0 + (mv->x >> 2);
-+            int y1 = y0 + (mv->y >> 2);
-+            int x2 = x0 + (mv2->x >> 2);
-+            int y2 = y0 + (mv2->y >> 2);
-+            int chan = x0>>6; // 64 wide blocks per QPU
-+            uint32_t *y = s->y_mvs[chan % 12];
-+            for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
-+              for(int start_x=0;start_x < nPbW;start_x+=8) { // B blocks work 8 at a time
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff);
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[0]);
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y2 - 3 + start_y) << 16) + ( (x2 - 3 + start_x) & 0xffff); // Second fetch is for ref1
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[0]);
-+                  *y++ = ( (nPbW<8 ? nPbW : 8) << 16 ) + (nPbH<16 ? nPbH : 16);
-+                  *y++ = my2_mx2_my_mx;
-+                  *y++ = 1; // B frame weighted prediction not supported
-+                  *y++ = (get_vc_address(s->frame->buf[0]) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter_b;
-+                }
-+            }
-+            s->y_mvs[chan % 12] = y;
-+        } else
-+#endif
-+        {
-+            RPI_REDIRECT(luma_mc_bi)(s, dst0, s->frame->linesize[0], ref0->frame,
-                    &current_mv.mv[0], x0, y0, nPbW, nPbH,
-                    ref1->frame, &current_mv.mv[1], &current_mv);
-+        }
- 
-         if (s->ps.sps->chroma_format_idc) {
- #ifdef RPI_INTER_QPU
-@@ -2834,7 +2954,6 @@ static void rpi_inter_clear(HEVCContext *s)
-         *s->u_mvs[i]++ = pic_height;
-         *s->u_mvs[i]++ = s->frame->linesize[1];
-         *s->u_mvs[i]++ = s->frame->linesize[2];
--        *s->u_mvs[i]++ = i;
-         if (weight_flag) {
-             *s->u_mvs[i]++ = 1 << (s->sh.chroma_log2_weight_denom + 6 - 1);
-             *s->u_mvs[i]++ = s->sh.chroma_log2_weight_denom + 6;
-@@ -2842,7 +2961,31 @@ static void rpi_inter_clear(HEVCContext *s)
-             *s->u_mvs[i]++ = 1 << 5;
-             *s->u_mvs[i]++ = 6;
-         }
-+        *s->u_mvs[i]++ = i;  // Select section of VPM (avoid collisions with 3d unit)
-+    }
 +
 +#ifdef RPI_LUMA_QPU
 +    for(i=0;i<12;i++) {
-+        s->y_mvs[i] = s->y_mvs_base[i];
-+        *s->y_mvs[i]++ = 0; // y_x
-+        *s->y_mvs[i]++ = 0; // ref_y_base
-+        *s->y_mvs[i]++ = 0; // y2_x2
-+        *s->y_mvs[i]++ = 0; // ref_y2_base
-+        *s->y_mvs[i]++ = (s->ps.sps->width << 16) + s->ps.sps->height;
-+        *s->y_mvs[i]++ = s->frame->linesize[0]; // pitch
-+        *s->y_mvs[i]++ = s->frame->linesize[0]; // dst_pitch
-+        if (weight_flag) {
-+            int offset = 1 << (s->sh.luma_log2_weight_denom + 6 - 1);
-+            int shift = s->sh.luma_log2_weight_denom + 6;
-+            *s->y_mvs[i]++ = (offset << 16) + shift;
-+        } else {
-+            int offset = 1 << 5;
-+            int shift = 6;
-+            *s->y_mvs[i]++ = (offset << 16) + shift;
-+        }
-+        *s->y_mvs[i]++ = 0; // Next kernel
-     }
-+#endif
- }
- 
- static void rpi_execute_inter_qpu(HEVCContext *s)
-@@ -2850,6 +2993,9 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
-     int k;
-     int i;
-     uint32_t *unif_vc = (uint32_t *)s->unif_mvs_ptr.vc;
-+#ifdef RPI_LUMA_QPU
-+    uint32_t *y_unif_vc = (uint32_t *)s->y_unif_mvs_ptr.vc;
-+#endif
-     if (s->sh.slice_type == I_SLICE) {
- #ifdef RPI_MULTI_MAILBOX
-       rpi_execute_transform(s);
-@@ -2865,8 +3011,23 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
- 
-     s->u_mvs[8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
- 
-+#ifdef RPI_LUMA_QPU
-+    for(k=0;k<12;k++) {
-+        s->y_mvs[k][-RPI_LUMA_COMMAND_WORDS+1] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
-+        s->y_mvs[k][-RPI_LUMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // Also need a dummy for second request
-+        s->y_mvs[k][-RPI_LUMA_COMMAND_WORDS+8] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
-+        assert(s->y_mvs[k] - s->y_mvs_base[k] < Y_COMMANDS_PER_QPU);
++        // This needs to have a generally similar structure to the
++        // actual filter code as various pipelined bits need to land correctly
++        // when inserted by the filter requests
++        s->y_mvs[job][i] = s->y_mvs_base[job][i];
++        *s->y_mvs[job][i]++ = 0; // y_x
++        *s->y_mvs[job][i]++ = 0; // ref_y_base
++        *s->y_mvs[job][i]++ = 0; // y2_x2
++        *s->y_mvs[job][i]++ = 0; // ref_y2_base
++        *s->y_mvs[job][i]++ = (s->ps.sps->width << 16) + s->ps.sps->height;
++        *s->y_mvs[job][i]++ = s->frame->linesize[0]; // pitch
++        *s->y_mvs[job][i]++ = s->frame->linesize[0]; // dst_pitch
++        *s->y_mvs[job][i]++ = s->sh.luma_log2_weight_denom + 6;  // weight demon + 6
++        *s->y_mvs[job][i]++ = 0; // Unused - alignment with per-block
++        *s->y_mvs[job][i]++ = 0; // Next kernel
 +    }
-+    s->y_mvs[12-1][-RPI_LUMA_COMMAND_WORDS+8] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT12); // This QPU will signal interrupt when all others are done and have acquired a semaphore
-+#endif
-+
-+
- #ifdef RPI_MULTI_MAILBOX
-+#ifdef RPI_CACHE_UNIF_MVS
-+    gpu_cache_flush3(&s->coeffs_buf_accelerated,&s->y_unif_mvs_ptr, &s->unif_mvs_ptr);
-+#else
-     gpu_cache_flush(&s->coeffs_buf_accelerated);
-+#endif
-     s->vpu_id = vpu_qpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0,
-                                    qpu_get_fn(QPU_MC_SETUP_UV),
-                                    (uint32_t)(unif_vc+(s->mvs_base[0 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-@@ -2876,7 +3037,27 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
-                                    (uint32_t)(unif_vc+(s->mvs_base[4 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-                                    (uint32_t)(unif_vc+(s->mvs_base[5 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-                                    (uint32_t)(unif_vc+(s->mvs_base[6 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
--                                   (uint32_t)(unif_vc+(s->mvs_base[7 ] - (uint32_t*)s->unif_mvs_ptr.arm))
-+                                   (uint32_t)(unif_vc+(s->mvs_base[7 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+#ifdef RPI_LUMA_QPU
-+                                   qpu_get_fn(QPU_MC_SETUP),
-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[0 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[1 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[2 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[3 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[4 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[5 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[6 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[7 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[8 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[9 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[10 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[11 ] - (uint32_t*)s->y_unif_mvs_ptr.arm))
-+#else
-+                                   0,
-+                                   0,0,0,0,
-+                                   0,0,0,0,
-+                                   0,0,0,0
-+#endif
-                                  );
-     for(i=0;i<4;i++)
-         s->num_coeffs[i] = 0;
-@@ -2892,6 +3073,8 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
-       (uint32_t)(unif_vc+(s->mvs_base[7 ] - (uint32_t*)s->unif_mvs_ptr.arm))
-       );
- #endif
-+
-+
- }
- #endif
- 
-@@ -3579,8 +3762,7 @@ static int decode_nal_units(HEVCContext *s, const uint8_t *buf, int length)
- fail:
-     if (s->ref && s->threads_type == FF_THREAD_FRAME) {
- #ifdef RPI_INTER_QPU
--        void ff_hevc_flush_chroma(HEVCContext *s, ThreadFrame *f, int n);
--        ff_hevc_flush_chroma(s, &s->ref->tf, s->ps.sps->height);
-+        ff_hevc_flush_buffer(s, &s->ref->tf, s->ps.sps->height);
- #endif
-         ff_thread_report_progress(&s->ref->tf, INT_MAX, 0);
-     }
-@@ -3767,7 +3949,6 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
- 
- #ifdef RPI
-     av_freep(&s->unif_mv_cmds);
--    av_freep(&s->unif_xfm_cmds);
-     av_freep(&s->univ_pred_cmds);
- 
- #ifdef RPI_INTER_QPU
-@@ -3776,7 +3957,12 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
-         s->unif_mvs = 0;
-     }
- #endif
--    //gpu_free(&s->dummy);
-+#ifdef RPI_LUMA_QPU
-+    if (s->y_unif_mvs) {
-+        gpu_free( &s->y_unif_mvs_ptr );
-+        s->y_unif_mvs = 0;
-+    }
-+#endif
- 
- #ifdef EARLY_MALLOC
-     printf("hevc_decode_free\n");
-@@ -3861,9 +4047,6 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
-     s->unif_mv_cmds = av_mallocz(sizeof(HEVCMvCmd)*RPI_MAX_MV_CMDS);
-     if (!s->unif_mv_cmds)
-         goto fail;
--    s->unif_xfm_cmds = av_mallocz(sizeof(HEVCXfmCmd)*RPI_MAX_XFM_CMDS);
--    if (!s->unif_xfm_cmds)
--        goto fail;
-     s->univ_pred_cmds = av_mallocz(sizeof(HEVCPredCmd)*RPI_MAX_PRED_CMDS);
-     if (!s->univ_pred_cmds)
-         goto fail;
-@@ -3877,7 +4060,11 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
-     {
-         int uv_commands_per_qpu = UV_COMMANDS_PER_QPU;
-         uint32_t *p;
-+#ifdef RPI_CACHE_UNIF_MVS
-+        gpu_malloc_cached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr );
-+#else
-         gpu_malloc_uncached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr );
-+#endif
-         s->unif_mvs = (uint32_t *) s->unif_mvs_ptr.arm; // TODO support this allocation in non EARLY_MALLOC
- 
-         // Set up initial locations for uniform streams
-@@ -3892,6 +4079,28 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
- 
-     }
- #endif
-+#ifdef RPI_LUMA_QPU
-+    {
-+        int y_commands_per_qpu = Y_COMMANDS_PER_QPU;
-+        uint32_t *p;
-+#ifdef RPI_CACHE_UNIF_MVS
-+        gpu_malloc_cached( 12 * y_commands_per_qpu * sizeof(uint32_t), &s->y_unif_mvs_ptr );
-+#else
-+        gpu_malloc_uncached( 12 * y_commands_per_qpu * sizeof(uint32_t), &s->y_unif_mvs_ptr );
-+#endif
-+        s->y_unif_mvs = (uint32_t *) s->y_unif_mvs_ptr.arm; // TODO support this allocation in non EARLY_MALLOC
-+
-+        // Set up initial locations for uniform streams
-+        p = s->y_unif_mvs;
-+        for(i = 0; i < 12; i++) {
-+            s->y_mvs_base[i] = p;
-+            p += y_commands_per_qpu;
-+        }
-+        s->mc_filter = qpu_get_fn(QPU_MC_FILTER);
-+        s->mc_filter_b = qpu_get_fn(QPU_MC_FILTER_B);
-+
-+    }
-+#endif
-     //gpu_malloc_uncached(2048*64,&s->dummy);
- 
- #ifdef EARLY_MALLOC
-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-index 4a39e39..5df9dcd 100644
---- a/libavcodec/hevc.h
-+++ b/libavcodec/hevc.h
-@@ -44,9 +44,13 @@
- #ifdef RPI
- 
-   #include "rpi_qpu.h"
--  // Use QPU for inter prediction
-+  // Define RPI_INTER_QPU to use QPU for chroma inter prediction
-   #define RPI_INTER_QPU
- 
-+  #ifdef RPI_INTER_QPU
-+    // Define RPI_LUMA_QPU to also use QPU for luma inter prediction
-+    #define RPI_LUMA_QPU
-+  #endif
- #endif
- 
- #define MAX_DPB_SIZE 16 // A.4.1
-@@ -809,7 +813,6 @@ typedef struct HEVCLocalContext {
- 
- // Worst case is for 4:4:4 4x4 blocks with 64 high coding tree blocks, so 16 MV cmds per 4 pixels across for each colour plane, * 2 for bi
- #define RPI_MAX_MV_CMDS   (2*16*3*(RPI_MAX_WIDTH/4))
--#define RPI_MAX_XFM_CMDS  (16*3*(RPI_MAX_WIDTH/4))
- // Each block can have an intra prediction and a transform_add command
- #define RPI_MAX_PRED_CMDS (2*16*3*(RPI_MAX_WIDTH/4))
- // Worst case is 16x16 CTUs
-@@ -844,9 +847,6 @@ typedef struct HEVCMvCmd {
-     int8_t ref_idx[2];
- } HEVCMvCmd;
- 
--// Command for transform to process a block of coefficients
--typedef struct HEVCXfmCmd {
--} HEVCXfmCmd;
- 
- // Command for intra prediction and transform_add of predictions to coefficients
- #define RPI_PRED_TRANSFORM_ADD 0
-@@ -892,8 +892,7 @@ typedef struct HEVCContext {
- 
- #ifdef RPI
-     int enable_rpi;
--    HEVCMvCmd *unif_mv_cmds;  // TODO rename
--    HEVCXfmCmd *unif_xfm_cmds;
-+    HEVCMvCmd *unif_mv_cmds;
-     HEVCPredCmd *univ_pred_cmds;
-     int buf_width;
-     GPU_MEM_PTR_T coeffs_buf_default;
-@@ -920,6 +919,15 @@ typedef struct HEVCContext {
-     uint32_t mc_filter_uv_b0;
-     uint32_t mc_filter_uv_b;
- #endif
-+#ifdef RPI_LUMA_QPU
-+    GPU_MEM_PTR_T y_unif_mvs_ptr;
-+    uint32_t *y_unif_mvs; // Base of memory for motion vector commands
-+    uint32_t *y_mvs_base[12];
-+    uint32_t *y_mvs[12];
-+    // Function pointers
-+    uint32_t mc_filter;
-+    uint32_t mc_filter_b;
-+#endif
- 
- #endif
- 
-@@ -1166,6 +1174,10 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-                                  int log2_trafo_size, enum ScanType scan_idx,
-                                  int c_idx);
- 
-+#ifdef RPI_INTER_QPU
-+extern void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n);
-+#endif
-+
- void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size);
- 
- 
-diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
-index ec84e8a..11629e4 100644
---- a/libavcodec/hevc_filter.c
-+++ b/libavcodec/hevc_filter.c
-@@ -883,8 +883,7 @@ static int ff_hevc_buf_base(AVBufferRef *bref) {
-   return p->vc & 0x3fffffff;
- }
- 
--void ff_hevc_flush_chroma(HEVCContext *s, ThreadFrame *f, int n);
--void ff_hevc_flush_chroma(HEVCContext *s, ThreadFrame *f, int n)
-+void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n)
- {
-     if (s->enable_rpi && !(  s->nal_unit_type == NAL_TRAIL_N ||
-             s->nal_unit_type == NAL_TSA_N   ||
-@@ -911,10 +910,24 @@ void ff_hevc_flush_chroma(HEVCContext *s, ThreadFrame *f, int n)
-         iocache.s[1].cmd = 3; // clean+invalidate
-         iocache.s[1].addr = p->arm + base;
-         iocache.s[1].size  = sz;
-+
-+#ifdef RPI_LUMA_QPU
-+        p = av_buffer_pool_opaque(s->frame->buf[0]);
-+        sz = s->frame->linesize[0] * (n-curr_y);
-+        base = s->frame->linesize[0] * curr_y;
-+        iocache.s[2].handle = p->vcsm_handle;
-+        iocache.s[2].cmd = 3; // clean+invalidate
-+        iocache.s[2].addr = p->arm + base;
-+        iocache.s[2].size  = sz;
-+#endif
-         vcsm_clean_invalid( &iocache );
- #else
-         flush_buffer(s->frame->buf[1]);
-         flush_buffer(s->frame->buf[2]);
-+#ifdef RPI_LUMA_QPU
-+        flush_buffer(s->frame->buf[1]);
-+#endif
-+
- #endif
-         //memcpy(s->dummy.arm,s->frame->data[0],2048*64);
-         //memcpy(s->dummy.arm,s->frame->data[1],1024*32);
-@@ -938,7 +951,7 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
-             sao_filter_CTB(s, x, y - ctb_size);
-             if (s->threads_type & FF_THREAD_FRAME ) {
- #ifdef RPI_INTER_QPU
--                ff_hevc_flush_chroma(s,&s->ref->tf, y);
-+                ff_hevc_flush_buffer(s,&s->ref->tf, y);
- #endif
-                 ff_thread_report_progress(&s->ref->tf, y, 0);
-             }
-@@ -947,7 +960,7 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
-             sao_filter_CTB(s, x , y);
-             if (s->threads_type & FF_THREAD_FRAME ) {
- #ifdef RPI_INTER_QPU
--                ff_hevc_flush_chroma(s, &s->ref->tf, y + ctb_size);
-+                ff_hevc_flush_buffer(s, &s->ref->tf, y + ctb_size);
- #endif
-                 ff_thread_report_progress(&s->ref->tf, y + ctb_size, 0);
-             }
-@@ -957,7 +970,7 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
-         //int currh = s->ref->tf.progress->data[0];
-         //if (((y + ctb_size)&63)==0)
- #ifdef RPI_INTER_QPU
--        ff_hevc_flush_chroma(s, &s->ref->tf, y + ctb_size - 4);
-+        ff_hevc_flush_buffer(s, &s->ref->tf, y + ctb_size - 4);
- #endif
-         ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
-     }
-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-index aa65a77..e12304b 100644
---- a/libavcodec/rpi_qpu.c
-+++ b/libavcodec/rpi_qpu.c
-@@ -1,9 +1,11 @@
- #ifdef RPI
- // This works better than the mmap in that the memory can be cached, but requires a kernel modification to enable the device.
- // define RPI_TIME_TOTAL_QPU to print out how much time is spent in the QPU code
--#define RPI_TIME_TOTAL_QPU
-+//#define RPI_TIME_TOTAL_QPU
- // define RPI_TIME_TOTAL_VPU to print out how much time is spent in the VPI code
- //#define RPI_TIME_TOTAL_VPU
-+// define RPI_TIME_TOTAL_POSTED to print out how much time is spent in the multi execute QPU/VPU combined
-+//#define RPI_TIME_TOTAL_POSTED
- // define RPI_ASYNC to run the VPU in a separate thread, need to make a separate call to check for completion
- #define RPI_ASYNC
- 
-@@ -94,7 +96,8 @@ struct GPU
-   int open_count; // Number of allocated video buffers
-   int      mb; // Mailbox handle
-   int      vc; // Address in GPU memory
--  int mail[12]; // These are used to pass pairs of code/unifs to the QPUs
-+  int mail[12*2]; // These are used to pass pairs of code/unifs to the QPUs for the first QPU task
-+  int mail2[12*2]; // These are used to pass pairs of code/unifs to the QPUs for the second QPU task
- };
- 
- // Stop more than one thread trying to allocate memory or use the processing resources at once
-@@ -102,7 +105,7 @@ static pthread_mutex_t gpu_mutex = PTHREAD_MUTEX_INITIALIZER;
- static volatile struct GPU* gpu = NULL;
- static GPU_MEM_PTR_T gpu_mem_ptr;
- 
--#if defined(RPI_TIME_TOTAL_QPU) || defined(RPI_TIME_TOTAL_VPU)
-+#if defined(RPI_TIME_TOTAL_QPU) || defined(RPI_TIME_TOTAL_VPU) || defined(RPI_TIME_TOTAL_POSTED)
- static unsigned int Microseconds(void) {
-     struct timespec ts;
-     unsigned int x;
-@@ -123,7 +126,7 @@ static pthread_cond_t post_cond_head = PTHREAD_COND_INITIALIZER;
- static pthread_cond_t post_cond_tail = PTHREAD_COND_INITIALIZER;
- static pthread_mutex_t post_mutex = PTHREAD_MUTEX_INITIALIZER;
- 
--static int vpu_cmds[MAXCMDS][16];
-+static int vpu_cmds[MAXCMDS][32];
- static volatile int vpu_async_tail=0; // Contains the number of posted jobs
- static volatile int vpu_async_head=0;
- #endif
-@@ -247,7 +250,6 @@ int gpu_get_mailbox(void)
- // Call this to clean and invalidate a region of memory
- void gpu_cache_flush(GPU_MEM_PTR_T *p)
- {
--#define RPI_FAST_CACHEFLUSH
- #ifdef RPI_FAST_CACHEFLUSH
-     struct vcsm_user_clean_invalid_s iocache = {};
-     iocache.s[0].handle = p->vcsm_handle;
-@@ -261,6 +263,34 @@ void gpu_cache_flush(GPU_MEM_PTR_T *p)
- #endif
- }
- 
-+void gpu_cache_flush3(GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2)
-+{
-+#ifdef RPI_FAST_CACHEFLUSH
-+    struct vcsm_user_clean_invalid_s iocache = {};
-+    iocache.s[0].handle = p0->vcsm_handle;
-+    iocache.s[0].cmd = 3; // clean+invalidate
-+    iocache.s[0].addr = (int) p0->arm;
-+    iocache.s[0].size  = p0->numbytes;
-+    iocache.s[1].handle = p1->vcsm_handle;
-+    iocache.s[1].cmd = 3; // clean+invalidate
-+    iocache.s[1].addr = (int) p1->arm;
-+    iocache.s[1].size  = p1->numbytes;
-+    iocache.s[2].handle = p2->vcsm_handle;
-+    iocache.s[2].cmd = 3; // clean+invalidate
-+    iocache.s[2].addr = (int) p2->arm;
-+    iocache.s[2].size  = p2->numbytes;
-+    vcsm_clean_invalid( &iocache );
-+#else
-+    void *tmp;
-+    tmp = vcsm_lock(p0->vcsm_handle);
-+    vcsm_unlock_ptr(tmp);
-+    tmp = vcsm_lock(p1->vcsm_handle);
-+    vcsm_unlock_ptr(tmp);
-+    tmp = vcsm_lock(p2->vcsm_handle);
-+    vcsm_unlock_ptr(tmp);
++    s->curr_y_mvs = s->y_mvs[job][0];
 +#endif
++    s->ctu_count = 0;
 +}
-+
- static int gpu_malloc_cached_internal(int numbytes, GPU_MEM_PTR_T *p) {
-   p->numbytes = numbytes;
-   p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST, (char *)"Video Frame" );
-@@ -357,9 +387,19 @@ unsigned int vpu_get_constants(void) {
- #ifdef RPI_ASYNC
- 
- static void *vpu_start(void *arg) {
-+#ifdef RPI_TIME_TOTAL_POSTED
-+  int last_time=0;
-+  long long on_time=0;
-+  long long off_time=0;
-+  int start_time;
-+  int end_time;
-+  int count=0;
 +#endif
-   while(1) {
-+    int i;
-     int *p;
-     int qpu_code;
-+    int qpu_codeb;
-     pthread_mutex_lock(&post_mutex);
-     while( vpu_async_tail - vpu_async_head <= 0)
-     {
-@@ -373,24 +413,49 @@ static void *vpu_start(void *arg) {
-       break; // Last job
-     }
-     qpu_code = p[7];
-+    qpu_codeb = p[16];
-     //if (p[7]) {
-         //GPU_MEM_PTR_T *buf = (GPU_MEM_PTR_T *)p[7];
-         //gpu_cache_flush(buf);
-     //}
-+
-+#ifdef RPI_TIME_TOTAL_POSTED
-+    start_time = Microseconds();
-+    if (last_time==0)
-+      last_time = start_time;
-+    off_time += start_time-last_time;
-+#endif
-+
-     if (!qpu_code) {
-       vpu_execute_code(p[0], p[1], p[2], p[3], p[4], p[5], p[6]);
-     } else {
--      int i;
-       for(i=0;i<8;i++) {
-         gpu->mail[i*2] = p[8+i];
-         gpu->mail[i*2 + 1] = qpu_code;
-       }
--
--      execute_multi(gpu->mb,8,gpu->vc + offsetof(struct GPU, mail), 1 /* no flush */, 5000 /* timeout ms */,
--                              0, 0, 0, 0,
-+      for(i=0;i<12;i++) {
-+        gpu->mail2[i*2] = p[17+i];
-+        gpu->mail2[i*2 + 1] = qpu_codeb;
-+      }
-+#if (0)
-+      vpu_execute_code(p[0], p[1], p[2], p[3], p[4], p[5], p[6]);
-+      execute_qpu(gpu->mb,8,gpu->vc + offsetof(struct GPU, mail), 1 /* no flush */, 5000 /* timeout ms */);
-+#else
-+      execute_multi(gpu->mb,
-+                              12,gpu->vc + offsetof(struct GPU, mail2), 1, 5000,
-+                              8,gpu->vc + offsetof(struct GPU, mail), 1 /* no flush */, 5000 /* timeout ms */,
-                               p[0], p[1], p[2], p[3], p[4], p[5], p[6], // VPU0
-                               0,    0   , 0   , 0   , 0   , 0   , 0); // VPU1
-+#endif
-     }
-+#ifdef RPI_TIME_TOTAL_POSTED
-+    end_time = Microseconds();
-+    last_time = end_time;
-+    on_time += end_time - start_time;
-+    count++;
-+    if ((count&0x7f)==0)
-+      printf("Posted %d On=%dms, Off=%dms\n",count,(int)(on_time/1000),(int)(off_time/1000));
-+#endif
-     pthread_mutex_lock(&post_mutex);
-     vpu_async_head++;
-     pthread_cond_broadcast(&post_cond_head);
-@@ -436,7 +501,9 @@ int vpu_post_code(unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned
- }
- 
- int vpu_qpu_post_code(unsigned vpu_code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
--                      int qpu_code, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8)
-+                      int qpu_code, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8,
-+                      int qpu_codeb, int unifs1b, int unifs2b, int unifs3b, int unifs4b, int unifs5b, int unifs6b, int unifs7b, int unifs8b, int unifs9b, int unifs10b, int unifs11b, int unifs12b
-+                      )
- {
- 
-   pthread_mutex_lock(&post_mutex);
-@@ -464,6 +531,21 @@ int vpu_qpu_post_code(unsigned vpu_code, unsigned r0, unsigned r1, unsigned r2,
-     p[13] = unifs6;
-     p[14] = unifs7;
-     p[15] = unifs8;
-+
-+    p[16] = qpu_codeb;
-+    p[17] = unifs1b;
-+    p[18] = unifs2b;
-+    p[19] = unifs3b;
-+    p[20] = unifs4b;
-+    p[21] = unifs5b;
-+    p[22] = unifs6b;
-+    p[23] = unifs7b;
-+    p[24] = unifs8b;
-+    p[25] = unifs9b;
-+    p[26] = unifs10b;
-+    p[27] = unifs11b;
-+    p[28] = unifs12b;
-+
-     if (num<=1)
-       pthread_cond_broadcast(&post_cond_tail); // Otherwise the vpu thread must already be awake
-     pthread_mutex_unlock(&post_mutex);
-@@ -544,27 +626,27 @@ void qpu_run_shader12(int code, int num, int code2, int num2, int unifs1, int un
-   off_time += start_time-last_time;
- #endif
-   for(i=0;i<num;i++) {
--    gpu->mail[i*2 + 1] = code;
-+    gpu->mail2[i*2 + 1] = code;
-   }
-   for(;i<num+num2;i++) {
--    gpu->mail[i*2 + 1] = code2;
-+    gpu->mail2[i*2 + 1] = code2;
-   }
--  gpu->mail[0 ] = unifs1;
--  gpu->mail[2 ] = unifs2;
--  gpu->mail[4 ] = unifs3;
--  gpu->mail[6 ] = unifs4;
--  gpu->mail[8 ] = unifs5;
--  gpu->mail[10] = unifs6;
--	gpu->mail[12] = unifs7;
--	gpu->mail[14] = unifs8;
--	gpu->mail[16] = unifs9;
--	gpu->mail[18] = unifs10;
--	gpu->mail[20] = unifs11;
--	gpu->mail[22] = unifs12;
-+  gpu->mail2[0 ] = unifs1;
-+  gpu->mail2[2 ] = unifs2;
-+  gpu->mail2[4 ] = unifs3;
-+  gpu->mail2[6 ] = unifs4;
-+  gpu->mail2[8 ] = unifs5;
-+  gpu->mail2[10] = unifs6;
-+	gpu->mail2[12] = unifs7;
-+	gpu->mail2[14] = unifs8;
-+	gpu->mail2[16] = unifs9;
-+	gpu->mail2[18] = unifs10;
-+	gpu->mail2[20] = unifs11;
-+	gpu->mail2[22] = unifs12;
- 	execute_qpu(
- 		gpu->mb,
- 		12 /* Number of QPUs */,
--		gpu->vc + offsetof(struct GPU, mail),
-+		gpu->vc + offsetof(struct GPU, mail2),
- 		1 /* no flush */,  // Don't flush VPU L1 cache
- 		5000 /* timeout ms */);
- #ifdef RPI_TIME_TOTAL_QPU
-@@ -635,21 +717,21 @@ unsigned int qpu_get_fn(int num) {
-       gpu_unlock();
-     }
-     switch(num) {
--    //case QPU_MC_SETUP:
--    //  fn = mc_setup;
--    //  break;
--    //case QPU_MC_FILTER:
--    //  fn = mc_filter;
--    //  break;
-+    case QPU_MC_SETUP:
-+      fn = mc_setup;
-+      break;
-+    case QPU_MC_FILTER:
-+      fn = mc_filter;
-+      break;
-     case QPU_MC_EXIT:
-       fn = mc_exit;
-       break;
--    //case QPU_MC_INTERRUPT_EXIT:
--    //  fn = mc_interrupt_exit;
--    //  break;
--    //case QPU_MC_FILTER_B:
--    //  fn = mc_filter_b;
--    //  break;
-+    case QPU_MC_INTERRUPT_EXIT12:
-+      fn = mc_interrupt_exit12;
-+      break;
-+    case QPU_MC_FILTER_B:
-+      fn = mc_filter_b;
-+      break;
-     //case QPU_MC_FILTER_HONLY:
-     //  fn = mc_filter_honly;
-     //  break;
-diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
-index 0565a60..81c2bb1 100644
---- a/libavcodec/rpi_qpu.h
-+++ b/libavcodec/rpi_qpu.h
-@@ -1,6 +1,7 @@
- #ifndef RPI_QPU_H
- #define RPI_QPU_H
- 
-+// Define RPI_FAST_CACHEFLUSH to use the VCSM cache flush code
- #define RPI_FAST_CACHEFLUSH
- 
- typedef struct gpu_mem_ptr_s {
-@@ -16,6 +17,7 @@ extern int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p);
- extern int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p);
- extern void gpu_free(GPU_MEM_PTR_T *p);
- extern void gpu_cache_flush(GPU_MEM_PTR_T *p);
-+extern void gpu_cache_flush3(GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2);
- 
- // QPU specific functions
- extern void qpu_run_shader8(int code, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8);
-@@ -26,7 +28,7 @@ enum {
-   QPU_MC_SETUP,
-   QPU_MC_FILTER,
-   QPU_MC_EXIT,
--  QPU_MC_INTERRUPT_EXIT,
-+  QPU_MC_INTERRUPT_EXIT12,
-   QPU_MC_FILTER_B,
-   QPU_MC_FILTER_HONLY,
-   QPU_MC_SETUP_UV,
-@@ -44,7 +46,9 @@ extern unsigned int vpu_get_constants(void);
- extern unsigned vpu_execute_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5);
- extern int vpu_post_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5, GPU_MEM_PTR_T *buf);
- int vpu_qpu_post_code(unsigned vpu_code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
--                      int qpu_code, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8);
-+                      int qpu_code, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8,
-+                      int qpu_codeb, int unifs1b, int unifs2b, int unifs3b, int unifs4b, int unifs5b, int unifs6b, int unifs7b, int unifs8b, int unifs9b, int unifs10b, int unifs11b, int unifs12b
-+                      );
- extern void vpu_wait( int id);
- 
- // Simple test of shader code
-diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
-index a0f0282..e86eb30 100644
---- a/libavcodec/rpi_shader.c
-+++ b/libavcodec/rpi_shader.c
-@@ -48,693 +48,674 @@ unsigned int rpi_shader[] = {
- /* [0x000000b8] */ 0x00000000, 0xe0020367, // mov ra13, 0
- /* [0x000000c0] */ 0x00000000, 0xe00203a7, // mov ra14, 0
- /* [0x000000c8] */ 0x00000000, 0xe00203e7, // mov ra15, 0
--/* [0x000000d0] */ 0x15827d80, 0x100208e7, // mov r3, unif
--/* [0x000000d8] */ 0x119c17c0, 0xd00208a7, // shl r2, r3, 1
--/* [0x000000e0] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
--/* [0x000000e8] */ 0x159e7480, 0x10020867, // mov r1, r2
--/* [0x000000f0] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
--/* [0x000000f8] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
--/* [0x00000100] */ 0x159e7480, 0x10020827, // mov r0, r2
--/* [0x00000108] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
--/* [0x00000110] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
--/* [0x00000118] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
--/* [0x00000120] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
--/* [0x00000128] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
--/* [0x00000130] */ 0x119c17c0, 0xd00208a7, // shl r2, r3, 1
--/* [0x00000138] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
--/* [0x00000140] */ 0x159e7480, 0x10020867, // mov r1, r2
--/* [0x00000148] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
--/* [0x00000150] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
--/* [0x00000158] */ 0x159e7480, 0x10020827, // mov r0, r2
--/* [0x00000160] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
--/* [0x00000168] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
--/* [0x00000170] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
--/* [0x00000178] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
--/* [0x00000180] */ 0x0f9c11c0, 0xd0020827, // asr r0, r0, 1
--/* [0x00000188] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
--/* [0x00000190] */ 0x0c9e7040, 0x10021567, // add rb21, r0, r1
--/* [0x00000198] */ 0x15427d80, 0x10020827, // mov r0, ra_x
--/* [0x000001a0] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
--/* [0x000001a8] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base
--/* [0x000001b0] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
--/* [0x000001b8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
--/* [0x000001c0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
--/* [0x000001c8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
--/* [0x000001d0] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x, r0
-+/* [0x000000d0] */ 0x15427d80, 0x10020827, // mov r0, ra_x
-+/* [0x000000d8] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
-+/* [0x000000e0] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base
-+/* [0x000000e8] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
-+/* [0x000000f0] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
-+/* [0x000000f8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+/* [0x00000100] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+/* [0x00000108] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x, r0
-+/* [0x00000110] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+/* [0x00000118] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
-+/* [0x00000120] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_frame_base, r2
-+/* [0x00000128] */ 0x0c9e7440, 0x10020f27, // add t1s, r2, r1
-+/* [0x00000130] */ 0x00000008, 0xe00208a7, // mov r2,8
-+/* [0x00000138] */ 0x11827c80, 0x10021327, // shl rb12,unif, r2
-+/* [0x00000140] */ 0x0c827c80, 0x10021367, // add rb13,unif,r2
-+/* [0x00000148] */ 0x15827d80, 0x100208a7, // mov r2, unif
-+/* [0x00000150] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
-+/* [0x00000158] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-+/* [0x00000160] */ 0x159e7480, 0x10020867, // mov r1, r2
-+/* [0x00000168] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+/* [0x00000170] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+/* [0x00000178] */ 0x159e7480, 0x10020827, // mov r0, r2
-+/* [0x00000180] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-+/* [0x00000188] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000190] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-+/* [0x00000198] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
-+/* [0x000001a0] */ 0x0f9c11c0, 0xd00208a7, // asr r2, r0, 1
-+/* [0x000001a8] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
-+/* [0x000001b0] */ 0x0c9e7440, 0x10021567, // add rb21, r2, r1
-+/* [0x000001b8] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-+/* [0x000001c0] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-+/* [0x000001c8] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
-+/* [0x000001d0] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
- /* [0x000001d8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
--/* [0x000001e0] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
--/* [0x000001e8] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_frame_base, r2
--/* [0x000001f0] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
--/* [0x000001f8] */ 0x15827d80, 0x10021327, // mov rb12,unif
--/* [0x00000200] */ 0x15827d80, 0x10021367, // mov rb13,unif
--/* [0x00000208] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
--/* [0x00000210] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
--/* [0x00000218] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
--/* [0x00000220] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
--/* [0x00000228] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
--/* [0x00000230] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x
--/* [0x00000238] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_frame_base
-+/* [0x000001e0] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-+/* [0x000001e8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x000001f0] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-+/* [0x000001f8] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x
-+/* [0x00000200] */ 0x0c627380, 0x10020f27, // add t1s, r1, ra_frame_base
- // ::mc_filter_uv
--/* [0x00000240] */ 0x15827d80, 0x100207e7, // mov ra31, unif
--/* [0x00000248] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
--/* [0x00000250] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
--/* [0x00000258] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
--/* [0x00000260] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
--/* [0x00000268] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
--/* [0x00000270] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
--/* [0x00000278] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
--/* [0x00000280] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
--/* [0x00000288] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
--/* [0x00000290] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
--/* [0x00000298] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
--/* [0x000002a0] */ 0x00000010, 0xe00208a7, // mov r2, 16
--/* [0x000002a8] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x000002b0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
--/* [0x000002b8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
--/* [0x000002c0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
--/* [0x000002c8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
--/* [0x000002d0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
--/* [0x000002d8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
--/* [0x000002e0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
--/* [0x000002e8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
--/* [0x000002f0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
--/* [0x000002f8] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000300] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000308] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000310] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000318] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
--/* [0x00000320] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000328] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000330] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000338] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
--/* [0x00000340] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00000348] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000350] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
--/* [0x00000358] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
--/* [0x00000360] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
--/* [0x00000368] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000370] */ 0x0f9e7080, 0x100613e7, // asr.ifnz rb15, r0, r2
--/* [0x00000378] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
--/* [0x00000380] */ 0x0f9e7080, 0x100613a7, // asr.ifnz rb14, r0, r2
--/* [0x00000388] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
--/* [0x00000390] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+/* [0x00000208] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x00000210] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+/* [0x00000218] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+/* [0x00000220] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+/* [0x00000228] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+/* [0x00000230] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x00000238] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+/* [0x00000240] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+/* [0x00000248] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
-+/* [0x00000250] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+/* [0x00000258] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
-+/* [0x00000260] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+/* [0x00000268] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+/* [0x00000270] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000278] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+/* [0x00000280] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+/* [0x00000288] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+/* [0x00000290] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-+/* [0x00000298] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+/* [0x000002a0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+/* [0x000002a8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x000002b0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+/* [0x000002b8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+/* [0x000002c0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x000002c8] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000002d0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000002d8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000002e0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+/* [0x000002e8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x000002f0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x000002f8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000300] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-+/* [0x00000308] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000310] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000318] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
-+/* [0x00000320] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+/* [0x00000328] */ 0x0f9e7080, 0x100208e7, // asr r3, r0, r2
-+/* [0x00000330] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000338] */ 0x0f9e7080, 0x100613e7, // asr.ifnz rb15, r0, r2
-+/* [0x00000340] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+/* [0x00000348] */ 0x0f9e7080, 0x100608e7, // asr.ifnz r3, r0, r2
-+/* [0x00000350] */ 0x119c87c0, 0xd00213a7, // shl rb14,r3,8
-+/* [0x00000358] */ 0x00000000, 0xe00208e7, // mov r3, 0
- // :uvloop
--/* [0x00000398] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
--/* [0x000003a0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
--/* [0x000003a8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
--/* [0x000003b0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
--/* [0x000003b8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
--/* [0x000003c0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
--/* [0x000003c8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
--/* [0x000003d0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
--/* [0x000003d8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
--/* [0x000003e0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_frame_base, r2
--/* [0x000003e8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x000003f0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
--/* [0x000003f8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
--/* [0x00000400] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
--/* [0x00000408] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
--/* [0x00000410] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
--/* [0x00000418] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
--/* [0x00000420] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
--/* [0x00000428] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
--/* [0x00000430] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
--/* [0x00000438] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
--/* [0x00000440] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
--/* [0x00000448] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
--/* [0x00000450] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
--/* [0x00000458] */ 0x159e7000, 0x100203e7, // mov ra15, r0
--/* [0x00000460] */ 0x00000020, 0xe0021327, // mov rb12,32
--/* [0x00000468] */ 0x00000006, 0xe0021367, // mov rb13,6
--/* [0x00000470] */ 0x00000001, 0xe00213a7, // mov rb14,1
--/* [0x00000478] */ 0x00000000, 0xe00213e7, // mov rb15,0
--/* [0x00000480] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
--/* [0x00000488] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
--/* [0x00000490] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
--/* [0x00000498] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
--/* [0x000004a0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
--/* [0x000004a8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
--/* [0x000004b0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
--/* [0x000004b8] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
--/* [0x000004c0] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
--/* [0x000004c8] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
--/* [0x000004d0] */ 0xfffffea8, 0xf06809e7, // brr.anyn -, r:uvloop
--/* [0x000004d8] */ 0x0c9cf3c0, 0x10020867, // add r1, r1, rb15
--/* [0x000004e0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
--/* [0x000004e8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
--/* [0x000004f0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
--/* [0x000004f8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x00000500] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
--/* [0x00000508] */ 0x00000010, 0xe0020827, // mov r0, 16
--/* [0x00000510] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
--/* [0x00000518] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
--/* [0x00000520] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
--/* [0x00000528] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x00000530] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x00000360] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+/* [0x00000368] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
-+/* [0x00000370] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+/* [0x00000378] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x00000380] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+/* [0x00000388] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x00000390] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00000398] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+/* [0x000003a0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-+/* [0x000003a8] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2
-+/* [0x000003b0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x000003b8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+/* [0x000003c0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+/* [0x000003c8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+/* [0x000003d0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+/* [0x000003d8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+/* [0x000003e0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+/* [0x000003e8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+/* [0x000003f0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+/* [0x000003f8] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+/* [0x00000400] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+/* [0x00000408] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
-+/* [0x00000410] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+/* [0x00000418] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+/* [0x00000420] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+/* [0x00000428] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+/* [0x00000430] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+/* [0x00000438] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+/* [0x00000440] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+/* [0x00000448] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+/* [0x00000450] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+/* [0x00000458] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+/* [0x00000460] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
-+/* [0x00000468] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
-+/* [0x00000470] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
-+/* [0x00000478] */ 0xfffffec8, 0xf06809e7, // brr.anyn -, r:uvloop
-+/* [0x00000480] */ 0x0c9cf3c0, 0x10020867, // add r1, r1, rb15
-+/* [0x00000488] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+/* [0x00000490] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+/* [0x00000498] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x000004a0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x000004a8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x000004b0] */ 0x00000010, 0xe0020827, // mov r0, 16
-+/* [0x000004b8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x000004c0] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x000004c8] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+/* [0x000004d0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x000004d8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
- // ::mc_filter_uv_b0
--/* [0x00000538] */ 0x15827d80, 0x100207e7, // mov ra31, unif
--/* [0x00000540] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
--/* [0x00000548] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
--/* [0x00000550] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
--/* [0x00000558] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
--/* [0x00000560] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
--/* [0x00000568] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
--/* [0x00000570] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
--/* [0x00000578] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
--/* [0x00000580] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
--/* [0x00000588] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
--/* [0x00000590] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
--/* [0x00000598] */ 0x00000010, 0xe00208a7, // mov r2, 16
--/* [0x000005a0] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x000005a8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
--/* [0x000005b0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
--/* [0x000005b8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
--/* [0x000005c0] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
--/* [0x000005c8] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
--/* [0x000005d0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
--/* [0x000005d8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
--/* [0x000005e0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
--/* [0x000005e8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
--/* [0x000005f0] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x000005f8] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000600] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000608] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000610] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
--/* [0x00000618] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000620] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000628] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000630] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
--/* [0x00000638] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000640] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000648] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
--/* [0x00000650] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00000658] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+/* [0x000004e0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x000004e8] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+/* [0x000004f0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+/* [0x000004f8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+/* [0x00000500] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+/* [0x00000508] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x00000510] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+/* [0x00000518] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+/* [0x00000520] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
-+/* [0x00000528] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+/* [0x00000530] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
-+/* [0x00000538] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
-+/* [0x00000540] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+/* [0x00000548] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000550] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+/* [0x00000558] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+/* [0x00000560] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+/* [0x00000568] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-+/* [0x00000570] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+/* [0x00000578] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+/* [0x00000580] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000588] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+/* [0x00000590] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+/* [0x00000598] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x000005a0] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000005a8] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000005b0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000005b8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+/* [0x000005c0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x000005c8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x000005d0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x000005d8] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-+/* [0x000005e0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x000005e8] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x000005f0] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+/* [0x000005f8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000600] */ 0x00000000, 0xe00208e7, // mov r3, 0
- // :uvloop_b0
--/* [0x00000660] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
--/* [0x00000668] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
--/* [0x00000670] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
--/* [0x00000678] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
--/* [0x00000680] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
--/* [0x00000688] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
--/* [0x00000690] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
--/* [0x00000698] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
--/* [0x000006a0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
--/* [0x000006a8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_frame_base, r2
--/* [0x000006b0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x000006b8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
--/* [0x000006c0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
--/* [0x000006c8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
--/* [0x000006d0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
--/* [0x000006d8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
--/* [0x000006e0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
--/* [0x000006e8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
--/* [0x000006f0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
--/* [0x000006f8] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
--/* [0x00000700] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
--/* [0x00000708] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
--/* [0x00000710] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
--/* [0x00000718] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
--/* [0x00000720] */ 0x159e7000, 0x100203e7, // mov ra15, r0
--/* [0x00000728] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
--/* [0x00000730] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
--/* [0x00000738] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
--/* [0x00000740] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
--/* [0x00000748] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
--/* [0x00000750] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
--/* [0x00000758] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:uvloop_b0
--/* [0x00000760] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
--/* [0x00000768] */ 0x009e7000, 0x100009e7, // nop
--/* [0x00000770] */ 0x009e7000, 0x100009e7, // nop
--/* [0x00000778] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
--/* [0x00000780] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000788] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000790] */ 0x009e7000, 0x100009e7, // nop
-+/* [0x00000608] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+/* [0x00000610] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
-+/* [0x00000618] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+/* [0x00000620] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x00000628] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+/* [0x00000630] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x00000638] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00000640] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+/* [0x00000648] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-+/* [0x00000650] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2
-+/* [0x00000658] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000660] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+/* [0x00000668] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+/* [0x00000670] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+/* [0x00000678] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+/* [0x00000680] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+/* [0x00000688] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+/* [0x00000690] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+/* [0x00000698] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+/* [0x000006a0] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+/* [0x000006a8] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+/* [0x000006b0] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+/* [0x000006b8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+/* [0x000006c0] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+/* [0x000006c8] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+/* [0x000006d0] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+/* [0x000006d8] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+/* [0x000006e0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+/* [0x000006e8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+/* [0x000006f0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+/* [0x000006f8] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
-+/* [0x00000700] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+/* [0x00000708] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
-+/* [0x00000710] */ 0x009e7000, 0x100009e7, // nop
-+/* [0x00000718] */ 0x009e7000, 0x100009e7, // nop
-+/* [0x00000720] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00000728] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000730] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000738] */ 0x009e7000, 0x100009e7, // nop
- // ::mc_filter_uv_b
--/* [0x00000798] */ 0x15827d80, 0x100207e7, // mov ra31, unif
--/* [0x000007a0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
--/* [0x000007a8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
--/* [0x000007b0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
--/* [0x000007b8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
--/* [0x000007c0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
--/* [0x000007c8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
--/* [0x000007d0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
--/* [0x000007d8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
--/* [0x000007e0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
--/* [0x000007e8] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
--/* [0x000007f0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
--/* [0x000007f8] */ 0x00000010, 0xe00208a7, // mov r2, 16
--/* [0x00000800] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000808] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
--/* [0x00000810] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
--/* [0x00000818] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
--/* [0x00000820] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
--/* [0x00000828] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
--/* [0x00000830] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
--/* [0x00000838] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
--/* [0x00000840] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
--/* [0x00000848] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
--/* [0x00000850] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
--/* [0x00000858] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
--/* [0x00000860] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
--/* [0x00000868] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
--/* [0x00000870] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000878] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000880] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000888] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000890] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
--/* [0x00000898] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
--/* [0x000008a0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
--/* [0x000008a8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000008b0] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
--/* [0x000008b8] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x000008c0] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x000008c8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
--/* [0x000008d0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x000008d8] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+/* [0x00000740] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x00000748] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+/* [0x00000750] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+/* [0x00000758] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+/* [0x00000760] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+/* [0x00000768] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x00000770] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+/* [0x00000778] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+/* [0x00000780] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
-+/* [0x00000788] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+/* [0x00000790] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
-+/* [0x00000798] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+/* [0x000007a0] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+/* [0x000007a8] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x000007b0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+/* [0x000007b8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+/* [0x000007c0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+/* [0x000007c8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-+/* [0x000007d0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+/* [0x000007d8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+/* [0x000007e0] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-+/* [0x000007e8] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-+/* [0x000007f0] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-+/* [0x000007f8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000800] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+/* [0x00000808] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+/* [0x00000810] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
-+/* [0x00000818] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000820] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000828] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000830] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000838] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+/* [0x00000840] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000848] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+/* [0x00000850] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000858] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-+/* [0x00000860] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000868] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000870] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+/* [0x00000878] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000880] */ 0x00000000, 0xe00208e7, // mov r3, 0
- // :uvloop_b
--/* [0x000008e0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
--/* [0x000008e8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
--/* [0x000008f0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
--/* [0x000008f8] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
--/* [0x00000900] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
--/* [0x00000908] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
--/* [0x00000910] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
--/* [0x00000918] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
--/* [0x00000920] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
--/* [0x00000928] */ 0x0c627c80, 0x10020e27, // add t0s, ra_frame_base, r2
--/* [0x00000930] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00000938] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
--/* [0x00000940] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
--/* [0x00000948] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
--/* [0x00000950] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
--/* [0x00000958] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
--/* [0x00000960] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
--/* [0x00000968] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
--/* [0x00000970] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
--/* [0x00000978] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
--/* [0x00000980] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
--/* [0x00000988] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
--/* [0x00000990] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
--/* [0x00000998] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
--/* [0x000009a0] */ 0x159e7000, 0x100203e7, // mov ra15, r0
--/* [0x000009a8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
--/* [0x000009b0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
--/* [0x000009b8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
--/* [0x000009c0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
--/* [0x000009c8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
--/* [0x000009d0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
--/* [0x000009d8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
--/* [0x000009e0] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
--/* [0x000009e8] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
--/* [0x000009f0] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
--/* [0x000009f8] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
--/* [0x00000a00] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
--/* [0x00000a08] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
--/* [0x00000a10] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
--/* [0x00000a18] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x00000a20] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
--/* [0x00000a28] */ 0x00000010, 0xe0020827, // mov r0, 16
--/* [0x00000a30] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
--/* [0x00000a38] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
--/* [0x00000a40] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
--/* [0x00000a48] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x00000a50] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x00000888] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+/* [0x00000890] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
-+/* [0x00000898] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+/* [0x000008a0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x000008a8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+/* [0x000008b0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x000008b8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x000008c0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+/* [0x000008c8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-+/* [0x000008d0] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2
-+/* [0x000008d8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x000008e0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+/* [0x000008e8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+/* [0x000008f0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+/* [0x000008f8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+/* [0x00000900] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+/* [0x00000908] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+/* [0x00000910] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+/* [0x00000918] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+/* [0x00000920] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+/* [0x00000928] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+/* [0x00000930] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+/* [0x00000938] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+/* [0x00000940] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+/* [0x00000948] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+/* [0x00000950] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+/* [0x00000958] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+/* [0x00000960] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+/* [0x00000968] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+/* [0x00000970] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+/* [0x00000978] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+/* [0x00000980] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+/* [0x00000988] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
-+/* [0x00000990] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
-+/* [0x00000998] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+/* [0x000009a0] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-+/* [0x000009a8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+/* [0x000009b0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+/* [0x000009b8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x000009c0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x000009c8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x000009d0] */ 0x00000010, 0xe0020827, // mov r0, 16
-+/* [0x000009d8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x000009e0] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x000009e8] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+/* [0x000009f0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x000009f8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
- // ::mc_exit
--/* [0x00000a58] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
--/* [0x00000a60] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
--/* [0x00000a68] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000a70] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000a78] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000a80] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000a88] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
--/* [0x00000a90] */ 0x009e7000, 0x100009e7, // nop        ; nop
--/* [0x00000a98] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+/* [0x00000a00] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x00000a08] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-+/* [0x00000a10] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000a18] */ 0x009e7000, 0xb00009e7, // ldtmu1
-+/* [0x00000a20] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000a28] */ 0x009e7000, 0xb00009e7, // ldtmu1
-+/* [0x00000a30] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x00000a38] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+/* [0x00000a40] */ 0x009e7000, 0x100009e7, // nop        ; nop
- // ::mc_interrupt_exit8
--/* [0x00000aa0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
--/* [0x00000aa8] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000ab0] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000ab8] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000ac0] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00000ac8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000ad0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000ad8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000ae0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000ae8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000af0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000af8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00000b00] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
--/* [0x00000b08] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
--/* [0x00000b10] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+/* [0x00000a48] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x00000a50] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000a58] */ 0x009e7000, 0xb00009e7, // ldtmu1
-+/* [0x00000a60] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000a68] */ 0x009e7000, 0xb00009e7, // ldtmu1
-+/* [0x00000a70] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000a78] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000a80] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000a88] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000a90] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000a98] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000aa0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000aa8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x00000ab0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+/* [0x00000ab8] */ 0x009e7000, 0x100009e7, // nop        ; nop
- // ::mc_setup
--/* [0x00000b18] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00000b20] */ 0x15827d80, 0x100207e7, // mov ra31, unif
--/* [0x00000b28] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
--/* [0x00000b30] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
--/* [0x00000b38] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
--/* [0x00000b40] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
--/* [0x00000b48] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
--/* [0x00000b50] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
--/* [0x00000b58] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
--/* [0x00000b60] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
--/* [0x00000b68] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
--/* [0x00000b70] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
--/* [0x00000b78] */ 0x8c9e7452, 0x10025e18, // add t0s, r2, r1 ; mov ra_frame_base, r2
--/* [0x00000b80] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
--/* [0x00000b88] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
--/* [0x00000b90] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
--/* [0x00000b98] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
--/* [0x00000ba0] */ 0x0c9c13c0, 0xd0020567, // add ra_y2, r1, 1
--/* [0x00000ba8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
--/* [0x00000bb0] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
--/* [0x00000bb8] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
--/* [0x00000bc0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
--/* [0x00000bc8] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
--/* [0x00000bd0] */ 0x8c9e7452, 0x10025e19, // add t0s, r2, r1 ; mov ra_frame_base2, r2
--/* [0x00000bd8] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
--/* [0x00000be0] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
--/* [0x00000be8] */ 0x15827d80, 0x10021427, // mov rb16, unif
--/* [0x00000bf0] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000bf8] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
--/* [0x00000c00] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
--/* [0x00000c08] */ 0x00000001, 0xe0020527, // mov ra20, 1
--/* [0x00000c10] */ 0x00000100, 0xe00205a7, // mov ra22, 256
--/* [0x00000c18] */ 0x00000040, 0xe00207a7, // mov ra30, 64
--/* [0x00000c20] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
--/* [0x00000c28] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
--/* [0x00000c30] */ 0x00000018, 0xe00215e7, // mov rb23, 24
--/* [0x00000c38] */ 0x00000000, 0xe0020227, // mov ra8, 0
--/* [0x00000c40] */ 0x00000000, 0xe0020267, // mov ra9, 0
--/* [0x00000c48] */ 0x00000000, 0xe00202a7, // mov ra10, 0
--/* [0x00000c50] */ 0x00000000, 0xe00202e7, // mov ra11, 0
--/* [0x00000c58] */ 0x00000000, 0xe0020327, // mov ra12, 0
--/* [0x00000c60] */ 0x00000000, 0xe0020367, // mov ra13, 0
--/* [0x00000c68] */ 0x00000000, 0xe00203a7, // mov ra14, 0
--/* [0x00000c70] */ 0x00000000, 0xe00203e7, // mov ra15, 0
--/* [0x00000c78] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
--/* [0x00000c80] */ 0x159e7480, 0x10020867, // mov r1, r2
--/* [0x00000c88] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
--/* [0x00000c90] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
--/* [0x00000c98] */ 0x159e7480, 0x10020827, // mov r0, r2
--/* [0x00000ca0] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
--/* [0x00000ca8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
--/* [0x00000cb0] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
--/* [0x00000cb8] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
--/* [0x00000cc0] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
--/* [0x00000cc8] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
--/* [0x00000cd0] */ 0x159e7480, 0x10020867, // mov r1, r2
--/* [0x00000cd8] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
--/* [0x00000ce0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
--/* [0x00000ce8] */ 0x159e7480, 0x10020827, // mov r0, r2
--/* [0x00000cf0] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
--/* [0x00000cf8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
--/* [0x00000d00] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
--/* [0x00000d08] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
--/* [0x00000d10] */ 0x15827d80, 0x10021327, // mov rb12,unif
--/* [0x00000d18] */ 0x15827d80, 0x10021367, // mov rb13,unif
--/* [0x00000d20] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000d28] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
--/* [0x00000d30] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
--/* [0x00000d38] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
--/* [0x00000d40] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
--/* [0x00000d48] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_frame_base
--/* [0x00000d50] */ 0x13540dc0, 0xd0020867, // max r1, ra_y2, 0
--/* [0x00000d58] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
--/* [0x00000d60] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00000ac0] */ 0x00000010, 0xe00208e7, // mov r3, 16
-+/* [0x00000ac8] */ 0x15827d80, 0x10020227, // mov ra8, unif
-+/* [0x00000ad0] */ 0x15827d80, 0x10020267, // mov ra9, unif
-+/* [0x00000ad8] */ 0x15827d80, 0x100202a7, // mov ra10, unif
-+/* [0x00000ae0] */ 0x15827d80, 0x100202e7, // mov ra11, unif
-+/* [0x00000ae8] */ 0x15827d80, 0x10020867, // mov r1, unif
-+/* [0x00000af0] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3
-+/* [0x00000af8] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3
-+/* [0x00000b00] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3
-+/* [0x00000b08] */ 0x0d9c13c0, 0xd0021667, // sub rb_frame_width_minus_1,r1,1
-+/* [0x00000b10] */ 0x0d9c11c0, 0xd00217a7, // sub rb_frame_height_minus_1,r0,1
-+/* [0x00000b18] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
-+/* [0x00000b20] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000b28] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-+/* [0x00000b30] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
-+/* [0x00000b38] */ 0x15227d80, 0x10020867, // mov r1, ra8
-+/* [0x00000b40] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3
-+/* [0x00000b48] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3
-+/* [0x00000b50] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3
-+/* [0x00000b58] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
-+/* [0x00000b60] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
-+/* [0x00000b68] */ 0x922591f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, ra9
-+/* [0x00000b70] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x00000b78] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
-+/* [0x00000b80] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+/* [0x00000b88] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
-+/* [0x00000b90] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-+/* [0x00000b98] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+/* [0x00000ba0] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
-+/* [0x00000ba8] */ 0x8c9e7452, 0x10025e18, // add t0s, r2, r1 ; mov ra_frame_base, r2
-+/* [0x00000bb0] */ 0x152a7d80, 0x10020867, // mov r1, ra10
-+/* [0x00000bb8] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3
-+/* [0x00000bc0] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3
-+/* [0x00000bc8] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3
-+/* [0x00000bd0] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
-+/* [0x00000bd8] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
-+/* [0x00000be0] */ 0x922d91f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, ra11
-+/* [0x00000be8] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
-+/* [0x00000bf0] */ 0x0c9c13c0, 0xd0020567, // add ra_y2, r1, 1
-+/* [0x00000bf8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+/* [0x00000c00] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
-+/* [0x00000c08] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-+/* [0x00000c10] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+/* [0x00000c18] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
-+/* [0x00000c20] */ 0x8c9e7452, 0x10025f19, // add t1s, r2, r1 ; mov ra_frame_base2, r2
-+/* [0x00000c28] */ 0x00000001, 0xe0020527, // mov ra20, 1
-+/* [0x00000c30] */ 0x00000100, 0xe00205a7, // mov ra22, 256
-+/* [0x00000c38] */ 0x00000040, 0xe00207a7, // mov ra30, 64
-+/* [0x00000c40] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
-+/* [0x00000c48] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
-+/* [0x00000c50] */ 0x00000018, 0xe00215e7, // mov rb23, 24
-+/* [0x00000c58] */ 0x00000000, 0xe0020227, // mov ra8, 0
-+/* [0x00000c60] */ 0x00000000, 0xe0020267, // mov ra9, 0
-+/* [0x00000c68] */ 0x00000000, 0xe00202a7, // mov ra10, 0
-+/* [0x00000c70] */ 0x00000000, 0xe00202e7, // mov ra11, 0
-+/* [0x00000c78] */ 0x00000000, 0xe0020327, // mov ra12, 0
-+/* [0x00000c80] */ 0x00000000, 0xe0020367, // mov ra13, 0
-+/* [0x00000c88] */ 0x00000000, 0xe00203a7, // mov ra14, 0
-+/* [0x00000c90] */ 0x00000000, 0xe00203e7, // mov ra15, 0
-+/* [0x00000c98] */ 0x00004000, 0xe00204a7, // mov ra18, 0x4000
-+/* [0x00000ca0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+/* [0x00000ca8] */ 0x159e7480, 0x10020867, // mov r1, r2
-+/* [0x00000cb0] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+/* [0x00000cb8] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+/* [0x00000cc0] */ 0x159e7480, 0x10020827, // mov r0, r2
-+/* [0x00000cc8] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-+/* [0x00000cd0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000cd8] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-+/* [0x00000ce0] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
-+/* [0x00000ce8] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-+/* [0x00000cf0] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-+/* [0x00000cf8] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
-+/* [0x00000d00] */ 0x15827d80, 0x10020867, // mov r1, unif
-+/* [0x00000d08] */ 0x919c82ff, 0xd0024822, // shl r0,r1,r3 ; mov r2,8
-+/* [0x00000d10] */ 0x0f9e70c0, 0x10021367, // asr rb13,r0,r3
-+/* [0x00000d18] */ 0x0f9e72c0, 0x10021327, // asr rb12,r1,r3
-+/* [0x00000d20] */ 0x0c9cde80, 0x10021367, // add rb13,rb13,r2
-+/* [0x00000d28] */ 0x119cce80, 0x10021327, // shl rb12, rb12, r2
-+/* [0x00000d30] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-+/* [0x00000d38] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+/* [0x00000d40] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-+/* [0x00000d48] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-+/* [0x00000d50] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_frame_base
-+/* [0x00000d58] */ 0x13540dc0, 0xd0020867, // max r1, ra_y2, 0
-+/* [0x00000d60] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
- /* [0x00000d68] */ 0x0c541dc0, 0xd0020567, // add ra_y2, ra_y2, 1
- /* [0x00000d70] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
--/* [0x00000d78] */ 0x0c667380, 0x10020e27, // add t0s, r1, ra_frame_base2
--// ::mc_filter
-+/* [0x00000d78] */ 0x0c667380, 0x10020f27, // add t1s, r1, ra_frame_base2
-+// :per_block_setup
- /* [0x00000d80] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
- /* [0x00000d88] */ 0x15827d80, 0x100207e7, // mov ra31, unif
- /* [0x00000d90] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
- /* [0x00000d98] */ 0x155e7d80, 0x10021027, // mov rx_xshift2, rx_xshift2_next
--/* [0x00000da0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
--/* [0x00000da8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
--/* [0x00000db0] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
--/* [0x00000db8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
--/* [0x00000dc0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
--/* [0x00000dc8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
--/* [0x00000dd0] */ 0x0c9e7400, 0x100206a7, // add ra_frame_base_next, r2, r0
--/* [0x00000dd8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
--/* [0x00000de0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0   ; mov r1, unif
--/* [0x00000de8] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
--/* [0x00000df0] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
--/* [0x00000df8] */ 0x0c9c13c0, 0xd0021067, // add ra_y2_next, r1, 1
--/* [0x00000e00] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
--/* [0x00000e08] */ 0x0c9e7400, 0x100214e7, // add rx_frame_base2_next, r2, r0
--/* [0x00000e10] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
--/* [0x00000e18] */ 0x00000010, 0xe00208a7, // mov r2, 16
--/* [0x00000e20] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000e28] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
--/* [0x00000e30] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
--/* [0x00000e38] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
--/* [0x00000e40] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
--/* [0x00000e48] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
--/* [0x00000e50] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
--/* [0x00000e58] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
--/* [0x00000e60] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
--/* [0x00000e68] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
--/* [0x00000e70] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000e78] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
--/* [0x00000e80] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000e88] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000e90] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000e98] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
--/* [0x00000ea0] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
--/* [0x00000ea8] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000eb0] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000eb8] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000ec0] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
--/* [0x00000ec8] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
--/* [0x00000ed0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000ed8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
--/* [0x00000ee0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000ee8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
--/* [0x00000ef0] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
--/* [0x00000ef8] */ 0x4f5971c6, 0x100251e0, // asr rb7, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000f00] */ 0x4f5971c6, 0x100251a0, // asr rb6, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00000f08] */ 0x4f5971c6, 0x10025160, // asr rb5, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000da0] */ 0x00000010, 0xe00208e7, // mov r3, 16
-+/* [0x00000da8] */ 0x15827d80, 0x10020867, // mov r1, unif
-+/* [0x00000db0] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3
-+/* [0x00000db8] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3
-+/* [0x00000dc0] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3
-+/* [0x00000dc8] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
-+/* [0x00000dd0] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
-+/* [0x00000dd8] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-+/* [0x00000de0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x00000de8] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+/* [0x00000df0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+/* [0x00000df8] */ 0x8c827436, 0x100246a1, // add ra_frame_base_next, r2, r0 ; mov r1, unif
-+/* [0x00000e00] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3
-+/* [0x00000e08] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3
-+/* [0x00000e10] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3
-+/* [0x00000e18] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
-+/* [0x00000e20] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
-+/* [0x00000e28] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-+/* [0x00000e30] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
-+/* [0x00000e38] */ 0x159e7240, 0x10021067, // mov ra_y2_next, r1
-+/* [0x00000e40] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+/* [0x00000e48] */ 0x0c9e7400, 0x100214e7, // add rx_frame_base2_next, r2, r0
-+/* [0x00000e50] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+/* [0x00000e58] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000e60] */ 0x0e9e70c0, 0x10020867, // shr r1, r0, r3
-+/* [0x00000e68] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+/* [0x00000e70] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+/* [0x00000e78] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+/* [0x00000e80] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+/* [0x00000e88] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+/* [0x00000e90] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000e98] */ 0x119e70c0, 0x10020827, // shl r0, r0, r3
-+/* [0x00000ea0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+/* [0x00000ea8] */ 0x95801dbf, 0xd0024821, // mov r0, unif ; mov r1,1
-+/* [0x00000eb0] */ 0x4f5971c6, 0x10024260, // asr ra9, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000eb8] */ 0x4f5971c6, 0x10024220, // asr ra8, r0, rb23;      mul24 r0, r0, ra22
-+/* [0x00000ec0] */ 0x4f5971c6, 0x10044260, // asr.ifz ra9, r0, rb23;  mul24 r0, r0, ra22
-+/* [0x00000ec8] */ 0x0f9d71c0, 0x10040227, // asr.ifz ra8, r0, rb23
-+/* [0x00000ed0] */ 0x0d243f80, 0xd0020267, // sub ra9,3,ra9
-+/* [0x00000ed8] */ 0x0d203f80, 0xd0020227, // sub ra8,3,ra8
-+/* [0x00000ee0] */ 0x11243dc0, 0xd0020267, // shl ra9,ra9,3
-+/* [0x00000ee8] */ 0x11203dc0, 0xd0020227, // shl ra8,ra8,3
-+/* [0x00000ef0] */ 0x00ffff00, 0xe0020867, // mov r1,0xffff00
-+/* [0x00000ef8] */ 0x11227380, 0x10020827, // shl r0, r1, ra8
-+/* [0x00000f00] */ 0x0f9d71c0, 0x10020027, // asr ra0, r0, rb23
-+/* [0x00000f08] */ 0x11267380, 0x10020827, // shl r0, r1, ra9
- /* [0x00000f10] */ 0x0f9d71c0, 0x10021127, // asr rb4, r0, rb23
--/* [0x00000f18] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00000f20] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
--/* [0x00000f28] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
--/* [0x00000f30] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
--/* [0x00000f38] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
--/* [0x00000f40] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+/* [0x00000f18] */ 0x01040400, 0xe0020867, // mov r1,0x1040400
-+/* [0x00000f20] */ 0x11227380, 0x10020827, // shl r0, r1, ra8
-+/* [0x00000f28] */ 0x0f9d71c0, 0x10020067, // asr ra1, r0, rb23
-+/* [0x00000f30] */ 0x11267380, 0x10020827, // shl r0, r1, ra9
-+/* [0x00000f38] */ 0x0f9d71c0, 0x10021167, // asr rb5, r0, rb23
-+/* [0x00000f40] */ 0xfbf5f600, 0xe0020867, // mov r1,0xfbf5f600
-+/* [0x00000f48] */ 0x11227380, 0x10020827, // shl r0, r1, ra8
-+/* [0x00000f50] */ 0x0f9d71c0, 0x100200a7, // asr ra2, r0, rb23
-+/* [0x00000f58] */ 0x11267380, 0x10020827, // shl r0, r1, ra9
-+/* [0x00000f60] */ 0x0f9d71c0, 0x100211a7, // asr rb6, r0, rb23
-+/* [0x00000f68] */ 0x11283a40, 0xe0020867, // mov r1,0x11283a40
-+/* [0x00000f70] */ 0x11227380, 0x10020827, // shl r0, r1, ra8
-+/* [0x00000f78] */ 0x0f9d71c0, 0x100200e7, // asr ra3, r0, rb23
-+/* [0x00000f80] */ 0x11267380, 0x10020827, // shl r0, r1, ra9
-+/* [0x00000f88] */ 0x0f9d71c0, 0x100211e7, // asr rb7, r0, rb23
-+/* [0x00000f90] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100
-+/* [0x00000f98] */ 0x11227380, 0x10020827, // shl r0, r1, ra8
-+/* [0x00000fa0] */ 0x0f9d71c0, 0x10020127, // asr ra4, r0, rb23
-+/* [0x00000fa8] */ 0x11267380, 0x10020827, // shl r0, r1, ra9
-+/* [0x00000fb0] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-+/* [0x00000fb8] */ 0xf6f5fb00, 0xe0020867, // mov r1,0xf6f5fb00
-+/* [0x00000fc0] */ 0x11227380, 0x10020827, // shl r0, r1, ra8
-+/* [0x00000fc8] */ 0x0f9d71c0, 0x10020167, // asr ra5, r0, rb23
-+/* [0x00000fd0] */ 0x11267380, 0x10020827, // shl r0, r1, ra9
-+/* [0x00000fd8] */ 0x0f9d71c0, 0x10021267, // asr rb9, r0, rb23
-+/* [0x00000fe0] */ 0x04040100, 0xe0020867, // mov r1,0x4040100
-+/* [0x00000fe8] */ 0x11227380, 0x10020827, // shl r0, r1, ra8
-+/* [0x00000ff0] */ 0x0f9d71c0, 0x100201a7, // asr ra6, r0, rb23
-+/* [0x00000ff8] */ 0x11267380, 0x10020827, // shl r0, r1, ra9
-+/* [0x00001000] */ 0x0f9d71c0, 0x100212a7, // asr rb10, r0, rb23
-+/* [0x00001008] */ 0xffff0000, 0xe0020867, // mov r1,0xffff0000
-+/* [0x00001010] */ 0x11227380, 0x10020827, // shl r0, r1, ra8
-+/* [0x00001018] */ 0x0f9d71c0, 0x100201e7, // asr ra7, r0, rb23
-+/* [0x00001020] */ 0x11267380, 0x10020827, // shl r0, r1, ra9
-+/* [0x00001028] */ 0x0f9d71c0, 0x100212e7, // asr rb11, r0, rb23
-+/* [0x00001030] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00001038] */ 0x0f9e70c0, 0x100213e7, // asr rb15, r0, r3
-+/* [0x00001040] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00001048] */ 0x119e70c0, 0x10020827, // shl r0, r0, r3
-+/* [0x00001050] */ 0x8f9c00ff, 0xd0024823, // asr r0, r0, r3 ; mov r3, 0
-+/* [0x00001058] */ 0x119c81c0, 0xd00213a7, // shl rb14, r0, 8
-+// ::mc_filter
- // :yloop
--/* [0x00000f48] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
--/* [0x00000f50] */ 0x8e4539bf, 0xa0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu0
--/* [0x00000f58] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
--/* [0x00000f60] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
--/* [0x00000f68] */ 0x0e9c09c0, 0x10020867, // shr r1, r4, rx_xshift2
--/* [0x00000f70] */ 0x159c1fc0, 0x10040567, // mov.ifz ra_y2, ra_y2_next
--/* [0x00000f78] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
--/* [0x00000f80] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
--/* [0x00000f88] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
--/* [0x00000f90] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
--/* [0x00000f98] */ 0x13540dc0, 0xd00208a7, // max r2, ra_y2, 0
--/* [0x00000fa0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
--/* [0x00000fa8] */ 0x4c541dd3, 0xd0024562, // add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
--/* [0x00000fb0] */ 0xec654c87, 0x10024e20, // add t0s, ra_frame_base2, r2   ; v8subs r0, r0, rb20
--/* [0x00000fb8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00000fc0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
--/* [0x00000fc8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
--/* [0x00000fd0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
--/* [0x00000fd8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
--/* [0x00000fe0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
--/* [0x00000fe8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
--/* [0x00000ff0] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
--/* [0x00000ff8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
--/* [0x00001000] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
--/* [0x00001008] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
--/* [0x00001010] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
--/* [0x00001018] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
--/* [0x00001020] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
--/* [0x00001028] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
--/* [0x00001030] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
--/* [0x00001038] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
--/* [0x00001040] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
--/* [0x00001048] */ 0x8d3487f6, 0xd00279cc, // sub.setf -, r3, 8    ; mov ra12, ra13
--/* [0x00001050] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
--/* [0x00001058] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
--/* [0x00001060] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
--/* [0x00001068] */ 0x15367d80, 0x10020327, // mov ra12, ra13
--/* [0x00001070] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:yloop
--/* [0x00001078] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
--/* [0x00001080] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
--/* [0x00001088] */ 0x159e7000, 0x100203e7, // mov ra15, r0
--/* [0x00001090] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
--/* [0x00001098] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
--/* [0x000010a0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
--/* [0x000010a8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
--/* [0x000010b0] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb4
--/* [0x000010b8] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb5
--/* [0x000010c0] */ 0x4c286237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb6
--/* [0x000010c8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
--/* [0x000010d0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
--/* [0x000010d8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
--/* [0x000010e0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
--/* [0x000010e8] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
--/* [0x000010f0] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
--/* [0x000010f8] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
--/* [0x00001100] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:yloop
--/* [0x00001108] */ 0x0c9cf3c0, 0x10020867, // add r1, r1, rb15
--/* [0x00001110] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
--/* [0x00001118] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
--/* [0x00001120] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
--/* [0x00001128] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
--/* [0x00001130] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x00001138] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x00001060] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
-+/* [0x00001068] */ 0x8e4539bf, 0xb0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu1
-+/* [0x00001070] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+/* [0x00001078] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x00001080] */ 0x0e9c09c0, 0x10020867, // shr r1, r4, rx_xshift2
-+/* [0x00001088] */ 0x159c1fc0, 0x10040567, // mov.ifz ra_y2, ra_y2_next
-+/* [0x00001090] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x00001098] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x000010a0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-+/* [0x000010a8] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
-+/* [0x000010b0] */ 0x13540dc0, 0xd00208a7, // max r2, ra_y2, 0
-+/* [0x000010b8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x000010c0] */ 0x4c541dd3, 0xd0024562, // add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
-+/* [0x000010c8] */ 0xec654c8f, 0x10024f21, // add t1s, ra_frame_base2, r2   ; v8subs r1, r1, rb20
-+/* [0x000010d0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x000010d8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+/* [0x000010e0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+/* [0x000010e8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+/* [0x000010f0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+/* [0x000010f8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+/* [0x00001100] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+/* [0x00001108] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+/* [0x00001110] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+/* [0x00001118] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+/* [0x00001120] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+/* [0x00001128] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+/* [0x00001130] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+/* [0x00001138] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+/* [0x00001140] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+/* [0x00001148] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+/* [0x00001150] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+/* [0x00001158] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+/* [0x00001160] */ 0x8d2487f6, 0xd00279c8, // sub.setf -, r3, 8    ; mov ra8, ra9
-+/* [0x00001168] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+/* [0x00001170] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+/* [0x00001178] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+/* [0x00001180] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+/* [0x00001188] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:yloop
-+/* [0x00001190] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+/* [0x00001198] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+/* [0x000011a0] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+/* [0x000011a8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+/* [0x000011b0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+/* [0x000011b8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+/* [0x000011c0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+/* [0x000011c8] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb4
-+/* [0x000011d0] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb5
-+/* [0x000011d8] */ 0x4c286237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb6
-+/* [0x000011e0] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
-+/* [0x000011e8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+/* [0x000011f0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+/* [0x000011f8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+/* [0x00001200] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
-+/* [0x00001208] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
-+/* [0x00001210] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
-+/* [0x00001218] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:yloop
-+/* [0x00001220] */ 0x0c9cf3c0, 0x10020867, // add r1, r1, rb15
-+/* [0x00001228] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+/* [0x00001230] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+/* [0x00001238] */ 0xfffffb28, 0xf0f809e7, // brr -, r:per_block_setup
-+/* [0x00001240] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x00001248] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00001250] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
- // ::mc_filter_b
--/* [0x00001140] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x00001148] */ 0x15827d80, 0x100207e7, // mov ra31, unif
--/* [0x00001150] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
--/* [0x00001158] */ 0x155e7d80, 0x10021027, // mov rx_xshift2, rx_xshift2_next
--/* [0x00001160] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
--/* [0x00001168] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
--/* [0x00001170] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
--/* [0x00001178] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
--/* [0x00001180] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
--/* [0x00001188] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
--/* [0x00001190] */ 0x0c9e7400, 0x100206a7, // add ra_frame_base_next, r2, r0
--/* [0x00001198] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
--/* [0x000011a0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0   ; mov r1, unif
--/* [0x000011a8] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
--/* [0x000011b0] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
--/* [0x000011b8] */ 0x0c9c13c0, 0xd0021067, // add ra_y2_next, r1, 1
--/* [0x000011c0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
--/* [0x000011c8] */ 0x0c9e7400, 0x100214e7, // add rx_frame_base2_next, r2, r0
--/* [0x000011d0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
--/* [0x000011d8] */ 0x00000010, 0xe00208a7, // mov r2, 16
--/* [0x000011e0] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x000011e8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
--/* [0x000011f0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
--/* [0x000011f8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
--/* [0x00001200] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
--/* [0x00001208] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
--/* [0x00001210] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
--/* [0x00001218] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
--/* [0x00001220] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
--/* [0x00001228] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
--/* [0x00001230] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00001238] */ 0x00000001, 0xe0020867, // mov r1, 1
--/* [0x00001240] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
--/* [0x00001248] */ 0x409f3001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 13, r1 << 13
--/* [0x00001250] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00001258] */ 0x409f2001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 14, r1 << 14
--/* [0x00001260] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00001268] */ 0x409f1001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 15, r1 << 15
--/* [0x00001270] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00001278] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
--/* [0x00001280] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
--/* [0x00001288] */ 0x409f7001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 9, r1 << 9
--/* [0x00001290] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00001298] */ 0x409f6001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 10, r1 << 10
--/* [0x000012a0] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000012a8] */ 0x409f5001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 11, r1 << 11
--/* [0x000012b0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000012b8] */ 0x409f4001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 12, r1 << 12
--/* [0x000012c0] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
--/* [0x000012c8] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
--/* [0x000012d0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
--/* [0x000012d8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
--/* [0x000012e0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
--/* [0x000012e8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
--/* [0x000012f0] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
--/* [0x000012f8] */ 0x4f5971c6, 0x100251e0, // asr rb7, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00001300] */ 0x4f5971c6, 0x100251a0, // asr rb6, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00001308] */ 0x4f5971c6, 0x10025160, // asr rb5, r0, rb23;      mul24 r0, r0, ra22
--/* [0x00001310] */ 0x0f9d71c0, 0x10021127, // asr rb4, r0, rb23
--/* [0x00001318] */ 0x15827d80, 0x10020827, // mov r0, unif
--/* [0x00001320] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
--/* [0x00001328] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
--/* [0x00001330] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
--/* [0x00001338] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
--/* [0x00001340] */ 0x00000000, 0xe00208e7, // mov r3, 0
- // :yloopb
--/* [0x00001348] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
--/* [0x00001350] */ 0x8e4539bf, 0xa0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu0
--/* [0x00001358] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
--/* [0x00001360] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
--/* [0x00001368] */ 0x0e9c09c0, 0x10020867, // shr r1, r4, rx_xshift2
--/* [0x00001370] */ 0x159c1fc0, 0x10040567, // mov.ifz ra_y2, ra_y2_next
--/* [0x00001378] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
--/* [0x00001380] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
--/* [0x00001388] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
--/* [0x00001390] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
--/* [0x00001398] */ 0x13540dc0, 0xd00208a7, // max r2, ra_y2, 0
--/* [0x000013a0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
--/* [0x000013a8] */ 0x4c541dd3, 0xd0024562, // add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
--/* [0x000013b0] */ 0xec654c87, 0x10024e20, // add t0s, ra_frame_base2, r2   ; v8subs r0, r0, rb20
--/* [0x000013b8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--/* [0x000013c0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
--/* [0x000013c8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
--/* [0x000013d0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
--/* [0x000013d8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
--/* [0x000013e0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
--/* [0x000013e8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
--/* [0x000013f0] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
--/* [0x000013f8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
--/* [0x00001400] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
--/* [0x00001408] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
--/* [0x00001410] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
--/* [0x00001418] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
--/* [0x00001420] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
--/* [0x00001428] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
--/* [0x00001430] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
--/* [0x00001438] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
--/* [0x00001440] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
--/* [0x00001448] */ 0x8d3487f6, 0xd00279cc, // sub.setf -, r3, 8    ; mov ra12, ra13
--/* [0x00001450] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
--/* [0x00001458] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
--/* [0x00001460] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
--/* [0x00001468] */ 0x15367d80, 0x10020327, // mov ra12, ra13
--/* [0x00001470] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:yloopb
--/* [0x00001478] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
--/* [0x00001480] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
--/* [0x00001488] */ 0x159e7000, 0x100203e7, // mov ra15, r0
--/* [0x00001490] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
--/* [0x00001498] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
--/* [0x000014a0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
--/* [0x000014a8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
--/* [0x000014b0] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb4
--/* [0x000014b8] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb5
--/* [0x000014c0] */ 0x4c286237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb6
--/* [0x000014c8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
--/* [0x000014d0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
--/* [0x000014d8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
--/* [0x000014e0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
--/* [0x000014e8] */ 0x4053800e, 0xd00049e1, // nop                     ; mul24 r1, r1 << 8, ra20 << 8
--/* [0x000014f0] */ 0x4c78e38f, 0x10024860, // add r1, r1, ra30        ; mul24 r0, r1, rb14
--/* [0x000014f8] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
--/* [0x00001500] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:yloopb
--/* [0x00001508] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
--/* [0x00001510] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
--/* [0x00001518] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
--/* [0x00001520] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
--/* [0x00001528] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
--/* [0x00001530] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
--/* [0x00001538] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x00001258] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
-+/* [0x00001260] */ 0x8e4539bf, 0xb0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu1
-+/* [0x00001268] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+/* [0x00001270] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x00001278] */ 0x0e9c09c0, 0x10020867, // shr r1, r4, rx_xshift2
-+/* [0x00001280] */ 0x159c1fc0, 0x10040567, // mov.ifz ra_y2, ra_y2_next
-+/* [0x00001288] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x00001290] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00001298] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-+/* [0x000012a0] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
-+/* [0x000012a8] */ 0x13540dc0, 0xd00208a7, // max r2, ra_y2, 0
-+/* [0x000012b0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x000012b8] */ 0x4c541dd3, 0xd0024562, // add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
-+/* [0x000012c0] */ 0xec654c8f, 0x10024f21, // add t1s, ra_frame_base2, r2   ; v8subs r1, r1, rb20
-+/* [0x000012c8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x000012d0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+/* [0x000012d8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+/* [0x000012e0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+/* [0x000012e8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+/* [0x000012f0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+/* [0x000012f8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+/* [0x00001300] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+/* [0x00001308] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+/* [0x00001310] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+/* [0x00001318] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+/* [0x00001320] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+/* [0x00001328] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+/* [0x00001330] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+/* [0x00001338] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+/* [0x00001340] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+/* [0x00001348] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+/* [0x00001350] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+/* [0x00001358] */ 0x8d2487f6, 0xd00279c8, // sub.setf -, r3, 8    ; mov ra8, ra9
-+/* [0x00001360] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+/* [0x00001368] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+/* [0x00001370] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+/* [0x00001378] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+/* [0x00001380] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:yloopb
-+/* [0x00001388] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+/* [0x00001390] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+/* [0x00001398] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+/* [0x000013a0] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+/* [0x000013a8] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+/* [0x000013b0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+/* [0x000013b8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+/* [0x000013c0] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb4
-+/* [0x000013c8] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb5
-+/* [0x000013d0] */ 0x4c286237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb6
-+/* [0x000013d8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
-+/* [0x000013e0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+/* [0x000013e8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+/* [0x000013f0] */ 0x0f9ce3c0, 0xd0020827, // asr r0, r1, 14
-+/* [0x000013f8] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-+/* [0x00001400] */ 0x405b8006, 0xd00049e0, // nop                     ; mul24 r0, r0 << 8, ra22 << 8
-+/* [0x00001408] */ 0x0c4a7380, 0x10020867, // add r1, r1, ra18
-+/* [0x00001410] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
-+/* [0x00001418] */ 0xfffffe20, 0xf06809e7, // brr.anyn -, r:yloopb
-+/* [0x00001420] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
-+/* [0x00001428] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+/* [0x00001430] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+/* [0x00001438] */ 0xfffff928, 0xf0f809e7, // brr -, r:per_block_setup
-+/* [0x00001440] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x00001448] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00001450] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
- // ::mc_interrupt_exit12
--/* [0x00001540] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
--/* [0x00001548] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00001550] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00001558] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00001560] */ 0x009e7000, 0xa00009e7, // ldtmu0
--/* [0x00001568] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00001570] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00001578] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00001580] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00001588] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00001590] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x00001598] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x000015a0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x000015a8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x000015b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x000015b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
--/* [0x000015c0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
--/* [0x000015c8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
--/* [0x000015d0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+/* [0x00001458] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x00001460] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00001468] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00001470] */ 0x009e7000, 0xb00009e7, // ldtmu1
-+/* [0x00001478] */ 0x009e7000, 0xb00009e7, // ldtmu1
-+/* [0x00001480] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00001488] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00001490] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00001498] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000014a0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000014a8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000014b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000014b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000014c0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000014c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000014d0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000014d8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x000014e0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+/* [0x000014e8] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+// ::mc_exit1
-+/* [0x000014f0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x000014f8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00001500] */ 0x009e7000, 0xb00009e7, // ldtmu1
-+/* [0x00001508] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00001510] */ 0x009e7000, 0xb00009e7, // ldtmu1
-+/* [0x00001518] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x00001520] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+/* [0x00001528] */ 0x009e7000, 0x100009e7, // nop        ; nop
- // ::mc_end
- };
- #ifdef __HIGHC__
-diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
-index 6e552d9..760bd17 100644
---- a/libavcodec/rpi_shader.h
-+++ b/libavcodec/rpi_shader.h
-@@ -4,15 +4,16 @@
- extern unsigned int rpi_shader[];
- 
- #define mc_setup_uv (rpi_shader + 0)
--#define mc_filter_uv (rpi_shader + 144)
--#define mc_filter_uv_b0 (rpi_shader + 334)
--#define mc_filter_uv_b (rpi_shader + 486)
--#define mc_exit (rpi_shader + 662)
--#define mc_interrupt_exit8 (rpi_shader + 680)
--#define mc_setup (rpi_shader + 710)
--#define mc_filter (rpi_shader + 864)
--#define mc_filter_b (rpi_shader + 1104)
--#define mc_interrupt_exit12 (rpi_shader + 1360)
--#define mc_end (rpi_shader + 1398)
-+#define mc_filter_uv (rpi_shader + 130)
-+#define mc_filter_uv_b0 (rpi_shader + 312)
-+#define mc_filter_uv_b (rpi_shader + 464)
-+#define mc_exit (rpi_shader + 640)
-+#define mc_interrupt_exit8 (rpi_shader + 658)
-+#define mc_setup (rpi_shader + 688)
-+#define mc_filter (rpi_shader + 1048)
-+#define mc_filter_b (rpi_shader + 1174)
-+#define mc_interrupt_exit12 (rpi_shader + 1302)
-+#define mc_exit1 (rpi_shader + 1340)
-+#define mc_end (rpi_shader + 1356)
- 
- #endif
-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-index a0b8e5a..60d1ec2 100644
---- a/libavcodec/rpi_shader.qasm
-+++ b/libavcodec/rpi_shader.qasm
-@@ -21,6 +21,7 @@
- #
- # ra16                                          clipped(row start address+elem_num)&~3
- # ra17                                          per-channel shifts
-+# ra18                                          0x4000
- # ra19                                          next ra17
- #
- # rb16                                          pitch
-@@ -86,7 +87,7 @@
- 
- 
- ################################################################################
--# mc_setup_uv(next_kernel, x, y, ref_u_base, ref_v_base, frame_width, frame_height, pitch, dst_pitch, pad0, pad1, pad2)
-+# mc_setup_uv(next_kernel, x, y, ref_u_base, ref_v_base, frame_width, frame_height, pitch, dst_pitch, offset, denom, vpm_id)
- ::mc_setup_uv
- 
- # Read starting kernel
-@@ -132,36 +133,6 @@ mov ra13, 0
- mov ra14, 0
- mov ra15, 0
- 
--# Compute part of VPM to use for DMA output
--mov r3, unif
--shl r2, r3, 1   # Convert QPU numbers to be even (this means we can only use 8 QPUs, but is necessary as we need to save 16bit intermediate results)
--and r2, r2, 15
--mov r1, r2
--asr r1, r1, 2
--shl r1, r1, 6
--mov r0, r2
--and r0, r0, 3
--add r0, r0, r1
--mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
--shl r0, r0, 5
--add rb27, r0, r1
--
--# Compute part of VPM to save data into
--shl r2, r3, 1
--and r2, r2, 15    # r2 = bcd0
--mov r1, r2        # r1 = bcd0
--asr r1, r1, 2     # r1 = bc
--shl r1, r1, 6     # r1 = bc000000
--mov r0, r2        # r0 = bcd0
--and r0, r0, 3     # r0 = d0
--add r0, r0, r1    # r0 = bc0000d0
--mov r1, vpm_setup(0, 4, h8p(0, 0))   # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
--add rb28, r0, r1
--asr r0, r0, 1     # r0 = bc0000d
--# Prepare VPM command for 16bit intermediates
--mov r1, vpm_setup(0, 2, h16p(0, 0))  # 2 is stride - stride acts on ADDR which is Y[5:0],H[0] for 16 bit
--add rb21, r0, r1
--
- # Compute base address for first and second access
- mov r0, ra_x           # Load x
- max r0, r0, 0; mov r1, ra_y # Load y
-@@ -175,10 +146,31 @@ min r1, r1, rb_frame_height_minus_1
- # submit texture requests for first line
- add r2, r2, r0 ; mul24 r1, r1, rb_pitch
- add t0s, r0, r1 ; mov ra_frame_base, r2
--add t0s, r2, r1
-+add t1s, r2, r1
-+
-+mov r2,8
-+shl rb12,unif, r2 # offset before shift
-+add rb13,unif,r2  # offset after shift
-+
-+# Compute part of VPM to use for DMA output
-+mov r2, unif
-+shl r2, r2, 1   # Convert QPU numbers to be even (this means we can only use 8 QPUs, but is necessary as we need to save 16bit intermediate results)
-+and r2, r2, 15
-+mov r1, r2
-+asr r1, r1, 2
-+shl r1, r1, 6
-+mov r0, r2
-+and r0, r0, 3
-+add r0, r0, r1
- 
--mov rb12,unif # offset before shift
--mov rb13,unif # offset after shift
-+mov r1, vpm_setup(0, 4, h8p(0, 0))   # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
-+add rb28, r0, r1  # VPM 8bit storage
-+asr r2, r0, 1     # r0 = bc0000d
-+mov r1, vpm_setup(0, 2, h16p(0, 0))  # 2 is stride - stride acts on ADDR which is Y[5:0],H[0] for 16 bit
-+add rb21, r2, r1  # VPM for 16bit intermediates
-+mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
-+shl r0, r0, 5
-+add rb27, r0, r1  # DMA out
- 
- # submit texture requests for second line
- max r1, ra_y, 0
-@@ -187,7 +179,7 @@ add ra_y, ra_y, 1
- bra -, ra31
- nop ; mul24 r1, r1, rb_pitch
- add t0s, r1, ra_x
--add t0s, r1, ra_frame_base
-+add t1s, r1, ra_frame_base
- 
- 
- 
-@@ -248,17 +240,15 @@ mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
- mov r0, unif # U offset/weight
- asr rb15, r0, r2  # Compute offset from MSBs
- shl r0, r0, r2
--asr rb14, r0, r2  # Compute weight from LSBs
-+asr r3, r0, r2  # Compute weight from LSBs
- mov r0, unif # V offset/weight
- asr.ifnz rb15, r0, r2
- shl r0, r0, r2
--asr.ifnz rb14, r0, r2
-+asr.ifnz r3, r0, r2
-+shl rb14,r3,8 # Scale up weights so we can use mul24 in signed fashion
- 
- # r2 is elem_num
- # r3 is loop counter
--
--mov r5rep, -8
--
- # retrieve texture results and pick out bytes
- # then submit two more texture requests
- 
-@@ -269,7 +259,7 @@ mov r3, 0
- # then submit two more texture requests
- 
- sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
--shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
-+shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
- mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
- mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
- shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
-@@ -278,7 +268,7 @@ max r2, ra_y, 0  # y
- min r2, r2, rb_frame_height_minus_1
- add ra_y, ra_y, 1         ; mul24 r2, r2, r3
- add t0s, ra_x, r2    ; v8subs r1, r1, rb20
--add t0s, ra_frame_base, r2
-+add t1s, ra_frame_base, r2
- 
- # generate seven shifted versions
- # interleave with scroll of vertical context
-@@ -301,11 +291,6 @@ mov ra13, ra14       # Delay slot 1
- mov ra14, ra15       # Delay slot 2
- mov ra15, r0         # Delay slot 3
- 
--mov rb12,32 # TODO remove these to make P weighted prediction work properly
--mov rb13,6
--mov rb14,1
--mov rb15,0
--
- # apply vertical filter and write to VPM
- 
- nop                     ; mul24 r1, ra14, rb10
-@@ -412,7 +397,7 @@ mov r3, 0
- # then submit two more texture requests
- 
- sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
--shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
-+shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
- mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
- mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
- shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
-@@ -421,7 +406,7 @@ max r2, ra_y, 0  # y
- min r2, r2, rb_frame_height_minus_1
- add ra_y, ra_y, 1         ; mul24 r2, r2, r3
- add t0s, ra_x, r2    ; v8subs r1, r1, rb20
--add t0s, ra_frame_base, r2
-+add t1s, ra_frame_base, r2
- 
- # generate seven shifted versions
- # interleave with scroll of vertical context
-@@ -542,7 +527,7 @@ mov r3, 0
- # then submit two more texture requests
- 
- sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
--shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
-+shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
- mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
- mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
- shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
-@@ -551,7 +536,7 @@ max r2, ra_y, 0  # y
- min r2, r2, rb_frame_height_minus_1
- add ra_y, ra_y, 1         ; mul24 r2, r2, r3
- add t0s, ra_x, r2    ; v8subs r1, r1, rb20
--add t0s, ra_frame_base, r2
-+add t1s, ra_frame_base, r2
- 
- # generate seven shifted versions
- # interleave with scroll of vertical context
-@@ -617,9 +602,9 @@ mov  -, vw_wait # wait on the VDW
- mov -,srel(0)
- 
- ldtmu0
-+ldtmu1
- ldtmu0
--ldtmu0
--ldtmu0
-+ldtmu1
- 
- nop        ; nop ; thrend
- nop        ; nop # delay slot 1
-@@ -630,9 +615,9 @@ nop        ; nop # delay slot 2
- mov  -, vw_wait # wait on the VDW
- 
- ldtmu0
-+ldtmu1
- ldtmu0
--ldtmu0
--ldtmu0
-+ldtmu1
- 
- mov -,sacq(0) # 1
- mov -,sacq(0) # 2
-@@ -656,200 +641,249 @@ nop        ; nop # delay slot 2
- # For P frames we make the second x,y coordinates offset by +8
- 
- ################################################################################
--# mc_setup(next_kernel, x, y, ref_y_base, x2, y2, ref_y2_base, frame_width, frame_height, pitch, dst_pitch, offset, shift, pad2)
-+# mc_setup(y_x, ref_y_base, y2_x2, ref_y2_base, frame_width_height, pitch, dst_pitch, offset_shift, next_kernel)
- ::mc_setup
-+  mov r3, 16
- 
--# Read starting kernel
--mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--mov ra31, unif
--
--# Compute base address for first and second access
--add r0, unif, elem_num # Load x
--max r0, r0, 0; mov r1, unif # Load y
--min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
--shl ra_xshift_next, r0, 3 # Compute shifts
--add ra_y, r1, 1
--and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
--add r2, r2, r0  # r2 is address for frame0 (not including y offset)
--max r1, r1, 0
--min r1, r1, rb_frame_height_minus_1
--nop             ; mul24 r1, r1, rb_pitch   # r2 contains the addresses (not including y offset) for frame0
--add t0s, r2, r1 ; mov ra_frame_base, r2
--
--add r0, unif, elem_num # Load x
--max r0, r0, 0; mov r1, unif # Load y
--min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
--shl rx_xshift2_next, r0, 3 # Compute shifts
--add ra_y2, r1, 1
--and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
--add r2, r2, r0  # r2 is address for frame1 (not including y offset)
--max r1, r1, 0
--min r1, r1, rb_frame_height_minus_1
--nop             ; mul24 r1, r1, rb_pitch   # r2 contains the addresses (not including y offset) for frame0
--add t0s, r2, r1 ; mov ra_frame_base2, r2
--
-+  # Need to save these because we need to know the frame dimensions before computing texture coordinates
-+  mov ra8, unif
-+  mov ra9, unif
-+  mov ra10, unif
-+  mov ra11, unif
- 
- # Read image dimensions
--sub rb25,unif,1
--sub rb30,unif,1
-+  mov r1, unif # width_height
-+  shl r0,r1,r3
-+  asr r1,r1,r3 # width
-+  asr r0,r0,r3 # height
-+  sub rb_frame_width_minus_1,r1,1
-+  sub rb_frame_height_minus_1,r0,1
- 
- # get source pitch
--mov rb16, unif
-+  mov rb_pitch, unif
- 
- # get destination pitch
--mov r0, unif
--mov r1, vdw_setup_1(0)
--add rb24, r1, r0
-+  mov r0, unif
-+  mov r1, vdw_setup_1(0)
-+  add rb24, r1, r0
- 
--# load constants
--
--mov ra20, 1
--mov ra22, 256
--mov ra30, 64
--
--mov rb20, 0xffffff00
--mov rb22, 255
--mov rb23, 24
-+# Compute base address for first and second access
-+  mov r1, ra8 # y_x
-+  shl r0,r1,r3 # r0 is x<<16
-+  asr r1,r1,r3 # r1 is y
-+  asr r0,r0,r3 # r0 is x
-+  add r0, r0, elem_num # Load x
-+  max r0, r0, 0
-+  min r0, r0, rb_frame_width_minus_1 ; mov r2, ra9  # Load the frame base
-+  shl ra_xshift_next, r0, 3 # Compute shifts
-+  add ra_y, r1, 1
-+  and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
-+  add r2, r2, r0  # r2 is address for frame0 (not including y offset)
-+  max r1, r1, 0
-+  min r1, r1, rb_frame_height_minus_1
-+  nop             ; mul24 r1, r1, rb_pitch   # r2 contains the addresses (not including y offset) for frame0
-+  add t0s, r2, r1 ; mov ra_frame_base, r2
-+
-+  mov r1, ra10 # y_x
-+  shl r0,r1,r3 # r0 is x<<16
-+  asr r1,r1,r3 # r1 is y
-+  asr r0,r0,r3 # r0 is x
-+  add r0, r0, elem_num # Load x
-+  max r0, r0, 0
-+  min r0, r0, rb_frame_width_minus_1 ; mov r2, ra11  # Load the frame base
-+  shl rx_xshift2_next, r0, 3 # Compute shifts
-+  add ra_y2, r1, 1
-+  and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
-+  add r2, r2, r0  # r2 is address for frame1 (not including y offset)
-+  max r1, r1, 0
-+  min r1, r1, rb_frame_height_minus_1
-+  nop             ; mul24 r1, r1, rb_pitch   # r2 contains the addresses (not including y offset) for frame0
-+  add t1s, r2, r1 ; mov ra_frame_base2, r2
- 
--# touch vertical context to keep simulator happy
- 
--mov ra8, 0
--mov ra9, 0
--mov ra10, 0
--mov ra11, 0
--mov ra12, 0
--mov ra13, 0
--mov ra14, 0
--mov ra15, 0
-+# load constants
- 
--# Compute part of VPM to use for DMA output
--mov r2, qpu_num
--mov r1, r2
--asr r1, r1, 2
--shl r1, r1, 6
--mov r0, r2
--and r0, r0, 3
--add r0, r0, r1
--mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
--shl r0, r0, 5
--add rb27, r0, r1
-+  mov ra20, 1
-+  mov ra22, 256
-+  mov ra30, 64
- 
--# Compute part of VPM to save data into
--mov r2, qpu_num   # qpu_num = abcd
--mov r1, r2
--asr r1, r1, 2
--shl r1, r1, 6
--mov r0, r2
--and r0, r0, 3
--add r0, r0, r1
--mov r1, vpm_setup(0, 4, h8p(0, 0))   # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
--add rb28, r0, r1
-+  mov rb20, 0xffffff00
-+  mov rb22, 255
-+  mov rb23, 24
- 
--mov rb12,unif # offset before shift
--mov rb13,unif # shift
-+# touch vertical context to keep simulator happy
- 
--# Dump padding words
--mov r0, unif
-+  mov ra8, 0
-+  mov ra9, 0
-+  mov ra10, 0
-+  mov ra11, 0
-+  mov ra12, 0
-+  mov ra13, 0
-+  mov ra14, 0
-+  mov ra15, 0
-+  mov ra18, 0x4000
-+
-+# Compute part of VPM to use
-+  mov r2, qpu_num
-+  mov r1, r2
-+  asr r1, r1, 2
-+  shl r1, r1, 6
-+  mov r0, r2
-+  and r0, r0, 3
-+  add r0, r0, r1
-+  mov r1, vpm_setup(0, 4, h8p(0, 0))   # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
-+  add rb28, r0, r1  # VPM for saving data
-+  mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
-+  shl r0, r0, 5
-+  add rb27, r0, r1  # Command for dma output
-+
-+# Weighted prediction denom
-+
-+  mov r1, unif # offset_shift
-+  shl r0,r1,r3 ; mov r2,8
-+  asr rb13,r0,r3 # shift
-+  asr rb12,r1,r3 # offset
-+  add rb13,rb13,r2    # mul24 is unsigned so scale up into high bits
-+  shl rb12, rb12, r2 # Account for larger shift
- 
- # submit texture requests for second line
--max r1, ra_y, 0
--min r1, r1, rb_frame_height_minus_1
--add ra_y, ra_y, 1
--nop ; mul24 r1, r1, rb_pitch
--add t0s, r1, ra_frame_base
--
--max r1, ra_y2, 0
--min r1, r1, rb_frame_height_minus_1
--bra -, ra31
--add ra_y2, ra_y2, 1           # Delay 1
--nop ; mul24 r1, r1, rb_pitch  # Delay 2
--add t0s, r1, ra_frame_base2   # Delay 3
--
--
--################################################################################
--
--# mc_filter(next_kernel, x, y, frame_base, x2, y2, frame_base2, height, hcoeffs[0], hcoeffs2[0], hcoeffs[1], hcoeffs2[1], vcoeffs[0], vcoeffs2[0], vcoeffs[1], vcoeffs2[1], offsetweight0, offsetweight1, this_dst)
--# In a P block, only the first half of coefficients contain used information.
--# At this point we have already issued two pairs of texture requests for the current block
--# ra_x, ra_x16_base point to the current coordinates for this block
--::mc_filter
--mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--mov ra31, unif
-+  max r1, ra_y, 0
-+  min r1, r1, rb_frame_height_minus_1
-+  add ra_y, ra_y, 1
-+  nop ; mul24 r1, r1, rb_pitch
-+  add t0s, r1, ra_frame_base
-+
-+  max r1, ra_y2, 0
-+  min r1, r1, rb_frame_height_minus_1
-+  add ra_y2, ra_y2, 1
-+  nop ; mul24 r1, r1, rb_pitch
-+  add t1s, r1, ra_frame_base2
-+
-+# FALL THROUGHT TO PER-BLOCK SETUP
-+
-+# Start of per-block setup code
-+# P and B blocks share the same setup code to save on Icache space
-+:per_block_setup
-+  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+  mov ra31, unif
- 
- # per-channel shifts were calculated on the *previous* invocation
--
--mov ra_xshift, ra_xshift_next
--mov rx_xshift2, rx_xshift2_next
-+  mov ra_xshift, ra_xshift_next
-+  mov rx_xshift2, rx_xshift2_next
- 
- # get base addresses and per-channel shifts for *next* invocation
--add r0, unif, elem_num # Load x
--max r0, r0, 0; mov r1, unif # Load y
--min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
--shl ra_xshift_next, r0, 3 # Compute shifts
--mov ra_y_next, r1
--and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
--add ra_frame_base_next, r2, r0  # r2 is address for frame0 (not including y offset)
--
--add r0, unif, elem_num # Load x
--max r0, r0, 0   ; mov r1, unif # Load y
--min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
--shl rx_xshift2_next, r0, 3 # Compute shifts
--add ra_y2_next, r1, 1
--and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
--add rx_frame_base2_next, r2, r0  # r2 is address for frame1 (not including y offset)
--
-+  mov r3, 16
-+  mov r1, unif # y_x
-+  shl r0,r1,r3 # r0 is x<<16
-+  asr r1,r1,r3 # r1 is y
-+  asr r0,r0,r3 # r0 is x
-+  add r0, r0, elem_num # Load x
-+  max r0, r0, 0
-+  min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
-+  shl ra_xshift_next, r0, 3 # Compute shifts
-+  mov ra_y_next, r1
-+  and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
-+  add ra_frame_base_next, r2, r0 ; mov r1, unif # y2_x2
-+
-+  shl r0,r1,r3 # r0 is x2<<16
-+  asr r1,r1,r3 # r1 is y2
-+  asr r0,r0,r3 # r0 is x2
-+  add r0, r0, elem_num # Load x
-+  max r0, r0, 0
-+  min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
-+  shl rx_xshift2_next, r0, 3 # Compute shifts
-+  mov ra_y2_next, r1
-+  and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
-+  add rx_frame_base2_next, r2, r0  # r2 is address for frame1 (not including y offset)
- 
- # set up VPM write
--mov vw_setup, rb28
-+  mov vw_setup, rb28
- 
- # get width,height of block
--mov r2, 16
--mov r0, unif
--shr r1, r0, r2 # Extract width
--sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
--and r0, r0, rb22 # Extract height
--add rb17, r0, 5
--add rb18, r0, 7
--shl r0, r0, 7
--add r0, r0, r1 # Combine width and height of destination area
--shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
--add rb26, r0, rb27
-+  mov r0, unif
-+  shr r1, r0, r3 # Extract width
-+  sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
-+  and r0, r0, rb22 # Extract height
-+  add rb17, r0, 5
-+  add rb18, r0, 7
-+  shl r0, r0, 7
-+  add r0, r0, r1 # Combine width and height of destination area
-+  shl r0, r0, r3 # Shift into bits 16 upwards of the vdw_setup0 register
-+  add rb26, r0, rb27
- 
- # get filter coefficients and discard unused B frame values
--mov r0, unif
--mov.ifnz -, unif # Alternate coefficients are unused for P frames
--asr ra3, r0, rb23;      mul24 r0, r0, ra22 # These may need some pre-rotation to be used in B frames correctly
--asr ra2, r0, rb23;      mul24 r0, r0, ra22
--asr ra1, r0, rb23;      mul24 r0, r0, ra22
--asr ra0, r0, rb23;      mov r0, unif
--mov.ifnz -, unif
--asr ra7, r0, rb23;      mul24 r0, r0, ra22
--asr ra6, r0, rb23;      mul24 r0, r0, ra22
--asr ra5, r0, rb23;      mul24 r0, r0, ra22
--asr ra4, r0, rb23;      mov r0, unif
--mov.ifnz -, unif
--asr rb11, r0, rb23;     mul24 r0, r0, ra22
--asr rb10, r0, rb23;     mul24 r0, r0, ra22
--asr rb9, r0, rb23;      mul24 r0, r0, ra22
--asr rb8, r0, rb23;      mov r0, unif
--mov.ifnz -, unif
--asr rb7, r0, rb23;      mul24 r0, r0, ra22
--asr rb6, r0, rb23;      mul24 r0, r0, ra22
--asr rb5, r0, rb23;      mul24 r0, r0, ra22
--asr rb4, r0, rb23
--
--mov r0, unif # Frame0 offset/weight
--mov.ifnz -, unif # Frame1 offset/weight unused
--asr rb15, r0, r2  # Compute offset from MSBs
--shl r0, r0, r2
--asr rb14, r0, r2  # Compute weight from LSBs
--
--# r3 is loop counter
-+  mov r0, unif ; mov r1,1  # Packed filter offsets, unpack into ra8... (to be used for vertical context later)
-+  asr ra9, r0, rb23;      mul24 r0, r0, ra22 # my2
-+  asr ra8, r0, rb23;      mul24 r0, r0, ra22 # mx2
-+  asr.ifz ra9, r0, rb23;  mul24 r0, r0, ra22 # my:my2
-+  asr.ifz ra8, r0, rb23                      # mx:mx2
-+  sub ra9,3,ra9
-+  sub ra8,3,ra8
-+  shl ra9,ra9,3   # Scale up by 8
-+  shl ra8,ra8,3   # Scale up by 8
-+# Now if we want aligned we have a mul of 1, so put 0 coefficients at the top
-+  mov r1,0xffff00
-+  shl r0, r1, ra8
-+  asr ra0, r0, rb23
-+  shl r0, r1, ra9
-+  asr rb4, r0, rb23
-+
-+  mov r1,0x1040400
-+  shl r0, r1, ra8
-+  asr ra1, r0, rb23
-+  shl r0, r1, ra9
-+  asr rb5, r0, rb23
-+
-+  mov r1,0xfbf5f600
-+  shl r0, r1, ra8
-+  asr ra2, r0, rb23
-+  shl r0, r1, ra9
-+  asr rb6, r0, rb23
-+
-+  mov r1,0x11283a40
-+  shl r0, r1, ra8
-+  asr ra3, r0, rb23
-+  shl r0, r1, ra9
-+  asr rb7, r0, rb23
-+
-+  mov r1,0x3a281100
-+  shl r0, r1, ra8
-+  asr ra4, r0, rb23
-+  shl r0, r1, ra9
-+  asr rb8, r0, rb23
-+
-+  mov r1,0xf6f5fb00
-+  shl r0, r1, ra8
-+  asr ra5, r0, rb23
-+  shl r0, r1, ra9
-+  asr rb9, r0, rb23
-+
-+  mov r1,0x4040100
-+  shl r0, r1, ra8
-+  asr ra6, r0, rb23
-+  shl r0, r1, ra9
-+  asr rb10, r0, rb23
-+
-+  mov r1,0xffff0000
-+  shl r0, r1, ra8
-+  asr ra7, r0, rb23
-+  shl r0, r1, ra9
-+  asr rb11, r0, rb23
-+
-+# Extract weighted prediction information
-+  mov r0, unif      # offset/weight  TODO move up
-+  asr rb15, r0, r3  # Compute offset from MSBs
-+  bra -, ra31
-+  shl r0, r0, r3    #                                                            Delay 1
-+  asr r0, r0, r3 ; mov r3, 0 # Compute weight from LSBs and reset loop counter   Delay 2
-+  shl rb14, r0, 8 # Use a larger shift to avoid unsigned multiply problem        Delay 3
- 
--# retrieve texture results and pick out bytes
--# then submit two more texture requests
-+################################################################################
-+# mc_filter(y_x, frame_base, y2_x2, frame_base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
-+# In a P block, y2_x2 should be y_x+8
-+# At this point we have already issued two pairs of texture requests for the current block
- 
--mov r3, 0
-+::mc_filter
- 
- :yloop
- # retrieve texture results and pick out bytes
-@@ -858,91 +892,90 @@ mov r3, 0
- # If we knew there was no clipping then this code would get simpler.
- # Perhaps we could add on the pitch and clip using larger values?
- 
--sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
--shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu0
--mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
--mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
--shr r1, r4, rx_xshift2
--mov.ifz ra_y2, ra_y2_next
-+  sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
-+  shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu1
-+  mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+  mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+  shr r1, r4, rx_xshift2
-+  mov.ifz ra_y2, ra_y2_next
- 
--max r2, ra_y, 0  # y
--min r2, r2, rb_frame_height_minus_1
--add ra_y, ra_y, 1            ; mul24 r2, r2, r3
--add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte
--
--max r2, ra_y2, 0  # y
--min r2, r2, rb_frame_height_minus_1
--add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
--add t0s, ra_frame_base2, r2   ; v8subs r0, r0, rb20
-+  max r2, ra_y, 0  # y
-+  min r2, r2, rb_frame_height_minus_1
-+  add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-+  add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte
- 
-+  max r2, ra_y2, 0  # y
-+  min r2, r2, rb_frame_height_minus_1
-+  add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
-+  add t1s, ra_frame_base2, r2   ; v8subs r1, r1, rb20
- 
- # generate seven shifted versions
- # interleave with scroll of vertical context
- 
--mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
- 
- # apply horizontal filter
--nop                  ; mul24 r2, r0, ra0
--nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
--nop                  ; mul24      r3, ra1 << 1, r0 << 1
--nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
--add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
--nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
--add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
--nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
--add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
--nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
--add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
--nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
--add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
--nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
--add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
--nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
--add r0, r2, r3       ; mov r3, rb31
--sub.setf -, r3, 8    ; mov ra12, ra13
--mov ra9, ra10
--mov ra10, ra11
--mov ra11, ra12
--mov ra12, ra13
--brr.anyn -, r:yloop
--mov ra13, ra14       # Delay slot 1
--mov ra14, ra15       # Delay slot 2
--mov ra15, r0         # Delay slot 3
-+  nop                  ; mul24 r2, r0, ra0
-+  nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+  nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+  nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+  add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+  nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+  add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+  nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+  add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+  nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+  add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+  nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+  add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+  nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+  add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+  nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+  add r0, r2, r3       ; mov r3, rb31
-+  sub.setf -, r3, 8    ; mov ra8, ra9
-+  mov ra9, ra10
-+  mov ra10, ra11
-+  mov ra11, ra12
-+  mov ra12, ra13
-+  brr.anyn -, r:yloop
-+  mov ra13, ra14       # Delay slot 1
-+  mov ra14, ra15       # Delay slot 2
-+  mov ra15, r0         # Delay slot 3
- 
- # apply vertical filter and write to VPM
- 
--nop                     ; mul24 r1, ra14, rb10
--nop                     ; mul24 r0, ra13, rb9
--add r1, r1, r0          ; mul24 r0, ra12, rb8
--add r1, r1, r0          ; mul24 r0, ra15, rb11
--add r1, r1, r0          ; mul24 r0, ra8, rb4
--add r1, r1, r0          ; mul24 r0, ra9, rb5
--add r1, r1, r0          ; mul24 r0, ra10, rb6
--add r1, r1, r0          ; mul24 r0, ra11, rb7
--
--add r1, r1, r0          ; mov -, vw_wait
--sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
--asr r1, r1, 14
--nop                     ; mul24 r1, r1, rb14
--add r1, r1, rb12
--asr r1, r1, rb13
--brr.anyn -, r:yloop
--add r1, r1, rb15       # Delay 1
--min r1, r1, rb22       # Delay 2
--max vpm, r1, 0         # Delay 3
-+  nop                     ; mul24 r1, ra14, rb10
-+  nop                     ; mul24 r0, ra13, rb9
-+  add r1, r1, r0          ; mul24 r0, ra12, rb8
-+  add r1, r1, r0          ; mul24 r0, ra15, rb11
-+  add r1, r1, r0          ; mul24 r0, ra8, rb4
-+  add r1, r1, r0          ; mul24 r0, ra9, rb5
-+  add r1, r1, r0          ; mul24 r0, ra10, rb6
-+  add r1, r1, r0          ; mul24 r0, ra11, rb7
-+
-+  add r1, r1, r0          ; mov -, vw_wait
-+  sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+  asr r1, r1, 14
-+  nop                     ; mul24 r1, r1, rb14
-+  add r1, r1, rb12
-+  asr r1, r1, rb13
-+  brr.anyn -, r:yloop
-+  add r1, r1, rb15       # Delay 1
-+  min r1, r1, rb22       # Delay 2
-+  max vpm, r1, 0         # Delay 3
- 
- # DMA out
- 
--bra -, ra31
--mov vw_setup, rb26 # VDW setup 0    Delay 1
--mov vw_setup, rb29 # Stride         Delay 2
--mov vw_addr, unif # start the VDW   Delay 3
-+  brr -, r:per_block_setup
-+  mov vw_setup, rb26 # VDW setup 0    Delay 1
-+  mov vw_setup, rb29 # Stride         Delay 2
-+  mov vw_addr, unif # start the VDW   Delay 3
- 
- 
- 
- ################################################################################
- 
--# mc_filter_b(next_kernel, x, y, frame_base, x2, y2, frame_base2, width_height, hcoeffs[0], hcoeffs2[0], hcoeffs[1], hcoeffs2[1], vcoeffs[0], vcoeffs2[0], vcoeffs[1], vcoeffs2[1], offsetweight0, offsetweight1, this_dst)
-+# mc_filter_b(y_x, frame_base, y2_x2, frame_base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
- # In a P block, only the first half of coefficients contain used information.
- # At this point we have already issued two pairs of texture requests for the current block
- # May be better to just send 16.16 motion vector and figure out the coefficients inside this block (only 4 cases so can compute hcoeffs in around 24 cycles?)
-@@ -952,92 +985,6 @@ mov vw_addr, unif # start the VDW   Delay 3
- # Or possibly by taking advantage of symmetry?
- # From 19->7 32bits per command.
- ::mc_filter_b
--mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
--mov ra31, unif
--
--# per-channel shifts were calculated on the *previous* invocation
--
--mov ra_xshift, ra_xshift_next
--mov rx_xshift2, rx_xshift2_next
--
--# get base addresses and per-channel shifts for *next* invocation
--add r0, unif, elem_num # Load x
--max r0, r0, 0; mov r1, unif # Load y
--min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
--shl ra_xshift_next, r0, 3 # Compute shifts
--mov ra_y_next, r1
--and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
--add ra_frame_base_next, r2, r0  # r2 is address for frame0 (not including y offset)
--
--add r0, unif, elem_num # Load x
--max r0, r0, 0   ; mov r1, unif # Load y
--min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
--shl rx_xshift2_next, r0, 3 # Compute shifts
--add ra_y2_next, r1, 1
--and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
--add rx_frame_base2_next, r2, r0  # r2 is address for frame1 (not including y offset)
--
--
--# set up VPM write
--mov vw_setup, rb28
--
--# get width,height of block
--mov r2, 16
--mov r0, unif
--shr r1, r0, r2 # Extract width
--sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
--and r0, r0, rb22 # Extract height
--add rb17, r0, 5
--add rb18, r0, 7
--shl r0, r0, 7
--add r0, r0, r1 # Combine width and height of destination area
--shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
--add rb26, r0, rb27
--
--# get filter coefficients and discard unused B frame values
--mov r0, unif
--mov r1, 1
--mov.ifnz r0, unif # Alternate coefficients are unused for P frames
--nop              ;      mul24 r0, r0 << 13, r1 << 13
--asr ra3, r0, rb23;      mul24 r0, r0, ra22
--nop              ;      mul24 r0, r0 << 14, r1 << 14
--asr ra2, r0, rb23;      mul24 r0, r0, ra22
--nop              ;      mul24 r0, r0 << 15, r1 << 15 # Adjust such that a rotate of 1 will produce the values with first 8 on left, second 8 on right
--asr ra1, r0, rb23;      mul24 r0, r0, ra22
--asr ra0, r0, rb23;      mov r0, unif
--mov.ifnz r0, unif
--nop              ;      mul24 r0, r0 << 9, r1 << 9
--asr ra7, r0, rb23;      mul24 r0, r0, ra22
--nop              ;      mul24 r0, r0 << 10, r1 << 10
--asr ra6, r0, rb23;      mul24 r0, r0, ra22
--nop              ;      mul24 r0, r0 << 11, r1 << 11
--asr ra5, r0, rb23;      mul24 r0, r0, ra22
--nop              ;      mul24 r0, r0 << 12, r1 << 12
--asr ra4, r0, rb23;      mov r0, unif
--mov.ifnz r0, unif
--asr rb11, r0, rb23;     mul24 r0, r0, ra22
--asr rb10, r0, rb23;     mul24 r0, r0, ra22
--asr rb9, r0, rb23;      mul24 r0, r0, ra22
--asr rb8, r0, rb23;      mov r0, unif
--mov.ifnz r0, unif
--asr rb7, r0, rb23;      mul24 r0, r0, ra22
--asr rb6, r0, rb23;      mul24 r0, r0, ra22
--asr rb5, r0, rb23;      mul24 r0, r0, ra22
--asr rb4, r0, rb23
--
--mov r0, unif # Frame0 offset/weight
--mov.ifnz r0, unif # Frame1 offset/weight unused
--asr rb15, r0, r2  # Compute offset from MSBs
--shl r0, r0, r2
--asr rb14, r0, r2  # Compute weight from LSBs
--
--# r3 is loop counter
--
--# retrieve texture results and pick out bytes
--# then submit two more texture requests
--
--mov r3, 0
--
- :yloopb
- # retrieve texture results and pick out bytes
- # then submit two more texture requests
-@@ -1045,111 +992,123 @@ mov r3, 0
- # If we knew there was no clipping then this code would get simpler.
- # Perhaps we could add on the pitch and clip using larger values?
- 
--sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
--shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu0
--mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
--mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
--shr r1, r4, rx_xshift2
--mov.ifz ra_y2, ra_y2_next
-+  sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
-+  shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu1
-+  mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+  mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+  shr r1, r4, rx_xshift2
-+  mov.ifz ra_y2, ra_y2_next
- 
--max r2, ra_y, 0  # y
--min r2, r2, rb_frame_height_minus_1
--add ra_y, ra_y, 1            ; mul24 r2, r2, r3
--add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte
--
--max r2, ra_y2, 0  # y
--min r2, r2, rb_frame_height_minus_1
--add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
--add t0s, ra_frame_base2, r2   ; v8subs r0, r0, rb20
-+  max r2, ra_y, 0  # y
-+  min r2, r2, rb_frame_height_minus_1
-+  add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-+  add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte
- 
-+  max r2, ra_y2, 0  # y
-+  min r2, r2, rb_frame_height_minus_1
-+  add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
-+  add t1s, ra_frame_base2, r2   ; v8subs r1, r1, rb20
- 
- # generate seven shifted versions
- # interleave with scroll of vertical context
- 
--mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
- 
- # apply horizontal filter
--nop                  ; mul24 r2, r0, ra0
--nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
--nop                  ; mul24      r3, ra1 << 1, r0 << 1
--nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
--add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
--nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
--add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
--nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
--add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
--nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
--add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
--nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
--add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
--nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
--add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
--nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
--add r0, r2, r3       ; mov r3, rb31
--sub.setf -, r3, 8    ; mov ra12, ra13
--mov ra9, ra10
--mov ra10, ra11
--mov ra11, ra12
--mov ra12, ra13
--brr.anyn -, r:yloopb
--mov ra13, ra14       # Delay slot 1
--mov ra14, ra15       # Delay slot 2
--mov ra15, r0         # Delay slot 3
--
--# apply vertical filter and write to VPM
--
--nop                     ; mul24 r1, ra14, rb10
--nop                     ; mul24 r0, ra13, rb9
--add r1, r1, r0          ; mul24 r0, ra12, rb8
--add r1, r1, r0          ; mul24 r0, ra15, rb11
--add r1, r1, r0          ; mul24 r0, ra8, rb4
--add r1, r1, r0          ; mul24 r0, ra9, rb5
--add r1, r1, r0          ; mul24 r0, ra10, rb6
--add r1, r1, r0          ; mul24 r0, ra11, rb7
--
--add r1, r1, r0          ; mov -, vw_wait
--sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
--asr r1, r1, 14
--nop                     ; mul24 r1, r1 << 8, ra20 << 8 # Rotate to align left and right halves
--add r1, r1, ra30        ; mul24 r0, r1, rb14
--add r1, r1, r0
--brr.anyn -, r:yloopb
--asr r1, r1, 7          # Delay 1
--min r1, r1, rb22       # Delay 2
--max vpm, r1, 0         # Delay 3
-+  nop                  ; mul24 r2, r0, ra0
-+  nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+  nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+  nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+  add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+  nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+  add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+  nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+  add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+  nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+  add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+  nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+  add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+  nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+  add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+  nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+  add r0, r2, r3       ; mov r3, rb31
-+  sub.setf -, r3, 8    ; mov ra8, ra9
-+  mov ra9, ra10
-+  mov ra10, ra11
-+  mov ra11, ra12
-+  mov ra12, ra13
-+  brr.anyn -, r:yloopb
-+  mov ra13, ra14       # Delay slot 1
-+  mov ra14, ra15       # Delay slot 2
-+  mov ra15, r0         # Delay slot 3
-+
-+  # apply vertical filter and write to VPM
-+
-+  nop                     ; mul24 r1, ra14, rb10
-+  nop                     ; mul24 r0, ra13, rb9
-+  add r1, r1, r0          ; mul24 r0, ra12, rb8
-+  add r1, r1, r0          ; mul24 r0, ra15, rb11
-+  add r1, r1, r0          ; mul24 r0, ra8, rb4
-+  add r1, r1, r0          ; mul24 r0, ra9, rb5
-+  add r1, r1, r0          ; mul24 r0, ra10, rb6
-+  add r1, r1, r0          ; mul24 r0, ra11, rb7
-+
-+  add r1, r1, r0          ; mov -, vw_wait
-+  sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+  asr r0, r1, 14
-+  asr r1, r1, 6           # Wait state so we can use the rotate instruction
-+  nop                     ; mul24 r0, r0 << 8, ra22 << 8 # Rotate to align left and right halves
-+  add r1, r1, ra18
-+  add r1, r1, r0
-+  brr.anyn -, r:yloopb
-+  asr r1, r1, 15         # Delay 1
-+  min r1, r1, rb22       # Delay 2
-+  max vpm, r1, 0         # Delay 3
- 
- # DMA out
--bra -, ra31
--mov vw_setup, rb26 # VDW setup 0    Delay 1
--mov vw_setup, rb29 # Stride         Delay 2
--mov vw_addr, unif # start the VDW   Delay 3
-+  brr -, r:per_block_setup
-+  mov vw_setup, rb26 # VDW setup 0    Delay 1
-+  mov vw_setup, rb29 # Stride         Delay 2
-+  mov vw_addr, unif # start the VDW   Delay 3
- 
- ################################################################################
- 
- # mc_interrupt_exit12()
- ::mc_interrupt_exit12
--mov  -, vw_wait # wait on the VDW
--
--ldtmu0
--ldtmu0
--ldtmu0
--ldtmu0
--
--mov -,sacq(0) # 1
--mov -,sacq(0) # 2
--mov -,sacq(0) # 3
--mov -,sacq(0) # 4
--mov -,sacq(0) # 5
--mov -,sacq(0) # 6
--mov -,sacq(0) # 7
--mov -,sacq(0) # 8
--mov -,sacq(0) # 9
--mov -,sacq(0) # 10
--mov -,sacq(0) # 11
--
--nop        ; nop ; thrend
--mov interrupt, 1; nop # delay slot 1
--nop        ; nop # delay slot 2
-+  mov  -, vw_wait # wait on the VDW
-+
-+  ldtmu0
-+  ldtmu0
-+  ldtmu1
-+  ldtmu1
-+
-+  mov -,sacq(0) # 1
-+  mov -,sacq(0) # 2
-+  mov -,sacq(0) # 3
-+  mov -,sacq(0) # 4
-+  mov -,sacq(0) # 5
-+  mov -,sacq(0) # 6
-+  mov -,sacq(0) # 7
-+  mov -,sacq(0) # 8
-+  mov -,sacq(0) # 9
-+  mov -,sacq(0) # 10
-+  mov -,sacq(0) # 11
-+
-+  nop        ; nop ; thrend
-+  mov interrupt, 1; nop # delay slot 1
-+  nop        ; nop # delay slot 2
-+
-+
-+::mc_exit1
-+  mov  -, vw_wait # wait on the VDW
-+
-+  ldtmu0
-+  ldtmu1
-+  ldtmu0
-+  ldtmu1
-+  nop        ; nop ; thrend
-+  mov interrupt, 1; nop # delay slot 1
-+  nop        ; nop # delay slot 2
- 
- 
- ::mc_end
--- 
-2.7.4
-
-
-From f02ec34c772aad3caa17432c6a4860f9ed0d5dc6 Mon Sep 17 00:00:00 2001
-From: Peter de Rivaz <peter.derivaz@gmail.com>
-Date: Tue, 2 Jun 2015 10:58:25 +0100
-Subject: [PATCH 48/68] Added option to simulate QPUs
-
----
- libavcodec/hevc.c          | 288 +++++++++++++++++++++++++++++++++++++++++++--
- libavcodec/rpi_qpu.c       |  24 ++--
- libavcodec/rpi_shader.qasm |   6 +-
- 3 files changed, 295 insertions(+), 23 deletions(-)
-
-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-index 2da88ec..34d92e2 100644
---- a/libavcodec/hevc.c
-+++ b/libavcodec/hevc.c
-@@ -56,6 +56,8 @@
-   // Define RPI_CACHE_UNIF_MVS to write motion vector uniform stream to cached memory
-   // RPI_CACHE_UNIF_MVS doesn't seem to make much difference, so left undefined.
- 
-+  // Define RPI_SIMULATE_QPUS for debugging to run QPU code on the ARMs
-+  //#define RPI_SIMULATE_QPUS
- 
- #endif
- 
-@@ -124,7 +126,6 @@ static void pic_arrays_free(HEVCContext *s)
- 
- #ifdef EARLY_MALLOC
- #else
--    printf("pic_arrays_free\n");
-     if (s->coeffs_buf_arm[0]) {
-       gpu_free(&s->coeffs_buf_default);
-       s->coeffs_buf_arm[0] = 0;
-@@ -174,11 +175,9 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
- #ifdef RPI
- #ifdef EARLY_MALLOC
- #else
--    assert(sps);
-+    av_assert0(sps);
-     int coeffs_in_ctb = (1 << sps->log2_ctb_size) * (1 << sps->log2_ctb_size);
-     int coefs_per_row = sps->ctb_width * coeffs_in_ctb * 3;  // Allow space for chroma
--    printf("pic_arrays_init\n");
--    printf("Allocated %d\n",coefs_per_row);
-     gpu_malloc_cached(sizeof(int16_t) * coefs_per_row, &s->coeffs_buf_default);
-     s->coeffs_buf_arm[0] = (int16_t*) s->coeffs_buf_default.arm;
-     if (!s->coeffs_buf_arm[0])
-@@ -2988,6 +2987,274 @@ static void rpi_inter_clear(HEVCContext *s)
- #endif
- }
- 
 +
 +#ifdef RPI_SIMULATE_QPUS
 +
@@ -28438,12 +3698,13 @@ index 2da88ec..34d92e2 100644
 +   return vsum;
 +}
 +
-+static uint8_t *test_frame(HEVCContext *s,uint32_t p, AVFrame *frame, int cIdx)
++static uint8_t *test_frame(HEVCContext *s,uint32_t p, AVFrame *frame, const int cIdx)
 +{
 +  //int pic_width        = s->ps.sps->width >> s->ps.sps->hshift[cIdx];
 +  int pic_height       = s->ps.sps->height >> s->ps.sps->vshift[cIdx];
 +  int pitch = frame->linesize[cIdx];
-+  uint32_t base = get_vc_address(frame->buf[cIdx]);
++  uint32_t base = cIdx == 0 ? get_vc_address_y(frame) :
++    cIdx == 1 ? get_vc_address_u(frame) : get_vc_address_v(frame);
 +  if (p>=base && p<base+pitch*pic_height) {
 +    return frame->data[cIdx] + (p-base);
 +  }
@@ -28541,14 +3802,15 @@ index 2da88ec..34d92e2 100644
 +}
 +
 +// mc_setup(y_x, ref_y_base, y2_x2, ref_y2_base, frame_width_height, pitch, dst_pitch, offset_shift, next_kernel)
-+static void rpi_simulate_inter_luma(HEVCContext *s,uint32_t *p)
++static void rpi_simulate_inter_luma(HEVCContext *s,uint32_t *p,int chan)
 +{
 +  uint32_t next_kernel;
 +  int y_x,y2_x2;
-+  uint32_t x0;
-+  uint32_t y0;
-+  uint32_t x2;
-+  uint32_t y2;
++  int x0;
++  int y0;
++  int x2;
++  int y2;
++  uint32_t *p0 = p;
 +  uint8_t *ref_y_base;
 +  uint8_t *ref_y2_base;
 +  uint32_t frame_width_height = p[4];
@@ -28578,13 +3840,15 @@ index 2da88ec..34d92e2 100644
 +      uint8_t *this_dst = compute_arm_addr(s,p[7],0);
 +      uint32_t width = width_height >> 16;
 +      uint32_t height = (width_height << 16) >> 16;
++      uint8_t *dst_base = s->frame->data[0];
 +      ref_y_base = compute_arm_addr(s,p[1-9],0);
 +      ref_y2_base = compute_arm_addr(s,p[3-9],0);
 +      for (y=0; y<height; ++y) {
 +        for (x=0; x<width; ++x) {
 +          if (next_kernel==s->mc_filter) {
 +            int32_t refa = filter8_luma(ref_y_base,x+x0, y+y0, pitch, my2_mx2_my_mx, offset_weight,offset_before,denom,frame_width,frame_height);
-+            this_dst[x+y*dst_pitch] = av_clip_uint8(refa);
++            refa = av_clip_uint8(refa);
++            this_dst[x+y*dst_pitch] = refa;
 +          }
 +          else {
 +            int32_t refa = filter8_luma(ref_y_base, x+x0, y+y0, pitch, my2_mx2_my_mx, 1, 0, 0, frame_width, frame_height);
@@ -28611,1204 +3875,99 @@ index 2da88ec..34d92e2 100644
 +  }
 +  for(i=0;i<12;i++)
 +  {
-+    rpi_simulate_inter_luma(s,s->y_mvs_base[i]);
-+  }
-+}
-+
-+#endif
-+
-+
- static void rpi_execute_inter_qpu(HEVCContext *s)
- {
-     int k;
-@@ -3006,7 +3273,7 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
-         s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
-         s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
-         s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP_UV); // Also need a dummy for V
--        assert(s->u_mvs[k] - s->mvs_base[k] < UV_COMMANDS_PER_QPU);
-+        av_assert0(s->u_mvs[k] - s->mvs_base[k] < UV_COMMANDS_PER_QPU);
-     }
- 
-     s->u_mvs[8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
-@@ -3016,11 +3283,16 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
-         s->y_mvs[k][-RPI_LUMA_COMMAND_WORDS+1] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
-         s->y_mvs[k][-RPI_LUMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // Also need a dummy for second request
-         s->y_mvs[k][-RPI_LUMA_COMMAND_WORDS+8] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
--        assert(s->y_mvs[k] - s->y_mvs_base[k] < Y_COMMANDS_PER_QPU);
-+        av_assert0(s->y_mvs[k] - s->y_mvs_base[k] < Y_COMMANDS_PER_QPU);
-     }
-     s->y_mvs[12-1][-RPI_LUMA_COMMAND_WORDS+8] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT12); // This QPU will signal interrupt when all others are done and have acquired a semaphore
- #endif
- 
-+#ifdef RPI_SIMULATE_QPUS
-+    rpi_simulate_inter_qpu(s);
-+    s->vpu_id = -1;
-+    return;
-+#endif
- 
- #ifdef RPI_MULTI_MAILBOX
- #ifdef RPI_CACHE_UNIF_MVS
-@@ -3101,7 +3373,7 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-                     && s->ps.pps->num_tile_rows <= 1 && s->ps.pps->num_tile_columns <= 1;
- #endif
- 
--    /*if (!s->enable_rpi) {
-+    if (!s->enable_rpi) {
-       if (s->ps.pps->cross_component_prediction_enabled_flag)
-         printf("Cross component\n");
-       if (s->ps.pps->num_tile_rows > 1 || s->ps.pps->num_tile_columns > 1)
-@@ -3110,7 +3382,7 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-         printf("Weighted P slice\n");
-       if (s->ps.pps->weighted_bipred_flag && s->sh.slice_type == B_SLICE)
-         printf("Weighted B slice\n");
--    }*/
-+    }
- 
- #endif
- 
-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-index e12304b..4480f72 100644
---- a/libavcodec/rpi_qpu.c
-+++ b/libavcodec/rpi_qpu.c
-@@ -13,7 +13,7 @@
- #include <stdlib.h>
- #include <string.h>
- #include <stddef.h>
--#include <assert.h>
-+#include "libavutil/avassert.h"
- 
- #include "config.h"
- 
-@@ -160,13 +160,13 @@ static int gpu_init(volatile struct GPU **gpu) {
-   // Now copy over the QPU code into GPU memory
-   {
-     int num_bytes = qpu_get_fn(QPU_MC_END) - qpu_get_fn(QPU_MC_SETUP_UV);
--    assert(num_bytes<=QPU_CODE_SIZE*sizeof(unsigned int));
-+    av_assert0(num_bytes<=QPU_CODE_SIZE*sizeof(unsigned int));
-     memcpy((void*)ptr->qpu_code, rpi_shader, num_bytes);
-   }
-   // And the VPU code
-   {
-     int num_bytes = sizeof(rpi_hevc_transform);
--    assert(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
-+    av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
-     memcpy((void*)ptr->vpu_code, rpi_hevc_transform, num_bytes);
-   }
-   // And the transform coefficients
-@@ -216,13 +216,13 @@ static void gpu_unlock(void) {
- static int gpu_malloc_uncached_internal(int numbytes, GPU_MEM_PTR_T *p, int mb) {
-   p->numbytes = numbytes;
-   p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
--  assert(p->vcsm_handle);
-+  av_assert0(p->vcsm_handle);
-   p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
--  assert(p->vc_handle);
-+  av_assert0(p->vc_handle);
-   p->arm = vcsm_lock(p->vcsm_handle);
--  assert(p->arm);
-+  av_assert0(p->arm);
-   p->vc = mem_lock(mb, p->vc_handle);
--  assert(p->vc);
-+  av_assert0(p->vc);
-   return 0;
- }
- 
-@@ -243,7 +243,7 @@ int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p)
- 
- int gpu_get_mailbox(void)
- {
--  assert(gpu);
-+  av_assert0(gpu);
-   return gpu->mb;
- }
- 
-@@ -297,13 +297,13 @@ static int gpu_malloc_cached_internal(int numbytes, GPU_MEM_PTR_T *p) {
-   //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_VC, (char *)"Video Frame" );
-   //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
-   //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST_AND_VC, (char *)"Video Frame" );
--  assert(p->vcsm_handle);
-+  av_assert0(p->vcsm_handle);
-   p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
--  assert(p->vc_handle);
-+  av_assert0(p->vc_handle);
-   p->arm = vcsm_lock(p->vcsm_handle);
--  assert(p->arm);
-+  av_assert0(p->arm);
-   p->vc = mem_lock(gpu->mb, p->vc_handle);
--  assert(p->vc);
-+  av_assert0(p->vc);
-   return 0;
- }
- 
-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-index 60d1ec2..0686249 100644
---- a/libavcodec/rpi_shader.qasm
-+++ b/libavcodec/rpi_shader.qasm
-@@ -149,8 +149,8 @@ add t0s, r0, r1 ; mov ra_frame_base, r2
- add t1s, r2, r1
- 
- mov r2,8
--shl rb12,unif, r2 # offset before shift
--add rb13,unif,r2  # offset after shift
-+shl rb12,unif,r2 # offset before shift
-+add rb13,unif,r2  # denominator
- 
- # Compute part of VPM to use for DMA output
- mov r2, unif
-@@ -185,7 +185,7 @@ add t1s, r1, ra_frame_base
- 
- ################################################################################
- 
--# mc_filter_uv(next_kernel, x, y, frame_u_base, frame_v_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_u_dst, this_v_dst)
-+# mc_filter_uv(next_kernel, x, y, frame_u_base, frame_v_base, width_height, hcoeffs, vcoeffs, offset_weight_u, offset_weight_v, this_u_dst, this_v_dst)
- 
- # At this point we have already issued two pairs of texture requests for the current block
- # ra_x, ra_x16_base point to the current coordinates for this block
--- 
-2.7.4
-
-
-From 8bdf6b06c612ff4971c2ce99a62d093cf92468ca Mon Sep 17 00:00:00 2001
-From: Peter de Rivaz <peter.derivaz@gmail.com>
-Date: Tue, 2 Jun 2015 13:17:50 +0100
-Subject: [PATCH 49/68] Increased motion vector memory and fixed block size
- computation for non-multiple of 2 block sizes
-
----
- libavcodec/hevc.c | 50 +++++++++++++++++++++++++++++++-------------------
- 1 file changed, 31 insertions(+), 19 deletions(-)
-
-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-index 34d92e2..3fb1e2a 100644
---- a/libavcodec/hevc.c
-+++ b/libavcodec/hevc.c
-@@ -83,11 +83,9 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
- 
- // Split image of 2048 into parts 64 wide
- // So some QPUs will have 3 blocks of 64 to do, and others 2 blocks for an image 2048 wide with 32 blocks across
--// Each block of 64*64
--// Smallest CTU size is 16x16, so smallest block is 8x8
--// Corresponds to a total of 83kbytes over all 12 QPUs
-+// For each block of 64*64 the smallest block size is 8x4
- #define RPI_LUMA_COMMAND_WORDS 9
--#define Y_COMMANDS_PER_QPU ((1+3*(64*64)/(8*8)) * RPI_LUMA_COMMAND_WORDS)
-+#define Y_COMMANDS_PER_QPU ((1+3*(64*64)/(8*4)) * RPI_LUMA_COMMAND_WORDS)
- 
- #define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24)
- 
-@@ -2042,11 +2040,13 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-             uint32_t *y = s->y_mvs[chan % 12];
-             for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
-               for(int start_x=0;start_x < nPbW;start_x+=16) {
-+                  int bw = nPbW-start_x;
-+                  int bh = nPbH-start_y;
-                   y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff);
-                   y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[0]);
-                   y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + 8 + start_x) & 0xffff);
-                   y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[0]);
--                  *y++ = ( (nPbW<16 ? nPbW : 16) << 16 ) + (nPbH<16 ? nPbH : 16);
-+                  *y++ = ( (bw<16 ? bw : 16) << 16 ) + (bh<16 ? bh : 16);
-                   *y++ = my2_mx2_my_mx;
-                   if (weight_flag) {
-                       *y++ = (s->sh.luma_offset_l0[current_mv.ref_idx[reflist]] << 16) + (s->sh.luma_weight_l0[current_mv.ref_idx[reflist]] & 0xffff);
-@@ -2089,12 +2089,14 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-                 uint32_t *u = s->u_mvs[chan & 7];
-                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
-+                      int bw = nPbW_c-start_x;
-+                      int bh = nPbH_c-start_y;
-                       u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
-                       u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
-                       u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[1]);
-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[2]);
--                      *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
-+                      *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
-                       *u++ = rpi_filter_coefs[_mx][0];
-                       *u++ = rpi_filter_coefs[_my][0];
-                       if (weight_flag) {
-@@ -2141,11 +2143,13 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-             uint32_t *y = s->y_mvs[chan % 12];
-             for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
-               for(int start_x=0;start_x < nPbW;start_x+=16) {
-+                  int bw = nPbW-start_x;
-+                  int bh = nPbH-start_y;
-                   y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff);
-                   y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[0]);
-                   y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + 8 + start_x) & 0xffff);
-                   y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[0]);
--                  *y++ = ( (nPbW<16 ? nPbW : 16) << 16 ) + (nPbH<16 ? nPbH : 16);
-+                  *y++ = ( (bw<16 ? bw : 16) << 16 ) + (bh<16 ? bh : 16);
-                   *y++ = my2_mx2_my_mx;
-                   if (weight_flag) {
-                       *y++ = (s->sh.luma_offset_l0[current_mv.ref_idx[reflist]] << 16) + (s->sh.luma_weight_l0[current_mv.ref_idx[reflist]] & 0xffff);
-@@ -2189,12 +2193,14 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-                 uint32_t *u = s->u_mvs[chan & 7];
-                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
-+                      int bw = nPbW_c-start_x;
-+                      int bh = nPbH_c-start_y;
-                       u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
-                       u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
-                       u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[1]);
-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[2]);
--                      *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
-+                      *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
-                       // TODO chroma weight and offset... s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]
-                       *u++ = rpi_filter_coefs[_mx][0];
-                       *u++ = rpi_filter_coefs[_my][0];
-@@ -2246,11 +2252,13 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-             uint32_t *y = s->y_mvs[chan % 12];
-             for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
-               for(int start_x=0;start_x < nPbW;start_x+=8) { // B blocks work 8 at a time
-+                  int bw = nPbW-start_x;
-+                  int bh = nPbH-start_y;
-                   y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff);
-                   y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[0]);
-                   y++[-RPI_LUMA_COMMAND_WORDS] = ((y2 - 3 + start_y) << 16) + ( (x2 - 3 + start_x) & 0xffff); // Second fetch is for ref1
-                   y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[0]);
--                  *y++ = ( (nPbW<8 ? nPbW : 8) << 16 ) + (nPbH<16 ? nPbH : 16);
-+                  *y++ = ( (bw<8 ? bw : 8) << 16 ) + (bh<16 ? bh : 16);
-                   *y++ = my2_mx2_my_mx;
-                   *y++ = 1; // B frame weighted prediction not supported
-                   *y++ = (get_vc_address(s->frame->buf[0]) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
-@@ -2293,12 +2301,14 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-                 uint32_t *u = s->u_mvs[chan & 7];
-                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
-+                      int bw = nPbW_c-start_x;
-+                      int bh = nPbH_c-start_y;
-                       u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b0;
-                       u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
-                       u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[1]);
-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[2]);
--                      *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
-+                      *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
-                       *u++ = rpi_filter_coefs[_mx][0];
-                       *u++ = rpi_filter_coefs[_my][0];
-                       u+=2; // Weights not supported in B slices
-@@ -2309,7 +2319,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-                       u++[-RPI_CHROMA_COMMAND_WORDS] = y2_c - 1 + start_y;
-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[1]);
-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[2]);
--                      *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
-+                      *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
-                       *u++ = rpi_filter_coefs[_mx2][0];
-                       *u++ = rpi_filter_coefs[_my2][0];
-                       u+=2; // Weights not supported in B slices
-@@ -3178,14 +3188,15 @@ static void rpi_simulate_inter_chroma(HEVCContext *s,uint32_t *p)
- }
- 
- // mc_setup(y_x, ref_y_base, y2_x2, ref_y2_base, frame_width_height, pitch, dst_pitch, offset_shift, next_kernel)
--static void rpi_simulate_inter_luma(HEVCContext *s,uint32_t *p)
-+static void rpi_simulate_inter_luma(HEVCContext *s,uint32_t *p,int chan)
- {
-   uint32_t next_kernel;
-   int y_x,y2_x2;
--  uint32_t x0;
--  uint32_t y0;
--  uint32_t x2;
--  uint32_t y2;
-+  int x0;
-+  int y0;
-+  int x2;
-+  int y2;
-+  uint32_t *p0 = p;
-   uint8_t *ref_y_base;
-   uint8_t *ref_y2_base;
-   uint32_t frame_width_height = p[4];
-@@ -3215,13 +3226,15 @@ static void rpi_simulate_inter_luma(HEVCContext *s,uint32_t *p)
-       uint8_t *this_dst = compute_arm_addr(s,p[7],0);
-       uint32_t width = width_height >> 16;
-       uint32_t height = (width_height << 16) >> 16;
-+      uint8_t *dst_base = s->frame->data[0];
-       ref_y_base = compute_arm_addr(s,p[1-9],0);
-       ref_y2_base = compute_arm_addr(s,p[3-9],0);
-       for (y=0; y<height; ++y) {
-         for (x=0; x<width; ++x) {
-           if (next_kernel==s->mc_filter) {
-             int32_t refa = filter8_luma(ref_y_base,x+x0, y+y0, pitch, my2_mx2_my_mx, offset_weight,offset_before,denom,frame_width,frame_height);
--            this_dst[x+y*dst_pitch] = av_clip_uint8(refa);
-+            refa = av_clip_uint8(refa);
-+            this_dst[x+y*dst_pitch] = refa;
-           }
-           else {
-             int32_t refa = filter8_luma(ref_y_base, x+x0, y+y0, pitch, my2_mx2_my_mx, 1, 0, 0, frame_width, frame_height);
-@@ -3248,7 +3261,7 @@ static void rpi_simulate_inter_qpu(HEVCContext *s)
-   }
-   for(i=0;i<12;i++)
-   {
--    rpi_simulate_inter_luma(s,s->y_mvs_base[i]);
 +    rpi_simulate_inter_luma(s,s->y_mvs_base[i],i);
-   }
- }
- 
-@@ -3290,7 +3303,6 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
- 
- #ifdef RPI_SIMULATE_QPUS
-     rpi_simulate_inter_qpu(s);
--    s->vpu_id = -1;
-     return;
- #endif
- 
--- 
-2.7.4
-
-
-From da5ae7e96dd961ccc7bc162c8acf336d54a50092 Mon Sep 17 00:00:00 2001
-From: Peter de Rivaz <peter.derivaz@gmail.com>
-Date: Tue, 2 Jun 2015 14:36:54 +0100
-Subject: [PATCH 50/68] Added support for skip deblock
-
----
- libavcodec/hevc.c        |  5 +++++
- libavcodec/hevc.h        |  2 ++
- libavcodec/hevc_filter.c | 14 ++++----------
- 3 files changed, 11 insertions(+), 10 deletions(-)
-
-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-index 3fb1e2a..0ac4f4c 100644
---- a/libavcodec/hevc.c
-+++ b/libavcodec/hevc.c
-@@ -3397,6 +3397,11 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-     }
- 
- #endif
-+    s->used_for_ref = !(s->nal_unit_type == NAL_TRAIL_N ||
-+                        s->nal_unit_type == NAL_TSA_N   ||
-+                        s->nal_unit_type == NAL_STSA_N  ||
-+                        s->nal_unit_type == NAL_RADL_N  ||
-+                        s->nal_unit_type == NAL_RASL_N);
- 
-     if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) {
-         av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n");
-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-index 5df9dcd..5cb90b5 100644
---- a/libavcodec/hevc.h
-+++ b/libavcodec/hevc.h
-@@ -890,6 +890,8 @@ typedef struct HEVCContext {
-     int                 width;
-     int                 height;
- 
-+    int used_for_ref;
-+
- #ifdef RPI
-     int enable_rpi;
-     HEVCMvCmd *unif_mv_cmds;
-diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
-index 11629e4..14a0952 100644
---- a/libavcodec/hevc_filter.c
-+++ b/libavcodec/hevc_filter.c
-@@ -512,16 +512,14 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
-                s->ps.pps->transquant_bypass_enable_flag;
- 
- #ifdef DISABLE_DEBLOCK_NONREF
--    if (    s->nal_unit_type == NAL_TRAIL_N ||
--            s->nal_unit_type == NAL_TSA_N   ||
--            s->nal_unit_type == NAL_STSA_N  ||
--            s->nal_unit_type == NAL_RADL_N  ||
--            s->nal_unit_type == NAL_RASL_N )
-+    if (!s->used_for_ref)
-       return; // Don't deblock non-reference frames
- #endif
- #ifdef DISABLE_DEBLOCK
-     return;
- #endif
-+    if (!s->used_for_ref && s->avctx->skip_loop_filter >= AVDISCARD_NONREF)
-+        return;
- 
-     if (x0) {
-         left_tc_offset   = s->deblock[ctb - 1].tc_offset;
-@@ -885,11 +883,7 @@ static int ff_hevc_buf_base(AVBufferRef *bref) {
- 
- void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n)
- {
--    if (s->enable_rpi && !(  s->nal_unit_type == NAL_TRAIL_N ||
--            s->nal_unit_type == NAL_TSA_N   ||
--            s->nal_unit_type == NAL_STSA_N  ||
--            s->nal_unit_type == NAL_RADL_N  ||
--            s->nal_unit_type == NAL_RASL_N )) {
-+    if (s->enable_rpi && s->used_for_ref) {
- #ifdef RPI_FAST_CACHEFLUSH
-         struct vcsm_user_clean_invalid_s iocache = {};
-         int curr_y = ((int *)f->progress->data)[0];
--- 
-2.7.4
-
-
-From 6401d88c310cd3bfec7be94bf3ceb6d0c5736c7e Mon Sep 17 00:00:00 2001
-From: Peter de Rivaz <peter.derivaz@gmail.com>
-Date: Tue, 2 Jun 2015 15:22:52 +0100
-Subject: [PATCH 51/68] Added support for skip_frame
-
----
- libavcodec/hevc.c | 15 ++++++++++-----
- 1 file changed, 10 insertions(+), 5 deletions(-)
-
-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-index 0ac4f4c..639e4df 100644
---- a/libavcodec/hevc.c
-+++ b/libavcodec/hevc.c
-@@ -3397,11 +3397,6 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-     }
- 
- #endif
--    s->used_for_ref = !(s->nal_unit_type == NAL_TRAIL_N ||
--                        s->nal_unit_type == NAL_TSA_N   ||
--                        s->nal_unit_type == NAL_STSA_N  ||
--                        s->nal_unit_type == NAL_RADL_N  ||
--                        s->nal_unit_type == NAL_RASL_N);
- 
-     if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) {
-         av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n");
-@@ -3925,6 +3920,16 @@ static int decode_nal_unit(HEVCContext *s, const H2645NAL *nal)
-         if (ret < 0)
-             return ret;
- 
-+        s->used_for_ref = !(s->nal_unit_type == NAL_TRAIL_N ||
-+                        s->nal_unit_type == NAL_TSA_N   ||
-+                        s->nal_unit_type == NAL_STSA_N  ||
-+                        s->nal_unit_type == NAL_RADL_N  ||
-+                        s->nal_unit_type == NAL_RASL_N);
-+
-+        if (!s->used_for_ref && s->avctx->skip_frame >= AVDISCARD_NONREF) {
-+            s->is_decoded = 0;
-+            break;
-+        }
-         if (s->max_ra == INT_MAX) {
-             if (s->nal_unit_type == NAL_CRA_NUT || IS_BLA(s)) {
-                 s->max_ra = s->poc;
--- 
-2.7.4
-
-
-From d2951e2ca73e234d1b775621e3993948a4a2c8ea Mon Sep 17 00:00:00 2001
-From: Peter de Rivaz <peter.derivaz@gmail.com>
-Date: Wed, 3 Jun 2015 09:15:38 +0100
-Subject: [PATCH 52/68] Fixed cache flushing of luma when using old method
-
----
- libavcodec/hevc_filter.c | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
-index 14a0952..b286bbf 100644
---- a/libavcodec/hevc_filter.c
-+++ b/libavcodec/hevc_filter.c
-@@ -919,7 +919,7 @@ void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n)
-         flush_buffer(s->frame->buf[1]);
-         flush_buffer(s->frame->buf[2]);
- #ifdef RPI_LUMA_QPU
--        flush_buffer(s->frame->buf[1]);
-+        flush_buffer(s->frame->buf[0]);
- #endif
- 
- #endif
--- 
-2.7.4
-
-
-From 7ae612e69c1cabcc7d0b37b65efa8c5bdcfa7bf5 Mon Sep 17 00:00:00 2001
-From: Peter de Rivaz <peter.derivaz@gmail.com>
-Date: Wed, 3 Jun 2015 11:37:27 +0100
-Subject: [PATCH 53/68] Option to parallelise coefficient decode and inter
- prediction and deblock for each frame
-
----
- libavcodec/hevc.c              | 701 +++++++++++++++++++++++++++--------------
- libavcodec/hevc.h              |  74 +++--
- libavcodec/hevc_cabac.c        |  12 +-
- libavcodec/hevcpred_template.c |   5 +-
- 4 files changed, 522 insertions(+), 270 deletions(-)
-
-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-index 639e4df..12aacc5 100644
---- a/libavcodec/hevc.c
-+++ b/libavcodec/hevc.c
-@@ -43,8 +43,6 @@
- 
- #ifdef RPI
-   #include "rpi_qpu.h"
--  // For some unknown reason, the code seems to crash if I do a late malloc
--  //#define EARLY_MALLOC
-   // Move Inter prediction into separate pass
-   #define RPI_INTER
- 
-@@ -58,6 +56,21 @@
- 
-   // Define RPI_SIMULATE_QPUS for debugging to run QPU code on the ARMs
-   //#define RPI_SIMULATE_QPUS
-+  #ifdef RPI_WORKER
-+    #include "pthread.h"
-+  #endif
-+
-+  static void rpi_execute_dblk_cmds(HEVCContext *s);
-+  static void rpi_execute_transform(HEVCContext *s);
-+  static void rpi_execute_inter_qpu(HEVCContext *s);
-+  static void rpi_execute_pred_cmds(HEVCContext *s);
-+  static void rpi_execute_inter_cmds(HEVCContext *s);
-+  static void rpi_inter_clear(HEVCContext *s);
-+
-+  // Define INTER_PASS0 to do inter prediction in first pass
-+  //#define INTER_PASS0
-+  // Define LAUNCH_PASS0 to launch QPU/VPU from pass0
-+  //#define LAUNCH_PASS0
- 
- #endif
- 
-@@ -105,6 +118,143 @@ static uint32_t get_vc_address(AVBufferRef *bref) {
-   GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
-   return p->vc;
- }
-+#endif
-+
-+
-+#ifdef RPI_WORKER
-+
-+//#define LOG_ENTER printf("Enter %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s);
-+//#define LOG_EXIT printf("Exit %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s);
-+
-+#define LOG_ENTER
-+#define LOG_EXIT
-+
-+// Call this when we have completed pass0 and wish to trigger pass1 for the current job
-+static void worker_submit_job(HEVCContext *s)
-+{
-+  LOG_ENTER
-+  //pthread_mutex_lock(&s->worker_mutex);
-+  s->worker_tail++; // This is the only place that can change tail so we do not need the mutex
-+  s->pass0_job = (s->pass0_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
-+  pthread_cond_broadcast(&s->worker_cond_tail); // Let people know that the tail has moved
-+  //pthread_mutex_unlock(&s->worker_mutex);
-+  LOG_EXIT
-+}
-+
-+// Call this to say we have completed pass1
-+static void worker_complete_middle_job(HEVCContext *s)
-+{
-+  LOG_ENTER
-+  //pthread_mutex_lock(&s->worker_mutex);
-+  s->worker_middle++; // This is the only place that can change head so we do not need the mutex
-+  s->pass1_job = (s->pass1_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
-+  pthread_cond_broadcast(&s->worker_cond_middle); // Let people know that the tail has moved
-+  //pthread_mutex_unlock(&s->worker_mutex);
-+  LOG_EXIT
-+}
-+
-+// Call this to say we have completed pass2
-+static void worker_complete_job(HEVCContext *s)
-+{
-+  LOG_ENTER
-+  //pthread_mutex_lock(&s->worker_mutex);
-+  s->worker_head++; // This is the only place that can change head so we do not need the mutex
-+  s->pass2_job = (s->pass2_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
-+  pthread_cond_broadcast(&s->worker_cond_head); // Let people know that the tail has moved
-+  //pthread_mutex_unlock(&s->worker_mutex);
-+  LOG_EXIT
-+}
-+
-+// Call this to wait for all jobs to have completed at the end of a frame
-+static void worker_wait(HEVCContext *s)
-+{
-+  LOG_ENTER
-+  pthread_mutex_lock(&s->worker_mutex);
-+  while( s->worker_head !=s->worker_tail)
-+  {
-+    pthread_cond_wait(&s->worker_cond_head, &s->worker_mutex);
 +  }
-+  pthread_mutex_unlock(&s->worker_mutex);
-+  LOG_EXIT
 +}
 +
-+// Call worker_pass0_ready to wait until the s->pass0_job slot becomes
-+// available to receive the next job.
-+static void worker_pass0_ready(HEVCContext *s)
-+{
-+  LOG_ENTER
-+    pthread_mutex_lock(&s->worker_mutex);
-+    // tail is number of submitted jobs
-+    // head is number of completed jobs
-+    // tail-head is number of outstanding jobs in the queue
-+    // we need to ensure there is at least 1 space left for us to use
-+    while( s->worker_tail - s->worker_head >= RPI_MAX_JOBS)
-+    {
-+      // Wait until another job is completed
-+      pthread_cond_wait(&s->worker_cond_head, &s->worker_mutex);
-+    }
-+    pthread_mutex_unlock(&s->worker_mutex);
-+  LOG_EXIT
-+}
-+
-+static void *worker_start(void *arg)
-+{
-+  HEVCContext *s = (HEVCContext *)arg;
-+  while(1) {
-+    pthread_mutex_lock(&s->worker_mutex);
-+
-+    while( !s->kill_worker && s->worker_tail - s->worker_middle <= 0)
-+    {
-+      pthread_cond_wait(&s->worker_cond_tail, &s->worker_mutex);
-+    }
-+    pthread_mutex_unlock(&s->worker_mutex);
-+
-+    if (s->kill_worker) {
-+      break;
-+    }
-+    LOG_ENTER
-+    // printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
-+#ifndef LAUNCH_PASS0
-+    rpi_execute_inter_qpu(s);
 +#endif
-+#ifndef INTER_PASS0
-+    // Perform inter prediction
-+    rpi_execute_inter_cmds(s);
-+#endif
-+    // Wait for transform completion
-+    vpu_wait(s->vpu_id);
 +
-+    worker_complete_middle_job(s);
-+    LOG_EXIT
-+  }
-+  return NULL;
-+}
-+
-+static void *worker_deblock_start(void *arg)
-+{
-+  HEVCContext *s = (HEVCContext *)arg;
-+  while(1) {
-+    pthread_mutex_lock(&s->worker_mutex);
-+    while( !s->kill_worker && s->worker_middle - s->worker_head <= 0)
-+    {
-+      pthread_cond_wait(&s->worker_cond_middle, &s->worker_mutex);
-+    }
-+    pthread_mutex_unlock(&s->worker_mutex);
-+
-+    if (s->kill_worker) {
-+      break;
-+    }
-+    LOG_ENTER
-+    // Perform intra prediction and residual reconstruction
-+    rpi_execute_pred_cmds(s);
-+    // Perform deblocking for CTBs in this row
-+    rpi_execute_dblk_cmds(s);
-+
-+    worker_complete_job(s);
-+    LOG_EXIT
-+  }
-+  return NULL;
-+}
- 
- #endif
- 
-@@ -121,19 +271,18 @@ static uint32_t get_vc_address(AVBufferRef *bref) {
- static void pic_arrays_free(HEVCContext *s)
- {
- #ifdef RPI
--
--#ifdef EARLY_MALLOC
--#else
--    if (s->coeffs_buf_arm[0]) {
--      gpu_free(&s->coeffs_buf_default);
--      s->coeffs_buf_arm[0] = 0;
--    }
--    if (s->coeffs_buf_arm[2]) {
--      gpu_free(&s->coeffs_buf_accelerated);
--      s->coeffs_buf_arm[2] = 0;
-+    int job;
-+    for(job=0;job<RPI_MAX_JOBS;job++) {
-+      if (s->coeffs_buf_arm[job][0]) {
-+        gpu_free(&s->coeffs_buf_default[job]);
-+        s->coeffs_buf_arm[job][0] = 0;
-+      }
-+      if (s->coeffs_buf_arm[job][2]) {
-+        gpu_free(&s->coeffs_buf_accelerated[job]);
-+        s->coeffs_buf_arm[job][2] = 0;
-+      }
-     }
- #endif
--#endif
-     av_freep(&s->sao);
-     av_freep(&s->deblock);
- 
-@@ -171,24 +320,26 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
-     int min_pu_size      = sps->min_pu_width * sps->min_pu_height;
- 
- #ifdef RPI
--#ifdef EARLY_MALLOC
--#else
-     av_assert0(sps);
-     int coeffs_in_ctb = (1 << sps->log2_ctb_size) * (1 << sps->log2_ctb_size);
-     int coefs_per_row = sps->ctb_width * coeffs_in_ctb * 3;  // Allow space for chroma
--    gpu_malloc_cached(sizeof(int16_t) * coefs_per_row, &s->coeffs_buf_default);
--    s->coeffs_buf_arm[0] = (int16_t*) s->coeffs_buf_default.arm;
--    if (!s->coeffs_buf_arm[0])
--        goto fail;
--    gpu_malloc_cached(sizeof(int16_t) * coefs_per_row * 2, &s->coeffs_buf_accelerated);
--    s->coeffs_buf_arm[2] = (int16_t*) s->coeffs_buf_accelerated.arm;
--    s->coeffs_buf_vc[2] = s->coeffs_buf_accelerated.vc;
--    if (!s->coeffs_buf_arm[2])
--        goto fail;
--    s->coeffs_buf_arm[3] = coefs_per_row + s->coeffs_buf_arm[2];
--    s->coeffs_buf_vc[3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[2];
--    printf("Done\n");
--#endif
-+    int job;
-+    for(job=0;job<RPI_MAX_JOBS;job++) {
-+      printf("Allocated %d\n",coefs_per_row);
-+      for(job=0;job<RPI_MAX_JOBS;job++) {
-+        gpu_malloc_cached(sizeof(int16_t) * coefs_per_row, &s->coeffs_buf_default[job]);
-+        s->coeffs_buf_arm[job][0] = (int16_t*) s->coeffs_buf_default[job].arm;
-+        if (!s->coeffs_buf_arm[job][0])
-+            goto fail;
-+        gpu_malloc_cached(sizeof(int16_t) * coefs_per_row * 2, &s->coeffs_buf_accelerated[job]);
-+        s->coeffs_buf_arm[job][2] = (int16_t*) s->coeffs_buf_accelerated[job].arm;
-+        s->coeffs_buf_vc[job][2] = s->coeffs_buf_accelerated[job].vc;
-+        if (!s->coeffs_buf_arm[job][2])
-+            goto fail;
-+        s->coeffs_buf_arm[job][3] = coefs_per_row + s->coeffs_buf_arm[job][2];
-+        s->coeffs_buf_vc[job][3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[job][2];
-+      }
-+    }
- #endif
- 
-     s->bs_width  = (width  >> 2) + 1;
-@@ -1036,7 +1187,7 @@ static void rpi_intra_pred(HEVCContext *s, int log2_trafo_size, int x0, int y0,
- {
-     if (s->enable_rpi) {
-         HEVCLocalContext *lc = s->HEVClc;
--        HEVCPredCmd *cmd = s->univ_pred_cmds + s->num_pred_cmds++;
-+        HEVCPredCmd *cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++;
-         cmd->type = RPI_PRED_INTRA;
-         cmd->size = log2_trafo_size;
-         cmd->c_idx = c_idx;
-@@ -1496,7 +1647,7 @@ static void rpi_luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
-                         AVFrame *ref, const Mv *mv, int x_off, int y_off,
-                         int block_w, int block_h, int luma_weight, int luma_offset)
- {
--    HEVCMvCmd *cmd = s->unif_mv_cmds + s->num_mv_cmds++;
-+    HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++;
-     cmd->cmd = RPI_CMD_LUMA_UNI;
-     cmd->dst = dst;
-     cmd->dststride = dststride;
-@@ -1515,7 +1666,7 @@ static void rpi_luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
-                        AVFrame *ref0, const Mv *mv0, int x_off, int y_off,
-                        int block_w, int block_h, AVFrame *ref1, const Mv *mv1, struct MvField *current_mv)
- {
--    HEVCMvCmd *cmd = s->unif_mv_cmds + s->num_mv_cmds++;
-+    HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++;
-     cmd->cmd = RPI_CMD_LUMA_BI;
-     cmd->dst = dst;
-     cmd->dststride = dststride;
-@@ -1537,7 +1688,7 @@ static void rpi_chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
-                           ptrdiff_t dststride, uint8_t *src0, ptrdiff_t srcstride, int reflist,
-                           int x_off, int y_off, int block_w, int block_h, struct MvField *current_mv, int chroma_weight, int chroma_offset)
- {
--    HEVCMvCmd *cmd = s->unif_mv_cmds + s->num_mv_cmds++;
-+    HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++;
-     cmd->cmd = RPI_CMD_CHROMA_UNI;
-     cmd->dst = dst0;
-     cmd->dststride = dststride;
-@@ -1555,7 +1706,7 @@ static void rpi_chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
- static void rpi_chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVFrame *ref0, AVFrame *ref1,
-                          int x_off, int y_off, int block_w, int block_h, struct MvField *current_mv, int cidx)
- {
--    HEVCMvCmd *cmd = s->unif_mv_cmds + s->num_mv_cmds++;
-+    HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++;
-     cmd->cmd = RPI_CMD_CHROMA_BI+cidx;
-     cmd->dst = dst0;
-     cmd->dststride = dststride;
-@@ -2037,7 +2188,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-             int chan = x0>>6; // 64 wide blocks per QPU
-             int weight_flag = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
-                               (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
--            uint32_t *y = s->y_mvs[chan % 12];
-+            uint32_t *y = s->y_mvs[s->pass0_job][chan % 12];
-             for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
-               for(int start_x=0;start_x < nPbW;start_x+=16) {
-                   int bw = nPbW-start_x;
-@@ -2057,7 +2208,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-                   y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter;
-                 }
-             }
--            s->y_mvs[chan % 12] = y;
-+            s->y_mvs[s->pass0_job][chan % 12] = y;
-         } else
- #endif
-         {
-@@ -2086,7 +2237,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-                 int weight_flag      = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
-                                        (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
- 
--                uint32_t *u = s->u_mvs[chan & 7];
-+                uint32_t *u = s->u_mvs[s->pass0_job][chan & 7];
-                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
-                       int bw = nPbW_c-start_x;
-@@ -2110,7 +2261,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-                     }
-                 }
--                s->u_mvs[chan & 7] = u;
-+                s->u_mvs[s->pass0_job][chan & 7] = u;
-                 return;
-             }
- #endif
-@@ -2140,7 +2291,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-             int chan = x0>>6; // 64 wide blocks per QPU
-             int weight_flag = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
-                               (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
--            uint32_t *y = s->y_mvs[chan % 12];
-+            uint32_t *y = s->y_mvs[s->pass0_job][chan % 12];
-             for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
-               for(int start_x=0;start_x < nPbW;start_x+=16) {
-                   int bw = nPbW-start_x;
-@@ -2160,7 +2311,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-                   y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter;
-                 }
-             }
--            s->y_mvs[chan % 12] = y;
-+            s->y_mvs[s->pass0_job][chan % 12] = y;
-         } else
- #endif
- 
-@@ -2190,7 +2341,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-                 int weight_flag      = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
-                                        (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
- 
--                uint32_t *u = s->u_mvs[chan & 7];
-+                uint32_t *u = s->u_mvs[s->pass0_job][chan & 7];
-                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
-                       int bw = nPbW_c-start_x;
-@@ -2215,7 +2366,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-                     }
-                 }
--                s->u_mvs[chan & 7] = u;
-+                s->u_mvs[s->pass0_job][chan & 7] = u;
-                 return;
-             }
- #endif
-@@ -2249,7 +2400,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-             int x2 = x0 + (mv2->x >> 2);
-             int y2 = y0 + (mv2->y >> 2);
-             int chan = x0>>6; // 64 wide blocks per QPU
--            uint32_t *y = s->y_mvs[chan % 12];
-+            uint32_t *y = s->y_mvs[s->pass0_job][chan % 12];
-             for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
-               for(int start_x=0;start_x < nPbW;start_x+=8) { // B blocks work 8 at a time
-                   int bw = nPbW-start_x;
-@@ -2265,7 +2416,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-                   y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter_b;
-                 }
-             }
--            s->y_mvs[chan % 12] = y;
-+            s->y_mvs[s->pass0_job][chan % 12] = y;
-         } else
- #endif
-         {
-@@ -2298,7 +2449,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
- 
-                 int chan = x0>>8; // Allocate commands for the first 256 luma pixels across to the first QPU.  This is optimised for images around 1920 width
- 
--                uint32_t *u = s->u_mvs[chan & 7];
-+                uint32_t *u = s->u_mvs[s->pass0_job][chan & 7];
-                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
-                       int bw = nPbW_c-start_x;
-@@ -2327,7 +2478,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-                     }
-                 }
--                s->u_mvs[chan & 7] = u;
-+                s->u_mvs[s->pass0_job][chan & 7] = u;
-                 return;
-             }
- #endif
-@@ -2832,40 +2983,54 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
- static void rpi_execute_dblk_cmds(HEVCContext *s)
- {
-     int n;
-+    int job = s->pass2_job;
-     int ctb_size    = 1 << s->ps.sps->log2_ctb_size;
--    int (*p)[2] = s->dblk_cmds;
--    for(n = s->num_dblk_cmds; n>0 ;n--,p++) {
-+    int (*p)[2] = s->dblk_cmds[job];
-+    for(n = s->num_dblk_cmds[job]; n>0 ;n--,p++) {
-         ff_hevc_hls_filters(s, (*p)[0], (*p)[1], ctb_size);
-     }
--    s->num_dblk_cmds = 0;
-+    s->num_dblk_cmds[job] = 0;
- }
- 
- static void rpi_execute_transform(HEVCContext *s)
- {
-     int i=2;
-+#ifdef LAUNCH_PASS0
-+    int job = s->pass0_job;
-+#else
-+    int job = s->pass1_job;
-+#endif
-     //int j;
-     //int16_t *coeffs = s->coeffs_buf_arm[i];
-     //for(j=s->num_coeffs[i]; j > 0; j-= 16*16, coeffs+=16*16) {
-     //    s->hevcdsp.idct[4-2](coeffs, 16);
-     //}
- 
--    gpu_cache_flush(&s->coeffs_buf_accelerated);
--    s->vpu_id = vpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0, &s->coeffs_buf_accelerated);
-+    gpu_cache_flush(&s->coeffs_buf_accelerated[job]);
-+    s->vpu_id = vpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2],
-+                               s->num_coeffs[job][2] >> 8, s->coeffs_buf_vc[job][3],
-+                               s->num_coeffs[job][3] >> 10, 0, &s->coeffs_buf_accelerated[job]);
-     //vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0);
-     //gpu_cache_flush(&s->coeffs_buf_accelerated);
-     //vpu_wait(s->vpu_id);
- 
-     for(i=0;i<4;i++)
--        s->num_coeffs[i] = 0;
-+        s->num_coeffs[job][i] = 0;
- }
- 
- static void rpi_execute_pred_cmds(HEVCContext *s)
- {
-   int i;
--  HEVCPredCmd *cmd = s->univ_pred_cmds;
-+  int job = s->pass2_job;
-+  HEVCPredCmd *cmd = s->univ_pred_cmds[job];
-+#ifdef RPI_WORKER
-+  HEVCLocalContextIntra *lc = &s->HEVClcIntra;
-+#else
-   HEVCLocalContext *lc = s->HEVClc;
-+#endif
- 
--  for(i = s->num_pred_cmds; i > 0; i--, cmd++) {
-+  for(i = s->num_pred_cmds[job]; i > 0; i--, cmd++) {
-+      //printf("i=%d cmd=%p job1=%d job0=%d\n",i,cmd,s->pass1_job,s->pass0_job);
-       if (cmd->type == RPI_PRED_INTRA) {
-           lc->tu.intra_pred_mode_c = lc->tu.intra_pred_mode = cmd->mode;
-           lc->na.cand_bottom_left  = (cmd->na >> 4) & 1;
-@@ -2884,21 +3049,26 @@ static void rpi_execute_pred_cmds(HEVCContext *s)
- #endif
-       }
-   }
--  s->num_pred_cmds = 0;
-+  s->num_pred_cmds[job] = 0;
- }
- 
- static void rpi_execute_inter_cmds(HEVCContext *s)
- {
--    HEVCMvCmd *cmd = s->unif_mv_cmds;
-+#ifdef INTER_PASS0
-+    int job = s->pass0_job;
-+#else
-+    int job = s->pass1_job;
-+#endif
-+    HEVCMvCmd *cmd = s->unif_mv_cmds[job];
-     int n,cidx;
-     AVFrame myref;
-     AVFrame myref1;
-     struct MvField mymv;
--    if (s->num_mv_cmds > RPI_MAX_MV_CMDS) {
-+    if (s->num_mv_cmds[job] > RPI_MAX_MV_CMDS) {
-         printf("Overflow inter_cmds\n");
-         exit(-1);
-     }
--    for(n = s->num_mv_cmds; n>0 ; n--, cmd++) {
-+    for(n = s->num_mv_cmds[job]; n>0 ; n--, cmd++) {
-         switch(cmd->cmd) {
-         case RPI_CMD_LUMA_UNI:
-             myref.data[0] = cmd->src;
-@@ -2938,7 +3108,28 @@ static void rpi_execute_inter_cmds(HEVCContext *s)
-             break;
-         }
-     }
--    s->num_mv_cmds = 0;
-+    s->num_mv_cmds[job] = 0;
-+}
-+
-+static void rpi_do_all_passes(HEVCContext *s)
-+{
 +#ifdef RPI_INTER_QPU
-+    // Kick off inter prediction on QPUs
-+    rpi_execute_inter_qpu(s);
-+#else
-+    rpi_execute_transform(s);
-+#endif
-+    // Perform luma inter prediction
-+    rpi_execute_inter_cmds(s);
-+    // Wait for transform completion
-+    vpu_wait(s->vpu_id);
-+    // Perform intra prediction and residual reconstruction
-+    rpi_execute_pred_cmds(s);
-+    // Perform deblocking for CTBs in this row
-+    rpi_execute_dblk_cmds(s);
-+#ifdef RPI_INTER_QPU
-+    rpi_inter_clear(s);
-+#endif
- }
- 
- #endif
-@@ -2946,6 +3137,7 @@ static void rpi_execute_inter_cmds(HEVCContext *s)
- #ifdef RPI_INTER_QPU
- static void rpi_inter_clear(HEVCContext *s)
- {
-+    int job = s->pass0_job;
-     int i;
-     int pic_width        = s->ps.sps->width >> s->ps.sps->hshift[1];
-     int pic_height       = s->ps.sps->height >> s->ps.sps->vshift[1];
-@@ -2953,51 +3145,50 @@ static void rpi_inter_clear(HEVCContext *s)
-                            (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
- 
-     for(i=0;i<8;i++) {
--        s->u_mvs[i] = s->mvs_base[i];
--        *s->u_mvs[i]++ = 0;
--        *s->u_mvs[i]++ = 0;
--        *s->u_mvs[i]++ = 0;
--        *s->u_mvs[i]++ = 0;
--        *s->u_mvs[i]++ = 0;
--        *s->u_mvs[i]++ = pic_width;
--        *s->u_mvs[i]++ = pic_height;
--        *s->u_mvs[i]++ = s->frame->linesize[1];
--        *s->u_mvs[i]++ = s->frame->linesize[2];
-+        s->u_mvs[job][i] = s->mvs_base[job][i];
-+        *s->u_mvs[job][i]++ = 0;
-+        *s->u_mvs[job][i]++ = 0;
-+        *s->u_mvs[job][i]++ = 0;
-+        *s->u_mvs[job][i]++ = 0;
-+        *s->u_mvs[job][i]++ = 0;
-+        *s->u_mvs[job][i]++ = pic_width;
-+        *s->u_mvs[job][i]++ = pic_height;
-+        *s->u_mvs[job][i]++ = s->frame->linesize[1];
-+        *s->u_mvs[job][i]++ = s->frame->linesize[2];
-         if (weight_flag) {
--            *s->u_mvs[i]++ = 1 << (s->sh.chroma_log2_weight_denom + 6 - 1);
--            *s->u_mvs[i]++ = s->sh.chroma_log2_weight_denom + 6;
-+            *s->u_mvs[job][i]++ = 1 << (s->sh.chroma_log2_weight_denom + 6 - 1);
-+            *s->u_mvs[job][i]++ = s->sh.chroma_log2_weight_denom + 6;
-         } else {
--            *s->u_mvs[i]++ = 1 << 5;
--            *s->u_mvs[i]++ = 6;
-+            *s->u_mvs[job][i]++ = 1 << 5;
-+            *s->u_mvs[job][i]++ = 6;
-         }
--        *s->u_mvs[i]++ = i;  // Select section of VPM (avoid collisions with 3d unit)
-+        *s->u_mvs[job][i]++ = i;  // Select section of VPM (avoid collisions with 3d unit)
-     }
- 
- #ifdef RPI_LUMA_QPU
-     for(i=0;i<12;i++) {
--        s->y_mvs[i] = s->y_mvs_base[i];
--        *s->y_mvs[i]++ = 0; // y_x
--        *s->y_mvs[i]++ = 0; // ref_y_base
--        *s->y_mvs[i]++ = 0; // y2_x2
--        *s->y_mvs[i]++ = 0; // ref_y2_base
--        *s->y_mvs[i]++ = (s->ps.sps->width << 16) + s->ps.sps->height;
--        *s->y_mvs[i]++ = s->frame->linesize[0]; // pitch
--        *s->y_mvs[i]++ = s->frame->linesize[0]; // dst_pitch
-+        s->y_mvs[job][i] = s->y_mvs_base[job][i];
-+        *s->y_mvs[job][i]++ = 0; // y_x
-+        *s->y_mvs[job][i]++ = 0; // ref_y_base
-+        *s->y_mvs[job][i]++ = 0; // y2_x2
-+        *s->y_mvs[job][i]++ = 0; // ref_y2_base
-+        *s->y_mvs[job][i]++ = (s->ps.sps->width << 16) + s->ps.sps->height;
-+        *s->y_mvs[job][i]++ = s->frame->linesize[0]; // pitch
-+        *s->y_mvs[job][i]++ = s->frame->linesize[0]; // dst_pitch
-         if (weight_flag) {
-             int offset = 1 << (s->sh.luma_log2_weight_denom + 6 - 1);
-             int shift = s->sh.luma_log2_weight_denom + 6;
--            *s->y_mvs[i]++ = (offset << 16) + shift;
-+            *s->y_mvs[job][i]++ = (offset << 16) + shift;
-         } else {
-             int offset = 1 << 5;
-             int shift = 6;
--            *s->y_mvs[i]++ = (offset << 16) + shift;
-+            *s->y_mvs[job][i]++ = (offset << 16) + shift;
-         }
--        *s->y_mvs[i]++ = 0; // Next kernel
-+        *s->y_mvs[job][i]++ = 0; // Next kernel
-     }
- #endif
- }
- 
--
- #ifdef RPI_SIMULATE_QPUS
- 
- static int32_t clipx(int x,int FRAME_WIDTH)
-@@ -3271,10 +3462,15 @@ static void rpi_simulate_inter_qpu(HEVCContext *s)
- static void rpi_execute_inter_qpu(HEVCContext *s)
- {
-     int k;
-+#ifdef LAUNCH_PASS0
-+    int job = s->pass0_job;
-+#else
++
++static void rpi_launch_vpu_qpu(HEVCContext *s)
++{
++    int k;
 +    int job = s->pass1_job;
-+#endif
-     int i;
--    uint32_t *unif_vc = (uint32_t *)s->unif_mvs_ptr.vc;
++    int i;
 +    uint32_t *unif_vc = (uint32_t *)s->unif_mvs_ptr[job].vc;
- #ifdef RPI_LUMA_QPU
--    uint32_t *y_unif_vc = (uint32_t *)s->y_unif_mvs_ptr.vc;
++#ifdef RPI_LUMA_QPU
 +    uint32_t *y_unif_vc = (uint32_t *)s->y_unif_mvs_ptr[job].vc;
- #endif
-     if (s->sh.slice_type == I_SLICE) {
- #ifdef RPI_MULTI_MAILBOX
-@@ -3283,22 +3479,22 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
- #endif
-     }
-     for(k=0;k<8;k++) {
--        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
--        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
--        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP_UV); // Also need a dummy for V
--        av_assert0(s->u_mvs[k] - s->mvs_base[k] < UV_COMMANDS_PER_QPU);
++#endif
++    if (s->sh.slice_type == I_SLICE) {
++#ifdef RPI_MULTI_MAILBOX
++      rpi_execute_transform(s);
++      return;
++#endif
++    }
++    for(k=0;k<8;k++) {
 +        s->u_mvs[job][k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
 +        s->u_mvs[job][k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
 +        s->u_mvs[job][k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP_UV); // Also need a dummy for V
 +        av_assert0(s->u_mvs[job][k] - s->mvs_base[job][k] < UV_COMMANDS_PER_QPU);
-     }
- 
--    s->u_mvs[8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
++    }
++
 +    s->u_mvs[job][8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
- 
- #ifdef RPI_LUMA_QPU
-     for(k=0;k<12;k++) {
--        s->y_mvs[k][-RPI_LUMA_COMMAND_WORDS+1] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
--        s->y_mvs[k][-RPI_LUMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // Also need a dummy for second request
--        s->y_mvs[k][-RPI_LUMA_COMMAND_WORDS+8] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
--        av_assert0(s->y_mvs[k] - s->y_mvs_base[k] < Y_COMMANDS_PER_QPU);
++
++#ifdef RPI_LUMA_QPU
++    for(k=0;k<12;k++) {
 +        s->y_mvs[job][k][-RPI_LUMA_COMMAND_WORDS+1] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
 +        s->y_mvs[job][k][-RPI_LUMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // Also need a dummy for second request
-+        s->y_mvs[job][k][-RPI_LUMA_COMMAND_WORDS+8] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
++        s->y_mvs[job][k][-1] = qpu_get_fn(QPU_MC_EXIT); // Add exit command (Final uniform)
 +        av_assert0(s->y_mvs[job][k] - s->y_mvs_base[job][k] < Y_COMMANDS_PER_QPU);
-     }
--    s->y_mvs[12-1][-RPI_LUMA_COMMAND_WORDS+8] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT12); // This QPU will signal interrupt when all others are done and have acquired a semaphore
-+    s->y_mvs[job][12-1][-RPI_LUMA_COMMAND_WORDS+8] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT12); // This QPU will signal interrupt when all others are done and have acquired a semaphore
- #endif
- 
- #ifdef RPI_SIMULATE_QPUS
-@@ -3308,34 +3504,34 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
- 
- #ifdef RPI_MULTI_MAILBOX
- #ifdef RPI_CACHE_UNIF_MVS
--    gpu_cache_flush3(&s->coeffs_buf_accelerated,&s->y_unif_mvs_ptr, &s->unif_mvs_ptr);
-+    gpu_cache_flush3(&s->coeffs_buf_accelerated[job],&s->y_unif_mvs_ptr[job], &s->unif_mvs_ptr[job]);
- #else
--    gpu_cache_flush(&s->coeffs_buf_accelerated);
-+    gpu_cache_flush(&s->coeffs_buf_accelerated[job]);
- #endif
--    s->vpu_id = vpu_qpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0,
-+    s->vpu_id = vpu_qpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2], s->num_coeffs[job][2] >> 8, s->coeffs_buf_vc[job][3], s->num_coeffs[job][3] >> 10, 0,
-                                    qpu_get_fn(QPU_MC_SETUP_UV),
--                                   (uint32_t)(unif_vc+(s->mvs_base[0 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
--                                   (uint32_t)(unif_vc+(s->mvs_base[1 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
--                                   (uint32_t)(unif_vc+(s->mvs_base[2 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
--                                   (uint32_t)(unif_vc+(s->mvs_base[3 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
--                                   (uint32_t)(unif_vc+(s->mvs_base[4 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
--                                   (uint32_t)(unif_vc+(s->mvs_base[5 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
--                                   (uint32_t)(unif_vc+(s->mvs_base[6 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
--                                   (uint32_t)(unif_vc+(s->mvs_base[7 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
++    }
++    s->y_mvs[job][12-1][-1] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT12); // This QPU will signal interrupt when all others are done and have acquired a semaphore
++#endif
++
++#ifdef RPI_SIMULATE_QPUS
++    rpi_simulate_inter_qpu(s);
++    return;
++#endif
++
++#ifdef RPI_MULTI_MAILBOX
++#ifdef RPI_CACHE_UNIF_MVS
++    flush_frame3(s, s->frame,&s->coeffs_buf_accelerated[job],&s->y_unif_mvs_ptr[job], &s->unif_mvs_ptr[job], job);
++#else
++    flush_frame3(s, s->frame,&s->coeffs_buf_accelerated[job],NULL,NULL, job);
++#endif
++
++#if 1
++    {
++        unsigned int i;
++        uint32_t * p;
++        uint32_t code = qpu_get_fn(QPU_MC_SETUP_UV);
++        uint32_t mail_uv[QPU_N_UV * QPU_MAIL_EL_VALS];
++        uint32_t mail_y[QPU_N_Y * QPU_MAIL_EL_VALS];
++
++        for (p = mail_uv, i = 0; i != QPU_N_UV; ++i) {
++            *p++ = (uint32_t)(unif_vc + (s->mvs_base[job][i] - (uint32_t*)s->unif_mvs_ptr[job].arm));
++            *p++ = code;
++        }
++
++        code = qpu_get_fn(QPU_MC_SETUP);
++        for (p = mail_y, i = 0; i != QPU_N_Y; ++i) {
++            *p++ = (uint32_t)(y_unif_vc + (s->y_mvs_base[job][i] - (uint32_t*)s->y_unif_mvs_ptr[job].arm));
++            *p++ = code;
++        }
++
++        s->vpu_id = vpu_qpu_post_code2(vpu_get_fn(),
++            vpu_get_constants(),
++            s->coeffs_buf_vc[job][2],
++            s->num_coeffs[job][2] >> 8,
++            s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3],
++            s->num_coeffs[job][3] >> 10,
++            0,
++            // QPU job 1
++            QPU_N_UV,
++            mail_uv,
++            // QPU job 2
++            QPU_N_Y,
++            mail_y
++            );
++    }
++
++#else
++    s->vpu_id = vpu_qpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2], s->num_coeffs[job][2] >> 8,
++                                                                      s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3], s->num_coeffs[job][3] >> 10, 0,
++                                   qpu_get_fn(QPU_MC_SETUP_UV),
 +                                   (uint32_t)(unif_vc+(s->mvs_base[job][0 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
 +                                   (uint32_t)(unif_vc+(s->mvs_base[job][1 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
 +                                   (uint32_t)(unif_vc+(s->mvs_base[job][2 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
@@ -29817,20 +3976,8 @@ index 639e4df..12aacc5 100644
 +                                   (uint32_t)(unif_vc+(s->mvs_base[job][5 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
 +                                   (uint32_t)(unif_vc+(s->mvs_base[job][6 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
 +                                   (uint32_t)(unif_vc+(s->mvs_base[job][7 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
- #ifdef RPI_LUMA_QPU
-                                    qpu_get_fn(QPU_MC_SETUP),
--                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[0 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
--                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[1 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
--                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[2 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
--                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[3 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
--                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[4 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
--                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[5 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
--                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[6 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
--                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[7 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
--                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[8 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
--                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[9 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
--                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[10 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
--                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[11 ] - (uint32_t*)s->y_unif_mvs_ptr.arm))
++#ifdef RPI_LUMA_QPU
++                                   qpu_get_fn(QPU_MC_SETUP),
 +                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][0 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
 +                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][1 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
 +                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][2 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
@@ -29843,25 +3990,19 @@ index 639e4df..12aacc5 100644
 +                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][9 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
 +                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][10 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
 +                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][11 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm))
- #else
-                                    0,
-                                    0,0,0,0,
-@@ -3344,17 +3540,17 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
- #endif
-                                  );
-     for(i=0;i<4;i++)
--        s->num_coeffs[i] = 0;
++#else
++                                   0,
++                                   0,0,0,0,
++                                   0,0,0,0,
++                                   0,0,0,0
++#endif
++                                 );
++#endif
++    for(i=0;i<4;i++)
 +        s->num_coeffs[job][i] = 0;
- #else
-     qpu_run_shader8(qpu_get_fn(QPU_MC_SETUP_UV),
--      (uint32_t)(unif_vc+(s->mvs_base[0 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
--      (uint32_t)(unif_vc+(s->mvs_base[1 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
--      (uint32_t)(unif_vc+(s->mvs_base[2 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
--      (uint32_t)(unif_vc+(s->mvs_base[3 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
--      (uint32_t)(unif_vc+(s->mvs_base[4 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
--      (uint32_t)(unif_vc+(s->mvs_base[5 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
--      (uint32_t)(unif_vc+(s->mvs_base[6 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
--      (uint32_t)(unif_vc+(s->mvs_base[7 ] - (uint32_t*)s->unif_mvs_ptr.arm))
++#else
++#error Code rotted here
++    qpu_run_shader8(qpu_get_fn(QPU_MC_SETUP_UV),
 +      (uint32_t)(unif_vc+(s->mvs_base[job][0 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
 +      (uint32_t)(unif_vc+(s->mvs_base[job][1 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
 +      (uint32_t)(unif_vc+(s->mvs_base[job][2 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
@@ -29870,1198 +4011,11 @@ index 639e4df..12aacc5 100644
 +      (uint32_t)(unif_vc+(s->mvs_base[job][5 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
 +      (uint32_t)(unif_vc+(s->mvs_base[job][6 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
 +      (uint32_t)(unif_vc+(s->mvs_base[job][7 ] - (uint32_t*)s->unif_mvs_ptr[job].arm))
-       );
- #endif
- 
-@@ -3411,6 +3607,11 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-         }
-     }
- 
-+#ifdef RPI_WORKER
-+    s->pass0_job = 0;
-+    s->pass1_job = 0;
-+    s->pass2_job = 0;
++      );
 +#endif
- #ifdef RPI_INTER_QPU
-     rpi_inter_clear(s);
- #endif
-@@ -3431,46 +3632,42 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-         s->filter_slice_edges[ctb_addr_rs]  = s->sh.slice_loop_filter_across_slices_enabled_flag;
- 
-         more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
-+
- #ifdef RPI
-         if (s->enable_rpi) {
--          s->dblk_cmds[s->num_dblk_cmds][0] = x_ctb;
--          s->dblk_cmds[s->num_dblk_cmds++][1] = y_ctb;
-+          s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]][0] = x_ctb;
-+          s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]++][1] = y_ctb;
-           if ( (((y_ctb + ctb_size)&63) == 0) && x_ctb + ctb_size >= s->ps.sps->width) {
--            // Transform all blocks
--            // printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
--#ifdef RPI_MULTI_MAILBOX
--            // Kick off inter prediction on QPUs
--            rpi_execute_inter_qpu(s);
--            // Perform luma inter prediction
--            rpi_execute_inter_cmds(s);
--#else
--            rpi_execute_transform(s);
--            // Perform inter prediction
--            rpi_execute_inter_cmds(s);
--#ifdef RPI_INTER_QPU
--            // Kick off inter prediction on QPUs
--            rpi_execute_inter_qpu(s);
--#endif
--#endif
--
--            // Wait for transform completion
--            vpu_wait(s->vpu_id);
--
--            // Copy back reconstructed data
--            //memcpy(s->frame->data[0],s->dummy.arm,2048*64);
--            //memcpy(s->frame->data[1],s->dummy.arm,1024*32);
--            //memcpy(s->frame->data[2],s->dummy.arm,1024*32);
-+#ifdef RPI_WORKER
-+            if (s->used_for_ref) {
-+              // Split work load onto separate threads so we make as rapid progress as possible with this frame
-+  #ifdef INTER_PASS0
-+              rpi_execute_inter_cmds(s);
-+  #endif
-+  #ifdef LAUNCH_PASS0
-+              rpi_execute_inter_qpu(s);
-+  #endif
-+              // Pass on this job to worker thread
-+              worker_submit_job(s);
-+              // Make sure we have space to prepare the next job
-+              worker_pass0_ready(s);
- 
--            // Perform intra prediction and residual reconstruction
--            rpi_execute_pred_cmds(s);
--            // Perform deblocking for CTBs in this row
--            rpi_execute_dblk_cmds(s);
-+              // Prepare the next batch of commands
- #ifdef RPI_INTER_QPU
--            rpi_inter_clear(s);
-+              rpi_inter_clear(s);
-+#endif
-+            } else {
-+              // Non-ref frame so do it all on this thread
-+              rpi_do_all_passes(s);
-+            }
-+#else
-+            rpi_do_all_passes(s);
- #endif
-           }
-         }
- #endif
 +
 +
-         if (more_data < 0) {
-             s->tab_slice_address[ctb_addr_rs] = -1;
-             return more_data;
-@@ -3487,18 +3684,21 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-     }
- 
- #ifdef RPI
--    if (s->enable_rpi && s->num_dblk_cmds) {
--#ifdef RPI_INTER_QPU
--        rpi_execute_inter_qpu(s);
--#endif
--#ifndef RPI_MULTI_MAILBOX
--        rpi_execute_transform(s);
-+
-+#ifdef RPI_WORKER
-+    // Wait for the worker to finish all its jobs
-+    if (s->enable_rpi) {
-+        worker_wait(s);
-+        av_assert0(s->pass0_job==s->pass1_job);
-+        av_assert0(s->pass1_job==s->pass2_job);
-+    }
- #endif
--        rpi_execute_inter_cmds(s);
--        vpu_wait(s->vpu_id);
--        rpi_execute_pred_cmds(s);
--        rpi_execute_dblk_cmds(s);
-+
-+    // Finish off any half-completed rows
-+    if (s->enable_rpi && s->num_dblk_cmds[s->pass0_job]) {
-+        rpi_do_all_passes(s);
-     }
-+
- #endif
- 
-     if (x_ctb + ctb_size >= s->ps.sps->width &&
-@@ -4230,6 +4430,48 @@ fail:
-     return AVERROR(ENOMEM);
- }
- 
-+#ifdef RPI_WORKER
-+static av_cold void hevc_init_worker(HEVCContext *s)
-+{
-+    int err;
-+    pthread_cond_init(&s->worker_cond_head, NULL);
-+    pthread_cond_init(&s->worker_cond_middle, NULL);
-+    pthread_cond_init(&s->worker_cond_tail, NULL);
-+    pthread_mutex_init(&s->worker_mutex, NULL);
-+
-+    s->worker_tail=0;
-+    s->worker_middle=0;
-+    s->worker_head=0;
-+    s->kill_worker=0;
-+    err = pthread_create(&s->worker_thread, NULL, worker_start, s);
-+    err = pthread_create(&s->worker_deblock_thread, NULL, worker_deblock_start, s);
-+    if (err) {
-+        printf("Failed to create worker thread\n");
-+        exit(-1);
-+    }
 +}
-+
-+static av_cold void hevc_exit_worker(HEVCContext *s)
-+{
-+    void *res;
-+    s->kill_worker=1;
-+    pthread_cond_broadcast(&s->worker_cond_tail);
-+    pthread_cond_broadcast(&s->worker_cond_middle);
-+    pthread_join(s->worker_thread, &res);
-+    pthread_join(s->worker_deblock_thread, &res);
-+
-+    pthread_cond_destroy(&s->worker_cond_head);
-+    pthread_cond_destroy(&s->worker_cond_middle);
-+    pthread_cond_destroy(&s->worker_cond_tail);
-+    pthread_mutex_destroy(&s->worker_mutex);
-+
-+    s->worker_tail=0;
-+    s->worker_middle=0;
-+    s->worker_head=0;
-+    s->kill_worker=0;
-+}
-+#endif
-+
- static av_cold int hevc_decode_free(AVCodecContext *avctx)
- {
-     HEVCContext       *s = avctx->priv_data;
-@@ -4242,33 +4484,29 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
-     av_freep(&s->cabac_state);
- 
- #ifdef RPI
--    av_freep(&s->unif_mv_cmds);
--    av_freep(&s->univ_pred_cmds);
-+
-+#ifdef RPI_WORKER
-+    hevc_exit_worker(s);
-+#endif
-+
-+    for(i=0;i<RPI_MAX_JOBS;i++) {
-+      av_freep(&s->unif_mv_cmds[i]);
-+      av_freep(&s->univ_pred_cmds[i]);
- 
- #ifdef RPI_INTER_QPU
--    if (s->unif_mvs) {
--        gpu_free( &s->unif_mvs_ptr );
--        s->unif_mvs = 0;
--    }
-+      if (s->unif_mvs[i]) {
-+        gpu_free( &s->unif_mvs_ptr[i] );
-+        s->unif_mvs[i] = 0;
-+      }
- #endif
- #ifdef RPI_LUMA_QPU
--    if (s->y_unif_mvs) {
--        gpu_free( &s->y_unif_mvs_ptr );
--        s->y_unif_mvs = 0;
--    }
-+      if (s->y_unif_mvs[i]) {
-+        gpu_free( &s->y_unif_mvs_ptr[i] );
-+        s->y_unif_mvs[i] = 0;
-+      }
- #endif
--
--#ifdef EARLY_MALLOC
--    printf("hevc_decode_free\n");
--    if (s->coeffs_buf_arm[0]) {
--      gpu_free(&s->coeffs_buf_default);
--      s->coeffs_buf_arm[0] = 0;
--    }
--    if (s->coeffs_buf_arm[2]) {
--      gpu_free(&s->coeffs_buf_accelerated);
--      s->coeffs_buf_arm[2] = 0;
-     }
--#endif
-+
- #endif
- 
-     for (i = 0; i < 3; i++) {
-@@ -4328,6 +4566,7 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
- {
-     HEVCContext *s = avctx->priv_data;
-     int i;
-+    int job;
- 
-     s->avctx = avctx;
- 
-@@ -4338,12 +4577,14 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
-     s->sList[0] = s;
- 
- #ifdef RPI
--    s->unif_mv_cmds = av_mallocz(sizeof(HEVCMvCmd)*RPI_MAX_MV_CMDS);
--    if (!s->unif_mv_cmds)
--        goto fail;
--    s->univ_pred_cmds = av_mallocz(sizeof(HEVCPredCmd)*RPI_MAX_PRED_CMDS);
--    if (!s->univ_pred_cmds)
--        goto fail;
-+    for(job=0;job<RPI_MAX_JOBS;job++) {
-+        s->unif_mv_cmds[job] = av_mallocz(sizeof(HEVCMvCmd)*RPI_MAX_MV_CMDS);
-+        if (!s->unif_mv_cmds[job])
-+            goto fail;
-+        s->univ_pred_cmds[job] = av_mallocz(sizeof(HEVCPredCmd)*RPI_MAX_PRED_CMDS);
-+        if (!s->univ_pred_cmds[job])
-+            goto fail;
-+    }
- 
- #ifdef RPI_INTER_QPU
-     // We divide the image into blocks 256 wide and 64 high
-@@ -4354,18 +4595,20 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
-     {
-         int uv_commands_per_qpu = UV_COMMANDS_PER_QPU;
-         uint32_t *p;
-+		for(job=0;job<RPI_MAX_JOBS;job++) {
- #ifdef RPI_CACHE_UNIF_MVS
--        gpu_malloc_cached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr );
-+          gpu_malloc_cached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr[job] );
- #else
--        gpu_malloc_uncached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr );
-+          gpu_malloc_uncached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr[job] );
- #endif
--        s->unif_mvs = (uint32_t *) s->unif_mvs_ptr.arm; // TODO support this allocation in non EARLY_MALLOC
-+          s->unif_mvs[job] = (uint32_t *) s->unif_mvs_ptr[job].arm;
- 
--        // Set up initial locations for uniform streams
--        p = s->unif_mvs;
--        for(i = 0; i < 8; i++) {
--            s->mvs_base[i] = p;
-+          // Set up initial locations for uniform streams
-+          p = s->unif_mvs[job];
-+          for(i = 0; i < 8; i++) {
-+            s->mvs_base[job][i] = p;
-             p += uv_commands_per_qpu;
-+          }
-         }
-         s->mc_filter_uv = qpu_get_fn(QPU_MC_FILTER_UV);
-         s->mc_filter_uv_b0 = qpu_get_fn(QPU_MC_FILTER_UV_B0);
-@@ -4374,61 +4617,35 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
-     }
- #endif
- #ifdef RPI_LUMA_QPU
-+    for(job=0;job<RPI_MAX_JOBS;job++)
-     {
-         int y_commands_per_qpu = Y_COMMANDS_PER_QPU;
-         uint32_t *p;
- #ifdef RPI_CACHE_UNIF_MVS
--        gpu_malloc_cached( 12 * y_commands_per_qpu * sizeof(uint32_t), &s->y_unif_mvs_ptr );
-+        gpu_malloc_cached( 12 * y_commands_per_qpu * sizeof(uint32_t), &s->y_unif_mvs_ptr[job] );
- #else
--        gpu_malloc_uncached( 12 * y_commands_per_qpu * sizeof(uint32_t), &s->y_unif_mvs_ptr );
-+        gpu_malloc_uncached( 12 * y_commands_per_qpu * sizeof(uint32_t), &s->y_unif_mvs_ptr[job] );
- #endif
--        s->y_unif_mvs = (uint32_t *) s->y_unif_mvs_ptr.arm; // TODO support this allocation in non EARLY_MALLOC
-+        s->y_unif_mvs[job] = (uint32_t *) s->y_unif_mvs_ptr[job].arm;
- 
-         // Set up initial locations for uniform streams
--        p = s->y_unif_mvs;
-+        p = s->y_unif_mvs[job];
-         for(i = 0; i < 12; i++) {
--            s->y_mvs_base[i] = p;
-+            s->y_mvs_base[job][i] = p;
-             p += y_commands_per_qpu;
-         }
--        s->mc_filter = qpu_get_fn(QPU_MC_FILTER);
--        s->mc_filter_b = qpu_get_fn(QPU_MC_FILTER_B);
--
-     }
-+    s->mc_filter = qpu_get_fn(QPU_MC_FILTER);
-+    s->mc_filter_b = qpu_get_fn(QPU_MC_FILTER_B);
- #endif
-     //gpu_malloc_uncached(2048*64,&s->dummy);
- 
--#ifdef EARLY_MALLOC
--    {
--        int coeffs_in_ctb = 64*64;
--        int coefs_per_row = (2048/64) * coeffs_in_ctb * 3;  // Allow space for chroma
--        s->coeffs_buf_arm[0] = 0;
--        s->coeffs_buf_arm[2] = 0;
--        printf("Allocated %d\n",coefs_per_row);
--        gpu_malloc_cached(sizeof(int16_t) * coefs_per_row, &s->coeffs_buf_default);
--        s->coeffs_buf_arm[0] = (int16_t*) s->coeffs_buf_default.arm;
--        if (!s->coeffs_buf_arm[0])
--            goto fail;
--        gpu_malloc_cached(sizeof(int16_t) * coefs_per_row * 2, &s->coeffs_buf_accelerated);
--        s->coeffs_buf_arm[2] = (int16_t*) s->coeffs_buf_accelerated.arm;
--        s->coeffs_buf_vc[2] = s->coeffs_buf_accelerated.vc;
--        if (!s->coeffs_buf_arm[2])
--            goto fail;
--        s->coeffs_buf_arm[3] = coefs_per_row + s->coeffs_buf_arm[2];
--        s->coeffs_buf_vc[3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[2];
--        printf("Done\n");
--#ifdef RPI_PRECLEAR
--        //memset(s->coeffs_buf_arm[0],0, sizeof(int16_t) * coefs_per_row);
--        memclear16(s->coeffs_buf_arm[0], coefs_per_row);
--        //memset(s->coeffs_buf_arm[2],0, sizeof(int16_t) * coefs_per_row);
--        memclear16(s->coeffs_buf_arm[2], coefs_per_row);
--        //memset(s->coeffs_buf_arm[3],0, sizeof(int16_t) * coefs_per_row);
--        memclear16(s->coeffs_buf_arm[3], coefs_per_row);
--#endif
--    }
--#endif
--
-     s->enable_rpi = 0;
- 
-+#ifdef RPI_WORKER
-+    hevc_init_worker(s);
-+#endif
-+
- #endif
- 
-     s->cabac_state = av_malloc(HEVC_CONTEXTS);
-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-index 5cb90b5..7bd295a 100644
---- a/libavcodec/hevc.h
-+++ b/libavcodec/hevc.h
-@@ -51,6 +51,12 @@
-     // Define RPI_LUMA_QPU to also use QPU for luma inter prediction
-     #define RPI_LUMA_QPU
-   #endif
-+
-+  // By passing jobs to a worker thread we hope to be able to catch up during slow frames
-+  #define RPI_MAX_JOBS 2
-+  // Define RPI_WORKER to launch a worker thread for pixel processing tasks
-+  #define RPI_WORKER
-+
- #endif
- 
- #define MAX_DPB_SIZE 16 // A.4.1
-@@ -806,6 +812,13 @@ typedef struct HEVCLocalContext {
-     int boundary_flags;
- } HEVCLocalContext;
- 
-+#ifdef RPI_WORKER
-+typedef struct HEVCLocalContextIntra {
-+    TransformUnit tu;
-+    NeighbourAvailable na;
-+} HEVCLocalContextIntra;
-+#endif
-+
- #ifdef RPI
- 
- // RPI_MAX_WIDTH is maximum width in pixels supported by the accelerated code
-@@ -874,7 +887,7 @@ typedef struct HEVCPredCmd {
- 
- typedef struct HEVCContext {
- #ifdef RPI
--    int dblk_cmds[RPI_MAX_DEBLOCK_CMDS][2];
-+    int dblk_cmds[RPI_MAX_JOBS][RPI_MAX_DEBLOCK_CMDS][2];
- #endif
-     const AVClass *c;  // needed by private avoptions
-     AVCodecContext *avctx;
-@@ -883,7 +896,9 @@ typedef struct HEVCContext {
- 
-     HEVCLocalContext    *HEVClcList[MAX_NB_THREADS];
-     HEVCLocalContext    *HEVClc;
--
-+#ifdef RPI_WORKER
-+    HEVCLocalContextIntra HEVClcIntra;
-+#endif
-     uint8_t             threads_type;
-     uint8_t             threads_number;
- 
-@@ -894,43 +909,60 @@ typedef struct HEVCContext {
- 
- #ifdef RPI
-     int enable_rpi;
--    HEVCMvCmd *unif_mv_cmds;
--    HEVCPredCmd *univ_pred_cmds;
-+    HEVCMvCmd *unif_mv_cmds[RPI_MAX_JOBS];
-+    HEVCPredCmd *univ_pred_cmds[RPI_MAX_JOBS];
-     int buf_width;
--    GPU_MEM_PTR_T coeffs_buf_default;
--    GPU_MEM_PTR_T coeffs_buf_accelerated;
--    int16_t *coeffs_buf_arm[4];
--    unsigned int coeffs_buf_vc[4];
--    int num_coeffs[4];
--    int num_xfm_cmds;
--    int num_mv_cmds;
--    int num_pred_cmds;
--    int num_dblk_cmds;
-+    GPU_MEM_PTR_T coeffs_buf_default[RPI_MAX_JOBS];
-+    GPU_MEM_PTR_T coeffs_buf_accelerated[RPI_MAX_JOBS];
-+    int16_t *coeffs_buf_arm[RPI_MAX_JOBS][4];
-+    unsigned int coeffs_buf_vc[RPI_MAX_JOBS][4];
-+    int num_coeffs[RPI_MAX_JOBS][4];
-+    int num_xfm_cmds[RPI_MAX_JOBS];
-+    int num_mv_cmds[RPI_MAX_JOBS];
-+    int num_pred_cmds[RPI_MAX_JOBS];
-+    int num_dblk_cmds[RPI_MAX_JOBS];
-     int vpu_id;
-     //GPU_MEM_PTR_T dummy;
-+    int pass0_job; // Pass0 does coefficient decode
-+    int pass1_job; // Pass1 does pixel processing
-+    int pass2_job; // Pass2 does reconstruction and deblocking
- #ifdef RPI_INTER_QPU
--    GPU_MEM_PTR_T unif_mvs_ptr;
--    uint32_t *unif_mvs; // Base of memory for motion vector commands
-+    GPU_MEM_PTR_T unif_mvs_ptr[RPI_MAX_JOBS];
-+    uint32_t *unif_mvs[RPI_MAX_JOBS]; // Base of memory for motion vector commands
- 
-     // _base pointers are to the start of the row
--    uint32_t *mvs_base[8];
-+    uint32_t *mvs_base[RPI_MAX_JOBS][8];
-     // these pointers are to the next free space
--    uint32_t *u_mvs[8];
-+    uint32_t *u_mvs[RPI_MAX_JOBS][8];
-     // Function pointers
-     uint32_t mc_filter_uv;
-     uint32_t mc_filter_uv_b0;
-     uint32_t mc_filter_uv_b;
- #endif
- #ifdef RPI_LUMA_QPU
--    GPU_MEM_PTR_T y_unif_mvs_ptr;
--    uint32_t *y_unif_mvs; // Base of memory for motion vector commands
--    uint32_t *y_mvs_base[12];
--    uint32_t *y_mvs[12];
-+    GPU_MEM_PTR_T y_unif_mvs_ptr[RPI_MAX_JOBS];
-+    uint32_t *y_unif_mvs[RPI_MAX_JOBS]; // Base of memory for motion vector commands
-+    uint32_t *y_mvs_base[RPI_MAX_JOBS][12];
-+    uint32_t *y_mvs[RPI_MAX_JOBS][12];
-     // Function pointers
-     uint32_t mc_filter;
-     uint32_t mc_filter_b;
- #endif
- 
-+#ifdef RPI_WORKER
-+    pthread_t worker_thread;
-+    pthread_t worker_deblock_thread;
-+    pthread_cond_t worker_cond_head;
-+    pthread_cond_t worker_cond_tail;
-+    pthread_cond_t worker_cond_middle;
-+    pthread_mutex_t worker_mutex;
-+
-+    int worker_tail; // Contains the number of posted jobs
-+    int worker_head; // Contains the number of completed jobs
-+    int worker_middle; // Contains the number of completed jobs
-+    int kill_worker; // set to 1 to terminate the worker
-+#endif
-+
- #endif
- 
-     uint8_t *cabac_state;
-diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
-index 38f53de..f0982cd 100644
---- a/libavcodec/hevc_cabac.c
-+++ b/libavcodec/hevc_cabac.c
-@@ -1051,11 +1051,11 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-     if (s->enable_rpi) {
-         int n = trafo_size * trafo_size;
-         if (use_vpu) {
--            coeffs = s->coeffs_buf_arm[log2_trafo_size - 2] + s->num_coeffs[log2_trafo_size - 2];
--            s->num_coeffs[log2_trafo_size - 2] += n;
-+            coeffs = s->coeffs_buf_arm[s->pass0_job][log2_trafo_size - 2] + s->num_coeffs[s->pass0_job][log2_trafo_size - 2];
-+            s->num_coeffs[s->pass0_job][log2_trafo_size - 2] += n;
-         } else {
--            coeffs = s->coeffs_buf_arm[0] + s->num_coeffs[0];
--            s->num_coeffs[0] += n;
-+            coeffs = s->coeffs_buf_arm[s->pass0_job][0] + s->num_coeffs[s->pass0_job][0];
-+            s->num_coeffs[s->pass0_job][0] += n;
-         }
-     }
-     // We now do the memset after transform_add while we know the data is cached.
-@@ -1508,7 +1508,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-                 s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
-             }
-         } else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2) {
--            s->hevcdsp.idct_4x4_luma(coeffs);
-+           s->hevcdsp.idct_4x4_luma(coeffs);
-         } else {
- #ifdef RPI
-             if (!use_vpu) {
-@@ -1553,7 +1553,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-     }
- #ifdef RPI
-     if (s->enable_rpi) {
--        HEVCPredCmd *cmd = s->univ_pred_cmds + s->num_pred_cmds++;
-+        HEVCPredCmd *cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++;
-         cmd->type = RPI_PRED_TRANSFORM_ADD;
-         cmd->size = log2_trafo_size;
-         cmd->buf = coeffs;
-diff --git a/libavcodec/hevcpred_template.c b/libavcodec/hevcpred_template.c
-index 71c6d52..344e021 100644
---- a/libavcodec/hevcpred_template.c
-+++ b/libavcodec/hevcpred_template.c
-@@ -71,8 +71,11 @@ do {                                  \
-                 AV_WN4P(&ptr[i], a);                                           \
-             else                                                               \
-                 a = PIXEL_SPLAT_X4(ptr[i + 3])
--
-+#ifdef RPI_WORKER
-+    HEVCLocalContextIntra *lc = &s->HEVClcIntra;
-+#else
-     HEVCLocalContext *lc = s->HEVClc;
-+#endif
-     int i;
-     int hshift = s->ps.sps->hshift[c_idx];
-     int vshift = s->ps.sps->vshift[c_idx];
--- 
-2.7.4
-
-
-From 1e0885f8d98175777fff65b4cedd708176c2abcf Mon Sep 17 00:00:00 2001
-From: Peter de Rivaz <peter.derivaz@gmail.com>
-Date: Wed, 3 Jun 2015 13:43:48 +0100
-Subject: [PATCH 54/68] Avoid lockup bug with RPI_WORKER enabled
-
----
- libavcodec/hevc.c       | 22 +++++++++++-----------
- libavcodec/hevc_cabac.c |  1 -
- 2 files changed, 11 insertions(+), 12 deletions(-)
-
-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-index 12aacc5..182a82f 100644
---- a/libavcodec/hevc.c
-+++ b/libavcodec/hevc.c
-@@ -133,11 +133,11 @@ static uint32_t get_vc_address(AVBufferRef *bref) {
- static void worker_submit_job(HEVCContext *s)
- {
-   LOG_ENTER
--  //pthread_mutex_lock(&s->worker_mutex);
--  s->worker_tail++; // This is the only place that can change tail so we do not need the mutex
-+  pthread_mutex_lock(&s->worker_mutex);
-+  s->worker_tail++;
-   s->pass0_job = (s->pass0_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
-   pthread_cond_broadcast(&s->worker_cond_tail); // Let people know that the tail has moved
--  //pthread_mutex_unlock(&s->worker_mutex);
-+  pthread_mutex_unlock(&s->worker_mutex);
-   LOG_EXIT
- }
- 
-@@ -145,11 +145,11 @@ static void worker_submit_job(HEVCContext *s)
- static void worker_complete_middle_job(HEVCContext *s)
- {
-   LOG_ENTER
--  //pthread_mutex_lock(&s->worker_mutex);
--  s->worker_middle++; // This is the only place that can change head so we do not need the mutex
-+  pthread_mutex_lock(&s->worker_mutex);
-+  s->worker_middle++;
-   s->pass1_job = (s->pass1_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
--  pthread_cond_broadcast(&s->worker_cond_middle); // Let people know that the tail has moved
--  //pthread_mutex_unlock(&s->worker_mutex);
-+  pthread_cond_broadcast(&s->worker_cond_middle); // Let people know that the middle has moved
-+  pthread_mutex_unlock(&s->worker_mutex);
-   LOG_EXIT
- }
- 
-@@ -157,11 +157,11 @@ static void worker_complete_middle_job(HEVCContext *s)
- static void worker_complete_job(HEVCContext *s)
- {
-   LOG_ENTER
--  //pthread_mutex_lock(&s->worker_mutex);
--  s->worker_head++; // This is the only place that can change head so we do not need the mutex
-+  pthread_mutex_lock(&s->worker_mutex);
-+  s->worker_head++;
-   s->pass2_job = (s->pass2_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
--  pthread_cond_broadcast(&s->worker_cond_head); // Let people know that the tail has moved
--  //pthread_mutex_unlock(&s->worker_mutex);
-+  pthread_cond_broadcast(&s->worker_cond_head); // Let people know that the head has moved
-+  pthread_mutex_unlock(&s->worker_mutex);
-   LOG_EXIT
- }
- 
-diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
-index f0982cd..6523e66 100644
---- a/libavcodec/hevc_cabac.c
-+++ b/libavcodec/hevc_cabac.c
-@@ -1497,7 +1497,6 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-                 for (i = 0; i < 8; i++)
-                     FFSWAP(int16_t, coeffs[i], coeffs[16 - i - 1]);
-             }
--
-             s->hevcdsp.transform_skip(coeffs, log2_trafo_size);
- 
-             if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
--- 
-2.7.4
-
-
-From 1d7ad81069dec6914ec7e9983855d7a1b5e4b123 Mon Sep 17 00:00:00 2001
-From: Peter de Rivaz <peter.derivaz@gmail.com>
-Date: Wed, 3 Jun 2015 15:37:19 +0100
-Subject: [PATCH 55/68] Added code to flush buffers at start of frame
-
----
- libavcodec/hevc.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
- 1 file changed, 72 insertions(+)
-
-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-index 182a82f..e5b9f1e 100644
---- a/libavcodec/hevc.c
-+++ b/libavcodec/hevc.c
-@@ -43,6 +43,7 @@
- 
- #ifdef RPI
-   #include "rpi_qpu.h"
-+  #include "rpi_user_vcsm.h"
-   // Move Inter prediction into separate pass
-   #define RPI_INTER
- 
-@@ -3508,6 +3509,7 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
- #else
-     gpu_cache_flush(&s->coeffs_buf_accelerated[job]);
- #endif
-+
-     s->vpu_id = vpu_qpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2], s->num_coeffs[job][2] >> 8, s->coeffs_buf_vc[job][3], s->num_coeffs[job][3] >> 10, 0,
-                                    qpu_get_fn(QPU_MC_SETUP_UV),
-                                    (uint32_t)(unif_vc+(s->mvs_base[job][0 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-@@ -3558,6 +3560,71 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
- }
- #endif
- 
-+#ifdef RPI
-+
-+static void flush_buffer(AVBufferRef *bref) {
-+    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
-+    gpu_cache_flush(p);
-+}
-+
-+static void flush_frame(HEVCContext *s,AVFrame *frame)
-+{
-+#if 1
-+    struct vcsm_user_clean_invalid_s iocache = {};
-+    int n = s->ps.sps->height;
-+    int curr_y = 0;
-+    int curr_uv = 0;
-+    int n_uv = n >> s->ps.sps->vshift[1];
-+    int sz,base;
-+    sz = s->frame->linesize[1] * (n_uv-curr_uv);
-+    base = s->frame->linesize[1] * curr_uv;
-+    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[1]);
-+    iocache.s[0].handle = p->vcsm_handle;
-+    iocache.s[0].cmd = 3; // clean+invalidate
-+    iocache.s[0].addr = p->arm + base;
-+    iocache.s[0].size  = sz;
-+    p = av_buffer_pool_opaque(frame->buf[2]);
-+    iocache.s[1].handle = p->vcsm_handle;
-+    iocache.s[1].cmd = 3; // clean+invalidate
-+    iocache.s[1].addr = p->arm + base;
-+    iocache.s[1].size  = sz;
-+    p = av_buffer_pool_opaque(frame->buf[0]);
-+    sz = s->frame->linesize[0] * (n-curr_y);
-+    base = s->frame->linesize[0] * curr_y;
-+    iocache.s[2].handle = p->vcsm_handle;
-+    iocache.s[2].cmd = 3; // clean+invalidate
-+    iocache.s[2].addr = p->arm + base;
-+    iocache.s[2].size  = sz;
-+    vcsm_clean_invalid( &iocache );
-+#else
-+    flush_buffer(frame->buf[0]);
-+    flush_buffer(frame->buf[1]);
-+    flush_buffer(frame->buf[2]);
-+#endif
-+}
-+
-+static void flush_all(HEVCContext *s)
-+{
-+#if 0
-+    struct vcsm_user_clean_invalid_s iocache = {};
-+    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(s->frame->buf[0]);
-+    iocache.s[0].handle = p->vcsm_handle;
-+    iocache.s[0].cmd = 4; // Flush all
-+    iocache.s[0].addr = p->arm;
-+    iocache.s[0].size  = 4096;
-+    vcsm_clean_invalid( &iocache );
-+#else
-+  int i,k;
-+  for(i=0;i<2;i++) {
-+    for (k = 0; k < s->sh.nb_refs[i]; k++) {
-+      flush_frame(s,s->ref->refPicList[i].ref[k]->frame);
-+    }
-+  }
-+  flush_frame(s,s->frame);
-+#endif
-+}
-+#endif
-+
- static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
- {
-     HEVCContext *s  = avctxt->priv_data;
-@@ -3592,8 +3659,12 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-         printf("Weighted B slice\n");
-     }
- 
-+    // Now flush all reference frames and our destination frame to get everything ready for decode
-+    flush_all(s);
- #endif
- 
-+    //printf("L0=%d L1=%d\n",s->sh.nb_refs[L1],s->sh.nb_refs[L1]);
-+
-     if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) {
-         av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n");
-         return AVERROR_INVALIDDATA;
-@@ -3664,6 +3735,7 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-             rpi_do_all_passes(s);
- #endif
-           }
-+
-         }
- #endif
- 
--- 
-2.7.4
-
-
-From 7a57f233dcd4048e20a0b5bc06bc20abb589d3fa Mon Sep 17 00:00:00 2001
-From: Peter de Rivaz <peter.derivaz@gmail.com>
-Date: Wed, 3 Jun 2015 16:42:24 +0100
-Subject: [PATCH 56/68] Reduce the amount that needs to be flushed
-
----
- libavcodec/hevc.c | 35 +++++++++++------------------------
- 1 file changed, 11 insertions(+), 24 deletions(-)
-
-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-index e5b9f1e..73d7f74 100644
---- a/libavcodec/hevc.c
-+++ b/libavcodec/hevc.c
-@@ -3569,7 +3569,7 @@ static void flush_buffer(AVBufferRef *bref) {
- 
- static void flush_frame(HEVCContext *s,AVFrame *frame)
- {
--#if 1
-+#ifdef RPI_FAST_CACHEFLUSH
-     struct vcsm_user_clean_invalid_s iocache = {};
-     int n = s->ps.sps->height;
-     int curr_y = 0;
-@@ -3603,26 +3603,6 @@ static void flush_frame(HEVCContext *s,AVFrame *frame)
- #endif
- }
- 
--static void flush_all(HEVCContext *s)
--{
--#if 0
--    struct vcsm_user_clean_invalid_s iocache = {};
--    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(s->frame->buf[0]);
--    iocache.s[0].handle = p->vcsm_handle;
--    iocache.s[0].cmd = 4; // Flush all
--    iocache.s[0].addr = p->arm;
--    iocache.s[0].size  = 4096;
--    vcsm_clean_invalid( &iocache );
--#else
--  int i,k;
--  for(i=0;i<2;i++) {
--    for (k = 0; k < s->sh.nb_refs[i]; k++) {
--      flush_frame(s,s->ref->refPicList[i].ref[k]->frame);
--    }
--  }
--  flush_frame(s,s->frame);
--#endif
--}
- #endif
- 
- static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-@@ -3658,9 +3638,6 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-       if (s->ps.pps->weighted_bipred_flag && s->sh.slice_type == B_SLICE)
-         printf("Weighted B slice\n");
-     }
--
--    // Now flush all reference frames and our destination frame to get everything ready for decode
--    flush_all(s);
- #endif
- 
-     //printf("L0=%d L1=%d\n",s->sh.nb_refs[L1],s->sh.nb_refs[L1]);
-@@ -4130,6 +4107,11 @@ static int hevc_frame_start(HEVCContext *s)
-     if (!s->avctx->hwaccel)
-         ff_thread_finish_setup(s->avctx);
- 
-+#ifdef RPI_INTER_QPU
-+    // Invalidate the output data buffer so it is ready for the QPUs to write into it.
-+    flush_frame(s,s->frame);
-+#endif
-+
-     return 0;
- 
- fail:
-@@ -4331,6 +4313,11 @@ fail:
-         ff_hevc_flush_buffer(s, &s->ref->tf, s->ps.sps->height);
- #endif
-         ff_thread_report_progress(&s->ref->tf, INT_MAX, 0);
-+    } else if (s->ref) {
-+#ifdef RPI_INTER_QPU
-+      // When running single threaded we need to flush the whole frame
-+      flush_frame(s,s->frame);
-+#endif
-     }
-     return ret;
- }
--- 
-2.7.4
-
-
-From 26eba8e3266cc5f2120e8284a1ce486d6a402010 Mon Sep 17 00:00:00 2001
-From: Peter de Rivaz <peter.derivaz@gmail.com>
-Date: Thu, 4 Jun 2015 07:59:28 +0100
-Subject: [PATCH 57/68] Corrected support for disabled rpi when using
- RPI_WORKER
-
----
- libavcodec/hevc.h              | 18 ++++++++++--------
- libavcodec/hevcpred_template.c |  2 +-
- 2 files changed, 11 insertions(+), 9 deletions(-)
-
-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-index 7bd295a..3cb34bd 100644
---- a/libavcodec/hevc.h
-+++ b/libavcodec/hevc.h
-@@ -769,7 +769,17 @@ typedef struct HEVCFrame {
-     uint8_t flags;
- } HEVCFrame;
- 
-+#ifdef RPI_WORKER
-+typedef struct HEVCLocalContextIntra {
-+    TransformUnit tu;
-+    NeighbourAvailable na;
-+} HEVCLocalContextIntra;
-+#endif
-+
- typedef struct HEVCLocalContext {
-+    TransformUnit tu;
-+    NeighbourAvailable na;  // WARNING tu and na must be the first two fields to match HEVCLocalContextIntra
-+
-     uint8_t cabac_state[HEVC_CONTEXTS];
- 
-     uint8_t stat_coeff[4];
-@@ -784,7 +794,6 @@ typedef struct HEVCLocalContext {
- 
-     int qPy_pred;
- 
--    TransformUnit tu;
- 
-     uint8_t ctb_left_flag;
-     uint8_t ctb_up_flag;
-@@ -801,7 +810,6 @@ typedef struct HEVCLocalContext {
-     int ct_depth;
-     CodingUnit cu;
-     PredictionUnit pu;
--    NeighbourAvailable na;
- 
- #define BOUNDARY_LEFT_SLICE     (1 << 0)
- #define BOUNDARY_LEFT_TILE      (1 << 1)
-@@ -812,12 +820,6 @@ typedef struct HEVCLocalContext {
-     int boundary_flags;
- } HEVCLocalContext;
- 
--#ifdef RPI_WORKER
--typedef struct HEVCLocalContextIntra {
--    TransformUnit tu;
--    NeighbourAvailable na;
--} HEVCLocalContextIntra;
--#endif
- 
- #ifdef RPI
- 
-diff --git a/libavcodec/hevcpred_template.c b/libavcodec/hevcpred_template.c
-index 344e021..325b60e 100644
---- a/libavcodec/hevcpred_template.c
-+++ b/libavcodec/hevcpred_template.c
-@@ -72,7 +72,7 @@ do {                                  \
-             else                                                               \
-                 a = PIXEL_SPLAT_X4(ptr[i + 3])
- #ifdef RPI_WORKER
--    HEVCLocalContextIntra *lc = &s->HEVClcIntra;
-+    HEVCLocalContextIntra *lc = s->enable_rpi ? &s->HEVClcIntra : (HEVCLocalContextIntra *)s->HEVClc ;
- #else
-     HEVCLocalContext *lc = s->HEVClc;
- #endif
--- 
-2.7.4
-
-
-From 5b3eee9be88a5326df7621de95095def969e05a8 Mon Sep 17 00:00:00 2001
-From: Peter de Rivaz <peter.derivaz@gmail.com>
-Date: Thu, 4 Jun 2015 11:52:55 +0100
-Subject: [PATCH 58/68] Draft support for tiles
-
----
- libavcodec/hevc.c              | 140 +++++++++++++++++++++++------------------
- libavcodec/hevc.h              |  21 +++++--
- libavcodec/hevc_filter.c       |   2 +-
- libavcodec/hevcpred_template.c |   2 +-
- 4 files changed, 99 insertions(+), 66 deletions(-)
-
-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-index 73d7f74..ec67252 100644
---- a/libavcodec/hevc.c
-+++ b/libavcodec/hevc.c
-@@ -63,10 +63,10 @@
- 
-   static void rpi_execute_dblk_cmds(HEVCContext *s);
-   static void rpi_execute_transform(HEVCContext *s);
--  static void rpi_execute_inter_qpu(HEVCContext *s);
-+  static void rpi_launch_vpu_qpu(HEVCContext *s);
-   static void rpi_execute_pred_cmds(HEVCContext *s);
-   static void rpi_execute_inter_cmds(HEVCContext *s);
--  static void rpi_inter_clear(HEVCContext *s);
-+  static void rpi_begin(HEVCContext *s);
- 
-   // Define INTER_PASS0 to do inter prediction in first pass
-   //#define INTER_PASS0
-@@ -90,16 +90,18 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
- 
- #ifdef RPI_INTER_QPU
- 
-+// Each luma QPU processes 2*RPI_NUM_CHUNKS 64x64 blocks
-+// Each chroma QPU processes 3*RPI_NUM_CHUNKS 64x64 blocks, but requires two commands for B blocks
-+// For each block of 64*64 the smallest block size is 8x4
-+// We also need an extra command for the setup information
-+
- #define RPI_CHROMA_COMMAND_WORDS 12
--#define UV_COMMANDS_PER_QPU ((1 + (256*64*2)/(4*4)) * RPI_CHROMA_COMMAND_WORDS)
-+#define UV_COMMANDS_PER_QPU ((1 + 3*RPI_NUM_CHUNKS*(64*64)*2/(8*4)) * RPI_CHROMA_COMMAND_WORDS)
- // The QPU code for UV blocks only works up to a block width of 8
- #define RPI_CHROMA_BLOCK_WIDTH 8
- 
--// Split image of 2048 into parts 64 wide
--// So some QPUs will have 3 blocks of 64 to do, and others 2 blocks for an image 2048 wide with 32 blocks across
--// For each block of 64*64 the smallest block size is 8x4
- #define RPI_LUMA_COMMAND_WORDS 9
--#define Y_COMMANDS_PER_QPU ((1+3*(64*64)/(8*4)) * RPI_LUMA_COMMAND_WORDS)
-+#define Y_COMMANDS_PER_QPU ((1+2*RPI_NUM_CHUNKS*(64*64)/(8*4)) * RPI_LUMA_COMMAND_WORDS)
- 
- #define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24)
- 
-@@ -216,7 +218,7 @@ static void *worker_start(void *arg)
-     LOG_ENTER
-     // printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
- #ifndef LAUNCH_PASS0
--    rpi_execute_inter_qpu(s);
-+    rpi_launch_vpu_qpu(s);
- #endif
- #ifndef INTER_PASS0
-     // Perform inter prediction
-@@ -322,9 +324,14 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
- 
- #ifdef RPI
-     av_assert0(sps);
--    int coeffs_in_ctb = (1 << sps->log2_ctb_size) * (1 << sps->log2_ctb_size);
--    int coefs_per_row = sps->ctb_width * coeffs_in_ctb * 3;  // Allow space for chroma
-+    int coefs_in_ctb = (1 << sps->log2_ctb_size) * (1 << sps->log2_ctb_size);
-+    int coefs_per_luma = 64*64*24*RPI_NUM_CHUNKS;
-+    int coefs_per_chroma = (coefs_per_luma * 2) >> sps->vshift[1] >> sps->hshift[1];
-+    int coefs_per_row = coefs_per_luma + coefs_per_chroma;
-     int job;
-+    s->max_ctu_count = coefs_per_luma / coefs_in_ctb;
-+    s->ctu_per_y_chan = s->max_ctu_count / 12;
-+    s->ctu_per_uv_chan = s->max_ctu_count / 8;
-     for(job=0;job<RPI_MAX_JOBS;job++) {
-       printf("Allocated %d\n",coefs_per_row);
-       for(job=0;job<RPI_MAX_JOBS;job++) {
-@@ -2186,10 +2193,9 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-             int my2_mx2_my_mx = (my_mx << 16) + my_mx;
-             int x1 = x0 + (mv->x >> 2);
-             int y1 = y0 + (mv->y >> 2);
--            int chan = x0>>6; // 64 wide blocks per QPU
-             int weight_flag = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
-                               (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
--            uint32_t *y = s->y_mvs[s->pass0_job][chan % 12];
-+            uint32_t *y = s->curr_y_mvs;
-             for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
-               for(int start_x=0;start_x < nPbW;start_x+=16) {
-                   int bw = nPbW-start_x;
-@@ -2209,7 +2215,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-                   y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter;
-                 }
-             }
--            s->y_mvs[s->pass0_job][chan % 12] = y;
-+            s->curr_y_mvs = y;
-         } else
- #endif
-         {
-@@ -2233,12 +2239,10 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
- 
-                 int x1_c = x0_c + (mv->x >> (2 + hshift));
-                 int y1_c = y0_c + (mv->y >> (2 + hshift));
--                //int chan = x0>>8; // Allocate commands for the first 256 luma pixels across to the first QPU.  This is optimised for images around 1920 width
--                int chan = x0>>8;
-                 int weight_flag      = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
-                                        (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
- 
--                uint32_t *u = s->u_mvs[s->pass0_job][chan & 7];
-+                uint32_t *u = s->curr_u_mvs;
-                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
-                       int bw = nPbW_c-start_x;
-@@ -2262,7 +2266,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-                     }
-                 }
--                s->u_mvs[s->pass0_job][chan & 7] = u;
-+                s->curr_u_mvs = u;
-                 return;
-             }
- #endif
-@@ -2289,10 +2293,9 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-             int my2_mx2_my_mx = (my_mx << 16) + my_mx;
-             int x1 = x0 + (mv->x >> 2);
-             int y1 = y0 + (mv->y >> 2);
--            int chan = x0>>6; // 64 wide blocks per QPU
-             int weight_flag = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
-                               (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
--            uint32_t *y = s->y_mvs[s->pass0_job][chan % 12];
-+            uint32_t *y = s->curr_y_mvs;
-             for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
-               for(int start_x=0;start_x < nPbW;start_x+=16) {
-                   int bw = nPbW-start_x;
-@@ -2312,7 +2315,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-                   y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter;
-                 }
-             }
--            s->y_mvs[s->pass0_job][chan % 12] = y;
-+            s->curr_y_mvs = y;
-         } else
- #endif
- 
-@@ -2337,12 +2340,10 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
- 
-                 int x1_c = x0_c + (mv->x >> (2 + hshift));
-                 int y1_c = y0_c + (mv->y >> (2 + hshift));
--                //int chan = x0>>8; // Allocate commands for the first 256 luma pixels across to the first QPU.  This is optimised for images around 1920 width
--                int chan = x0>>8;
-                 int weight_flag      = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
-                                        (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
- 
--                uint32_t *u = s->u_mvs[s->pass0_job][chan & 7];
-+                uint32_t *u = s->curr_u_mvs;
-                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
-                       int bw = nPbW_c-start_x;
-@@ -2367,7 +2368,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-                     }
-                 }
--                s->u_mvs[s->pass0_job][chan & 7] = u;
-+                s->curr_u_mvs = u;
-                 return;
-             }
- #endif
-@@ -2400,8 +2401,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-             int y1 = y0 + (mv->y >> 2);
-             int x2 = x0 + (mv2->x >> 2);
-             int y2 = y0 + (mv2->y >> 2);
--            int chan = x0>>6; // 64 wide blocks per QPU
--            uint32_t *y = s->y_mvs[s->pass0_job][chan % 12];
-+            uint32_t *y = s->curr_y_mvs;
-             for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
-               for(int start_x=0;start_x < nPbW;start_x+=8) { // B blocks work 8 at a time
-                   int bw = nPbW-start_x;
-@@ -2417,7 +2417,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-                   y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter_b;
-                 }
-             }
--            s->y_mvs[s->pass0_job][chan % 12] = y;
-+            s->curr_y_mvs = y;
-         } else
- #endif
-         {
-@@ -2448,9 +2448,8 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-                 int x2_c = x0_c + (mv2->x >> (2 + hshift));
-                 int y2_c = y0_c + (mv2->y >> (2 + hshift));
- 
--                int chan = x0>>8; // Allocate commands for the first 256 luma pixels across to the first QPU.  This is optimised for images around 1920 width
- 
--                uint32_t *u = s->u_mvs[s->pass0_job][chan & 7];
-+                uint32_t *u = s->curr_u_mvs;
-                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
-                       int bw = nPbW_c-start_x;
-@@ -2479,7 +2478,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-                     }
-                 }
--                s->u_mvs[s->pass0_job][chan & 7] = u;
-+                s->curr_u_mvs = u;
-                 return;
-             }
- #endif
-@@ -3114,12 +3113,8 @@ static void rpi_execute_inter_cmds(HEVCContext *s)
- 
- static void rpi_do_all_passes(HEVCContext *s)
- {
--#ifdef RPI_INTER_QPU
--    // Kick off inter prediction on QPUs
--    rpi_execute_inter_qpu(s);
--#else
--    rpi_execute_transform(s);
--#endif
-+    // Kick off QPUs and VPUs
-+    rpi_launch_vpu_qpu(s);
-     // Perform luma inter prediction
-     rpi_execute_inter_cmds(s);
-     // Wait for transform completion
-@@ -3128,18 +3123,18 @@ static void rpi_do_all_passes(HEVCContext *s)
-     rpi_execute_pred_cmds(s);
-     // Perform deblocking for CTBs in this row
-     rpi_execute_dblk_cmds(s);
--#ifdef RPI_INTER_QPU
--    rpi_inter_clear(s);
--#endif
-+    // Prepare next batch
-+    rpi_begin(s);
- }
- 
- #endif
- 
--#ifdef RPI_INTER_QPU
--static void rpi_inter_clear(HEVCContext *s)
-+#ifdef RPI
-+static void rpi_begin(HEVCContext *s)
- {
-     int job = s->pass0_job;
-     int i;
-+#ifdef RPI_INTER_QPU
-     int pic_width        = s->ps.sps->width >> s->ps.sps->hshift[1];
-     int pic_height       = s->ps.sps->height >> s->ps.sps->vshift[1];
-     int weight_flag      = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
-@@ -3165,6 +3160,8 @@ static void rpi_inter_clear(HEVCContext *s)
-         }
-         *s->u_mvs[job][i]++ = i;  // Select section of VPM (avoid collisions with 3d unit)
-     }
-+    s->curr_u_mvs = s->u_mvs[job][0];
-+#endif
- 
- #ifdef RPI_LUMA_QPU
-     for(i=0;i<12;i++) {
-@@ -3187,8 +3184,11 @@ static void rpi_inter_clear(HEVCContext *s)
-         }
-         *s->y_mvs[job][i]++ = 0; // Next kernel
-     }
-+    s->curr_y_mvs = s->y_mvs[job][0];
- #endif
-+    s->ctu_count = 0;
- }
-+#endif
- 
- #ifdef RPI_SIMULATE_QPUS
- 
-@@ -3459,8 +3459,9 @@ static void rpi_simulate_inter_qpu(HEVCContext *s)
- 
- #endif
- 
-+#ifdef RPI_INTER_QPU
- 
--static void rpi_execute_inter_qpu(HEVCContext *s)
-+static void rpi_launch_vpu_qpu(HEVCContext *s)
- {
-     int k;
- #ifdef LAUNCH_PASS0
-@@ -3558,6 +3559,15 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
- 
- 
- }
 +#else
 +
 +#ifdef RPI
@@ -31071,299 +4025,23 @@ index 73d7f74..ec67252 100644
 +}
 +#endif
 +
- #endif
- 
- #ifdef RPI
-@@ -3617,29 +3627,20 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
- #ifdef RPI
- #ifdef RPI_INTER_QPU
-     s->enable_rpi = s->ps.sps->bit_depth == 8
--                    && s->ps.sps->width <= RPI_MAX_WIDTH
-                     && !s->ps.pps->cross_component_prediction_enabled_flag
--                    && s->ps.pps->num_tile_rows <= 1 && s->ps.pps->num_tile_columns <= 1
-                     && !(s->ps.pps->weighted_bipred_flag && s->sh.slice_type == B_SLICE);
- #else
-     s->enable_rpi = s->ps.sps->bit_depth == 8
--                    && s->ps.sps->width <= RPI_MAX_WIDTH
--                    && !s->ps.pps->cross_component_prediction_enabled_flag
--                    && s->ps.pps->num_tile_rows <= 1 && s->ps.pps->num_tile_columns <= 1;
-+                    && !s->ps.pps->cross_component_prediction_enabled_flag;
- #endif
- 
-     if (!s->enable_rpi) {
-       if (s->ps.pps->cross_component_prediction_enabled_flag)
-         printf("Cross component\n");
--      if (s->ps.pps->num_tile_rows > 1 || s->ps.pps->num_tile_columns > 1)
--        printf("Tiles\n");
--      if (s->ps.pps->weighted_pred_flag && s->sh.slice_type == P_SLICE)
--        printf("Weighted P slice\n");
-       if (s->ps.pps->weighted_bipred_flag && s->sh.slice_type == B_SLICE)
-         printf("Weighted B slice\n");
-     }
- #endif
--
-     //printf("L0=%d L1=%d\n",s->sh.nb_refs[L1],s->sh.nb_refs[L1]);
- 
-     if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) {
-@@ -3660,8 +3661,8 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-     s->pass1_job = 0;
-     s->pass2_job = 0;
- #endif
--#ifdef RPI_INTER_QPU
--    rpi_inter_clear(s);
++#endif
++
 +#ifdef RPI
-+    rpi_begin(s);
- #endif
- 
-     while (more_data && ctb_addr_ts < s->ps.sps->ctb_size) {
-@@ -3679,13 +3680,34 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-         s->deblock[ctb_addr_rs].tc_offset   = s->sh.tc_offset;
-         s->filter_slice_edges[ctb_addr_rs]  = s->sh.slice_loop_filter_across_slices_enabled_flag;
- 
-+#ifdef RPI_INTER_QPU
-+        s->curr_u_mvs = s->u_mvs[s->pass0_job][s->ctu_count / s->ctu_per_uv_chan];
-+#endif
-+#ifdef RPI_LUMA_QPU
-+        s->curr_y_mvs = s->y_mvs[s->pass0_job][s->ctu_count / s->ctu_per_y_chan];
++
++#ifndef RPI_FAST_CACHEFLUSH
++#error RPI_FAST_CACHEFLUSH is broken
++static void flush_buffer(AVBufferRef *bref) {
++    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
++    gpu_cache_flush(p);
++}
 +#endif
 +
-         more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
- 
-+#ifdef RPI_INTER_QPU
-+        s->u_mvs[s->pass0_job][s->ctu_count / s->ctu_per_uv_chan] = s->curr_u_mvs;
-+#endif
-+#ifdef RPI_LUMA_QPU
-+        s->y_mvs[s->pass0_job][s->ctu_count / s->ctu_per_y_chan] = s->curr_y_mvs;
-+#endif
-+
- #ifdef RPI
-         if (s->enable_rpi) {
-+          //av_assert0(s->num_dblk_cmds[s->pass0_job]>=0);
-+          //av_assert0(s->num_dblk_cmds[s->pass0_job]<RPI_MAX_DEBLOCK_CMDS);
-+          //av_assert0(s->pass0_job<RPI_MAX_JOBS);
-+          //av_assert0(s->pass0_job>=0);
-           s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]][0] = x_ctb;
-           s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]++][1] = y_ctb;
--          if ( (((y_ctb + ctb_size)&63) == 0) && x_ctb + ctb_size >= s->ps.sps->width) {
-+          s->ctu_count++;
-+          //printf("%d %d/%d job=%d\n",s->ctu_count,s->num_dblk_cmds[s->pass0_job],RPI_MAX_DEBLOCK_CMDS,s->pass0_job);
-+
-+          if ( s->ctu_count >= s->max_ctu_count ) {
- #ifdef RPI_WORKER
-             if (s->used_for_ref) {
-               // Split work load onto separate threads so we make as rapid progress as possible with this frame
-@@ -3693,7 +3715,7 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-               rpi_execute_inter_cmds(s);
-   #endif
-   #ifdef LAUNCH_PASS0
--              rpi_execute_inter_qpu(s);
-+              rpi_launch_vpu_qpu(s);
-   #endif
-               // Pass on this job to worker thread
-               worker_submit_job(s);
-@@ -3701,9 +3723,7 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-               worker_pass0_ready(s);
- 
-               // Prepare the next batch of commands
--#ifdef RPI_INTER_QPU
--              rpi_inter_clear(s);
--#endif
-+              rpi_begin(s);
-             } else {
-               // Non-ref frame so do it all on this thread
-               rpi_do_all_passes(s);
-@@ -3744,7 +3764,7 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
- #endif
- 
-     // Finish off any half-completed rows
--    if (s->enable_rpi && s->num_dblk_cmds[s->pass0_job]) {
-+    if (s->enable_rpi && s->ctu_count) {
-         rpi_do_all_passes(s);
-     }
- 
-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-index 3cb34bd..a141316 100644
---- a/libavcodec/hevc.h
-+++ b/libavcodec/hevc.h
-@@ -823,8 +823,15 @@ typedef struct HEVCLocalContext {
- 
- #ifdef RPI
- 
-+// The processing is done in chunks
-+// Each chunk corresponds to 24 64x64 luma blocks (24 so it is divisible by 8 for chroma and 12 for luma)
-+// This is a distance of 1536 pixels across the screen
-+// Increasing RPI_NUM_CHUNKS will reduce time spent activating QPUs and cache flushing,
-+// but allocate more memory and increase the latency before data in the next frame can be processed
-+#define RPI_NUM_CHUNKS 1
-+
- // RPI_MAX_WIDTH is maximum width in pixels supported by the accelerated code
--#define RPI_MAX_WIDTH 2048
-+#define RPI_MAX_WIDTH (RPI_NUM_CHUNKS*64*24)
- 
- // Worst case is for 4:4:4 4x4 blocks with 64 high coding tree blocks, so 16 MV cmds per 4 pixels across for each colour plane, * 2 for bi
- #define RPI_MAX_MV_CMDS   (2*16*3*(RPI_MAX_WIDTH/4))
-@@ -888,9 +895,6 @@ typedef struct HEVCPredCmd {
- #endif
- 
- typedef struct HEVCContext {
--#ifdef RPI
--    int dblk_cmds[RPI_MAX_JOBS][RPI_MAX_DEBLOCK_CMDS][2];
--#endif
-     const AVClass *c;  // needed by private avoptions
-     AVCodecContext *avctx;
- 
-@@ -928,6 +932,10 @@ typedef struct HEVCContext {
-     int pass0_job; // Pass0 does coefficient decode
-     int pass1_job; // Pass1 does pixel processing
-     int pass2_job; // Pass2 does reconstruction and deblocking
-+    int ctu_count; // Number of CTUs done in pass0 so far
-+    int max_ctu_count; // Number of CTUs when we trigger a round of processing
-+    int ctu_per_y_chan; // Number of CTUs per luma QPU
-+    int ctu_per_uv_chan; // Number of CTUs per chroma QPU
- #ifdef RPI_INTER_QPU
-     GPU_MEM_PTR_T unif_mvs_ptr[RPI_MAX_JOBS];
-     uint32_t *unif_mvs[RPI_MAX_JOBS]; // Base of memory for motion vector commands
-@@ -936,6 +944,7 @@ typedef struct HEVCContext {
-     uint32_t *mvs_base[RPI_MAX_JOBS][8];
-     // these pointers are to the next free space
-     uint32_t *u_mvs[RPI_MAX_JOBS][8];
-+    uint32_t *curr_u_mvs; // Current uniform stream to use for chroma
-     // Function pointers
-     uint32_t mc_filter_uv;
-     uint32_t mc_filter_uv_b0;
-@@ -946,6 +955,7 @@ typedef struct HEVCContext {
-     uint32_t *y_unif_mvs[RPI_MAX_JOBS]; // Base of memory for motion vector commands
-     uint32_t *y_mvs_base[RPI_MAX_JOBS][12];
-     uint32_t *y_mvs[RPI_MAX_JOBS][12];
-+    uint32_t *curr_y_mvs; // Current uniform stream for luma
-     // Function pointers
-     uint32_t mc_filter;
-     uint32_t mc_filter_b;
-@@ -1084,6 +1094,9 @@ typedef struct HEVCContext {
-     uint32_t max_mastering_luminance;
-     uint32_t min_mastering_luminance;
- 
-+#ifdef RPI
-+    int dblk_cmds[RPI_MAX_JOBS][RPI_MAX_DEBLOCK_CMDS][2];
-+#endif
- } HEVCContext;
- 
- int ff_hevc_decode_short_term_rps(GetBitContext *gb, AVCodecContext *avctx,
-diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
-index b286bbf..1f04790 100644
---- a/libavcodec/hevc_filter.c
-+++ b/libavcodec/hevc_filter.c
-@@ -891,7 +891,7 @@ void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n)
-         int n_uv = n >> s->ps.sps->vshift[1];
-         int sz,base;
-         if (curr_uv < 0) curr_uv = 0;
--        if (n_uv<=curr_uv) { assert(0); return; } // Should not happen
-+        if (n_uv<=curr_uv) { return; }
-         sz = s->frame->linesize[1] * (n_uv-curr_uv);
-         base = s->frame->linesize[1] * curr_uv;
-         GPU_MEM_PTR_T *p = av_buffer_pool_opaque(s->frame->buf[1]);
-diff --git a/libavcodec/hevcpred_template.c b/libavcodec/hevcpred_template.c
-index 325b60e..28d2653 100644
---- a/libavcodec/hevcpred_template.c
-+++ b/libavcodec/hevcpred_template.c
-@@ -72,7 +72,7 @@ do {                                  \
-             else                                                               \
-                 a = PIXEL_SPLAT_X4(ptr[i + 3])
- #ifdef RPI_WORKER
--    HEVCLocalContextIntra *lc = s->enable_rpi ? &s->HEVClcIntra : (HEVCLocalContextIntra *)s->HEVClc ;
-+    HEVCLocalContextIntra *lc = (s->enable_rpi) ? &s->HEVClcIntra : (HEVCLocalContextIntra *)s->HEVClc ;
- #else
-     HEVCLocalContext *lc = s->HEVClc;
- #endif
--- 
-2.7.4
-
-
-From 1674a80d147e5342ef6ea9a4fb4ddfc640c15a05 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Thu, 4 Jun 2015 15:48:10 +0100
-Subject: [PATCH 59/68] Move deblocker into second pass
-
----
- libavcodec/hevc.c | 79 +++++++++++++++++++++++++++++++++++++++++++++----------
- 1 file changed, 65 insertions(+), 14 deletions(-)
-
-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-index ec67252..6cecbdd 100644
---- a/libavcodec/hevc.c
-+++ b/libavcodec/hevc.c
-@@ -67,6 +67,8 @@
-   static void rpi_execute_pred_cmds(HEVCContext *s);
-   static void rpi_execute_inter_cmds(HEVCContext *s);
-   static void rpi_begin(HEVCContext *s);
-+  static void flush_frame(HEVCContext *s,AVFrame *frame);
-+  static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2);
- 
-   // Define INTER_PASS0 to do inter prediction in first pass
-   //#define INTER_PASS0
-@@ -227,6 +229,11 @@ static void *worker_start(void *arg)
-     // Wait for transform completion
-     vpu_wait(s->vpu_id);
- 
-+    // Perform intra prediction and residual reconstruction
-+    rpi_execute_pred_cmds(s);
-+    // Perform deblocking for CTBs in this row
-+    rpi_execute_dblk_cmds(s);
-+
-     worker_complete_middle_job(s);
-     LOG_EXIT
-   }
-@@ -248,10 +255,6 @@ static void *worker_deblock_start(void *arg)
-       break;
-     }
-     LOG_ENTER
--    // Perform intra prediction and residual reconstruction
--    rpi_execute_pred_cmds(s);
--    // Perform deblocking for CTBs in this row
--    rpi_execute_dblk_cmds(s);
- 
-     worker_complete_job(s);
-     LOG_EXIT
-@@ -2983,7 +2986,7 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
- static void rpi_execute_dblk_cmds(HEVCContext *s)
- {
-     int n;
--    int job = s->pass2_job;
-+    int job = s->pass1_job;
-     int ctb_size    = 1 << s->ps.sps->log2_ctb_size;
-     int (*p)[2] = s->dblk_cmds[job];
-     for(n = s->num_dblk_cmds[job]; n>0 ;n--,p++) {
-@@ -3021,7 +3024,7 @@ static void rpi_execute_transform(HEVCContext *s)
- static void rpi_execute_pred_cmds(HEVCContext *s)
- {
-   int i;
--  int job = s->pass2_job;
-+  int job = s->pass1_job;
-   HEVCPredCmd *cmd = s->univ_pred_cmds[job];
- #ifdef RPI_WORKER
-   HEVCLocalContextIntra *lc = &s->HEVClcIntra;
-@@ -3506,11 +3509,10 @@ static void rpi_launch_vpu_qpu(HEVCContext *s)
- 
- #ifdef RPI_MULTI_MAILBOX
- #ifdef RPI_CACHE_UNIF_MVS
--    gpu_cache_flush3(&s->coeffs_buf_accelerated[job],&s->y_unif_mvs_ptr[job], &s->unif_mvs_ptr[job]);
-+    flush_frame3(s, s->frame,&s->coeffs_buf_accelerated[job],&s->y_unif_mvs_ptr[job], &s->unif_mvs_ptr[job]);
- #else
--    gpu_cache_flush(&s->coeffs_buf_accelerated[job]);
-+    flush_frame3(s, s->frame,&s->coeffs_buf_accelerated[job],NULL,NULL);
- #endif
--
-     s->vpu_id = vpu_qpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2], s->num_coeffs[job][2] >> 8, s->coeffs_buf_vc[job][3], s->num_coeffs[job][3] >> 10, 0,
-                                    qpu_get_fn(QPU_MC_SETUP_UV),
-                                    (uint32_t)(unif_vc+(s->mvs_base[job][0 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-@@ -3613,6 +3615,60 @@ static void flush_frame(HEVCContext *s,AVFrame *frame)
- #endif
- }
- 
-+static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2)
++static void flush_frame(HEVCContext *s,AVFrame *frame)
 +{
 +#ifdef RPI_FAST_CACHEFLUSH
 +    struct vcsm_user_clean_invalid_s iocache = {};
++    GPU_MEM_PTR_T p = get_gpu_mem_ptr_u(s->frame);
 +    int n = s->ps.sps->height;
 +    int curr_y = 0;
 +    int curr_uv = 0;
@@ -31371,22 +4049,70 @@ index ec67252..6cecbdd 100644
 +    int sz,base;
 +    sz = s->frame->linesize[1] * (n_uv-curr_uv);
 +    base = s->frame->linesize[1] * curr_uv;
-+    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[1]);
-+    iocache.s[0].handle = p->vcsm_handle;
++    iocache.s[0].handle = p.vcsm_handle;
 +    iocache.s[0].cmd = 3; // clean+invalidate
-+    iocache.s[0].addr = p->arm + base;
++    iocache.s[0].addr = (int)(p.arm) + base;
 +    iocache.s[0].size  = sz;
-+    p = av_buffer_pool_opaque(frame->buf[2]);
-+    iocache.s[1].handle = p->vcsm_handle;
++    p = get_gpu_mem_ptr_v(s->frame);
++    iocache.s[1].handle = p.vcsm_handle;
 +    iocache.s[1].cmd = 3; // clean+invalidate
-+    iocache.s[1].addr = p->arm + base;
++    iocache.s[1].addr = (int)(p.arm) + base;
 +    iocache.s[1].size  = sz;
-+    p = av_buffer_pool_opaque(frame->buf[0]);
++    p = get_gpu_mem_ptr_y(s->frame);
 +    sz = s->frame->linesize[0] * (n-curr_y);
 +    base = s->frame->linesize[0] * curr_y;
-+    iocache.s[2].handle = p->vcsm_handle;
++    iocache.s[2].handle = p.vcsm_handle;
 +    iocache.s[2].cmd = 3; // clean+invalidate
-+    iocache.s[2].addr = p->arm + base;
++    iocache.s[2].addr = (int)(p.arm) + base;
++    iocache.s[2].size  = sz;
++    vcsm_clean_invalid( &iocache );
++#else
++    flush_buffer(frame->buf[0]);
++    flush_buffer(frame->buf[1]);
++    flush_buffer(frame->buf[2]);
++#endif
++}
++
++static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2, int job)
++{
++#ifdef RPI_FAST_CACHEFLUSH
++    struct vcsm_user_clean_invalid_s iocache = {};
++    int n;
++    int curr_y;
++    int curr_uv;
++    int n_uv;
++    GPU_MEM_PTR_T p = get_gpu_mem_ptr_u(s->frame);
++    int sz,base;
++    int (*d)[2] = s->dblk_cmds[job];
++    int low=(*d)[1];
++    int high=(*d)[1];
++    for(n = s->num_dblk_cmds[job]; n>0 ;n--,d++) {
++        int y = (*d)[1];
++        low=FFMIN(low,y);
++        high=FFMAX(high,y);
++    }
++    curr_y = low;
++    n = high+(1 << s->ps.sps->log2_ctb_size);
++    curr_uv = curr_y >> s->ps.sps->vshift[1];
++    n_uv = n >> s->ps.sps->vshift[1];
++
++    sz = s->frame->linesize[1] * (n_uv-curr_uv);
++    base = s->frame->linesize[1] * curr_uv;
++    iocache.s[0].handle = p.vcsm_handle;
++    iocache.s[0].cmd = 3; // clean+invalidate
++    iocache.s[0].addr = (int)(p.arm) + base;
++    iocache.s[0].size  = sz;
++    p = get_gpu_mem_ptr_v(s->frame);
++    iocache.s[1].handle = p.vcsm_handle;
++    iocache.s[1].cmd = 3; // clean+invalidate
++    iocache.s[1].addr = (int)(p.arm) + base;
++    iocache.s[1].size  = sz;
++    p = get_gpu_mem_ptr_y(s->frame);
++    sz = s->frame->linesize[0] * (n-curr_y);
++    base = s->frame->linesize[0] * curr_y;
++    iocache.s[2].handle = p.vcsm_handle;
++    iocache.s[2].cmd = 3; // clean+invalidate
++    iocache.s[2].addr = (int)(p.arm) + base;
 +    iocache.s[2].size  = sz;
 +
 +    iocache.s[3].handle = p0->vcsm_handle;
@@ -31414,520 +4140,1383 @@ index ec67252..6cecbdd 100644
 +#endif
 +}
 +
- #endif
- 
++#endif
++
  static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-@@ -4127,11 +4183,6 @@ static int hevc_frame_start(HEVCContext *s)
-     if (!s->avctx->hwaccel)
-         ff_thread_finish_setup(s->avctx);
+ {
+     HEVCContext *s  = avctxt->priv_data;
+@@ -2313,6 +3762,17 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+     int y_ctb       = 0;
+     int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs];
  
--#ifdef RPI_INTER_QPU
--    // Invalidate the output data buffer so it is ready for the QPUs to write into it.
--    flush_frame(s,s->frame);
--#endif
--
-     return 0;
++#ifdef RPI
++    s->enable_rpi = s->ps.sps->bit_depth == 8
++                    && !s->ps.pps->cross_component_prediction_enabled_flag;
++
++    if (!s->enable_rpi) {
++      if (s->ps.pps->cross_component_prediction_enabled_flag)
++        printf("Cross component\n");
++    }
++#endif
++    //printf("L0=%d L1=%d\n",s->sh.nb_refs[L1],s->sh.nb_refs[L1]);
++
+     if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) {
+         av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n");
+         return AVERROR_INVALIDDATA;
+@@ -2326,6 +3786,14 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+         }
+     }
  
- fail:
--- 
-2.7.4
-
-
-From a453fe438c4ab311d6476955d0a40a5d2ed8a1c6 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Thu, 4 Jun 2015 16:10:23 +0100
-Subject: [PATCH 60/68] Change order of ctu accesses to improve qpu performance
-
----
- libavcodec/hevc.c | 8 ++++----
- 1 file changed, 4 insertions(+), 4 deletions(-)
-
-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-index 6cecbdd..ec17e64 100644
---- a/libavcodec/hevc.c
-+++ b/libavcodec/hevc.c
-@@ -3737,19 +3737,19 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
++#ifdef RPI_WORKER
++    s->pass0_job = 0;
++    s->pass1_job = 0;
++#endif
++#ifdef RPI
++    rpi_begin(s);
++#endif
++
+     while (more_data && ctb_addr_ts < s->ps.sps->ctb_size) {
+         int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
+ 
+@@ -2341,7 +3809,57 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+         s->deblock[ctb_addr_rs].tc_offset   = s->sh.tc_offset;
          s->filter_slice_edges[ctb_addr_rs]  = s->sh.slice_loop_filter_across_slices_enabled_flag;
  
- #ifdef RPI_INTER_QPU
--        s->curr_u_mvs = s->u_mvs[s->pass0_job][s->ctu_count / s->ctu_per_uv_chan];
++#ifdef RPI_INTER_QPU
 +        s->curr_u_mvs = s->u_mvs[s->pass0_job][s->ctu_count % 8];
- #endif
- #ifdef RPI_LUMA_QPU
--        s->curr_y_mvs = s->y_mvs[s->pass0_job][s->ctu_count / s->ctu_per_y_chan];
-+        s->curr_y_mvs = s->y_mvs[s->pass0_job][s->ctu_count % 12];
- #endif
- 
-         more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
- 
- #ifdef RPI_INTER_QPU
--        s->u_mvs[s->pass0_job][s->ctu_count / s->ctu_per_uv_chan] = s->curr_u_mvs;
-+        s->u_mvs[s->pass0_job][s->ctu_count % 8]= s->curr_u_mvs;
- #endif
- #ifdef RPI_LUMA_QPU
--        s->y_mvs[s->pass0_job][s->ctu_count / s->ctu_per_y_chan] = s->curr_y_mvs;
-+        s->y_mvs[s->pass0_job][s->ctu_count % 12] = s->curr_y_mvs;
- #endif
- 
- #ifdef RPI
--- 
-2.7.4
-
-
-From 504de0435e8f660c1b7b2d6ec053dc922a2d2896 Mon Sep 17 00:00:00 2001
-From: Peter de Rivaz <peter.derivaz@gmail.com>
-Date: Mon, 8 Jun 2015 09:36:59 +0100
-Subject: [PATCH 61/68] Removed deblocker thread
-
----
- libavcodec/hevc.c | 77 +++----------------------------------------------------
- libavcodec/hevc.h |  4 ---
- 2 files changed, 4 insertions(+), 77 deletions(-)
-
-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-index ec17e64..1868532 100644
---- a/libavcodec/hevc.c
-+++ b/libavcodec/hevc.c
-@@ -70,11 +70,6 @@
-   static void flush_frame(HEVCContext *s,AVFrame *frame);
-   static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2);
- 
--  // Define INTER_PASS0 to do inter prediction in first pass
--  //#define INTER_PASS0
--  // Define LAUNCH_PASS0 to launch QPU/VPU from pass0
--  //#define LAUNCH_PASS0
--
- #endif
- 
- // #define DISABLE_MC
-@@ -147,24 +142,12 @@ static void worker_submit_job(HEVCContext *s)
- }
- 
- // Call this to say we have completed pass1
--static void worker_complete_middle_job(HEVCContext *s)
--{
--  LOG_ENTER
--  pthread_mutex_lock(&s->worker_mutex);
--  s->worker_middle++;
--  s->pass1_job = (s->pass1_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
--  pthread_cond_broadcast(&s->worker_cond_middle); // Let people know that the middle has moved
--  pthread_mutex_unlock(&s->worker_mutex);
--  LOG_EXIT
--}
--
--// Call this to say we have completed pass2
- static void worker_complete_job(HEVCContext *s)
- {
-   LOG_ENTER
-   pthread_mutex_lock(&s->worker_mutex);
-   s->worker_head++;
--  s->pass2_job = (s->pass2_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
-+  s->pass1_job = (s->pass1_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
-   pthread_cond_broadcast(&s->worker_cond_head); // Let people know that the head has moved
-   pthread_mutex_unlock(&s->worker_mutex);
-   LOG_EXIT
-@@ -208,7 +191,7 @@ static void *worker_start(void *arg)
-   while(1) {
-     pthread_mutex_lock(&s->worker_mutex);
- 
--    while( !s->kill_worker && s->worker_tail - s->worker_middle <= 0)
-+    while( !s->kill_worker && s->worker_tail - s->worker_head <= 0)
-     {
-       pthread_cond_wait(&s->worker_cond_tail, &s->worker_mutex);
-     }
-@@ -219,13 +202,9 @@ static void *worker_start(void *arg)
-     }
-     LOG_ENTER
-     // printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
--#ifndef LAUNCH_PASS0
-     rpi_launch_vpu_qpu(s);
--#endif
--#ifndef INTER_PASS0
-     // Perform inter prediction
-     rpi_execute_inter_cmds(s);
--#endif
-     // Wait for transform completion
-     vpu_wait(s->vpu_id);
- 
-@@ -234,28 +213,6 @@ static void *worker_start(void *arg)
-     // Perform deblocking for CTBs in this row
-     rpi_execute_dblk_cmds(s);
- 
--    worker_complete_middle_job(s);
--    LOG_EXIT
--  }
--  return NULL;
--}
--
--static void *worker_deblock_start(void *arg)
--{
--  HEVCContext *s = (HEVCContext *)arg;
--  while(1) {
--    pthread_mutex_lock(&s->worker_mutex);
--    while( !s->kill_worker && s->worker_middle - s->worker_head <= 0)
--    {
--      pthread_cond_wait(&s->worker_cond_middle, &s->worker_mutex);
--    }
--    pthread_mutex_unlock(&s->worker_mutex);
--
--    if (s->kill_worker) {
--      break;
--    }
--    LOG_ENTER
--
-     worker_complete_job(s);
-     LOG_EXIT
-   }
-@@ -2998,11 +2955,7 @@ static void rpi_execute_dblk_cmds(HEVCContext *s)
- static void rpi_execute_transform(HEVCContext *s)
- {
-     int i=2;
--#ifdef LAUNCH_PASS0
--    int job = s->pass0_job;
--#else
-     int job = s->pass1_job;
--#endif
-     //int j;
-     //int16_t *coeffs = s->coeffs_buf_arm[i];
-     //for(j=s->num_coeffs[i]; j > 0; j-= 16*16, coeffs+=16*16) {
-@@ -3057,11 +3010,7 @@ static void rpi_execute_pred_cmds(HEVCContext *s)
- 
- static void rpi_execute_inter_cmds(HEVCContext *s)
- {
--#ifdef INTER_PASS0
--    int job = s->pass0_job;
--#else
-     int job = s->pass1_job;
--#endif
-     HEVCMvCmd *cmd = s->unif_mv_cmds[job];
-     int n,cidx;
-     AVFrame myref;
-@@ -3467,11 +3416,7 @@ static void rpi_simulate_inter_qpu(HEVCContext *s)
- static void rpi_launch_vpu_qpu(HEVCContext *s)
- {
-     int k;
--#ifdef LAUNCH_PASS0
--    int job = s->pass0_job;
--#else
-     int job = s->pass1_job;
--#endif
-     int i;
-     uint32_t *unif_vc = (uint32_t *)s->unif_mvs_ptr[job].vc;
- #ifdef RPI_LUMA_QPU
-@@ -3574,10 +3519,12 @@ static void rpi_launch_vpu_qpu(HEVCContext *s)
- 
- #ifdef RPI
- 
-+#ifndef RPI_FAST_CACHEFLUSH
- static void flush_buffer(AVBufferRef *bref) {
-     GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
-     gpu_cache_flush(p);
- }
 +#endif
++#ifdef RPI_LUMA_QPU
++        s->curr_y_mvs = s->y_mvs[s->pass0_job][s->ctu_count % 12];
++#endif
++
+         more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
++
++#ifdef RPI_INTER_QPU
++        s->u_mvs[s->pass0_job][s->ctu_count % 8]= s->curr_u_mvs;
++#endif
++#ifdef RPI_LUMA_QPU
++        s->y_mvs[s->pass0_job][s->ctu_count % 12] = s->curr_y_mvs;
++#endif
++
++#ifdef RPI
++        if (s->enable_rpi) {
++          //av_assert0(s->num_dblk_cmds[s->pass0_job]>=0);
++          //av_assert0(s->num_dblk_cmds[s->pass0_job]<RPI_MAX_DEBLOCK_CMDS);
++          //av_assert0(s->pass0_job<RPI_MAX_JOBS);
++          //av_assert0(s->pass0_job>=0);
++          s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]][0] = x_ctb;
++          s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]++][1] = y_ctb;
++          s->ctu_count++;
++          //printf("%d %d/%d job=%d\n",s->ctu_count,s->num_dblk_cmds[s->pass0_job],RPI_MAX_DEBLOCK_CMDS,s->pass0_job);
++
++          if ( s->ctu_count >= s->max_ctu_count ) {
++#ifdef RPI_WORKER
++            if (s->used_for_ref) {
++              // Split work load onto separate threads so we make as rapid progress as possible with this frame
++              // Pass on this job to worker thread
++              worker_submit_job(s);
++              // Make sure we have space to prepare the next job
++              worker_pass0_ready(s);
++
++              // Prepare the next batch of commands
++              rpi_begin(s);
++            } else {
++              // Non-ref frame so do it all on this thread
++              rpi_do_all_passes(s);
++            }
++#else
++            rpi_do_all_passes(s);
++#endif
++          }
++
++        }
++#endif
++
++
+         if (more_data < 0) {
+             s->tab_slice_address[ctb_addr_rs] = -1;
+             return more_data;
+@@ -2350,9 +3868,29 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
  
- static void flush_frame(HEVCContext *s,AVFrame *frame)
- {
-@@ -3715,7 +3662,6 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
- #ifdef RPI_WORKER
-     s->pass0_job = 0;
-     s->pass1_job = 0;
--    s->pass2_job = 0;
- #endif
- #ifdef RPI
-     rpi_begin(s);
-@@ -3767,12 +3713,6 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
- #ifdef RPI_WORKER
-             if (s->used_for_ref) {
-               // Split work load onto separate threads so we make as rapid progress as possible with this frame
--  #ifdef INTER_PASS0
--              rpi_execute_inter_cmds(s);
--  #endif
--  #ifdef LAUNCH_PASS0
--              rpi_launch_vpu_qpu(s);
--  #endif
-               // Pass on this job to worker thread
-               worker_submit_job(s);
-               // Make sure we have space to prepare the next job
-@@ -3814,8 +3754,6 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-     // Wait for the worker to finish all its jobs
-     if (s->enable_rpi) {
-         worker_wait(s);
--        av_assert0(s->pass0_job==s->pass1_job);
--        av_assert0(s->pass1_job==s->pass2_job);
+         ctb_addr_ts++;
+         ff_hevc_save_states(s, ctb_addr_ts);
++#ifdef RPI
++        if (s->enable_rpi)
++            continue;
++#endif
+         ff_hevc_hls_filters(s, x_ctb, y_ctb, ctb_size);
      }
- #endif
  
-@@ -4565,16 +4503,13 @@ static av_cold void hevc_init_worker(HEVCContext *s)
- {
-     int err;
-     pthread_cond_init(&s->worker_cond_head, NULL);
--    pthread_cond_init(&s->worker_cond_middle, NULL);
-     pthread_cond_init(&s->worker_cond_tail, NULL);
-     pthread_mutex_init(&s->worker_mutex, NULL);
++#ifdef RPI
++
++#ifdef RPI_WORKER
++    // Wait for the worker to finish all its jobs
++    if (s->enable_rpi) {
++        worker_wait(s);
++    }
++#endif
++
++    // Finish off any half-completed rows
++    if (s->enable_rpi && s->ctu_count) {
++        rpi_do_all_passes(s);
++    }
++
++#endif
++
+     if (x_ctb + ctb_size >= s->ps.sps->width &&
+         y_ctb + ctb_size >= s->ps.sps->height)
+         ff_hevc_hls_filter(s, x_ctb, y_ctb, ctb_size);
+@@ -2387,6 +3925,11 @@ static int hls_decode_entry_wpp(AVCodecContext *avctxt, void *input_ctb_row, int
+     s = s1->sList[self_id];
+     lc = s->HEVClc;
  
-     s->worker_tail=0;
--    s->worker_middle=0;
-     s->worker_head=0;
-     s->kill_worker=0;
-     err = pthread_create(&s->worker_thread, NULL, worker_start, s);
--    err = pthread_create(&s->worker_deblock_thread, NULL, worker_deblock_start, s);
-     if (err) {
-         printf("Failed to create worker thread\n");
-         exit(-1);
-@@ -4586,17 +4521,13 @@ static av_cold void hevc_exit_worker(HEVCContext *s)
-     void *res;
-     s->kill_worker=1;
-     pthread_cond_broadcast(&s->worker_cond_tail);
--    pthread_cond_broadcast(&s->worker_cond_middle);
-     pthread_join(s->worker_thread, &res);
--    pthread_join(s->worker_deblock_thread, &res);
++#ifdef RPI
++    s->enable_rpi = 0;
++    //printf("Wavefront\n");
++#endif
++
+     if(ctb_row) {
+         ret = init_get_bits8(&lc->gb, s->data + s->sh.offset[ctb_row - 1], s->sh.size[ctb_row - 1]);
  
-     pthread_cond_destroy(&s->worker_cond_head);
--    pthread_cond_destroy(&s->worker_cond_middle);
-     pthread_cond_destroy(&s->worker_cond_tail);
-     pthread_mutex_destroy(&s->worker_mutex);
+@@ -2767,6 +4310,16 @@ static int decode_nal_unit(HEVCContext *s, const H2645NAL *nal)
+         if (ret < 0)
+             return ret;
  
-     s->worker_tail=0;
--    s->worker_middle=0;
-     s->worker_head=0;
-     s->kill_worker=0;
++        s->used_for_ref = !(s->nal_unit_type == NAL_TRAIL_N ||
++                        s->nal_unit_type == NAL_TSA_N   ||
++                        s->nal_unit_type == NAL_STSA_N  ||
++                        s->nal_unit_type == NAL_RADL_N  ||
++                        s->nal_unit_type == NAL_RASL_N);
++
++        if (!s->used_for_ref && s->avctx->skip_frame >= AVDISCARD_NONREF) {
++            s->is_decoded = 0;
++            break;
++        }
+         if (s->max_ra == INT_MAX) {
+             if (s->nal_unit_type == NAL_CRA_NUT || IS_BLA(s)) {
+                 s->max_ra = s->poc;
+@@ -2891,9 +4444,17 @@ static int decode_nal_units(HEVCContext *s, const uint8_t *buf, int length)
+     }
+ 
+ fail:
+-    if (s->ref && s->threads_type == FF_THREAD_FRAME)
++    if (s->ref && s->threads_type == FF_THREAD_FRAME) {
++#ifdef RPI_INTER_QPU
++        ff_hevc_flush_buffer(s, &s->ref->tf, s->ps.sps->height);
++#endif
+         ff_thread_report_progress(&s->ref->tf, INT_MAX, 0);
+-
++    } else if (s->ref) {
++#ifdef RPI_INTER_QPU
++      // When running single threaded we need to flush the whole frame
++      flush_frame(s,s->frame);
++#endif
++    }
+     return ret;
  }
+ 
+@@ -3064,6 +4625,41 @@ fail:
+     return AVERROR(ENOMEM);
+ }
+ 
++#ifdef RPI_WORKER
++static av_cold void hevc_init_worker(HEVCContext *s)
++{
++    int err;
++    pthread_cond_init(&s->worker_cond_head, NULL);
++    pthread_cond_init(&s->worker_cond_tail, NULL);
++    pthread_mutex_init(&s->worker_mutex, NULL);
++
++    s->worker_tail=0;
++    s->worker_head=0;
++    s->kill_worker=0;
++    err = pthread_create(&s->worker_thread, NULL, worker_start, s);
++    if (err) {
++        printf("Failed to create worker thread\n");
++        exit(-1);
++    }
++}
++
++static av_cold void hevc_exit_worker(HEVCContext *s)
++{
++    void *res;
++    s->kill_worker=1;
++    pthread_cond_broadcast(&s->worker_cond_tail);
++    pthread_join(s->worker_thread, &res);
++
++    pthread_cond_destroy(&s->worker_cond_head);
++    pthread_cond_destroy(&s->worker_cond_tail);
++    pthread_mutex_destroy(&s->worker_mutex);
++
++    s->worker_tail=0;
++    s->worker_head=0;
++    s->kill_worker=0;
++}
++#endif
++
+ static av_cold int hevc_decode_free(AVCodecContext *avctx)
+ {
+     HEVCContext       *s = avctx->priv_data;
+@@ -3075,6 +4671,32 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
+ 
+     av_freep(&s->cabac_state);
+ 
++#ifdef RPI
++
++#ifdef RPI_WORKER
++    hevc_exit_worker(s);
++#endif
++
++    for(i=0;i<RPI_MAX_JOBS;i++) {
++      av_freep(&s->unif_mv_cmds[i]);
++      av_freep(&s->univ_pred_cmds[i]);
++
++#ifdef RPI_INTER_QPU
++      if (s->unif_mvs[i]) {
++        gpu_free( &s->unif_mvs_ptr[i] );
++        s->unif_mvs[i] = 0;
++      }
++#endif
++#ifdef RPI_LUMA_QPU
++      if (s->y_unif_mvs[i]) {
++        gpu_free( &s->y_unif_mvs_ptr[i] );
++        s->y_unif_mvs[i] = 0;
++      }
++#endif
++    }
++
++#endif
++
+     for (i = 0; i < 3; i++) {
+         av_freep(&s->sao_pixel_buffer_h[i]);
+         av_freep(&s->sao_pixel_buffer_v[i]);
+@@ -3116,10 +4738,23 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
+     return 0;
+ }
+ 
++#ifdef RPI
++#ifdef RPI_PRECLEAR
++static av_cold void memclear16(int16_t *p, int n)
++{
++  vpu_execute_code( vpu_get_fn(), p, n, 0, 0, 0, 1);
++  //int i;
++  //for(i=0;i<n;i++)
++  //  p[i] = 0;
++}
++#endif
++#endif
++
+ static av_cold int hevc_init_context(AVCodecContext *avctx)
+ {
+     HEVCContext *s = avctx->priv_data;
+     int i;
++    int job;
+ 
+     s->avctx = avctx;
+ 
+@@ -3129,6 +4764,78 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
+     s->HEVClcList[0] = s->HEVClc;
+     s->sList[0] = s;
+ 
++#ifdef RPI
++    for(job=0;job<RPI_MAX_JOBS;job++) {
++        s->unif_mv_cmds[job] = av_mallocz(sizeof(HEVCMvCmd)*RPI_MAX_MV_CMDS);
++        if (!s->unif_mv_cmds[job])
++            goto fail;
++        s->univ_pred_cmds[job] = av_mallocz(sizeof(HEVCPredCmd)*RPI_MAX_PRED_CMDS);
++        if (!s->univ_pred_cmds[job])
++            goto fail;
++    }
++
++#ifdef RPI_INTER_QPU
++    // We divide the image into blocks 256 wide and 64 high
++    // We support up to 2048 widths
++    // We compute the number of chroma motion vector commands for 4:4:4 format and 4x4 chroma blocks - assuming all blocks are B predicted
++    // Also add space for the startup command for each stream.
++
++    {
++        int uv_commands_per_qpu = UV_COMMANDS_PER_QPU;
++        uint32_t *p;
++		for(job=0;job<RPI_MAX_JOBS;job++) {
++#ifdef RPI_CACHE_UNIF_MVS
++          gpu_malloc_cached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr[job] );
++#else
++          gpu_malloc_uncached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr[job] );
++#endif
++          s->unif_mvs[job] = (uint32_t *) s->unif_mvs_ptr[job].arm;
++
++          // Set up initial locations for uniform streams
++          p = s->unif_mvs[job];
++          for(i = 0; i < 8; i++) {
++            s->mvs_base[job][i] = p;
++            p += uv_commands_per_qpu;
++          }
++        }
++        s->mc_filter_uv = qpu_get_fn(QPU_MC_FILTER_UV);
++        s->mc_filter_uv_b0 = qpu_get_fn(QPU_MC_FILTER_UV_B0);
++        s->mc_filter_uv_b = qpu_get_fn(QPU_MC_FILTER_UV_B);
++    }
++
++#endif
++#ifdef RPI_LUMA_QPU
++    for(job=0;job<RPI_MAX_JOBS;job++)
++    {
++        int y_commands_per_qpu = Y_COMMANDS_PER_QPU;
++        uint32_t *p;
++#ifdef RPI_CACHE_UNIF_MVS
++        gpu_malloc_cached( 12 * y_commands_per_qpu * sizeof(uint32_t), &s->y_unif_mvs_ptr[job] );
++#else
++        gpu_malloc_uncached( 12 * y_commands_per_qpu * sizeof(uint32_t), &s->y_unif_mvs_ptr[job] );
++#endif
++        s->y_unif_mvs[job] = (uint32_t *) s->y_unif_mvs_ptr[job].arm;
++
++        // Set up initial locations for uniform streams
++        p = s->y_unif_mvs[job];
++        for(i = 0; i < 12; i++) {
++            s->y_mvs_base[job][i] = p;
++            p += y_commands_per_qpu;
++        }
++    }
++    s->mc_filter = qpu_get_fn(QPU_MC_FILTER);
++    s->mc_filter_b = qpu_get_fn(QPU_MC_FILTER_B);
++#endif
++    //gpu_malloc_uncached(2048*64,&s->dummy);
++
++    s->enable_rpi = 0;
++
++#ifdef RPI_WORKER
++    hevc_init_worker(s);
++#endif
++
++#endif
++
+     s->cabac_state = av_malloc(HEVC_CONTEXTS);
+     if (!s->cabac_state)
+         goto fail;
 diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-index a141316..ef5bfb1 100644
+index be91010..6b03ea8 100644
 --- a/libavcodec/hevc.h
 +++ b/libavcodec/hevc.h
-@@ -931,7 +931,6 @@ typedef struct HEVCContext {
-     //GPU_MEM_PTR_T dummy;
-     int pass0_job; // Pass0 does coefficient decode
-     int pass1_job; // Pass1 does pixel processing
--    int pass2_job; // Pass2 does reconstruction and deblocking
-     int ctu_count; // Number of CTUs done in pass0 so far
-     int max_ctu_count; // Number of CTUs when we trigger a round of processing
-     int ctu_per_y_chan; // Number of CTUs per luma QPU
-@@ -963,15 +962,12 @@ typedef struct HEVCContext {
+@@ -23,6 +23,9 @@
+ #ifndef AVCODEC_HEVC_H
+ #define AVCODEC_HEVC_H
  
- #ifdef RPI_WORKER
-     pthread_t worker_thread;
--    pthread_t worker_deblock_thread;
-     pthread_cond_t worker_cond_head;
-     pthread_cond_t worker_cond_tail;
--    pthread_cond_t worker_cond_middle;
-     pthread_mutex_t worker_mutex;
- 
-     int worker_tail; // Contains the number of posted jobs
-     int worker_head; // Contains the number of completed jobs
--    int worker_middle; // Contains the number of completed jobs
-     int kill_worker; // set to 1 to terminate the worker
- #endif
- 
--- 
-2.7.4
-
-
-From 74892301cdb0829de959b798debac6ffe1c71603 Mon Sep 17 00:00:00 2001
-From: Peter de Rivaz <peter.derivaz@gmail.com>
-Date: Mon, 8 Jun 2015 11:04:43 +0100
-Subject: [PATCH 62/68] Reduced amount of output frame that is invalidated
-
----
- libavcodec/hevc.c | 45 +++++++++++++++++++++++++++++----------------
- 1 file changed, 29 insertions(+), 16 deletions(-)
-
-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-index 1868532..cbb4f46 100644
---- a/libavcodec/hevc.c
-+++ b/libavcodec/hevc.c
-@@ -68,7 +68,7 @@
-   static void rpi_execute_inter_cmds(HEVCContext *s);
-   static void rpi_begin(HEVCContext *s);
-   static void flush_frame(HEVCContext *s,AVFrame *frame);
--  static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2);
-+  static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2, int job);
- 
- #endif
- 
-@@ -3454,9 +3454,9 @@ static void rpi_launch_vpu_qpu(HEVCContext *s)
- 
- #ifdef RPI_MULTI_MAILBOX
- #ifdef RPI_CACHE_UNIF_MVS
--    flush_frame3(s, s->frame,&s->coeffs_buf_accelerated[job],&s->y_unif_mvs_ptr[job], &s->unif_mvs_ptr[job]);
-+    flush_frame3(s, s->frame,&s->coeffs_buf_accelerated[job],&s->y_unif_mvs_ptr[job], &s->unif_mvs_ptr[job], job);
- #else
--    flush_frame3(s, s->frame,&s->coeffs_buf_accelerated[job],NULL,NULL);
-+    flush_frame3(s, s->frame,&s->coeffs_buf_accelerated[job],NULL,NULL, job);
- #endif
-     s->vpu_id = vpu_qpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2], s->num_coeffs[job][2] >> 8, s->coeffs_buf_vc[job][3], s->num_coeffs[job][3] >> 10, 0,
-                                    qpu_get_fn(QPU_MC_SETUP_UV),
-@@ -3530,6 +3530,7 @@ static void flush_frame(HEVCContext *s,AVFrame *frame)
- {
- #ifdef RPI_FAST_CACHEFLUSH
-     struct vcsm_user_clean_invalid_s iocache = {};
-+    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[1]);
-     int n = s->ps.sps->height;
-     int curr_y = 0;
-     int curr_uv = 0;
-@@ -3537,22 +3538,21 @@ static void flush_frame(HEVCContext *s,AVFrame *frame)
-     int sz,base;
-     sz = s->frame->linesize[1] * (n_uv-curr_uv);
-     base = s->frame->linesize[1] * curr_uv;
--    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[1]);
-     iocache.s[0].handle = p->vcsm_handle;
-     iocache.s[0].cmd = 3; // clean+invalidate
--    iocache.s[0].addr = p->arm + base;
-+    iocache.s[0].addr = (int)(p->arm) + base;
-     iocache.s[0].size  = sz;
-     p = av_buffer_pool_opaque(frame->buf[2]);
-     iocache.s[1].handle = p->vcsm_handle;
-     iocache.s[1].cmd = 3; // clean+invalidate
--    iocache.s[1].addr = p->arm + base;
-+    iocache.s[1].addr = (int)(p->arm) + base;
-     iocache.s[1].size  = sz;
-     p = av_buffer_pool_opaque(frame->buf[0]);
-     sz = s->frame->linesize[0] * (n-curr_y);
-     base = s->frame->linesize[0] * curr_y;
-     iocache.s[2].handle = p->vcsm_handle;
-     iocache.s[2].cmd = 3; // clean+invalidate
--    iocache.s[2].addr = p->arm + base;
-+    iocache.s[2].addr = (int)(p->arm) + base;
-     iocache.s[2].size  = sz;
-     vcsm_clean_invalid( &iocache );
- #else
-@@ -3562,33 +3562,46 @@ static void flush_frame(HEVCContext *s,AVFrame *frame)
- #endif
- }
- 
--static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2)
-+static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2, int job)
- {
- #ifdef RPI_FAST_CACHEFLUSH
-     struct vcsm_user_clean_invalid_s iocache = {};
--    int n = s->ps.sps->height;
--    int curr_y = 0;
--    int curr_uv = 0;
--    int n_uv = n >> s->ps.sps->vshift[1];
-+    int n;
-+    int curr_y;
-+    int curr_uv;
-+    int n_uv;
-+    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[1]);
-     int sz,base;
-+    int (*d)[2] = s->dblk_cmds[job];
-+    int low=(*d)[1];
-+    int high=(*d)[1];
-+    for(n = s->num_dblk_cmds[job]; n>0 ;n--,d++) {
-+        int y = (*d)[1];
-+        low=FFMIN(low,y);
-+        high=FFMAX(high,y);
-+    }
-+    curr_y = low;
-+    n = high+(1 << s->ps.sps->log2_ctb_size);
-+    curr_uv = curr_y >> s->ps.sps->vshift[1];
-+    n_uv = n >> s->ps.sps->vshift[1];
++// define RPI to split the CABAC/prediction/transform into separate stages
++#include "config.h"
 +
-     sz = s->frame->linesize[1] * (n_uv-curr_uv);
-     base = s->frame->linesize[1] * curr_uv;
--    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[1]);
-     iocache.s[0].handle = p->vcsm_handle;
-     iocache.s[0].cmd = 3; // clean+invalidate
--    iocache.s[0].addr = p->arm + base;
-+    iocache.s[0].addr = (int)(p->arm) + base;
-     iocache.s[0].size  = sz;
-     p = av_buffer_pool_opaque(frame->buf[2]);
-     iocache.s[1].handle = p->vcsm_handle;
-     iocache.s[1].cmd = 3; // clean+invalidate
--    iocache.s[1].addr = p->arm + base;
-+    iocache.s[1].addr = (int)(p->arm) + base;
-     iocache.s[1].size  = sz;
-     p = av_buffer_pool_opaque(frame->buf[0]);
-     sz = s->frame->linesize[0] * (n-curr_y);
-     base = s->frame->linesize[0] * curr_y;
-     iocache.s[2].handle = p->vcsm_handle;
-     iocache.s[2].cmd = 3; // clean+invalidate
--    iocache.s[2].addr = p->arm + base;
-+    iocache.s[2].addr = (int)(p->arm) + base;
-     iocache.s[2].size  = sz;
+ #include "libavutil/buffer.h"
+ #include "libavutil/md5.h"
+ 
+@@ -37,6 +40,29 @@
+ #include "thread.h"
+ #include "videodsp.h"
+ 
++// define RPI to split the CABAC/prediction/transform into separate stages
++#ifdef RPI
++
++  #include "rpi_qpu.h"
++  // Define RPI_INTER_QPU to use QPU for chroma inter prediction
++  #define RPI_INTER_QPU
++
++  #ifdef RPI_INTER_QPU
++    // Define RPI_LUMA_QPU to also use QPU for luma inter prediction
++    #define RPI_LUMA_QPU
++  #endif
++
++  // By passing jobs to a worker thread we hope to be able to catch up during slow frames
++  #define RPI_MAX_JOBS 2
++  // Define RPI_WORKER to launch a worker thread for pixel processing tasks
++  #define RPI_WORKER
++  // Define RPI_DEBLOCK_VPU to perform deblocking on the VPUs
++//  #define RPI_DEBLOCK_VPU
++
++#endif
++
++#define RPI_VPU_DEBLOCK_CACHED 1
++
+ #define MAX_DPB_SIZE 16 // A.4.1
+ #define MAX_REFS 16
+ 
+@@ -660,17 +686,6 @@ typedef struct CodingUnit {
+     uint8_t cu_transquant_bypass_flag;
+ } CodingUnit;
+ 
+-typedef struct Mv {
+-    int16_t x;  ///< horizontal component of motion vector
+-    int16_t y;  ///< vertical component of motion vector
+-} Mv;
+-
+-typedef struct MvField {
+-    DECLARE_ALIGNED(4, Mv, mv)[2];
+-    int8_t ref_idx[2];
+-    int8_t pred_flag;
+-} MvField;
+-
+ typedef struct NeighbourAvailable {
+     int cand_bottom_left;
+     int cand_left;
+@@ -747,7 +762,17 @@ typedef struct HEVCFrame {
+     uint8_t flags;
+ } HEVCFrame;
+ 
++#ifdef RPI_WORKER
++typedef struct HEVCLocalContextIntra {
++    TransformUnit tu;
++    NeighbourAvailable na;
++} HEVCLocalContextIntra;
++#endif
++
+ typedef struct HEVCLocalContext {
++    TransformUnit tu;
++    NeighbourAvailable na;  // WARNING tu and na must be the first two fields to match HEVCLocalContextIntra
++
+     uint8_t cabac_state[HEVC_CONTEXTS];
+ 
+     uint8_t stat_coeff[4];
+@@ -762,7 +787,6 @@ typedef struct HEVCLocalContext {
+ 
+     int qPy_pred;
+ 
+-    TransformUnit tu;
+ 
+     uint8_t ctb_left_flag;
+     uint8_t ctb_up_flag;
+@@ -779,7 +803,6 @@ typedef struct HEVCLocalContext {
+     int ct_depth;
+     CodingUnit cu;
+     PredictionUnit pu;
+-    NeighbourAvailable na;
+ 
+ #define BOUNDARY_LEFT_SLICE     (1 << 0)
+ #define BOUNDARY_LEFT_TILE      (1 << 1)
+@@ -790,6 +813,80 @@ typedef struct HEVCLocalContext {
+     int boundary_flags;
+ } HEVCLocalContext;
+ 
++
++#ifdef RPI
++
++// The processing is done in chunks
++// Each chunk corresponds to 24 64x64 luma blocks (24 so it is divisible by 8 for chroma and 12 for luma)
++// This is a distance of 1536 pixels across the screen
++// Increasing RPI_NUM_CHUNKS will reduce time spent activating QPUs and cache flushing,
++// but allocate more memory and increase the latency before data in the next frame can be processed
++#define RPI_NUM_CHUNKS 1
++
++// RPI_MAX_WIDTH is maximum width in pixels supported by the accelerated code
++#define RPI_MAX_WIDTH (RPI_NUM_CHUNKS*64*24)
++
++// Worst case is for 4:4:4 4x4 blocks with 64 high coding tree blocks, so 16 MV cmds per 4 pixels across for each colour plane, * 2 for bi
++#define RPI_MAX_MV_CMDS   (2*16*3*(RPI_MAX_WIDTH/4))
++// Each block can have an intra prediction and a transform_add command
++#define RPI_MAX_PRED_CMDS (2*16*3*(RPI_MAX_WIDTH/4))
++// Worst case is 16x16 CTUs
++#define RPI_MAX_DEBLOCK_CMDS (RPI_MAX_WIDTH*4/16)
++
++#define RPI_CMD_LUMA_UNI 0
++#define RPI_CMD_CHROMA_UNI 1
++#define RPI_CMD_LUMA_BI 2
++#define RPI_CMD_CHROMA_BI 3
++#define RPI_CMD_V_BI 4
++
++// RPI_PRECLEAR is not working yet - perhaps clearing on VPUs is flawed?
++// #define RPI_PRECLEAR
++
++// Command for inter prediction
++typedef struct HEVCMvCmd {
++    int cmd;
++    uint8_t *dst;
++    ptrdiff_t dststride;
++    uint8_t *src;
++    ptrdiff_t srcstride;
++    Mv mv;
++    int x_off;
++    int y_off;
++    int block_w;
++    int block_h;
++    int weight;
++    int offset;
++    uint8_t *src1;
++    ptrdiff_t srcstride1;
++    Mv mv1;
++    int8_t ref_idx[2];
++} HEVCMvCmd;
++
++
++// Command for intra prediction and transform_add of predictions to coefficients
++#define RPI_PRED_TRANSFORM_ADD 0
++#define RPI_PRED_INTRA 1
++typedef struct HEVCPredCmd {
++    uint8_t size;
++    uint8_t type;
++    uint8_t na;
++    uint8_t c_idx;
++    union {
++        uint8_t *dst; // RPI_PRED_TRANSFORM_ADD
++        uint32_t x;   // RPI_PRED_INTRA
++    };
++    union {
++        int16_t *buf; // RPI_PRED_TRANSFORM_ADD
++        uint32_t y;   // RPI_PRED_INTRA
++    };
++    union {
++        enum IntraPredMode mode; // RPI_PRED_TRANSFORM_ADD
++        uint32_t stride;         // RPI_PRED_INTRA
++    };
++} HEVCPredCmd;
++
++#endif
++
+ typedef struct HEVCContext {
+     const AVClass *c;  // needed by private avoptions
+     AVCodecContext *avctx;
+@@ -798,13 +895,107 @@ typedef struct HEVCContext {
+ 
+     HEVCLocalContext    *HEVClcList[MAX_NB_THREADS];
+     HEVCLocalContext    *HEVClc;
+-
++#ifdef RPI_WORKER
++    HEVCLocalContextIntra HEVClcIntra;
++#endif
+     uint8_t             threads_type;
+     uint8_t             threads_number;
+ 
+     int                 width;
+     int                 height;
+ 
++    int used_for_ref;
++
++#ifdef RPI
++    int enable_rpi;
++    HEVCMvCmd *unif_mv_cmds[RPI_MAX_JOBS];
++    HEVCPredCmd *univ_pred_cmds[RPI_MAX_JOBS];
++    int buf_width;
++    GPU_MEM_PTR_T coeffs_buf_default[RPI_MAX_JOBS];
++    GPU_MEM_PTR_T coeffs_buf_accelerated[RPI_MAX_JOBS];
++    int16_t *coeffs_buf_arm[RPI_MAX_JOBS][4];
++    unsigned int coeffs_buf_vc[RPI_MAX_JOBS][4];
++    int num_coeffs[RPI_MAX_JOBS][4];
++    int num_xfm_cmds[RPI_MAX_JOBS];
++    int num_mv_cmds[RPI_MAX_JOBS];
++    int num_pred_cmds[RPI_MAX_JOBS];
++    int num_dblk_cmds[RPI_MAX_JOBS];
++    int vpu_id;
++    int pass0_job; // Pass0 does coefficient decode
++    int pass1_job; // Pass1 does pixel processing
++    int ctu_count; // Number of CTUs done in pass0 so far
++    int max_ctu_count; // Number of CTUs when we trigger a round of processing
++    int ctu_per_y_chan; // Number of CTUs per luma QPU
++    int ctu_per_uv_chan; // Number of CTUs per chroma QPU
++#ifdef RPI_INTER_QPU
++    GPU_MEM_PTR_T unif_mvs_ptr[RPI_MAX_JOBS];
++    uint32_t *unif_mvs[RPI_MAX_JOBS]; // Base of memory for motion vector commands
++
++    // _base pointers are to the start of the row
++    uint32_t *mvs_base[RPI_MAX_JOBS][8];
++    // these pointers are to the next free space
++    uint32_t *u_mvs[RPI_MAX_JOBS][8];
++    uint32_t *curr_u_mvs; // Current uniform stream to use for chroma
++    // Function pointers
++    uint32_t mc_filter_uv;
++    uint32_t mc_filter_uv_b0;
++    uint32_t mc_filter_uv_b;
++#endif
++#ifdef RPI_LUMA_QPU
++    GPU_MEM_PTR_T y_unif_mvs_ptr[RPI_MAX_JOBS];
++    uint32_t *y_unif_mvs[RPI_MAX_JOBS]; // Base of memory for motion vector commands
++    uint32_t *y_mvs_base[RPI_MAX_JOBS][12];
++    uint32_t *y_mvs[RPI_MAX_JOBS][12];
++    uint32_t *curr_y_mvs; // Current uniform stream for luma
++    // Function pointers
++    uint32_t mc_filter;
++    uint32_t mc_filter_b;
++#endif
++
++#ifdef RPI_WORKER
++    pthread_t worker_thread;
++    pthread_cond_t worker_cond_head;
++    pthread_cond_t worker_cond_tail;
++    pthread_mutex_t worker_mutex;
++
++    int worker_tail; // Contains the number of posted jobs
++    int worker_head; // Contains the number of completed jobs
++    int kill_worker; // set to 1 to terminate the worker
++#endif
++
++#define RPI_DEBLOCK_VPU_Q_COUNT 2
++
++#ifdef RPI_DEBLOCK_VPU
++    int enable_rpi_deblock;
++
++    int uv_setup_width;
++    int uv_setup_height;
++    int setup_width; // Number of 16x16 blocks across the image
++    int setup_height; // Number of 16x16 blocks down the image
++
++    struct dblk_vpu_q_s
++    {
++        GPU_MEM_PTR_T deblock_vpu_gmem;
++
++        uint8_t (*y_setup_arm)[2][2][2][4];
++        uint8_t (*y_setup_vc)[2][2][2][4];
++
++        uint8_t (*uv_setup_arm)[2][2][2][4];  // Half of this is unused [][][1][], but easier for the VPU as it allows us to store with zeros and addresses are aligned
++        uint8_t (*uv_setup_vc)[2][2][2][4];
++
++        int (*vpu_cmds_arm)[6]; // r0-r5 for each command
++        int vpu_cmds_vc;
++
++        int cmd_id;
++    } dvq_ents[RPI_DEBLOCK_VPU_Q_COUNT];
++
++    struct dblk_vpu_q_s * dvq;
++    unsigned int dvq_n;
++
++#endif
++
++#endif
++
+     uint8_t *cabac_state;
+ 
+     /** 1 if the independent slice segment header was successfully parsed */
+@@ -922,6 +1113,9 @@ typedef struct HEVCContext {
+     uint32_t max_mastering_luminance;
+     uint32_t min_mastering_luminance;
+ 
++#ifdef RPI
++    int dblk_cmds[RPI_MAX_JOBS][RPI_MAX_DEBLOCK_CMDS][2];
++#endif
+ } HEVCContext;
+ 
+ int ff_hevc_decode_short_term_rps(GetBitContext *gb, AVCodecContext *avctx,
+@@ -1048,6 +1242,10 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+                                  int log2_trafo_size, enum ScanType scan_idx,
+                                  int c_idx);
+ 
++#ifdef RPI_INTER_QPU
++extern void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n);
++#endif
++
+ void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size);
  
-     iocache.s[3].handle = p0->vcsm_handle;
--- 
-2.7.4
-
-
-From 090b6be5b501bd3c547700926e540397f0b39e69 Mon Sep 17 00:00:00 2001
-From: Peter de Rivaz <peter.derivaz@gmail.com>
-Date: Mon, 8 Jun 2015 11:55:29 +0100
-Subject: [PATCH 63/68] Packed 16x16 and 32x32 into the same buffer
-
----
- libavcodec/hevc.c       | 24 +++++++++++++++---------
- libavcodec/hevc_cabac.c |  9 ++++++++-
- libavcodec/rpi_qpu.c    |  2 +-
- 3 files changed, 24 insertions(+), 11 deletions(-)
-
-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-index cbb4f46..a596534 100644
---- a/libavcodec/hevc.c
-+++ b/libavcodec/hevc.c
-@@ -299,12 +299,12 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
-         s->coeffs_buf_arm[job][0] = (int16_t*) s->coeffs_buf_default[job].arm;
-         if (!s->coeffs_buf_arm[job][0])
-             goto fail;
--        gpu_malloc_cached(sizeof(int16_t) * coefs_per_row * 2, &s->coeffs_buf_accelerated[job]);
-+        gpu_malloc_cached(sizeof(int16_t) * (coefs_per_row + 32*32), &s->coeffs_buf_accelerated[job]);  // We prefetch past the end so provide an extra blocks worth of data
-         s->coeffs_buf_arm[job][2] = (int16_t*) s->coeffs_buf_accelerated[job].arm;
-         s->coeffs_buf_vc[job][2] = s->coeffs_buf_accelerated[job].vc;
-         if (!s->coeffs_buf_arm[job][2])
-             goto fail;
--        s->coeffs_buf_arm[job][3] = coefs_per_row + s->coeffs_buf_arm[job][2];
-+        s->coeffs_buf_arm[job][3] = coefs_per_row + s->coeffs_buf_arm[job][2];  // This points to just beyond the end of the buffer.  Coefficients fill in backwards.
-         s->coeffs_buf_vc[job][3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[job][2];
-       }
-     }
-@@ -2956,15 +2956,20 @@ static void rpi_execute_transform(HEVCContext *s)
- {
-     int i=2;
-     int job = s->pass1_job;
--    //int j;
--    //int16_t *coeffs = s->coeffs_buf_arm[i];
--    //for(j=s->num_coeffs[i]; j > 0; j-= 16*16, coeffs+=16*16) {
--    //    s->hevcdsp.idct[4-2](coeffs, 16);
--    //}
-+    /*int j;
-+    int16_t *coeffs = s->coeffs_buf_arm[job][i];
-+    for(j=s->num_coeffs[job][i]; j > 0; j-= 16*16, coeffs+=16*16) {
-+        s->hevcdsp.idct[4-2](coeffs, 16);
-+    }
-+    i=3;
-+    coeffs = s->coeffs_buf_arm[job][i] - s->num_coeffs[job][i];
-+    for(j=s->num_coeffs[job][i]; j > 0; j-= 32*32, coeffs+=32*32) {
-+        s->hevcdsp.idct[5-2](coeffs, 32);
-+    }*/
  
-     gpu_cache_flush(&s->coeffs_buf_accelerated[job]);
-     s->vpu_id = vpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2],
--                               s->num_coeffs[job][2] >> 8, s->coeffs_buf_vc[job][3],
-+                               s->num_coeffs[job][2] >> 8, s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3],
-                                s->num_coeffs[job][3] >> 10, 0, &s->coeffs_buf_accelerated[job]);
-     //vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0);
-     //gpu_cache_flush(&s->coeffs_buf_accelerated);
-@@ -3458,7 +3463,8 @@ static void rpi_launch_vpu_qpu(HEVCContext *s)
- #else
-     flush_frame3(s, s->frame,&s->coeffs_buf_accelerated[job],NULL,NULL, job);
- #endif
--    s->vpu_id = vpu_qpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2], s->num_coeffs[job][2] >> 8, s->coeffs_buf_vc[job][3], s->num_coeffs[job][3] >> 10, 0,
-+    s->vpu_id = vpu_qpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2], s->num_coeffs[job][2] >> 8,
-+                                                                      s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3], s->num_coeffs[job][3] >> 10, 0,
-                                    qpu_get_fn(QPU_MC_SETUP_UV),
-                                    (uint32_t)(unif_vc+(s->mvs_base[job][0 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-                                    (uint32_t)(unif_vc+(s->mvs_base[job][1 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
 diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
-index 6523e66..8656917 100644
+index 05b2821..e2f1f4e 100644
 --- a/libavcodec/hevc_cabac.c
 +++ b/libavcodec/hevc_cabac.c
-@@ -1051,7 +1051,14 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-     if (s->enable_rpi) {
-         int n = trafo_size * trafo_size;
-         if (use_vpu) {
--            coeffs = s->coeffs_buf_arm[s->pass0_job][log2_trafo_size - 2] + s->num_coeffs[s->pass0_job][log2_trafo_size - 2];
+@@ -21,14 +21,72 @@
+  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+  */
+ 
++#define UNCHECKED_BITSTREAM_READER 1
++
+ #include "libavutil/attributes.h"
+ #include "libavutil/common.h"
+ 
+-#include "cabac_functions.h"
+ #include "hevc.h"
++#include "cabac_functions.h"
++
++// BY22 is probably faster than simple bypass if the processor has
++// either a fast 32-bit divide or a fast 32x32->64[63:32] instruction
++// x86 has fast int divide
++// Arm doesn't have divide or general fast 64 bit, but does have the multiply
++// * Beware: ARCH_xxx isn't set if configure --disable-asm is used
++#define USE_BY22 (HAVE_FAST_64BIT || ARCH_ARM || ARCH_X86)
++// Use native divide if we have a fast one - otherwise use mpy 1/x
++// x86 has a fast integer divide - arm doesn't - unsure about other
++// architectures
++#define USE_BY22_DIV  ARCH_X86
++
++// Special case blocks with a single significant ceoff
++// Decreases the complexity of the code for a common case but increases the
++// code size.
++#define USE_N_END_1 1
++
++#if ARCH_ARM
++#include "arm/hevc_cabac.h"
++#endif
+ 
+ #define CABAC_MAX_BIN 31
+ 
++
++#if USE_BY22 && !USE_BY22_DIV
++#define I(x) (uint32_t)((0x10000000000ULL / (uint64_t)(x)) + 1ULL)
++
++static const uint32_t cabac_by22_inv_range[256] = {
++                                                    0,      I(257), I(258), I(259),
++    I(260), I(261), I(262), I(263), I(264), I(265), I(266), I(267), I(268), I(269),
++    I(270), I(271), I(272), I(273), I(274), I(275), I(276), I(277), I(278), I(279),
++    I(280), I(281), I(282), I(283), I(284), I(285), I(286), I(287), I(288), I(289),
++    I(290), I(291), I(292), I(293), I(294), I(295), I(296), I(297), I(298), I(299),
++    I(300), I(301), I(302), I(303), I(304), I(305), I(306), I(307), I(308), I(309),
++    I(310), I(311), I(312), I(313), I(314), I(315), I(316), I(317), I(318), I(319),
++    I(320), I(321), I(322), I(323), I(324), I(325), I(326), I(327), I(328), I(329),
++    I(330), I(331), I(332), I(333), I(334), I(335), I(336), I(337), I(338), I(339),
++    I(340), I(341), I(342), I(343), I(344), I(345), I(346), I(347), I(348), I(349),
++    I(350), I(351), I(352), I(353), I(354), I(355), I(356), I(357), I(358), I(359),
++    I(360), I(361), I(362), I(363), I(364), I(365), I(366), I(367), I(368), I(369),
++    I(370), I(371), I(372), I(373), I(374), I(375), I(376), I(377), I(378), I(379),
++    I(380), I(381), I(382), I(383), I(384), I(385), I(386), I(387), I(388), I(389),
++    I(390), I(391), I(392), I(393), I(394), I(395), I(396), I(397), I(398), I(399),
++    I(400), I(401), I(402), I(403), I(404), I(405), I(406), I(407), I(408), I(409),
++    I(410), I(411), I(412), I(413), I(414), I(415), I(416), I(417), I(418), I(419),
++    I(420), I(421), I(422), I(423), I(424), I(425), I(426), I(427), I(428), I(429),
++    I(430), I(431), I(432), I(433), I(434), I(435), I(436), I(437), I(438), I(439),
++    I(440), I(441), I(442), I(443), I(444), I(445), I(446), I(447), I(448), I(449),
++    I(450), I(451), I(452), I(453), I(454), I(455), I(456), I(457), I(458), I(459),
++    I(460), I(461), I(462), I(463), I(464), I(465), I(466), I(467), I(468), I(469),
++    I(470), I(471), I(472), I(473), I(474), I(475), I(476), I(477), I(478), I(479),
++    I(480), I(481), I(482), I(483), I(484), I(485), I(486), I(487), I(488), I(489),
++    I(490), I(491), I(492), I(493), I(494), I(495), I(496), I(497), I(498), I(499),
++    I(500), I(501), I(502), I(503), I(504), I(505), I(506), I(507), I(508), I(509),
++    I(510), I(511)
++};
++#undef I
++#endif  // USE_BY22
++
+ /**
+  * number of bin by SyntaxElement.
+  */
+@@ -445,6 +503,211 @@ static const uint8_t diag_scan8x8_inv[8][8] = {
+     { 28, 36, 43, 49, 54, 58, 61, 63, },
+ };
+ 
++
++typedef struct
++{
++    uint16_t coeff;
++    uint16_t scale;
++} xy_off_t;
++
++#define XYT_C(x,y,t) ((x) + ((y) << (t)))
++#define SCALE_TRAFO(t) ((t) > 3 ? 3 : (t))
++#define SCALE_SHR(t) ((t) - SCALE_TRAFO(t))
++#define XYT_S(x,y,t) (((x) >> SCALE_SHR(t)) + (((y) >> SCALE_SHR(t)) << SCALE_TRAFO(t)))
++
++#define XYT(x,y,t) {XYT_C(x,y,t), XYT_S(x,y,t)}
++
++#define OFF_DIAG(t) {\
++    XYT(0,0,t), XYT(0,1,t), XYT(1,0,t), XYT(0,2,t),\
++    XYT(1,1,t), XYT(2,0,t), XYT(0,3,t), XYT(1,2,t),\
++    XYT(2,1,t), XYT(3,0,t), XYT(1,3,t), XYT(2,2,t),\
++    XYT(3,1,t), XYT(2,3,t), XYT(3,2,t), XYT(3,3,t)\
++}
++
++#define OFF_HORIZ(t) {\
++    XYT(0,0,t), XYT(1,0,t), XYT(2,0,t), XYT(3,0,t),\
++    XYT(0,1,t), XYT(1,1,t), XYT(2,1,t), XYT(3,1,t),\
++    XYT(0,2,t), XYT(1,2,t), XYT(2,2,t), XYT(3,2,t),\
++    XYT(0,3,t), XYT(1,3,t), XYT(2,3,t), XYT(3,3,t)\
++}
++
++#define OFF_VERT(t) {\
++    XYT(0,0,t), XYT(0,1,t), XYT(0,2,t), XYT(0,3,t),\
++    XYT(1,0,t), XYT(1,1,t), XYT(1,2,t), XYT(1,3,t),\
++    XYT(2,0,t), XYT(2,1,t), XYT(2,2,t), XYT(2,3,t),\
++    XYT(3,0,t), XYT(3,1,t), XYT(3,2,t), XYT(3,3,t)\
++}
++
++static const xy_off_t off_xys[3][4][16] =
++{
++    {OFF_DIAG(2), OFF_DIAG(3), OFF_DIAG(4), OFF_DIAG(5)},
++    {OFF_HORIZ(2), OFF_HORIZ(3), OFF_HORIZ(4), OFF_HORIZ(5)},
++    {OFF_VERT(2), OFF_VERT(3), OFF_VERT(4), OFF_VERT(5)}
++};
++
++
++// Helper fns
++#ifndef hevc_mem_bits32
++static av_always_inline uint32_t hevc_mem_bits32(const void * buf, const unsigned int offset)
++{
++    return AV_RB32((const uint8_t *)buf + (offset >> 3)) << (offset & 7);
++}
++#endif
++
++#if AV_GCC_VERSION_AT_LEAST(3,4) && !defined(hevc_clz32)
++#define hevc_clz32 hevc_clz32_builtin
++static av_always_inline unsigned int hevc_clz32_builtin(const uint32_t x)
++{
++    // __builtin_clz says it works on ints - so adjust if int is >32 bits long
++    return __builtin_clz(x) - (sizeof(int) * 8 - 32);
++}
++#endif
++
++// It is unlikely that we will ever need this but include for completeness
++#ifndef hevc_clz32
++static inline unsigned int hevc_clz32(unsigned int x)
++{
++    unsigned int n = 1;
++    if ((x & 0xffff0000) == 0) {
++        n += 16;
++        x <<= 16;
++    }
++    if ((x & 0xff000000) == 0) {
++        n += 8;
++        x <<= 8;
++    }
++    if ((x & 0xf0000000) == 0) {
++        n += 4;
++        x <<= 4;
++    }
++    if ((x & 0xc0000000) == 0) {
++        n += 2;
++        x <<= 2;
++    }
++    return n - ((x >> 31) & 1);
++}
++#endif
++
++
++#if !USE_BY22
++// If no by22 then _by22 functions will revert to normal and so _peek/_flush
++// will no longer be called but the setup calls will still exist and we want
++// to null them out
++#define bypass_start(s)
++#define bypass_finish(s)
++#else
++// Use BY22 for residual bypass block
++
++#define bypass_start(s) get_cabac_by22_start(&s->HEVClc->cc)
++#define bypass_finish(s) get_cabac_by22_finish(&s->HEVClc->cc)
++
++// BY22 notes that bypass is simply a divide into the bitstream and so we
++// can peek out large quantities of bits at once and treat the result as if
++// it was VLC.  In many cases this will lead to O(1) processing rather than
++// O(n) though the setup and teardown is sufficiently expensive that it is
++// only worth using if we expect to be dealing with more than a few bits
++// The definition of "a few bits" will vary from platform to platform but
++// tests on ARM show that it probably isn't worth it for a single coded
++// residual, but is for >1 - it also seems likely that if there are
++// more residuals then they are likely to be bigger and this will make the
++// O(1) nature of the code more worthwhile.
++
++
++#if !USE_BY22_DIV
++// * 1/x @ 32 bits gets us 22 bits of accuracy
++#define CABAC_BY22_PEEK_BITS  22
++#else
++// A real 32-bit divide gets us another bit
++// If we have a 64 bit int & a unit time divider then we should get a lot
++// of bits (55)  but that is untested and it is unclear if it would give
++// us a large advantage
++#define CABAC_BY22_PEEK_BITS  23
++#endif
++
++// Bypass block start
++// Must be called before _by22_peek is used as it sets the CABAC environment
++// into the correct state.  _by22_finish must be called to return to 'normal'
++// (i.e. non-bypass) cabac decoding
++static inline void get_cabac_by22_start(CABACContext * const c)
++{
++    const unsigned int bits = __builtin_ctz(c->low);
++    const uint32_t m = hevc_mem_bits32(c->bytestream, 0);
++    uint32_t x = (c->low << (22 - CABAC_BITS)) ^ ((m ^ 0x80000000U) >> (9 + CABAC_BITS - bits));
++#if !USE_BY22_DIV
++    const uint32_t inv = cabac_by22_inv_range[c->range & 0xff];
++#endif
++
++    c->bytestream -= (CABAC_BITS / 8);
++    c->by22.bits = bits;
++#if !USE_BY22_DIV
++    c->by22.range = c->range;
++    c->range = inv;
++#endif
++    c->low = x;
++}
++
++// Bypass block finish
++// Must be called at the end of the bypass block to return to normal operation
++static inline void get_cabac_by22_finish(CABACContext * const c)
++{
++    unsigned int used = c->by22.bits;
++    unsigned int bytes_used = (used / CABAC_BITS) * (CABAC_BITS / 8);
++    unsigned int bits_used = used & (CABAC_BITS == 16 ? 15 : 7);
++
++    c->bytestream += bytes_used + (CABAC_BITS / 8);
++    c->low = (((uint32_t)c->low >> (22 - CABAC_BITS + bits_used)) | 1) << bits_used;
++#if !USE_BY22_DIV
++    c->range = c->by22.range;
++#endif
++}
++
++// Peek bypass bits
++// _by22_start must be called before _by22_peek is called and _by22_flush
++// must be called afterwards to flush any used bits
++// The actual number of valid bits returned is
++// min(<coded bypass block length>, CABAC_BY22_PEEK_BITS). CABAC_BY22_PEEK_BITS
++// will be at least 22 which should be long enough for any prefix or suffix
++// though probably not long enough for the worst case combination
++#ifndef get_cabac_by22_peek
++static inline uint32_t get_cabac_by22_peek(const CABACContext * const c)
++{
++#if USE_BY22_DIV
++    return ((unsigned int)c->low / (unsigned int)c->range) << 9;
++#else
++    uint32_t x = c->low & ~1U;
++    const uint32_t inv = c->range;
++
++    if (inv != 0)
++        x = (uint32_t)(((uint64_t)x * (uint64_t)inv) >> 32);
++
++    return x << 1;
++#endif
++}
++#endif
++
++// Flush bypass bits peeked by _by22_peek
++// Flush n bypass bits. n must be >= 1 to guarantee correct operation
++// val is an unmodified copy of whatever _by22_peek returned
++#ifndef get_cabac_by22_flush
++static inline void get_cabac_by22_flush(CABACContext * c, const unsigned int n, const uint32_t val)
++{
++    // Subtract the bits used & reshift up to the top of the word
++#if USE_BY22_DIV
++    const uint32_t low = (((unsigned int)c->low << n) - (((val >> (32 - n)) * (unsigned int)c->range) << 23));
++#else
++    const uint32_t low = (((uint32_t)c->low << n) - (((val >> (32 - n)) * c->by22.range) << 23));
++#endif
++
++    // and refill lower bits
++    // We will probably OR over some existing bits but that doesn't matter
++    c->by22.bits += n;
++    c->low = low | (hevc_mem_bits32(c->bytestream, c->by22.bits) >> 9);
++}
++#endif
++
++#endif  // USE_BY22
++
++
+ void ff_hevc_save_states(HEVCContext *s, int ctb_addr_ts)
+ {
+     if (s->ps.pps->entropy_coding_sync_enabled_flag &&
+@@ -863,19 +1126,19 @@ int ff_hevc_cbf_luma_decode(HEVCContext *s, int trafo_depth)
+     return GET_CABAC(elem_offset[CBF_LUMA] + !trafo_depth);
+ }
+ 
+-static int hevc_transform_skip_flag_decode(HEVCContext *s, int c_idx)
++static int hevc_transform_skip_flag_decode(HEVCContext *s, int c_idx_nz)
+ {
+-    return GET_CABAC(elem_offset[TRANSFORM_SKIP_FLAG] + !!c_idx);
++    return GET_CABAC(elem_offset[TRANSFORM_SKIP_FLAG] + c_idx_nz);
+ }
+ 
+-static int explicit_rdpcm_flag_decode(HEVCContext *s, int c_idx)
++static int explicit_rdpcm_flag_decode(HEVCContext *s, int c_idx_nz)
+ {
+-    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_FLAG] + !!c_idx);
++    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_FLAG] + c_idx_nz);
+ }
+ 
+-static int explicit_rdpcm_dir_flag_decode(HEVCContext *s, int c_idx)
++static int explicit_rdpcm_dir_flag_decode(HEVCContext *s, int c_idx_nz)
+ {
+-    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_DIR_FLAG] + !!c_idx);
++    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_DIR_FLAG] + c_idx_nz);
+ }
+ 
+ int ff_hevc_log2_res_scale_abs(HEVCContext *s, int idx) {
+@@ -891,14 +1154,14 @@ int ff_hevc_res_scale_sign_flag(HEVCContext *s, int idx) {
+     return GET_CABAC(elem_offset[RES_SCALE_SIGN_FLAG] + idx);
+ }
+ 
+-static av_always_inline void last_significant_coeff_xy_prefix_decode(HEVCContext *s, int c_idx,
++static av_always_inline void last_significant_coeff_xy_prefix_decode(HEVCContext *s, int c_idx_nz,
+                                                    int log2_size, int *last_scx_prefix, int *last_scy_prefix)
+ {
+     int i = 0;
+     int max = (log2_size << 1) - 1;
+     int ctx_offset, ctx_shift;
+ 
+-    if (!c_idx) {
++    if (!c_idx_nz) {
+         ctx_offset = 3 * (log2_size - 2)  + ((log2_size - 1) >> 2);
+         ctx_shift = (log2_size + 1) >> 2;
+     } else {
+@@ -929,22 +1192,16 @@ static av_always_inline int last_significant_coeff_suffix_decode(HEVCContext *s,
+     return value;
+ }
+ 
+-static av_always_inline int significant_coeff_group_flag_decode(HEVCContext *s, int c_idx, int ctx_cg)
++static av_always_inline int significant_coeff_group_flag_decode(HEVCContext *s, int c_idx_nz, int ctx_cg)
+ {
+     int inc;
+ 
+-    inc = FFMIN(ctx_cg, 1) + (c_idx>0 ? 2 : 0);
++    inc = (ctx_cg != 0) + (c_idx_nz << 1);
+ 
+     return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_GROUP_FLAG] + inc);
+ }
+-static av_always_inline int significant_coeff_flag_decode(HEVCContext *s, int x_c, int y_c,
+-                                           int offset, const uint8_t *ctx_idx_map)
+-{
+-    int inc = ctx_idx_map[(y_c << 2) + x_c] + offset;
+-    return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_FLAG] + inc);
+-}
+ 
+-static av_always_inline int significant_coeff_flag_decode_0(HEVCContext *s, int c_idx, int offset)
++static av_always_inline int significant_coeff_flag_decode_0(HEVCContext *s, int offset)
+ {
+     return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_FLAG] + offset);
+ }
+@@ -966,90 +1223,366 @@ static av_always_inline int coeff_abs_level_greater2_flag_decode(HEVCContext *s,
+     return GET_CABAC(elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] + inc);
+ }
+ 
+-static av_always_inline int coeff_abs_level_remaining_decode(HEVCContext *s, int rc_rice_param)
++
++#if !USE_BY22
++#define coeff_abs_level_remaining_decode_bypass(s,r) coeff_abs_level_remaining_decode(s, r)
++#endif
++
++
++#ifndef coeff_abs_level_remaining_decode_bypass
++static int coeff_abs_level_remaining_decode_bypass(HEVCContext * const s, const unsigned int rice_param)
+ {
++    CABACContext * const c = &s->HEVClc->cc;
++    uint32_t y;
++    unsigned int prefix;
++    unsigned int last_coeff_abs_level_remaining;
++    unsigned int n;
++
++    y = get_cabac_by22_peek(c);
++    prefix = hevc_clz32(~y);
++    // y << prefix will always have top bit 0
++
++    if (prefix < 3) {
++        const unsigned int suffix = (y << prefix) >> (31 - rice_param);
++        last_coeff_abs_level_remaining = (prefix << rice_param) + suffix;
++        n = prefix + 1 + rice_param;
++    }
++    else if (prefix * 2 + rice_param <= CABAC_BY22_PEEK_BITS + 2)
++    {
++        const uint32_t suffix = ((y << prefix) | 0x80000000) >> (34 - (prefix + rice_param));
++
++        last_coeff_abs_level_remaining = (2 << rice_param) + suffix;
++        n = prefix * 2 + rice_param - 2;
++    }
++    else {
++        unsigned int suffix;
++
++        get_cabac_by22_flush(c, prefix, y);
++        y = get_cabac_by22_peek(c);
++
++        suffix = (y | 0x80000000) >> (34 - (prefix + rice_param));
++        last_coeff_abs_level_remaining = (2 << rice_param) + suffix;
++        n = prefix + rice_param - 2;
++    }
++
++    get_cabac_by22_flush(c, n, y);
++
++    return last_coeff_abs_level_remaining;
++}
++#endif
++
++static int coeff_abs_level_remaining_decode(HEVCContext * const s, int rc_rice_param)
++{
++    CABACContext * const c = &s->HEVClc->cc;
+     int prefix = 0;
+     int suffix = 0;
+     int last_coeff_abs_level_remaining;
+     int i;
+ 
+-    while (prefix < CABAC_MAX_BIN && get_cabac_bypass(&s->HEVClc->cc))
++    while (prefix < CABAC_MAX_BIN && get_cabac_bypass(c))
+         prefix++;
+     if (prefix == CABAC_MAX_BIN) {
+         av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", prefix);
+         return 0;
+     }
++
+     if (prefix < 3) {
+         for (i = 0; i < rc_rice_param; i++)
+-            suffix = (suffix << 1) | get_cabac_bypass(&s->HEVClc->cc);
++            suffix = (suffix << 1) | get_cabac_bypass(c);
+         last_coeff_abs_level_remaining = (prefix << rc_rice_param) + suffix;
+     } else {
+         int prefix_minus3 = prefix - 3;
+         for (i = 0; i < prefix_minus3 + rc_rice_param; i++)
+-            suffix = (suffix << 1) | get_cabac_bypass(&s->HEVClc->cc);
++            suffix = (suffix << 1) | get_cabac_bypass(c);
+         last_coeff_abs_level_remaining = (((1 << prefix_minus3) + 3 - 1)
+                                               << rc_rice_param) + suffix;
+     }
++
+     return last_coeff_abs_level_remaining;
+ }
+ 
+-static av_always_inline int coeff_sign_flag_decode(HEVCContext *s, uint8_t nb)
++#if !USE_BY22
++#define coeff_sign_flag_decode_bypass coeff_sign_flag_decode
++static inline uint32_t coeff_sign_flag_decode(HEVCContext * const s, const unsigned int nb)
+ {
+-    int i;
+-    int ret = 0;
++    CABACContext * const c = &s->HEVClc->cc;
++    unsigned int i;
++    uint32_t ret = 0;
+ 
+     for (i = 0; i < nb; i++)
+-        ret = (ret << 1) | get_cabac_bypass(&s->HEVClc->cc);
+-    return ret;
++        ret = (ret << 1) | get_cabac_bypass(c);
++
++    return ret << (32 - nb);
++}
++#endif
++
++#ifndef coeff_sign_flag_decode_bypass
++static inline uint32_t coeff_sign_flag_decode_bypass(HEVCContext * const s, const unsigned int nb)
++{
++    CABACContext * const c = &s->HEVClc->cc;
++    uint32_t y;
++    y = get_cabac_by22_peek(c);
++    get_cabac_by22_flush(c, nb, y);
++    return y & ~(0xffffffffU >> nb);
++}
++#endif
++
++
++#ifndef get_cabac_greater1_bits
++static inline unsigned int get_cabac_greater1_bits(CABACContext * const c, const unsigned int n,
++    uint8_t * const state0)
++{
++    unsigned int i;
++    unsigned int rv = 0;
++    for (i = 0; i != n; ++i) {
++        const unsigned int idx = rv != 0 ? 0 : i < 3 ? i + 1 : 3;
++        const unsigned int b = get_cabac(c, state0 + idx);
++        rv = (rv << 1) | b;
++    }
++    return rv;
++}
++#endif
++
++
++// N.B. levels returned are the values assuming coeff_abs_level_remaining
++// is uncoded, so 1 must be added if it is coded.  sum_abs also reflects
++// this version of events.
++static inline uint32_t get_greaterx_bits(HEVCContext * const s, const unsigned int n_end, int * const levels,
++    int * const pprev_subset_coded, int * const psum,
++    const unsigned int idx0_gt1, const unsigned int idx_gt2)
++{
++    CABACContext * const c = &s->HEVClc->cc;
++    uint8_t * const state0 = s->HEVClc->cabac_state + idx0_gt1;
++    uint8_t * const state_gt2 = s->HEVClc->cabac_state + idx_gt2;
++    unsigned int rv;
++    unsigned int i;
++    const unsigned int n = FFMIN(n_end, 8);
++
++    // Really this is i != n but the simple unconditional loop is cheaper
++    // and faster
++    for (i = 0; i != 8; ++i)
++        levels[i] = 1;
++
++    rv = get_cabac_greater1_bits(c, n, state0);
++
++    *pprev_subset_coded = 0;
++    *psum = n;
++
++    rv <<= (32 - n);
++    if (rv != 0)
++    {
++        *pprev_subset_coded = 1;
++        *psum = n + 1;
++        i = hevc_clz32(rv);
++        levels[i] = 2;
++        if (get_cabac(c, state_gt2) == 0)
++        {
++            // Unset first coded bit
++            rv &= ~(0x80000000U >> i);
++        }
++    }
++
++    if (n_end > 8) {
++        const unsigned int g8 = n_end - 8;
++        rv |= ((1 << g8) - 1) << (24 - g8);
++        for (i = 0; i != g8; ++i) {
++            levels[i + 8] = 0;
++        }
++    }
++
++    return rv;
++}
++
++// extended_precision_processing_flag must be false given we are
++// putting the result into a 16-bit array
++// So trans_coeff_level must fit in 16 bits too (7.4.9.1 definition of coeff_abs_level_remaining)
++// scale_m is uint8_t
++//
++// scale is [40 - 72] << [0..12] based on qp- worst case is (45 << 12)
++//   or it can be 2 (if we have transquant_bypass)
++// shift is set to one less than we really want but would normally be
++//   s->ps.sps->bit_depth (max 16, min 8) + log2_trafo_size (max 5, min 2?) - 5 = max 16 min 5?
++// however the scale shift is substracted from shift to a min 0 so scale_m worst = 45 << 6
++// This can still theoretically lead to overflow but the coding would have to be very odd (& inefficient)
++// to achieve it
++
++#ifndef trans_scale_sat
++static inline int trans_scale_sat(const int level, const unsigned int scale, const unsigned int scale_m, const unsigned int shift)
++{
++    return av_clip_int16((((level * (int)(scale * scale_m)) >> shift) + 1) >> 1);
+ }
++#endif
++
++
++#ifndef update_rice
++static inline void update_rice(uint8_t * const stat_coeff,
++    const unsigned int last_coeff_abs_level_remaining,
++    const unsigned int c_rice_param)
++{
++    const unsigned int x = (last_coeff_abs_level_remaining << 1) >> c_rice_param;
++    if (x >= 6)
++        (*stat_coeff)++;
++    else if (x == 0 && *stat_coeff > 0)
++        (*stat_coeff)--;
++}
++#endif
++
++
++// n must be > 0 on entry
++#ifndef get_cabac_sig_coeff_flag_idxs
++static inline uint8_t * get_cabac_sig_coeff_flag_idxs(CABACContext * const c, uint8_t * const state0,
++    unsigned int n,
++    const uint8_t const * ctx_map,
++    uint8_t * p)
++{
++    do {
++        if (get_cabac(c, state0 + ctx_map[n]))
++            *p++ = n;
++    } while (--n != 0);
++    return p;
++}
++#endif
++
++
++static int get_sig_coeff_flag_idxs(CABACContext * const c, uint8_t * const state0,
++    unsigned int n,
++    const uint8_t const * ctx_map,
++    uint8_t * const flag_idx)
++{
++    int rv;
++
++    rv = get_cabac_sig_coeff_flag_idxs(c, state0, n, ctx_map, flag_idx) - flag_idx;
++
++    return rv;
++}
++
++#define H4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
++     x0,  x1,  x2,  x3,\
++     x4,  x5,  x6,  x7,\
++     x8,  x9, x10, x11,\
++    x12, x13, x14, x15}
++
++#define V4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
++     x0,  x4,  x8, x12,\
++     x1,  x5,  x9, x13,\
++     x2,  x6, x10, x14,\
++     x3,  x7, x11, x15}
++
++#define D4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
++     x0,  x4,  x1,  x8,\
++     x5,  x2, x12,  x9,\
++     x6,  x3, x13, x10,\
++     x7, x14, x11, x15}
++
++
++static inline int next_subset(HEVCContext * const s, int i, const int c_idx_nz,
++    uint8_t * const significant_coeff_group_flag,
++    const uint8_t * const scan_x_cg, const uint8_t * const scan_y_cg,
++    int * const pPrev_sig)
++{
++    while (--i >= 0) {
++        unsigned int x_cg = scan_x_cg[i];
++        unsigned int y_cg = scan_y_cg[i];
++
++        // For the flag decode we only care about Z/NZ but
++        // we use the full Right + Down * 2 when calculating
++        // significant coeff flags so we obtain it here
++        //.
++        // The group flag array is one longer than it needs to
++        // be so we don't need to check for y_cg limits
++        unsigned int prev_sig = ((significant_coeff_group_flag[y_cg] >> (x_cg + 1)) & 1) |
++            (((significant_coeff_group_flag[y_cg + 1] >> x_cg) & 1) << 1);
++
++        if (i == 0 ||
++            significant_coeff_group_flag_decode(s, c_idx_nz, prev_sig))
++        {
++            significant_coeff_group_flag[y_cg] |= (1 << x_cg);
++            *pPrev_sig = prev_sig;
++            break;
++        }
++    }
++
++    return i;
++}
++
+ 
+ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+                                 int log2_trafo_size, enum ScanType scan_idx,
+                                 int c_idx)
+ {
+-#define GET_COORD(offset, n)                                    \
+-    do {                                                        \
+-        x_c = (x_cg << 2) + scan_x_off[n];                      \
+-        y_c = (y_cg << 2) + scan_y_off[n];                      \
+-    } while (0)
+-    HEVCLocalContext *lc = s->HEVClc;
+-    int transform_skip_flag = 0;
++    HEVCLocalContext * const lc = s->HEVClc;
++    int trans_skip_or_bypass = lc->cu.cu_transquant_bypass_flag;
+ 
+     int last_significant_coeff_x, last_significant_coeff_y;
+-    int last_scan_pos;
+-    int n_end;
+     int num_coeff = 0;
+-    int greater1_ctx = 1;
++    int prev_subset_coded = 0;
+ 
+     int num_last_subset;
+     int x_cg_last_sig, y_cg_last_sig;
+ 
+-    const uint8_t *scan_x_cg, *scan_y_cg, *scan_x_off, *scan_y_off;
++    const uint8_t *scan_x_cg, *scan_y_cg;
++    const xy_off_t * scan_xy_off;
+ 
+     ptrdiff_t stride = s->frame->linesize[c_idx];
+     int hshift = s->ps.sps->hshift[c_idx];
+     int vshift = s->ps.sps->vshift[c_idx];
+     uint8_t *dst = &s->frame->data[c_idx][(y0 >> vshift) * stride +
+                                           ((x0 >> hshift) << s->ps.sps->pixel_shift)];
++#ifdef RPI
++    //***** transform_skip_flag decoded later!
++    int use_vpu = s->enable_rpi && !lc->cu.cu_transquant_bypass_flag /* && !transform_skip_flag*/ && !lc->tu.cross_pf && log2_trafo_size>=4;
++#endif
+     int16_t *coeffs = (int16_t*)(c_idx ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
+-    uint8_t significant_coeff_group_flag[8][8] = {{0}};
++    uint8_t significant_coeff_group_flag[9] = {0};  // Allow 1 final byte that is always zero
+     int explicit_rdpcm_flag = 0;
+     int explicit_rdpcm_dir_flag;
+ 
+     int trafo_size = 1 << log2_trafo_size;
+     int i;
+-    int qp,shift,add,scale,scale_m;
++    int qp,shift,scale;
+     static const uint8_t level_scale[] = { 40, 45, 51, 57, 64, 72 };
+     const uint8_t *scale_matrix = NULL;
+     uint8_t dc_scale;
+     int pred_mode_intra = (c_idx == 0) ? lc->tu.intra_pred_mode :
+                                          lc->tu.intra_pred_mode_c;
+ 
++    int prev_sig = 0;
++    const int c_idx_nz = (c_idx != 0);
++
++    int may_hide_sign;
++
++#ifdef RPI
++    if (s->enable_rpi) {
++        int n = trafo_size * trafo_size;
++        if (use_vpu) {
 +            // We support size 4 and size 5.
 +            // Size 4 grows from the front  (Coeffs_buf_arm[2] points to start of buf)
 +            // Size 5 grows from the back   (Coeffs_buf_arm[3] points to end of buf)
@@ -31936,123 +5525,797 @@ index 6523e66..8656917 100644
 +                coeffs = s->coeffs_buf_arm[s->pass0_job][log2_trafo_size - 2] + s->num_coeffs[s->pass0_job][log2_trafo_size - 2];
 +            else
 +                coeffs = s->coeffs_buf_arm[s->pass0_job][log2_trafo_size - 2] - s->num_coeffs[s->pass0_job][log2_trafo_size - 2] - n;
-             s->num_coeffs[s->pass0_job][log2_trafo_size - 2] += n;
-         } else {
-             coeffs = s->coeffs_buf_arm[s->pass0_job][0] + s->num_coeffs[s->pass0_job][0];
-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-index 4480f72..0121fca 100644
---- a/libavcodec/rpi_qpu.c
-+++ b/libavcodec/rpi_qpu.c
-@@ -5,7 +5,7 @@
- // define RPI_TIME_TOTAL_VPU to print out how much time is spent in the VPI code
- //#define RPI_TIME_TOTAL_VPU
- // define RPI_TIME_TOTAL_POSTED to print out how much time is spent in the multi execute QPU/VPU combined
--//#define RPI_TIME_TOTAL_POSTED
-+#define RPI_TIME_TOTAL_POSTED
- // define RPI_ASYNC to run the VPU in a separate thread, need to make a separate call to check for completion
- #define RPI_ASYNC
++            s->num_coeffs[s->pass0_job][log2_trafo_size - 2] += n;
++        } else {
++            coeffs = s->coeffs_buf_arm[s->pass0_job][0] + s->num_coeffs[s->pass0_job][0];
++            s->num_coeffs[s->pass0_job][0] += n;
++        }
++    }
++    // We now do the memset after transform_add while we know the data is cached.
++    #ifdef RPI_PRECLEAR
++    #else
++    memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
++    #endif
++#else
+     memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
++#endif
++
++
  
--- 
-2.7.4
-
-
-From ed359bbce56817bf9db0e54701103bd0505c353b Mon Sep 17 00:00:00 2001
-From: Peter de Rivaz <peter.derivaz@gmail.com>
-Date: Thu, 25 Jun 2015 09:02:47 +0100
-Subject: [PATCH 64/68] Moved luma deblock to VPU
-
----
- libavcodec/hevc.c               |   18 +-
- libavcodec/hevc.h               |   11 +
- libavcodec/hevc_filter.c        |  120 ++-
- libavcodec/rpi_hevc_transform.h | 1802 ++++++++++++++++++++++++++++++++++++++-
- libavcodec/rpi_hevc_transform.s |  426 +++++++++
- libavcodec/rpi_qpu.c            |   12 +-
- libavcodec/rpi_shader.c         |    2 +-
- 7 files changed, 2378 insertions(+), 13 deletions(-)
-
-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-index a596534..4ce94a7 100644
---- a/libavcodec/hevc.c
-+++ b/libavcodec/hevc.c
-@@ -246,6 +246,12 @@ static void pic_arrays_free(HEVCContext *s)
-       }
+     // Derive QP for dequant
+     if (!lc->cu.cu_transquant_bypass_flag) {
+-        static const int qp_c[] = { 29, 30, 31, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37 };
++        static const uint8_t qp_c[] = { 29, 30, 31, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37 };
+         static const uint8_t rem6[51 + 4 * 6 + 1] = {
+             0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2,
+             3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
+@@ -1065,9 +1598,19 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+         };
+         int qp_y = lc->qp_y;
+ 
++        may_hide_sign = s->ps.pps->sign_data_hiding_flag;
++
+         if (s->ps.pps->transform_skip_enabled_flag &&
+             log2_trafo_size <= s->ps.pps->log2_max_transform_skip_block_size) {
+-            transform_skip_flag = hevc_transform_skip_flag_decode(s, c_idx);
++            int transform_skip_flag = hevc_transform_skip_flag_decode(s, c_idx_nz);
++            if (transform_skip_flag) {
++                trans_skip_or_bypass = 1;
++                if (lc->cu.pred_mode ==  MODE_INTRA  &&
++                    s->ps.sps->implicit_rdpcm_enabled_flag &&
++                    (pred_mode_intra == 10 || pred_mode_intra == 26)) {
++                    may_hide_sign = 0;
++                }
++            }
+         }
+ 
+         if (c_idx == 0) {
+@@ -1100,39 +1643,73 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+             qp += s->ps.sps->qp_bd_offset;
+         }
+ 
+-        shift    = s->ps.sps->bit_depth + log2_trafo_size - 5;
+-        add      = 1 << (shift-1);
+-        scale    = level_scale[rem6[qp]] << (div6[qp]);
+-        scale_m  = 16; // default when no custom scaling lists.
+-        dc_scale = 16;
++        // Shift is set to one less than will actually occur as the scale
++        // and saturate step adds 1 and then shifts right again
++        shift = s->ps.sps->bit_depth + log2_trafo_size - 6;
++        scale = level_scale[rem6[qp]];
++        if (div6[qp] >= shift) {
++            scale <<= (div6[qp] - shift);
++            shift = 0;
++        } else {
++            shift -= div6[qp];
++        }
+ 
+-        if (s->ps.sps->scaling_list_enable_flag && !(transform_skip_flag && log2_trafo_size > 2)) {
++        if (s->ps.sps->scaling_list_enable_flag && !(trans_skip_or_bypass && log2_trafo_size > 2)) {
+             const ScalingList *sl = s->ps.pps->scaling_list_data_present_flag ?
+-            &s->ps.pps->scaling_list : &s->ps.sps->scaling_list;
++                &s->ps.pps->scaling_list : &s->ps.sps->scaling_list;
+             int matrix_id = lc->cu.pred_mode != MODE_INTRA;
+ 
+             matrix_id = 3 * matrix_id + c_idx;
+ 
+             scale_matrix = sl->sl[log2_trafo_size - 2][matrix_id];
++            dc_scale = scale_matrix[0];
+             if (log2_trafo_size >= 4)
+                 dc_scale = sl->sl_dc[log2_trafo_size - 4][matrix_id];
+         }
++        else
++        {
++            static const uint8_t sixteen_scale[64] = {
++                16, 16, 16, 16, 16, 16, 16, 16,
++                16, 16, 16, 16, 16, 16, 16, 16,
++                16, 16, 16, 16, 16, 16, 16, 16,
++                16, 16, 16, 16, 16, 16, 16, 16,
++                16, 16, 16, 16, 16, 16, 16, 16,
++                16, 16, 16, 16, 16, 16, 16, 16,
++                16, 16, 16, 16, 16, 16, 16, 16,
++                16, 16, 16, 16, 16, 16, 16, 16
++            };
++            scale_matrix = sixteen_scale;
++            dc_scale = 16;
++        }
+     } else {
++        static const uint8_t unit_scale[64] = {
++            1, 1, 1, 1, 1, 1, 1, 1,
++            1, 1, 1, 1, 1, 1, 1, 1,
++            1, 1, 1, 1, 1, 1, 1, 1,
++            1, 1, 1, 1, 1, 1, 1, 1,
++            1, 1, 1, 1, 1, 1, 1, 1,
++            1, 1, 1, 1, 1, 1, 1, 1,
++            1, 1, 1, 1, 1, 1, 1, 1,
++            1, 1, 1, 1, 1, 1, 1, 1,
++        };
++        scale_matrix = unit_scale;
+         shift        = 0;
+-        add          = 0;
+-        scale        = 0;
+-        dc_scale     = 0;
++        scale        = 2;  // We will shift right to kill this
++        dc_scale     = 1;
++
++        may_hide_sign = 0;
      }
- #endif
-+#ifdef RPI_DEBLOCK_VPU
-+    if (s->y_setup_arm) {
-+      gpu_free(&s->y_setup_ptr);
-+      s->y_setup_arm = 0;
+ 
+     if (lc->cu.pred_mode == MODE_INTER && s->ps.sps->explicit_rdpcm_enabled_flag &&
+-        (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
+-        explicit_rdpcm_flag = explicit_rdpcm_flag_decode(s, c_idx);
++        trans_skip_or_bypass) {
++        explicit_rdpcm_flag = explicit_rdpcm_flag_decode(s, c_idx_nz);
+         if (explicit_rdpcm_flag) {
+-            explicit_rdpcm_dir_flag = explicit_rdpcm_dir_flag_decode(s, c_idx);
++            may_hide_sign = 0;
++            explicit_rdpcm_dir_flag = explicit_rdpcm_dir_flag_decode(s, c_idx_nz);
+         }
+     }
+ 
+-    last_significant_coeff_xy_prefix_decode(s, c_idx, log2_trafo_size,
++    last_significant_coeff_xy_prefix_decode(s, c_idx_nz, log2_trafo_size,
+                                            &last_significant_coeff_x, &last_significant_coeff_y);
+ 
+     if (last_significant_coeff_x > 3) {
+@@ -1160,119 +1737,113 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+         int last_x_c = last_significant_coeff_x & 3;
+         int last_y_c = last_significant_coeff_y & 3;
+ 
+-        scan_x_off = ff_hevc_diag_scan4x4_x;
+-        scan_y_off = ff_hevc_diag_scan4x4_y;
+         num_coeff = diag_scan4x4_inv[last_y_c][last_x_c];
+-        if (trafo_size == 4) {
++
++        switch (log2_trafo_size) {
++        case 2:
+             scan_x_cg = scan_1x1;
+             scan_y_cg = scan_1x1;
+-        } else if (trafo_size == 8) {
++            break;
++        case 3:
+             num_coeff += diag_scan2x2_inv[y_cg_last_sig][x_cg_last_sig] << 4;
+             scan_x_cg = diag_scan2x2_x;
+             scan_y_cg = diag_scan2x2_y;
+-        } else if (trafo_size == 16) {
++            break;
++        case 4:
+             num_coeff += diag_scan4x4_inv[y_cg_last_sig][x_cg_last_sig] << 4;
+             scan_x_cg = ff_hevc_diag_scan4x4_x;
+             scan_y_cg = ff_hevc_diag_scan4x4_y;
+-        } else { // trafo_size == 32
++            break;
++        case 5:
++        default:
+             num_coeff += diag_scan8x8_inv[y_cg_last_sig][x_cg_last_sig] << 4;
+             scan_x_cg = ff_hevc_diag_scan8x8_x;
+             scan_y_cg = ff_hevc_diag_scan8x8_y;
++            break;
+         }
+         break;
+     }
+     case SCAN_HORIZ:
+         scan_x_cg = horiz_scan2x2_x;
+         scan_y_cg = horiz_scan2x2_y;
+-        scan_x_off = horiz_scan4x4_x;
+-        scan_y_off = horiz_scan4x4_y;
+         num_coeff = horiz_scan8x8_inv[last_significant_coeff_y][last_significant_coeff_x];
+         break;
+     default: //SCAN_VERT
+         scan_x_cg = horiz_scan2x2_y;
+         scan_y_cg = horiz_scan2x2_x;
+-        scan_x_off = horiz_scan4x4_y;
+-        scan_y_off = horiz_scan4x4_x;
+         num_coeff = horiz_scan8x8_inv[last_significant_coeff_x][last_significant_coeff_y];
+         break;
+     }
+     num_coeff++;
+     num_last_subset = (num_coeff - 1) >> 4;
+ 
+-    for (i = num_last_subset; i >= 0; i--) {
+-        int n, m;
+-        int x_cg, y_cg, x_c, y_c, pos;
+-        int implicit_non_zero_coeff = 0;
+-        int64_t trans_coeff_level;
+-        int prev_sig = 0;
+-        int offset = i << 4;
+-        int rice_init = 0;
++    significant_coeff_group_flag[y_cg_last_sig] = 1 << x_cg_last_sig; // 1st subset always significant
+ 
+-        uint8_t significant_coeff_flag_idx[16];
+-        uint8_t nb_significant_coeff_flag = 0;
+-
+-        x_cg = scan_x_cg[i];
+-        y_cg = scan_y_cg[i];
+-
+-        if ((i < num_last_subset) && (i > 0)) {
+-            int ctx_cg = 0;
+-            if (x_cg < (1 << (log2_trafo_size - 2)) - 1)
+-                ctx_cg += significant_coeff_group_flag[x_cg + 1][y_cg];
+-            if (y_cg < (1 << (log2_trafo_size - 2)) - 1)
+-                ctx_cg += significant_coeff_group_flag[x_cg][y_cg + 1];
++    scan_xy_off = off_xys[scan_idx][log2_trafo_size - 2];
+ 
+-            significant_coeff_group_flag[x_cg][y_cg] =
+-                significant_coeff_group_flag_decode(s, c_idx, ctx_cg);
+-            implicit_non_zero_coeff = 1;
+-        } else {
+-            significant_coeff_group_flag[x_cg][y_cg] =
+-            ((x_cg == x_cg_last_sig && y_cg == y_cg_last_sig) ||
+-             (x_cg == 0 && y_cg == 0));
+-        }
++    i = num_last_subset;
++    do {
++        int implicit_non_zero_coeff = 0;
++        int n_end;
+ 
+-        last_scan_pos = num_coeff - offset - 1;
++        uint8_t significant_coeff_flag_idx[16];
++        unsigned int nb_significant_coeff_flag = 0;
+ 
+         if (i == num_last_subset) {
++            // First time through
++            int last_scan_pos = num_coeff - (i << 4) - 1;
+             n_end = last_scan_pos - 1;
+             significant_coeff_flag_idx[0] = last_scan_pos;
+             nb_significant_coeff_flag = 1;
+         } else {
+             n_end = 15;
++            implicit_non_zero_coeff = (i != 0);
+         }
+ 
+-        if (x_cg < ((1 << log2_trafo_size) - 1) >> 2)
+-            prev_sig = !!significant_coeff_group_flag[x_cg + 1][y_cg];
+-        if (y_cg < ((1 << log2_trafo_size) - 1) >> 2)
+-            prev_sig += (!!significant_coeff_group_flag[x_cg][y_cg + 1] << 1);
+-
+-        if (significant_coeff_group_flag[x_cg][y_cg] && n_end >= 0) {
+-            static const uint8_t ctx_idx_map[] = {
+-                0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8, // log2_trafo_size == 2
+-                1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, // prev_sig == 0
+-                2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, // prev_sig == 1
+-                2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, // prev_sig == 2
+-                2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2  // default
++        if (n_end >= 0) {
++            static const uint8_t ctx_idx_maps_ts2[3][16] = {
++                D4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2
++                H4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2
++                V4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8)  // log2_trafo_size == 2
++            };
++            static const uint8_t ctx_idx_maps[3][4][16] = {
++                {
++                    D4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
++                    D4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 1
++                    D4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 2
++                    D4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
++                },
++                {
++                    H4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
++                    H4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 1
++                    H4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 2
++                    H4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
++                },
++                {
++                    V4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
++                    V4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 1
++                    V4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 2
++                    V4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
++                }
+             };
+             const uint8_t *ctx_idx_map_p;
+             int scf_offset = 0;
+-            if (s->ps.sps->transform_skip_context_enabled_flag &&
+-                (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
+-                ctx_idx_map_p = (uint8_t*) &ctx_idx_map[4 * 16];
+-                if (c_idx == 0) {
+-                    scf_offset = 40;
+-                } else {
+-                    scf_offset = 14 + 27;
+-                }
++
++            if (s->ps.sps->transform_skip_context_enabled_flag && trans_skip_or_bypass) {
++                ctx_idx_map_p = ctx_idx_maps[0][3];
++                scf_offset = 40 + c_idx_nz;
+             } else {
+-                if (c_idx != 0)
++                if (c_idx_nz != 0)
+                     scf_offset = 27;
++
+                 if (log2_trafo_size == 2) {
+-                    ctx_idx_map_p = (uint8_t*) &ctx_idx_map[0];
++                    ctx_idx_map_p = ctx_idx_maps_ts2[scan_idx];
+                 } else {
+-                    ctx_idx_map_p = (uint8_t*) &ctx_idx_map[(prev_sig + 1) << 4];
+-                    if (c_idx == 0) {
+-                        if ((x_cg > 0 || y_cg > 0))
++                    ctx_idx_map_p = ctx_idx_maps[scan_idx][prev_sig];
++                    if (!c_idx_nz) {
++                        if (i != 0)
+                             scf_offset += 3;
++
+                         if (log2_trafo_size == 3) {
+                             scf_offset += (scan_idx == SCAN_DIAG) ? 9 : 15;
+                         } else {
+@@ -1286,34 +1857,30 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+                     }
+                 }
+             }
+-            for (n = n_end; n > 0; n--) {
+-                x_c = scan_x_off[n];
+-                y_c = scan_y_off[n];
+-                if (significant_coeff_flag_decode(s, x_c, y_c, scf_offset, ctx_idx_map_p)) {
+-                    significant_coeff_flag_idx[nb_significant_coeff_flag] = n;
+-                    nb_significant_coeff_flag++;
++
++            if (n_end > 0) {
++                int cnt = get_sig_coeff_flag_idxs(&s->HEVClc->cc,
++                    s->HEVClc->cabac_state + elem_offset[SIGNIFICANT_COEFF_FLAG] + scf_offset,
++                    n_end, ctx_idx_map_p,
++                    significant_coeff_flag_idx + nb_significant_coeff_flag);
++
++                nb_significant_coeff_flag += cnt;
++                if (cnt != 0) {
+                     implicit_non_zero_coeff = 0;
+                 }
+             }
++
+             if (implicit_non_zero_coeff == 0) {
+-                if (s->ps.sps->transform_skip_context_enabled_flag &&
+-                    (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
+-                    if (c_idx == 0) {
+-                        scf_offset = 42;
+-                    } else {
+-                        scf_offset = 16 + 27;
+-                    }
++                if (s->ps.sps->transform_skip_context_enabled_flag && trans_skip_or_bypass) {
++                    scf_offset = 42 + c_idx_nz;
+                 } else {
+                     if (i == 0) {
+-                        if (c_idx == 0)
+-                            scf_offset = 0;
+-                        else
+-                            scf_offset = 27;
++                        scf_offset = c_idx_nz ? 27 : 0;
+                     } else {
+                         scf_offset = 2 + scf_offset;
+                     }
+                 }
+-                if (significant_coeff_flag_decode_0(s, c_idx, scf_offset) == 1) {
++                if (significant_coeff_flag_decode_0(s, scf_offset) == 1) {
+                     significant_coeff_flag_idx[nb_significant_coeff_flag] = 0;
+                     nb_significant_coeff_flag++;
+                 }
+@@ -1323,141 +1890,185 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+             }
+         }
+ 
+-        n_end = nb_significant_coeff_flag;
+-
++        if (nb_significant_coeff_flag != 0) {
++            const unsigned int gt1_idx_delta = (c_idx_nz << 2) |
++                ((i != 0 && !c_idx_nz) ? 2 : 0) |
++                prev_subset_coded;
++            const unsigned int idx0_gt1 = elem_offset[COEFF_ABS_LEVEL_GREATER1_FLAG] +
++                (gt1_idx_delta << 2);
++            const unsigned int idx_gt2 = elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] +
++                gt1_idx_delta;
++
++            const unsigned int x_cg = scan_x_cg[i];
++            const unsigned int y_cg = scan_y_cg[i];
++            int16_t * const blk_coeffs = coeffs +
++                ((x_cg + (y_cg << log2_trafo_size)) << 2);
++            // This calculation is 'wrong' for log2_traffo_size == 2
++            // but that doesn't mattor as in this case x_cg & y_cg
++            // are always 0 so result is correct (0) anyway
++            const uint8_t * const blk_scale = scale_matrix +
++                (((x_cg + (y_cg << 3)) << (5 - log2_trafo_size)));
++
++            // * The following code block doesn't deal with these flags:
++            //   (nor did the one it replaces)
++            //
++            // cabac_bypass_alignment_enabled_flag
++            //    This should be easy but I can't find a test case
++            // extended_precision_processing_flag
++            //    This can extend the required precision past 16bits
++            //    so is probably tricky - also no example found yet
++
++#if USE_N_END_1
++            if (nb_significant_coeff_flag == 1) {
++                // There is a small gain to be had from special casing the single
++                // transform coefficient case.  The reduction in complexity
++                // makes up for the code duplicatioon.
++
++                int trans_coeff_level = 1;
++                int coeff_sign_flag;
++                int coded_val = 0;
++
++                // initialize first elem of coeff_bas_level_greater1_flag
++                prev_subset_coded = 0;
++
++                if (get_cabac(&s->HEVClc->cc, s->HEVClc->cabac_state + idx0_gt1 + 1)) {
++                    trans_coeff_level = 2;
++                    prev_subset_coded = 1;
++                    coded_val = get_cabac(&s->HEVClc->cc, s->HEVClc->cabac_state + idx_gt2);
++                }
+ 
+-        if (n_end) {
+-            int first_nz_pos_in_cg;
+-            int last_nz_pos_in_cg;
+-            int c_rice_param = 0;
+-            int first_greater1_coeff_idx = -1;
+-            uint8_t coeff_abs_level_greater1_flag[8];
+-            uint16_t coeff_sign_flag;
+-            int sum_abs = 0;
+-            int sign_hidden;
+-            int sb_type;
++                // Probably not worth the overhead of starting by22 for just one value
++                coeff_sign_flag = get_cabac_bypass(&s->HEVClc->cc);
+ 
++                if (coded_val)
++                {
++                    if (!s->ps.sps->persistent_rice_adaptation_enabled_flag) {
++                        trans_coeff_level = 3 + coeff_abs_level_remaining_decode(s, 0);
++                    } else {
++                        uint8_t * const stat_coeff =
++                            lc->stat_coeff + trans_skip_or_bypass + 2 - ((c_idx_nz) << 1);
++                        const unsigned int c_rice_param = *stat_coeff >> 2;
++                        const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
+ 
+-            // initialize first elem of coeff_bas_level_greater1_flag
+-            int ctx_set = (i > 0 && c_idx == 0) ? 2 : 0;
++                        trans_coeff_level = 3 + last_coeff_abs_level_remaining;
++                        update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
++                    }
++                }
+ 
+-            if (s->ps.sps->persistent_rice_adaptation_enabled_flag) {
+-                if (!transform_skip_flag && !lc->cu.cu_transquant_bypass_flag)
+-                    sb_type = 2 * (c_idx == 0 ? 1 : 0);
+-                else
+-                    sb_type = 2 * (c_idx == 0 ? 1 : 0) + 1;
+-                c_rice_param = lc->stat_coeff[sb_type] / 4;
+-            }
++                {
++                    const xy_off_t * const xy_off = scan_xy_off + significant_coeff_flag_idx[0];
++                    const int k = (int32_t)(coeff_sign_flag << 31) >> 31;
++                    const unsigned int scale_m = blk_scale[xy_off->scale];
+ 
+-            if (!(i == num_last_subset) && greater1_ctx == 0)
+-                ctx_set++;
+-            greater1_ctx = 1;
+-            last_nz_pos_in_cg = significant_coeff_flag_idx[0];
+-
+-            for (m = 0; m < (n_end > 8 ? 8 : n_end); m++) {
+-                int inc = (ctx_set << 2) + greater1_ctx;
+-                coeff_abs_level_greater1_flag[m] =
+-                    coeff_abs_level_greater1_flag_decode(s, c_idx, inc);
+-                if (coeff_abs_level_greater1_flag[m]) {
+-                    greater1_ctx = 0;
+-                    if (first_greater1_coeff_idx == -1)
+-                        first_greater1_coeff_idx = m;
+-                } else if (greater1_ctx > 0 && greater1_ctx < 3) {
+-                    greater1_ctx++;
++                    blk_coeffs[xy_off->coeff] = trans_scale_sat(
++                        (trans_coeff_level ^ k) - k,  // Apply sign
++                        scale,
++                        i == 0 && xy_off->coeff == 0 ? dc_scale : scale_m,
++                        shift);
+                 }
+             }
+-            first_nz_pos_in_cg = significant_coeff_flag_idx[n_end - 1];
+-
+-            if (lc->cu.cu_transquant_bypass_flag ||
+-                (lc->cu.pred_mode ==  MODE_INTRA  &&
+-                 s->ps.sps->implicit_rdpcm_enabled_flag  &&  transform_skip_flag  &&
+-                 (pred_mode_intra == 10 || pred_mode_intra  ==  26 )) ||
+-                 explicit_rdpcm_flag)
+-                sign_hidden = 0;
+             else
+-                sign_hidden = (last_nz_pos_in_cg - first_nz_pos_in_cg >= 4);
++#endif
++            {
++                int sign_hidden = may_hide_sign;
++                int levels[16]; // Should be able to get away with int16_t but that fails some tests
++                uint32_t coeff_sign_flags;
++                uint32_t coded_vals = 0;
++                // Sum(abs(level[]))
++                // In fact we only need the bottom bit and in some future
++                // version that may be all we calculate
++                unsigned int sum_abs;
++
++                coded_vals = get_greaterx_bits(s, nb_significant_coeff_flag, levels,
++                    &prev_subset_coded, &sum_abs, idx0_gt1, idx_gt2);
++
++                if (significant_coeff_flag_idx[0] - significant_coeff_flag_idx[nb_significant_coeff_flag - 1] <= 3)
++                    sign_hidden = 0;
++
++                // -- Start bypass block
++
++                bypass_start(s);
++
++                coeff_sign_flags = coeff_sign_flag_decode_bypass(s, nb_significant_coeff_flag - sign_hidden);
++
++                if (coded_vals != 0)
++                {
++                    const int rice_adaptation_enabled = s->ps.sps->persistent_rice_adaptation_enabled_flag;
++                    uint8_t * stat_coeff = !rice_adaptation_enabled ? NULL :
++                        lc->stat_coeff + trans_skip_or_bypass + 2 - ((c_idx_nz) << 1);
++                    int c_rice_param = !rice_adaptation_enabled ? 0 : *stat_coeff >> 2;
++                    int * level = levels - 1;
++
++                    do {
++                        {
++                            const unsigned int z = hevc_clz32(coded_vals) + 1;
++                            level += z;
++                            coded_vals <<= z;
++                        }
+ 
+-            if (first_greater1_coeff_idx != -1) {
+-                coeff_abs_level_greater1_flag[first_greater1_coeff_idx] += coeff_abs_level_greater2_flag_decode(s, c_idx, ctx_set);
+-            }
+-            if (!s->ps.pps->sign_data_hiding_flag || !sign_hidden ) {
+-                coeff_sign_flag = coeff_sign_flag_decode(s, nb_significant_coeff_flag) << (16 - nb_significant_coeff_flag);
+-            } else {
+-                coeff_sign_flag = coeff_sign_flag_decode(s, nb_significant_coeff_flag - 1) << (16 - (nb_significant_coeff_flag - 1));
+-            }
++                        {
++                            const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode_bypass(s, c_rice_param);
++                            const int trans_coeff_level = *level + last_coeff_abs_level_remaining + 1;
++
++                            sum_abs += last_coeff_abs_level_remaining + 1;
++                            *level = trans_coeff_level;
++
++                            if (stat_coeff != NULL)
++                                update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
++                            stat_coeff = NULL;
+ 
+-            for (m = 0; m < n_end; m++) {
+-                n = significant_coeff_flag_idx[m];
+-                GET_COORD(offset, n);
+-                if (m < 8) {
+-                    trans_coeff_level = 1 + coeff_abs_level_greater1_flag[m];
+-                    if (trans_coeff_level == ((m == first_greater1_coeff_idx) ? 3 : 2)) {
+-                        int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
+-
+-                        trans_coeff_level += last_coeff_abs_level_remaining;
+-                        if (trans_coeff_level > (3 << c_rice_param))
+-                            c_rice_param = s->ps.sps->persistent_rice_adaptation_enabled_flag ? c_rice_param + 1 : FFMIN(c_rice_param + 1, 4);
+-                        if (s->ps.sps->persistent_rice_adaptation_enabled_flag && !rice_init) {
+-                            int c_rice_p_init = lc->stat_coeff[sb_type] / 4;
+-                            if (last_coeff_abs_level_remaining >= (3 << c_rice_p_init))
+-                                lc->stat_coeff[sb_type]++;
+-                            else if (2 * last_coeff_abs_level_remaining < (1 << c_rice_p_init))
+-                                if (lc->stat_coeff[sb_type] > 0)
+-                                    lc->stat_coeff[sb_type]--;
+-                            rice_init = 1;
++                            if (trans_coeff_level > (3 << c_rice_param) &&
++                                (c_rice_param < 4 || rice_adaptation_enabled))
++                                ++c_rice_param;
+                         }
+-                    }
+-                } else {
+-                    int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
+-
+-                    trans_coeff_level = 1 + last_coeff_abs_level_remaining;
+-                    if (trans_coeff_level > (3 << c_rice_param))
+-                        c_rice_param = s->ps.sps->persistent_rice_adaptation_enabled_flag ? c_rice_param + 1 : FFMIN(c_rice_param + 1, 4);
+-                    if (s->ps.sps->persistent_rice_adaptation_enabled_flag && !rice_init) {
+-                        int c_rice_p_init = lc->stat_coeff[sb_type] / 4;
+-                        if (last_coeff_abs_level_remaining >= (3 << c_rice_p_init))
+-                            lc->stat_coeff[sb_type]++;
+-                        else if (2 * last_coeff_abs_level_remaining < (1 << c_rice_p_init))
+-                            if (lc->stat_coeff[sb_type] > 0)
+-                                lc->stat_coeff[sb_type]--;
+-                        rice_init = 1;
+-                    }
++                    } while (coded_vals != 0);
+                 }
+-                if (s->ps.pps->sign_data_hiding_flag && sign_hidden) {
+-                    sum_abs += trans_coeff_level;
+-                    if (n == first_nz_pos_in_cg && (sum_abs&1))
+-                        trans_coeff_level = -trans_coeff_level;
++
++                // sign_hidden = 0 or 1 so we can combine the tests
++                if ((sign_hidden & sum_abs) != 0) {
++                    levels[nb_significant_coeff_flag - 1] = -levels[nb_significant_coeff_flag - 1];
+                 }
+-                if (coeff_sign_flag >> 15)
+-                    trans_coeff_level = -trans_coeff_level;
+-                coeff_sign_flag <<= 1;
+-                if(!lc->cu.cu_transquant_bypass_flag) {
+-                    if (s->ps.sps->scaling_list_enable_flag && !(transform_skip_flag && log2_trafo_size > 2)) {
+-                        if(y_c || x_c || log2_trafo_size < 4) {
+-                            switch(log2_trafo_size) {
+-                                case 3: pos = (y_c << 3) + x_c; break;
+-                                case 4: pos = ((y_c >> 1) << 3) + (x_c >> 1); break;
+-                                case 5: pos = ((y_c >> 2) << 3) + (x_c >> 2); break;
+-                                default: pos = (y_c << 2) + x_c; break;
+-                            }
+-                            scale_m = scale_matrix[pos];
+-                        } else {
+-                            scale_m = dc_scale;
+-                        }
++
++                bypass_finish(s);
++
++                // -- Finish bypass block
++
++                // Scale loop
++                {
++                    int m = nb_significant_coeff_flag - 1;
++
++                    // Deal with DC component (if any) first
++                    if (i == 0 && significant_coeff_flag_idx[m] == 0)
++                    {
++                        const int k = (int32_t)(coeff_sign_flags << m) >> 31;
++                        blk_coeffs[0] = trans_scale_sat(
++                            (levels[m] ^ k) - k, scale, dc_scale, shift);
++                        --m;
+                     }
+-                    trans_coeff_level = (trans_coeff_level * (int64_t)scale * (int64_t)scale_m + add) >> shift;
+-                    if(trans_coeff_level < 0) {
+-                        if((~trans_coeff_level) & 0xFffffffffff8000)
+-                            trans_coeff_level = -32768;
+-                    } else {
+-                        if(trans_coeff_level & 0xffffffffffff8000)
+-                            trans_coeff_level = 32767;
++
++#if !USE_N_END_1
++                    // If N_END_1 set then m was at least 1 initially
++                    if (m >= 0)
++#endif
++                    {
++                        do {
++                            const xy_off_t * const xy_off = scan_xy_off +
++                                significant_coeff_flag_idx[m];
++                            const int k = (int32_t)(coeff_sign_flags << m) >> 31;
++
++                            blk_coeffs[xy_off->coeff] = trans_scale_sat(
++                                (levels[m] ^ k) - k,
++                                scale,
++                                blk_scale[xy_off->scale],
++                                shift);
++                        } while (--m >= 0);
+                     }
+                 }
+-                coeffs[y_c * trafo_size + x_c] = trans_coeff_level;
++
+             }
+         }
+-    }
++    } while ((i = next_subset(s, i, c_idx_nz,
++        significant_coeff_group_flag, scan_x_cg, scan_y_cg, &prev_sig)) >= 0);
+ 
+     if (lc->cu.cu_transquant_bypass_flag) {
+         if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
+@@ -1467,7 +2078,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+             s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
+         }
+     } else {
+-        if (transform_skip_flag) {
++        if (trans_skip_or_bypass) { // Must be trans_skip as we've already dealt with bypass
+             int rot = s->ps.sps->transform_skip_rotation_enabled_flag &&
+                       log2_trafo_size == 2 &&
+                       lc->cu.pred_mode == MODE_INTRA;
+@@ -1475,7 +2086,6 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+                 for (i = 0; i < 8; i++)
+                     FFSWAP(int16_t, coeffs[i], coeffs[16 - i - 1]);
+             }
+-
+             s->hevcdsp.transform_skip(coeffs, log2_trafo_size);
+ 
+             if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
+@@ -1486,8 +2096,26 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+                 s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
+             }
+         } else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2) {
+-            s->hevcdsp.idct_4x4_luma(coeffs);
++           s->hevcdsp.idct_4x4_luma(coeffs);
+         } else {
++#ifdef RPI
++            if (!use_vpu) {
++              int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
++              if (max_xy == 0) {
++                  s->hevcdsp.idct_dc[log2_trafo_size-2](coeffs);
++              } else {
++                  int col_limit = last_significant_coeff_x + last_significant_coeff_y + 4;
++                  if (max_xy < 4)
++                      col_limit = FFMIN(4, col_limit);
++                  else if (max_xy < 8)
++                      col_limit = FFMIN(8, col_limit);
++                  else if (max_xy < 12)
++                      col_limit = FFMIN(24, col_limit);
++
++                  s->hevcdsp.idct[log2_trafo_size-2](coeffs, col_limit);
++              }
++            }
++#else
+             int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
+             if (max_xy == 0)
+                 s->hevcdsp.idct_dc[log2_trafo_size-2](coeffs);
+@@ -1501,6 +2129,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+                     col_limit = FFMIN(24, col_limit);
+                 s->hevcdsp.idct[log2_trafo_size-2](coeffs, col_limit);
+             }
++#endif
+         }
+     }
+     if (lc->tu.cross_pf) {
+@@ -1510,6 +2139,17 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+             coeffs[i] = coeffs[i] + ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
+         }
+     }
++#ifdef RPI
++    if (s->enable_rpi) {
++        HEVCPredCmd *cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++;
++        cmd->type = RPI_PRED_TRANSFORM_ADD;
++        cmd->size = log2_trafo_size;
++        cmd->buf = coeffs;
++        cmd->dst = dst;
++        cmd->stride = stride;
++        return;
 +    }
 +#endif
-     av_freep(&s->sao);
-     av_freep(&s->deblock);
+     s->hevcdsp.transform_add[log2_trafo_size-2](dst, coeffs, stride);
+ }
  
-@@ -283,12 +289,12 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
-     int min_pu_size      = sps->min_pu_width * sps->min_pu_height;
- 
- #ifdef RPI
--    av_assert0(sps);
-     int coefs_in_ctb = (1 << sps->log2_ctb_size) * (1 << sps->log2_ctb_size);
-     int coefs_per_luma = 64*64*24*RPI_NUM_CHUNKS;
-     int coefs_per_chroma = (coefs_per_luma * 2) >> sps->vshift[1] >> sps->hshift[1];
-     int coefs_per_row = coefs_per_luma + coefs_per_chroma;
-     int job;
-+    av_assert0(sps);
-     s->max_ctu_count = coefs_per_luma / coefs_in_ctb;
-     s->ctu_per_y_chan = s->max_ctu_count / 12;
-     s->ctu_per_uv_chan = s->max_ctu_count / 8;
-@@ -309,6 +315,16 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
-       }
-     }
- #endif
-+#ifdef RPI_DEBLOCK_VPU
-+    s->enable_rpi_deblock = !sps->sao_enabled;
-+    s->setup_width = (sps->width+15) / 16;
-+    s->setup_height = (sps->height+15) / 16;
-+    gpu_malloc_uncached(sizeof(*s->y_setup_arm) * s->setup_width * s->setup_height, &s->y_setup_ptr); // TODO make this cached
-+    s->y_setup_arm = (void*)s->y_setup_ptr.arm;
-+    s->y_setup_vc = (void*)s->y_setup_ptr.vc;
-+    memset(s->y_setup_arm, 0, s->y_setup_ptr.numbytes);
-+    printf("Setup %d by %d by %d\n",s->setup_width,s->setup_height,sizeof(*s->y_setup_arm));
-+#endif
- 
-     s->bs_width  = (width  >> 2) + 1;
-     s->bs_height = (height >> 2) + 1;
-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-index ef5bfb1..cf08489 100644
---- a/libavcodec/hevc.h
-+++ b/libavcodec/hevc.h
-@@ -57,6 +57,8 @@
-   // Define RPI_WORKER to launch a worker thread for pixel processing tasks
-   #define RPI_WORKER
- 
-+  #define RPI_DEBLOCK_VPU
-+
- #endif
- 
- #define MAX_DPB_SIZE 16 // A.4.1
-@@ -971,6 +973,15 @@ typedef struct HEVCContext {
-     int kill_worker; // set to 1 to terminate the worker
- #endif
- 
-+#ifdef RPI_DEBLOCK_VPU
-+    int enable_rpi_deblock;
-+    GPU_MEM_PTR_T y_setup_ptr;
-+    uint8_t (*y_setup_arm)[2][2][2][4];
-+    uint8_t (*y_setup_vc)[2][2][2][4];
-+    int setup_width; // Number of 16x16 blocks across the image
-+    int setup_height; // Number of 16x16 blocks down the image
-+#endif
-+
- #endif
- 
-     uint8_t *cabac_state;
 diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
-index 1f04790..06371da 100644
+index 1f33b0c..55a0315 100644
 --- a/libavcodec/hevc_filter.c
 +++ b/libavcodec/hevc_filter.c
-@@ -564,6 +564,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+@@ -22,6 +22,12 @@
+  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+  */
+ 
++//#define DISABLE_SAO
++//#define DISABLE_DEBLOCK
++//#define DISABLE_STRENGTHS
++// define DISABLE_DEBLOCK_NONREF for a 6% speed boost (by skipping deblocking on unimportant frames)
++//#define DISABLE_DEBLOCK_NONREF
++
+ #include "libavutil/common.h"
+ #include "libavutil/internal.h"
+ 
+@@ -31,6 +37,11 @@
+ 
+ #include "bit_depth_template.c"
+ 
++#ifdef RPI
++#include "rpi_user_vcsm.h"
++#include "rpi_qpu.h"
++#endif
++
+ #define LUMA 0
+ #define CB 1
+ #define CR 2
+@@ -273,6 +284,10 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
+     edges[2]   = x_ctb == s->ps.sps->ctb_width  - 1;
+     edges[3]   = y_ctb == s->ps.sps->ctb_height - 1;
+ 
++#ifdef DISABLE_SAO
++    return;
++#endif
++
+     if (restore) {
+         if (!edges[0]) {
+             left_tile_edge  = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1]];
+@@ -496,6 +511,15 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+                 s->ps.sps->pcm.loop_filter_disable_flag) ||
+                s->ps.pps->transquant_bypass_enable_flag;
+ 
++#ifdef DISABLE_DEBLOCK_NONREF
++    if (!s->used_for_ref)
++      return; // Don't deblock non-reference frames
++#endif
++#ifdef DISABLE_DEBLOCK
++    return;
++#endif
++    if (!s->used_for_ref && s->avctx->skip_loop_filter >= AVDISCARD_NONREF)
++        return;
+     if (x0) {
+         left_tc_offset   = s->deblock[ctb - 1].tc_offset;
+         left_beta_offset = s->deblock[ctb - 1].beta_offset;
+@@ -539,6 +563,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
                                                           s->frame->linesize[LUMA],
                                                           beta, tc, no_p, no_q);
                  } else
@@ -32062,7 +6325,7 @@ index 1f04790..06371da 100644
 +                    int num16 = (y>>4)*s->setup_width + (x>>4);
 +                    int a = ((y>>3) & 1) << 1;
 +                    int b = (x>>3) & 1;
-+                    setup = s->y_setup_arm[num16];
++                    setup = s->dvq->y_setup_arm[num16];
 +                    setup[0][b][0][a] = beta;
 +                    setup[0][b][0][a + 1] = beta;
 +                    setup[0][b][1][a] = tc[0];
@@ -32072,7 +6335,7 @@ index 1f04790..06371da 100644
                      s->hevcdsp.hevc_v_loop_filter_luma(src,
                                                         s->frame->linesize[LUMA],
                                                         beta, tc, no_p, no_q);
-@@ -596,6 +609,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+@@ -571,6 +608,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
                                                           s->frame->linesize[LUMA],
                                                           beta, tc, no_p, no_q);
                  } else
@@ -32082,7 +6345,7 @@ index 1f04790..06371da 100644
 +                    int num16 = (y>>4)*s->setup_width + (x>>4);
 +                    int a = ((x>>3) & 1) << 1;
 +                    int b = (y>>3) & 1;
-+                    setup = s->y_setup_arm[num16];
++                    setup = s->dvq->y_setup_arm[num16];
 +                    setup[1][b][0][a] = beta;
 +                    setup[1][b][0][a + 1] = beta;
 +                    setup[1][b][1][a] = tc[0];
@@ -32092,17 +6355,352 @@ index 1f04790..06371da 100644
                      s->hevcdsp.hevc_h_loop_filter_luma(src,
                                                         s->frame->linesize[LUMA],
                                                         beta, tc, no_p, no_q);
-@@ -876,33 +902,85 @@ static void flush_buffer(AVBufferRef *bref) {
+@@ -605,9 +655,23 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+                                                                    s->frame->linesize[chroma],
+                                                                    c_tc, no_p, no_q);
+                         } else
++#ifdef RPI_DEBLOCK_VPU
++                        if (s->enable_rpi_deblock) {
++                            uint8_t (*setup)[2][2][4];
++                            int xc = x>>s->ps.sps->hshift[chroma];
++                            int yc = y>>s->ps.sps->vshift[chroma];
++                            int num16 = (yc>>4)*s->uv_setup_width + (xc>>4);
++                            int a = ((yc>>3) & 1) << 1;
++                            int b = (xc>>3) & 1;
++                            setup = s->dvq->uv_setup_arm[num16];
++                            setup[0][b][0][a] = c_tc[0];
++                            setup[0][b][0][a + 1] = c_tc[1];
++                        } else
++#endif
+                             s->hevcdsp.hevc_v_loop_filter_chroma(src,
+                                                                  s->frame->linesize[chroma],
+                                                                  c_tc, no_p, no_q);
++
+                     }
+                 }
+ 
+@@ -638,6 +702,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+                                                                    s->frame->linesize[chroma],
+                                                                    c_tc, no_p, no_q);
+                         } else
++#ifdef RPI_DEBLOCK_VPU
++                        if (s->enable_rpi_deblock) {
++                            uint8_t (*setup)[2][2][4];
++                            int xc = x>>s->ps.sps->hshift[chroma];
++                            int yc = y>>s->ps.sps->vshift[chroma];
++                            int num16 = (yc>>4)*s->uv_setup_width + (xc>>4);
++                            int a = ((xc>>3) & 1) << 1;
++                            int b = (yc>>3) & 1;
++                            setup = s->dvq->uv_setup_arm[num16];
++                            setup[1][b][0][a] = c_tc[0];
++                            setup[1][b][0][a + 1] = c_tc[1];
++                        } else
++#endif
+                             s->hevcdsp.hevc_h_loop_filter_chroma(src,
+                                                                  s->frame->linesize[chroma],
+                                                                  c_tc, no_p, no_q);
+@@ -648,69 +725,6 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+     }
  }
  
- // Return Physical address for this image
--static int ff_hevc_buf_base(AVBufferRef *bref) {
-+static uint32_t get_vc_address(AVBufferRef *bref) {
-   GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
--  return p->vc & 0x3fffffff;
-+  return p->vc;
- }
+-static int boundary_strength(HEVCContext *s, MvField *curr, MvField *neigh,
+-                             RefPicList *neigh_refPicList)
+-{
+-    if (curr->pred_flag == PF_BI &&  neigh->pred_flag == PF_BI) {
+-        // same L0 and L1
+-        if (s->ref->refPicList[0].list[curr->ref_idx[0]] == neigh_refPicList[0].list[neigh->ref_idx[0]]  &&
+-            s->ref->refPicList[0].list[curr->ref_idx[0]] == s->ref->refPicList[1].list[curr->ref_idx[1]] &&
+-            neigh_refPicList[0].list[neigh->ref_idx[0]] == neigh_refPicList[1].list[neigh->ref_idx[1]]) {
+-            if ((FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
+-                 FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4) &&
+-                (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
+-                 FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4))
+-                return 1;
+-            else
+-                return 0;
+-        } else if (neigh_refPicList[0].list[neigh->ref_idx[0]] == s->ref->refPicList[0].list[curr->ref_idx[0]] &&
+-                   neigh_refPicList[1].list[neigh->ref_idx[1]] == s->ref->refPicList[1].list[curr->ref_idx[1]]) {
+-            if (FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
+-                FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4)
+-                return 1;
+-            else
+-                return 0;
+-        } else if (neigh_refPicList[1].list[neigh->ref_idx[1]] == s->ref->refPicList[0].list[curr->ref_idx[0]] &&
+-                   neigh_refPicList[0].list[neigh->ref_idx[0]] == s->ref->refPicList[1].list[curr->ref_idx[1]]) {
+-            if (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
+-                FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4)
+-                return 1;
+-            else
+-                return 0;
+-        } else {
+-            return 1;
+-        }
+-    } else if ((curr->pred_flag != PF_BI) && (neigh->pred_flag != PF_BI)){ // 1 MV
+-        Mv A, B;
+-        int ref_A, ref_B;
+-
+-        if (curr->pred_flag & 1) {
+-            A     = curr->mv[0];
+-            ref_A = s->ref->refPicList[0].list[curr->ref_idx[0]];
+-        } else {
+-            A     = curr->mv[1];
+-            ref_A = s->ref->refPicList[1].list[curr->ref_idx[1]];
+-        }
+-
+-        if (neigh->pred_flag & 1) {
+-            B     = neigh->mv[0];
+-            ref_B = neigh_refPicList[0].list[neigh->ref_idx[0]];
+-        } else {
+-            B     = neigh->mv[1];
+-            ref_B = neigh_refPicList[1].list[neigh->ref_idx[1]];
+-        }
+-
+-        if (ref_A == ref_B) {
+-            if (FFABS(A.x - B.x) >= 4 || FFABS(A.y - B.y) >= 4)
+-                return 1;
+-            else
+-                return 0;
+-        } else
+-            return 1;
+-    }
+-
+-    return 1;
+-}
  
+ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
+                                            int log2_trafo_size)
+@@ -721,10 +735,21 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
+     int log2_min_tu_size = s->ps.sps->log2_min_tb_size;
+     int min_pu_width     = s->ps.sps->min_pu_width;
+     int min_tu_width     = s->ps.sps->min_tb_width;
+-    int is_intra = tab_mvf[(y0 >> log2_min_pu_size) * min_pu_width +
+-                           (x0 >> log2_min_pu_size)].pred_flag == PF_INTRA;
+     int boundary_upper, boundary_left;
+-    int i, j, bs;
++    int i, j;
++    RefPicList *rpl      = s->ref->refPicList;
++    int min_pu_in_4pix   = (1 << log2_min_pu_size) >> 2;
++    int trafo_in_min_pus = (1 << log2_trafo_size) >> log2_min_pu_size;
++    int y_pu             = y0 >> log2_min_pu_size;
++    int x_pu             = x0 >> log2_min_pu_size;
++    MvField *curr        = &tab_mvf[y_pu * min_pu_width + x_pu];
++    int is_intra         = curr->pred_flag == PF_INTRA;
++    int inc              = log2_min_pu_size == 2 ? 2 : 1;
++    uint8_t *bs;
++
++#ifdef DISABLE_STRENGTHS
++    return;
++#endif
+ 
+     boundary_upper = y0 > 0 && !(y0 & 7);
+     if (boundary_upper &&
+@@ -736,34 +761,56 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
+           (y0 % (1 << s->ps.sps->log2_ctb_size)) == 0)))
+         boundary_upper = 0;
+ 
++    bs = &s->horizontal_bs[(x0 + y0 * s->bs_width) >> 2];
++
+     if (boundary_upper) {
+         RefPicList *rpl_top = (lc->boundary_flags & BOUNDARY_UPPER_SLICE) ?
+                               ff_hevc_get_ref_list(s, s->ref, x0, y0 - 1) :
+-                              s->ref->refPicList;
+-        int yp_pu = (y0 - 1) >> log2_min_pu_size;
+-        int yq_pu =  y0      >> log2_min_pu_size;
+-        int yp_tu = (y0 - 1) >> log2_min_tu_size;
+-        int yq_tu =  y0      >> log2_min_tu_size;
++                              rpl;
++        MvField *top = curr - min_pu_width;
++
++        if (is_intra) {
++            for (i = 0; i < (1 << log2_trafo_size); i += 4)
++                bs[i >> 2] = 2;
++
++        } else {
++            int y_tu = y0 >> log2_min_tu_size;
++            int x_tu = x0 >> log2_min_tu_size;
++            uint8_t *curr_cbf_luma = &s->cbf_luma[y_tu * min_tu_width + x_tu];
++            uint8_t *top_cbf_luma = curr_cbf_luma - min_tu_width;
++
++            s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
++                    min_pu_in_4pix, sizeof (MvField), 4 >> 2,
++                    rpl[0].list, rpl[1].list, rpl_top[0].list, rpl_top[1].list,
++                    curr, top, bs);
+ 
+             for (i = 0; i < (1 << log2_trafo_size); i += 4) {
+-                int x_pu = (x0 + i) >> log2_min_pu_size;
+-                int x_tu = (x0 + i) >> log2_min_tu_size;
+-                MvField *top  = &tab_mvf[yp_pu * min_pu_width + x_pu];
+-                MvField *curr = &tab_mvf[yq_pu * min_pu_width + x_pu];
+-                uint8_t top_cbf_luma  = s->cbf_luma[yp_tu * min_tu_width + x_tu];
+-                uint8_t curr_cbf_luma = s->cbf_luma[yq_tu * min_tu_width + x_tu];
+-
+-                if (curr->pred_flag == PF_INTRA || top->pred_flag == PF_INTRA)
+-                    bs = 2;
+-                else if (curr_cbf_luma || top_cbf_luma)
+-                    bs = 1;
+-                else
+-                    bs = boundary_strength(s, curr, top, rpl_top);
+-                s->horizontal_bs[((x0 + i) + y0 * s->bs_width) >> 2] = bs;
++                int i_pu = i >> log2_min_pu_size;
++                int i_tu = i >> log2_min_tu_size;
++
++                if (top[i_pu].pred_flag == PF_INTRA)
++                    bs[i >> 2] = 2;
++                else if (curr_cbf_luma[i_tu] || top_cbf_luma[i_tu])
++                    bs[i >> 2] = 1;
+             }
++        }
++    }
++
++    if (!is_intra) {
++        for (j = inc; j < trafo_in_min_pus; j += inc) {
++            MvField *top;
++
++            curr += min_pu_width * inc;
++            top = curr - min_pu_width;
++            bs += s->bs_width * inc << log2_min_pu_size >> 2;
++
++            s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
++                    min_pu_in_4pix, sizeof (MvField), 4 >> 2,
++                    rpl[0].list, rpl[1].list, rpl[0].list, rpl[1].list,
++                    curr, top, bs);
++        }
+     }
+ 
+-    // bs for vertical TU boundaries
+     boundary_left = x0 > 0 && !(x0 & 7);
+     if (boundary_left &&
+         ((!s->sh.slice_loop_filter_across_slices_enabled_flag &&
+@@ -774,64 +821,54 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
+           (x0 % (1 << s->ps.sps->log2_ctb_size)) == 0)))
+         boundary_left = 0;
+ 
++    curr = &tab_mvf[y_pu * min_pu_width + x_pu];
++    bs = &s->vertical_bs[(x0 + y0 * s->bs_width) >> 2];
++
+     if (boundary_left) {
+         RefPicList *rpl_left = (lc->boundary_flags & BOUNDARY_LEFT_SLICE) ?
+                                ff_hevc_get_ref_list(s, s->ref, x0 - 1, y0) :
+-                               s->ref->refPicList;
+-        int xp_pu = (x0 - 1) >> log2_min_pu_size;
+-        int xq_pu =  x0      >> log2_min_pu_size;
+-        int xp_tu = (x0 - 1) >> log2_min_tu_size;
+-        int xq_tu =  x0      >> log2_min_tu_size;
++                               rpl;
++        MvField *left = curr - 1;
+ 
+-            for (i = 0; i < (1 << log2_trafo_size); i += 4) {
+-                int y_pu      = (y0 + i) >> log2_min_pu_size;
+-                int y_tu      = (y0 + i) >> log2_min_tu_size;
+-                MvField *left = &tab_mvf[y_pu * min_pu_width + xp_pu];
+-                MvField *curr = &tab_mvf[y_pu * min_pu_width + xq_pu];
+-                uint8_t left_cbf_luma = s->cbf_luma[y_tu * min_tu_width + xp_tu];
+-                uint8_t curr_cbf_luma = s->cbf_luma[y_tu * min_tu_width + xq_tu];
+-
+-                if (curr->pred_flag == PF_INTRA || left->pred_flag == PF_INTRA)
+-                    bs = 2;
+-                else if (curr_cbf_luma || left_cbf_luma)
+-                    bs = 1;
+-                else
+-                    bs = boundary_strength(s, curr, left, rpl_left);
+-                s->vertical_bs[(x0 + (y0 + i) * s->bs_width) >> 2] = bs;
+-            }
+-    }
++        if (is_intra) {
++            for (j = 0; j < (1 << log2_trafo_size); j += 4)
++                bs[j * s->bs_width >> 2] = 2;
+ 
+-    if (log2_trafo_size > log2_min_pu_size && !is_intra) {
+-        RefPicList *rpl = s->ref->refPicList;
+-
+-        // bs for TU internal horizontal PU boundaries
+-        for (j = 8; j < (1 << log2_trafo_size); j += 8) {
+-            int yp_pu = (y0 + j - 1) >> log2_min_pu_size;
+-            int yq_pu = (y0 + j)     >> log2_min_pu_size;
+-
+-            for (i = 0; i < (1 << log2_trafo_size); i += 4) {
+-                int x_pu = (x0 + i) >> log2_min_pu_size;
+-                MvField *top  = &tab_mvf[yp_pu * min_pu_width + x_pu];
+-                MvField *curr = &tab_mvf[yq_pu * min_pu_width + x_pu];
+-
+-                bs = boundary_strength(s, curr, top, rpl);
+-                s->horizontal_bs[((x0 + i) + (y0 + j) * s->bs_width) >> 2] = bs;
++        } else {
++            int y_tu = y0 >> log2_min_tu_size;
++            int x_tu = x0 >> log2_min_tu_size;
++            uint8_t *curr_cbf_luma = &s->cbf_luma[y_tu * min_tu_width + x_tu];
++            uint8_t *left_cbf_luma = curr_cbf_luma - 1;
++
++            s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
++                    min_pu_in_4pix, min_pu_width * sizeof (MvField), 4 * s->bs_width >> 2,
++                    rpl[0].list, rpl[1].list, rpl_left[0].list, rpl_left[1].list,
++                    curr, left, bs);
++
++            for (j = 0; j < (1 << log2_trafo_size); j += 4) {
++                int j_pu = j >> log2_min_pu_size;
++                int j_tu = j >> log2_min_tu_size;
++
++                if (left[j_pu * min_pu_width].pred_flag == PF_INTRA)
++                    bs[j * s->bs_width >> 2] = 2;
++                else if (curr_cbf_luma[j_tu * min_tu_width] || left_cbf_luma[j_tu * min_tu_width])
++                    bs[j * s->bs_width >> 2] = 1;
+             }
+         }
++    }
+ 
+-        // bs for TU internal vertical PU boundaries
+-        for (j = 0; j < (1 << log2_trafo_size); j += 4) {
+-            int y_pu = (y0 + j) >> log2_min_pu_size;
++    if (!is_intra) {
++        for (i = inc; i < trafo_in_min_pus; i += inc) {
++            MvField *left;
+ 
+-            for (i = 8; i < (1 << log2_trafo_size); i += 8) {
+-                int xp_pu = (x0 + i - 1) >> log2_min_pu_size;
+-                int xq_pu = (x0 + i)     >> log2_min_pu_size;
+-                MvField *left = &tab_mvf[y_pu * min_pu_width + xp_pu];
+-                MvField *curr = &tab_mvf[y_pu * min_pu_width + xq_pu];
++            curr += inc;
++            left = curr - 1;
++            bs += inc << log2_min_pu_size >> 2;
+ 
+-                bs = boundary_strength(s, curr, left, rpl);
+-                s->vertical_bs[((x0 + i) + (y0 + j) * s->bs_width) >> 2] = bs;
+-            }
++            s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
++                    min_pu_in_4pix, min_pu_width * sizeof (MvField), 4 * s->bs_width >> 2,
++                    rpl[0].list, rpl[1].list, rpl[0].list, rpl[1].list,
++                    curr, left, bs);
+         }
+     }
+ }
+@@ -840,11 +877,196 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
+ #undef CB
+ #undef CR
+ 
++#if !defined(RPI_FAST_CACHEFLUSH)
++#if defined(RPI_LUMA_QPU) || defined(RPI_DEBLOCK_VPU)
++static void flush_buffer_y(const AVFrame * const frame) {
++    GPU_MEM_PTR_T p = get_gpu_mem_ptr_y(frame);
++    gpu_cache_flush(&p);
++}
++
++static void flush_buffer_u(const AVFrame * const frame) {
++    GPU_MEM_PTR_T p = get_gpu_mem_ptr_u(frame);
++    gpu_cache_flush(&p);
++}
++
++static void flush_buffer_v(const AVFrame * const frame) {
++    GPU_MEM_PTR_T p = get_gpu_mem_ptr_v(frame);
++    gpu_cache_flush(&p);
++}
++#endif
++#endif
++
++
++#ifdef RPI_DEBLOCK_VPU
++#error Not fixed yet
++
 +// ff_hevc_flush_buffer_lines
 +// flushes and invalidates all pixel rows in [start,end-1]
 +static void ff_hevc_flush_buffer_lines(HEVCContext *s, int start, int end, int flush_luma, int flush_chroma)
@@ -32114,156 +6712,1134 @@ index 1f04790..06371da 100644
 +        int curr_uv = curr_y >> s->ps.sps->vshift[1];
 +        int n_uv = n >> s->ps.sps->vshift[1];
 +        int sz,base;
-+        GPU_MEM_PTR_T *p;
++        GPU_MEM_PTR_T p;
 +        if (curr_uv < 0) curr_uv = 0;
 +        if (n_uv<=curr_uv) { return; }
 +        sz = s->frame->linesize[1] * (n_uv-curr_uv);
 +        base = s->frame->linesize[1] * curr_uv;
 +        if (flush_chroma) {
-+          p = av_buffer_pool_opaque(s->frame->buf[1]);
-+          iocache.s[0].handle = p->vcsm_handle;
++          p = get_gpu_mem_ptr_u(s->frame);
++          iocache.s[0].handle = p.vcsm_handle;
 +          iocache.s[0].cmd = 3; // clean+invalidate
-+          iocache.s[0].addr = (int)p->arm + base;
++          iocache.s[0].addr = (int)p.arm + base;
 +          iocache.s[0].size  = sz;
-+          p = av_buffer_pool_opaque(s->frame->buf[2]);
-+          iocache.s[1].handle = p->vcsm_handle;
++          p = get_gpu_mem_ptr_v(s->frame);
++          iocache.s[1].handle = p.vcsm_handle;
 +          iocache.s[1].cmd = 3; // clean+invalidate
-+          iocache.s[1].addr = (int)p->arm + base;
++          iocache.s[1].addr = (int)p.arm + base;
 +          iocache.s[1].size  = sz;
 +        }
 +        if (flush_luma) {
-+          p = av_buffer_pool_opaque(s->frame->buf[0]);
++          p = get_gpu_mem_ptr_y(s->frame);
 +          sz = s->frame->linesize[0] * (n-curr_y);
 +          base = s->frame->linesize[0] * curr_y;
-+          iocache.s[2].handle = p->vcsm_handle;
++          iocache.s[2].handle = p.vcsm_handle;
 +          iocache.s[2].cmd = 3; // clean+invalidate
-+          iocache.s[2].addr = (int)p->arm + base;
++          iocache.s[2].addr = (int)p.arm + base;
 +          iocache.s[2].size  = sz;
 +        }
 +        vcsm_clean_invalid( &iocache );
 +#else
 +        if (flush_chroma) {
-+          flush_buffer(s->frame->buf[1]);
-+          flush_buffer(s->frame->buf[2]);
++          flush_buffer_u(s->frame);
++          flush_buffer_v(s->frame);
 +        }
 +        if (flush_luma) {
-+          flush_buffer(s->frame->buf[0]);
++          flush_buffer_y(s->frame);
 +        }
 +#endif
 +}
++#endif
 +
-+
- void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n)
- {
-     if (s->enable_rpi && s->used_for_ref) {
++#ifdef RPI_INTER_QPU
++void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n)
++{
++    if (s->enable_rpi && s->used_for_ref) {
 +      // TODO make this use ff_hevc_flush_buffer_lines
- #ifdef RPI_FAST_CACHEFLUSH
-         struct vcsm_user_clean_invalid_s iocache = {};
-         int curr_y = ((int *)f->progress->data)[0];
-         int curr_uv = curr_y >> s->ps.sps->vshift[1];
-         int n_uv = n >> s->ps.sps->vshift[1];
-         int sz,base;
-+        GPU_MEM_PTR_T *p;
-         if (curr_uv < 0) curr_uv = 0;
-         if (n_uv<=curr_uv) { return; }
-         sz = s->frame->linesize[1] * (n_uv-curr_uv);
-         base = s->frame->linesize[1] * curr_uv;
--        GPU_MEM_PTR_T *p = av_buffer_pool_opaque(s->frame->buf[1]);
-+        p = av_buffer_pool_opaque(s->frame->buf[1]);
-         iocache.s[0].handle = p->vcsm_handle;
-         iocache.s[0].cmd = 3; // clean+invalidate
--        iocache.s[0].addr = p->arm + base;
-+        iocache.s[0].addr = (int)p->arm + base;
-         iocache.s[0].size  = sz;
-         p = av_buffer_pool_opaque(s->frame->buf[2]);
-         iocache.s[1].handle = p->vcsm_handle;
-         iocache.s[1].cmd = 3; // clean+invalidate
--        iocache.s[1].addr = p->arm + base;
-+        iocache.s[1].addr = (int)p->arm + base;
-         iocache.s[1].size  = sz;
- 
- #ifdef RPI_LUMA_QPU
-@@ -911,7 +989,7 @@ void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n)
-         base = s->frame->linesize[0] * curr_y;
-         iocache.s[2].handle = p->vcsm_handle;
-         iocache.s[2].cmd = 3; // clean+invalidate
--        iocache.s[2].addr = p->arm + base;
-+        iocache.s[2].addr = (int)p->arm + base;
-         iocache.s[2].size  = sz;
- #endif
-         vcsm_clean_invalid( &iocache );
-@@ -930,11 +1008,40 @@ void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n)
- }
- #endif
- 
++#ifdef RPI_FAST_CACHEFLUSH
++        struct vcsm_user_clean_invalid_s iocache = {};
++        int curr_y = ((int *)f->progress->data)[0];
++        int curr_uv = curr_y >> s->ps.sps->vshift[1];
++        int n_uv = n >> s->ps.sps->vshift[1];
++        int sz,base;
++        GPU_MEM_PTR_T p;
++        if (curr_uv < 0) curr_uv = 0;
++        if (n_uv<=curr_uv) { return; }
++        sz = s->frame->linesize[1] * (n_uv-curr_uv);
++        base = s->frame->linesize[1] * curr_uv;
++        p = get_gpu_mem_ptr_u(s->frame);
++        iocache.s[0].handle = p.vcsm_handle;
++        iocache.s[0].cmd = 3; // clean+invalidate
++        iocache.s[0].addr = (int)p.arm + base;
++        iocache.s[0].size  = sz;
++        p = get_gpu_mem_ptr_v(s->frame);
++        iocache.s[1].handle = p.vcsm_handle;
++        iocache.s[1].cmd = 3; // clean+invalidate
++        iocache.s[1].addr = (int)p.arm + base;
++        iocache.s[1].size  = sz;
++
++#ifdef RPI_LUMA_QPU
++        p = get_gpu_mem_ptr_y(s->frame);
++        sz = s->frame->linesize[0] * (n-curr_y);
++        base = s->frame->linesize[0] * curr_y;
++        iocache.s[2].handle = p.vcsm_handle;
++        iocache.s[2].cmd = 3; // clean+invalidate
++        iocache.s[2].addr = (int)p.arm + base;
++        iocache.s[2].size  = sz;
++#endif
++        vcsm_clean_invalid( &iocache );
++#else
++        flush_buffer_u(s->frame);
++        flush_buffer_v(s->frame);
++#ifdef RPI_LUMA_QPU
++        flush_buffer_y(s->frame);
++#endif
++
++#endif
++        //memcpy(s->dummy.arm,s->frame->data[0],2048*64);
++        //memcpy(s->dummy.arm,s->frame->data[1],1024*32);
++        //memcpy(s->dummy.arm,s->frame->data[2],1024*32);
++    }
++}
++#endif
++
 +#ifdef RPI_DEBLOCK_VPU
++#error XXX
 +/* rpi_deblock deblocks an entire row of ctbs using the VPU */
 +static void rpi_deblock(HEVCContext *s, int y, int ctb_size)
 +{
 +  // Flush image, 4 lines above to bottom of ctb stripe
-+  ff_hevc_flush_buffer_lines(s, FFMAX(y-4,0), y+ctb_size, 1, 0);
++  ff_hevc_flush_buffer_lines(s, FFMAX(y-4,0), y+ctb_size, 1, 1);
 +  // TODO flush buffer of beta/tc setup when it becomes cached
++
++  // Prepare three commands at once to avoid calling overhead
++  s->dvq->vpu_cmds_arm[0][0] = get_vc_address_y(s->frame) + s->frame->linesize[0] * y;
++  s->dvq->vpu_cmds_arm[0][1] = s->frame->linesize[0];
++  s->dvq->vpu_cmds_arm[0][2] = s->setup_width;
++  s->dvq->vpu_cmds_arm[0][3] = (int) ( s->dvq->y_setup_vc + s->setup_width * (y>>4) );
++  s->dvq->vpu_cmds_arm[0][4] = ctb_size>>4;
++  s->dvq->vpu_cmds_arm[0][5] = 2;
++
++  s->dvq->vpu_cmds_arm[1][0] = get_vc_address_u(s->frame) + s->frame->linesize[1] * (y>> s->ps.sps->vshift[1]);
++  s->dvq->vpu_cmds_arm[1][1] = s->frame->linesize[1];
++  s->dvq->vpu_cmds_arm[1][2] = s->uv_setup_width;
++  s->dvq->vpu_cmds_arm[1][3] = (int) ( s->dvq->uv_setup_vc + s->uv_setup_width * ((y>>4)>> s->ps.sps->vshift[1]) );
++  s->dvq->vpu_cmds_arm[1][4] = (ctb_size>>4)>> s->ps.sps->vshift[1];
++  s->dvq->vpu_cmds_arm[1][5] = 3;
++
++  s->dvq->vpu_cmds_arm[2][0] = get_vc_address_v(s->frame) + s->frame->linesize[2] * (y>> s->ps.sps->vshift[2]);
++  s->dvq->vpu_cmds_arm[2][1] = s->frame->linesize[2];
++  s->dvq->vpu_cmds_arm[2][2] = s->uv_setup_width;
++  s->dvq->vpu_cmds_arm[2][3] = (int) ( s->dvq->uv_setup_vc + s->uv_setup_width * ((y>>4)>> s->ps.sps->vshift[1]) );
++  s->dvq->vpu_cmds_arm[2][4] = (ctb_size>>4)>> s->ps.sps->vshift[1];
++  s->dvq->vpu_cmds_arm[2][5] = 4;
 +  // Call VPU
-+  // TODO add this to a separate pipeline of VPU jobs that can be run in parallel and wait for completion
-+  vpu_wait(vpu_post_code( vpu_get_fn(), get_vc_address(s->frame->buf[0]) + s->frame->linesize[0] * y, s->frame->linesize[0],
-+                               s->setup_width, (int) ( s->y_setup_vc + s->setup_width * (y>>4) ),
-+                               ctb_size>>4, 2, 0)); // 2 means to do the deblocking code
++  s->dvq->cmd_id = vpu_post_code2( vpu_get_fn(), s->dvq->vpu_cmds_vc, 3, 0, 0, 0, 5, 0); // 5 means to do all the commands
++
++  s->dvq_n = (s->dvq_n + 1) & (RPI_DEBLOCK_VPU_Q_COUNT - 1);
++  s->dvq = s->dvq_ents + s->dvq_n;
++
++  if (s->dvq->cmd_id != -1) {
++      vpu_wait(s->dvq->cmd_id);
++      s->dvq->cmd_id = -1;
++  }
 +}
 +
-+static void rpi_deblock2(HEVCContext *s, int y, int ctb_size)
-+{
-+   int y2;
-+   for(y2=y;y2<y+ctb_size;y2+=16) {
-+      rpi_deblock(s,y2,16);
-+   }
-+}
 +#endif
 +
  void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
  {
      int x_end = x >= s->ps.sps->width  - ctb_size;
++#ifdef RPI_DEBLOCK_VPU
++    int done_deblock = 0;
++#endif
      if (s->avctx->skip_loop_filter < AVDISCARD_ALL)
          deblocking_filter_CTB(s, x, y);
 +#ifdef RPI_DEBLOCK_VPU
 +    if (s->enable_rpi_deblock && x_end)
 +    {
-+      rpi_deblock(s, y, ctb_size);
++      int y_at_end = y >= s->ps.sps->height - ctb_size;
++      int height = 64;  // Deblock in units 64 high to avoid too many VPU calls
++      int y_start = y&~63;
++      if (y_at_end) height = s->ps.sps->height - y_start;
++      if ((((y+ctb_size)&63)==0) || y_at_end) {
++        done_deblock = 1;
++        rpi_deblock(s, y_start, height);
++      }
 +    }
 +#endif
      if (s->ps.sps->sao_enabled) {
          int y_end = y >= s->ps.sps->height - ctb_size;
          if (y && x)
-@@ -965,6 +1072,7 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
-         //if (((y + ctb_size)&63)==0)
- #ifdef RPI_INTER_QPU
-         ff_hevc_flush_buffer(s, &s->ref->tf, y + ctb_size - 4);
-+        // TODO we no longer need to flush the luma buffer as it is in GPU memory when using deblocking on the rpi
- #endif
+@@ -853,16 +1075,46 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
+             sao_filter_CTB(s, x - ctb_size, y);
+         if (y && x_end) {
+             sao_filter_CTB(s, x, y - ctb_size);
+-            if (s->threads_type & FF_THREAD_FRAME )
++            if (s->threads_type & FF_THREAD_FRAME ) {
++#ifdef RPI_INTER_QPU
++                ff_hevc_flush_buffer(s,&s->ref->tf, y);
++#endif
+                 ff_thread_report_progress(&s->ref->tf, y, 0);
++            }
+         }
+         if (x_end && y_end) {
+             sao_filter_CTB(s, x , y);
+-            if (s->threads_type & FF_THREAD_FRAME )
++            if (s->threads_type & FF_THREAD_FRAME ) {
++#ifdef RPI_INTER_QPU
++                ff_hevc_flush_buffer(s, &s->ref->tf, y + ctb_size);
++#endif
+                 ff_thread_report_progress(&s->ref->tf, y + ctb_size, 0);
++            }
++        }
++    } else if (s->threads_type & FF_THREAD_FRAME && x_end) {
++        //int newh = y + ctb_size - 4;
++        //int currh = s->ref->tf.progress->data[0];
++        //if (((y + ctb_size)&63)==0)
++#ifdef RPI_DEBLOCK_VPU
++        if (s->enable_rpi_deblock) {
++          // we no longer need to flush the luma buffer as it is in GPU memory when using deblocking on the rpi
++          if (done_deblock) {
++            ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
++          }
++        } else {
++#ifdef RPI_INTER_QPU
++          ff_hevc_flush_buffer(s, &s->ref->tf, y + ctb_size - 4);
++#endif
++          ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
+         }
+-    } else if (s->threads_type & FF_THREAD_FRAME && x_end)
++#else
++#ifdef RPI_INTER_QPU
++        ff_hevc_flush_buffer(s, &s->ref->tf, y + ctb_size - 4);
++        // we no longer need to flush the luma buffer as it is in GPU memory when using deblocking on the rpi
++#endif
          ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
++#endif
++    }
+ }
+ 
+ void ff_hevc_hls_filters(HEVCContext *s, int x_ctb, int y_ctb, int ctb_size)
+diff --git a/libavcodec/hevc_ps.c b/libavcodec/hevc_ps.c
+index 83f2ec2..6882a8d 100644
+--- a/libavcodec/hevc_ps.c
++++ b/libavcodec/hevc_ps.c
+@@ -989,6 +989,8 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
+     sps->amp_enabled_flag = get_bits1(gb);
+     sps->sao_enabled      = get_bits1(gb);
+ 
++    av_log(avctx, AV_LOG_INFO, "sao_enabled=%d\n", sps->sao_enabled);
++
+     sps->pcm_enabled_flag = get_bits1(gb);
+     if (sps->pcm_enabled_flag) {
+         sps->pcm.bit_depth   = get_bits(gb, 4) + 1;
+diff --git a/libavcodec/hevcdsp.c b/libavcodec/hevcdsp.c
+index 9d773d9..a6534a9 100644
+--- a/libavcodec/hevcdsp.c
++++ b/libavcodec/hevcdsp.c
+@@ -123,6 +123,120 @@ DECLARE_ALIGNED(16, const int8_t, ff_hevc_qpel_filters[3][16]) = {
+ #include "hevcdsp_template.c"
+ #undef BIT_DEPTH
+ 
++static void hevc_deblocking_boundary_strengths(int pus, int dup, int in_inc, int out_inc,
++                                               int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
++                                               MvField *curr, MvField *neigh, uint8_t *bs)
++{
++    for (; pus > 0; pus--) {
++        int strength, out;
++        int curr_refL0 = curr_rpl0[curr->ref_idx[0]];
++        int curr_refL1 = curr_rpl1[curr->ref_idx[1]];
++        int neigh_refL0 = neigh_rpl0[neigh->ref_idx[0]];
++        int neigh_refL1 = neigh_rpl1[neigh->ref_idx[1]];
++
++#if 1 // This more directly matches the original implementation
++        if (curr->pred_flag == PF_BI &&  neigh->pred_flag == PF_BI) {
++            // same L0 and L1
++            if (curr_refL0 == neigh_refL0 &&
++                curr_refL0 == curr_refL1 &&
++                neigh_refL0 == neigh_refL1) {
++                if ((FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
++                     FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4) &&
++                    (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
++                     FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4))
++                    strength = 1;
++                else
++                    strength = 0;
++            } else if (neigh_refL0 == curr_refL0 &&
++                       neigh_refL1 == curr_refL1) {
++                if (FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
++                    FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4)
++                    strength = 1;
++                else
++                    strength = 0;
++            } else if (neigh_refL1 == curr_refL0 &&
++                       neigh_refL0 == curr_refL1) {
++                if (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
++                    FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4)
++                    strength = 1;
++                else
++                    strength = 0;
++            } else {
++                strength = 1;
++            }
++        } else if ((curr->pred_flag != PF_BI) && (neigh->pred_flag != PF_BI)){ // 1 MV
++            Mv curr_mv0, neigh_mv0;
++
++            if (curr->pred_flag & 1) {
++                curr_mv0   = curr->mv[0];
++            } else {
++                curr_mv0   = curr->mv[1];
++                curr_refL0 = curr_refL1;
++            }
++
++            if (neigh->pred_flag & 1) {
++                neigh_mv0   = neigh->mv[0];
++            } else {
++                neigh_mv0   = neigh->mv[1];
++                neigh_refL0 = neigh_refL1;
++            }
++
++            if (curr_refL0 == neigh_refL0) {
++                if (FFABS(curr_mv0.x - neigh_mv0.x) >= 4 || FFABS(curr_mv0.y - neigh_mv0.y) >= 4)
++                    strength = 1;
++                else
++                    strength = 0;
++            } else
++                strength = 1;
++        } else
++            strength = 1;
++#else // This has exactly the same effect, but is more suitable for vectorisation
++        Mv curr_mv[2];
++        Mv neigh_mv[2];
++        memcpy(curr_mv, curr->mv, sizeof curr_mv);
++        memcpy(neigh_mv, neigh->mv, sizeof neigh_mv);
++
++        if (!(curr->pred_flag & 2)) {
++            curr_mv[1] = curr_mv[0];
++            curr_refL1 = curr_refL0;
++        }
++        if (!(neigh->pred_flag & 2)) {
++            neigh_mv[1] = neigh_mv[0];
++            neigh_refL1 = neigh_refL0;
++        }
++        if (!(curr->pred_flag & 1)) {
++            curr_mv[0] = curr_mv[1];
++            curr_refL0 = curr_refL1;
++        }
++        if (!(neigh->pred_flag & 1)) {
++            neigh_mv[0] = neigh_mv[1];
++            neigh_refL0 = neigh_refL1;
++        }
++
++        strength = 1;
++
++        strength &= (neigh_refL0 != curr_refL0) | (neigh_refL1 != curr_refL1) |
++                (FFABS(neigh_mv[0].x - curr_mv[0].x) >= 4) | (FFABS(neigh_mv[0].y - curr_mv[0].y) >= 4) |
++                (FFABS(neigh_mv[1].x - curr_mv[1].x) >= 4) | (FFABS(neigh_mv[1].y - curr_mv[1].y) >= 4);
++
++        strength &= (neigh_refL1 != curr_refL0) | (neigh_refL0 != curr_refL1) |
++                (FFABS(neigh_mv[1].x - curr_mv[0].x) >= 4) | (FFABS(neigh_mv[1].y - curr_mv[0].y) >= 4) |
++                (FFABS(neigh_mv[0].x - curr_mv[1].x) >= 4) | (FFABS(neigh_mv[0].y - curr_mv[1].y) >= 4);
++
++        strength |= (((curr->pred_flag + 1) ^ (neigh->pred_flag + 1)) >> 2);
++#endif
++
++        curr += in_inc / sizeof (MvField);
++        neigh += in_inc / sizeof (MvField);
++
++        for (out = dup; out > 0; out--)
++        {
++            *bs = strength;
++            bs += out_inc;
++        }
++    }
++}
++
+ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
+ {
+ #undef FUNC
+@@ -257,6 +371,8 @@ int i = 0;
+         break;
      }
+ 
++    hevcdsp->hevc_deblocking_boundary_strengths = hevc_deblocking_boundary_strengths;
++
+     if (ARCH_X86)
+         ff_hevc_dsp_init_x86(hevcdsp, bit_depth);
+     if (ARCH_ARM)
+diff --git a/libavcodec/hevcdsp.h b/libavcodec/hevcdsp.h
+index 9f1f6dd..e221e54 100644
+--- a/libavcodec/hevcdsp.h
++++ b/libavcodec/hevcdsp.h
+@@ -42,6 +42,17 @@ typedef struct SAOParams {
+     uint8_t type_idx[3];    ///< sao_type_idx
+ } SAOParams;
+ 
++typedef struct Mv {
++    int16_t x;  ///< horizontal component of motion vector
++    int16_t y;  ///< vertical component of motion vector
++} Mv;
++
++typedef struct MvField {
++    DECLARE_ALIGNED(4, Mv, mv)[2];
++    int8_t ref_idx[2];
++    int8_t pred_flag;
++} MvField;
++
+ typedef struct HEVCDSPContext {
+     void (*put_pcm)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
+                     struct GetBitContext *gb, int pcm_bit_depth);
+@@ -120,6 +131,9 @@ typedef struct HEVCDSPContext {
+     void (*hevc_v_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride,
+                                         int32_t *tc, uint8_t *no_p,
+                                         uint8_t *no_q);
++    void (*hevc_deblocking_boundary_strengths)(int pus, int dup, int in_inc, int out_inc,
++                                               int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
++                                               MvField *curr, MvField *neigh, uint8_t *bs);
+ } HEVCDSPContext;
+ 
+ void ff_hevc_dsp_init(HEVCDSPContext *hpc, int bit_depth);
+diff --git a/libavcodec/hevcpred_template.c b/libavcodec/hevcpred_template.c
+index 6ae87cc..28d2653 100644
+--- a/libavcodec/hevcpred_template.c
++++ b/libavcodec/hevcpred_template.c
+@@ -20,6 +20,8 @@
+  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+  */
+ 
++//#define DISABLE_INTRA
++
+ #include "libavutil/pixdesc.h"
+ 
+ #include "bit_depth_template.c"
+@@ -69,8 +71,11 @@ do {                                  \
+                 AV_WN4P(&ptr[i], a);                                           \
+             else                                                               \
+                 a = PIXEL_SPLAT_X4(ptr[i + 3])
+-
++#ifdef RPI_WORKER
++    HEVCLocalContextIntra *lc = (s->enable_rpi) ? &s->HEVClcIntra : (HEVCLocalContextIntra *)s->HEVClc ;
++#else
+     HEVCLocalContext *lc = s->HEVClc;
++#endif
+     int i;
+     int hshift = s->ps.sps->hshift[c_idx];
+     int vshift = s->ps.sps->vshift[c_idx];
+@@ -114,6 +119,10 @@ do {                                  \
+     int top_right_size   = (FFMIN(x0 + 2 * size_in_luma_h, s->ps.sps->width) -
+                            (x0 + size_in_luma_h)) >> hshift;
+ 
++#ifdef DISABLE_INTRA
++    return;
++#endif
++
+     if (s->ps.pps->constrained_intra_pred_flag == 1) {
+         int size_in_luma_pu_v = PU(size_in_luma_v);
+         int size_in_luma_pu_h = PU(size_in_luma_h);
+diff --git a/libavcodec/mmaldec.c b/libavcodec/mmaldec.c
+index 099a8c5..bdff2d2 100644
+--- a/libavcodec/mmaldec.c
++++ b/libavcodec/mmaldec.c
+@@ -24,6 +24,9 @@
+  * MMAL Video Decoder
+  */
+ 
++#pragma GCC diagnostic push
++// Many many redundant decls in the header files
++#pragma GCC diagnostic ignored "-Wredundant-decls"
+ #include <bcm_host.h>
+ #include <interface/mmal/mmal.h>
+ #include <interface/mmal/mmal_parameters_video.h>
+@@ -31,6 +34,7 @@
+ #include <interface/mmal/util/mmal_util_params.h>
+ #include <interface/mmal/util/mmal_default_components.h>
+ #include <interface/mmal/vc/mmal_vc_api.h>
++#pragma GCC diagnostic pop
+ 
+ #include "avcodec.h"
+ #include "internal.h"
+diff --git a/libavcodec/mpeg4videodec.c b/libavcodec/mpeg4videodec.c
+index 3adf28d..2f9195f 100644
+--- a/libavcodec/mpeg4videodec.c
++++ b/libavcodec/mpeg4videodec.c
+@@ -2205,6 +2205,9 @@ int ff_mpeg4_workaround_bugs(AVCodecContext *avctx)
+ 
+         if (ctx->divx_version >= 0)
+             s->workaround_bugs |= FF_BUG_HPEL_CHROMA;
++
++        if (ctx->num_sprite_warping_points > 1)
++            s->workaround_bugs |= FF_BUG_GMC_UNSUPPORTED;
+     }
+ 
+     if (s->workaround_bugs & FF_BUG_STD_QPEL) {
+@@ -2229,6 +2232,7 @@ int ff_mpeg4_workaround_bugs(AVCodecContext *avctx)
+                s->workaround_bugs, ctx->lavc_build, ctx->xvid_build,
+                ctx->divx_version, ctx->divx_build, s->divx_packed ? "p" : "");
+ 
++    avctx->workaround_bugs = s->workaround_bugs;
+     if (CONFIG_MPEG4_DECODER && ctx->xvid_build >= 0 &&
+         s->codec_id == AV_CODEC_ID_MPEG4 &&
+         avctx->idct_algo == FF_IDCT_AUTO) {
 diff --git a/libavcodec/rpi_hevc_transform.h b/libavcodec/rpi_hevc_transform.h
-index 4f13622..b3f155f 100644
---- a/libavcodec/rpi_hevc_transform.h
+new file mode 100644
+index 0000000..4309f1c
+--- /dev/null
 +++ b/libavcodec/rpi_hevc_transform.h
-@@ -3,7 +3,13 @@ unsigned char rpi_hevc_transform [] = {
- 106,
- 0,
- 144,
--35,
-+38,
+@@ -0,0 +1,3070 @@
++unsigned char rpi_hevc_transform [] = {
++21,
++106,
++0,
++144,
++47,
 +1,
 +37,
 +106,
 +0,
 +144,
++66,
++1,
++53,
++106,
++0,
++144,
++192,
++4,
++69,
++106,
++0,
++144,
++192,
++4,
++85,
++106,
++0,
++144,
++220,
++5,
++169,
++3,
++62,
++64,
++79,
++64,
++3,
++232,
++32,
++0,
++0,
++0,
++12,
++248,
++0,
++136,
++0,
++0,
++192,
++248,
++0,
++0,
++64,
++232,
++0,
++2,
++0,
++0,
++12,
++248,
++0,
++168,
++0,
++0,
++192,
++248,
++0,
++0,
++0,
++96,
++3,
++232,
++32,
++0,
++0,
++0,
++7,
++232,
++0,
++2,
++0,
++0,
++8,
++232,
++0,
++4,
++0,
++0,
++12,
++248,
++0,
++128,
++0,
++0,
++192,
++8,
++4,
++0,
++4,
++232,
++64,
++0,
++0,
++0,
++5,
++232,
++0,
++8,
++0,
++0,
++128,
++69,
++113,
++66,
++12,
++248,
++0,
++128,
++0,
++0,
++192,
++8,
++4,
++0,
++128,
++69,
++113,
++70,
++128,
++144,
++40,
++0,
++4,
++255,
++48,
++192,
++128,
++3,
++32,
++8,
++16,
++0,
++76,
++254,
++48,
++192,
++9,
++4,
++32,
++8,
++0,
++0,
++4,
++254,
++0,
++144,
++128,
++2,
++0,
++8,
++2,
++0,
++128,
++144,
++23,
++0,
++4,
++255,
++48,
++192,
++128,
++3,
++32,
++8,
++20,
++0,
++76,
++254,
++48,
++192,
++4,
++4,
++32,
++8,
++0,
++0,
++140,
++248,
++44,
++0,
++0,
++0,
++32,
++48,
++4,
++0,
++128,
++69,
++113,
++66,
++242,
++140,
++211,
++192,
++34,
++31,
++41,
++3,
++70,
++192,
++80,
++7,
++164,
++255,
++36,
++204,
++96,
++2,
++0,
++248,
++62,
++0,
++3,
++255,
++55,
++208,
++120,
++3,
++224,
++3,
++190,
++11,
++16,
++139,
++246,
++91,
++0,
++103,
++90,
++0,
++70,
++192,
++80,
++7,
++164,
++255,
++36,
++204,
++224,
++2,
++0,
++248,
++62,
++0,
++3,
++255,
++55,
++208,
++120,
++3,
++224,
++3,
++190,
++11,
++16,
++139,
++246,
++91,
++0,
++103,
++90,
++0,
++225,
++64,
++242,
++64,
++3,
++232,
++128,
++0,
++0,
++0,
++7,
++232,
++0,
++2,
++0,
++0,
 +57,
- 1,
- 169,
- 3,
-@@ -627,4 +633,1798 @@ unsigned char rpi_hevc_transform [] = {
- 30,
- 90,
- 0,
++239,
++224,
++247,
++255,
++255,
++72,
++192,
++95,
++207,
++88,
++122,
++88,
++124,
++137,
++64,
++26,
++64,
++4,
++232,
++64,
++0,
++0,
++0,
++149,
++96,
++161,
++64,
++152,
++64,
++128,
++144,
++35,
++0,
++72,
++232,
++0,
++4,
++0,
++0,
++65,
++232,
++32,
++0,
++0,
++0,
++128,
++144,
++27,
++0,
++4,
++232,
++0,
++8,
++0,
++0,
++69,
++96,
++145,
++64,
++168,
++64,
++128,
++144,
++19,
++0,
++72,
++232,
++0,
++4,
++0,
++0,
++65,
++232,
++32,
++0,
++0,
++0,
++128,
++144,
++11,
++0,
++74,
++232,
++0,
++8,
++0,
++0,
++242,
++140,
++221,
++192,
++57,
++239,
++32,
++8,
++0,
++0,
++41,
++3,
++239,
++3,
++12,
++248,
++0,
++128,
++0,
++0,
++192,
++248,
++4,
++0,
++12,
++248,
++0,
++132,
++64,
++0,
++192,
++248,
++4,
++0,
++0,
++96,
++255,
++159,
++154,
++255,
++0,
++232,
++0,
++4,
++0,
++0,
++255,
++159,
++165,
++255,
++4,
++255,
++48,
++204,
++16,
++3,
++224,
++251,
++62,
++0,
++4,
++255,
++51,
++204,
++128,
++3,
++224,
++251,
++16,
++0,
++76,
++254,
++51,
++204,
++128,
++3,
++224,
++251,
++20,
++0,
++128,
++64,
++6,
++232,
++64,
++0,
++0,
++0,
++140,
++248,
++47,
++0,
++0,
++0,
++224,
++99,
++0,
++0,
++32,
++247,
++240,
++207,
++16,
++3,
++32,
++247,
++176,
++207,
++17,
++19,
++32,
++247,
++112,
++207,
++18,
++35,
++32,
++247,
++48,
++207,
++19,
++51,
++32,
++247,
++240,
++206,
++20,
++67,
++32,
++247,
++176,
++206,
++21,
++83,
++32,
++247,
++112,
++206,
++22,
++99,
++32,
++247,
++48,
++206,
++23,
++115,
++32,
++247,
++240,
++205,
++24,
++131,
++32,
++247,
++176,
++205,
++25,
++147,
++32,
++247,
++112,
++205,
++26,
++163,
++32,
++247,
++48,
++205,
++27,
++179,
++32,
++247,
++240,
++204,
++28,
++195,
++32,
++247,
++176,
++204,
++29,
++211,
++32,
++247,
++112,
++204,
++30,
++227,
++32,
++247,
++48,
++204,
++31,
++243,
++4,
++255,
++51,
++204,
++128,
++3,
++224,
++251,
++16,
++0,
++76,
++254,
++51,
++204,
++128,
++3,
++224,
++251,
++20,
++0,
++0,
++237,
++32,
++0,
++0,
++0,
++140,
++248,
++47,
++0,
++0,
++0,
++224,
++99,
++0,
++0,
++111,
++3,
++4,
++254,
++0,
++128,
++0,
++4,
++0,
++248,
++0,
++0,
++2,
++232,
++32,
++0,
++0,
++0,
++140,
++248,
++32,
++0,
++0,
++0,
++224,
++35,
++0,
++0,
++64,
++232,
++0,
++2,
++0,
++0,
++193,
++232,
++0,
++1,
++0,
++0,
++1,
++106,
++116,
++30,
++90,
++0,
 +169,
 +3,
 +73,
@@ -34058,935 +9634,6 @@ index 4f13622..b3f155f 100644
 +128,
 +90,
 +0,
- };
-diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s
-index fd159bc..b055208 100644
---- a/libavcodec/rpi_hevc_transform.s
-+++ b/libavcodec/rpi_hevc_transform.s
-@@ -83,6 +83,8 @@
- hevc_trans_16x16:
-   cmp r5,1
-   beq memclear16
-+  cmp r5,2
-+  beq hevc_deblock_16x16
-   push r6-r15, lr # TODO cut down number of used registers
-   mov r14,r3 # coeffs32
-   mov r15,r4 # num32
-@@ -282,3 +284,427 @@ loop:
-   cmp r1,0
-   bgt loop
-   b lr
-+
-+
-+################################################################################
-+# HEVC VPU Deblock
-+#
-+# Vertical edges before horizontal
-+# Decision can change every 4 pixels, but only 8 pixel boundaries are deblocked
-+#
-+# ARM is responsible for storing beta and tc for each 4 pixels horiz and vert edge.
-+# The VPU code works in units of 16x16 blocks.
-+# We do vertical filtering for the current block followed by horizontal filtering for the previous (except for the first time).
-+# One final horizontal filter is required at the end.
-+# PCM is not allowed in this code.
-+#
-+#
-+# H(16-4:16+15,0) contains previous block (note that we need 4 lines above of context that may get altered during filtering)
-+# H(16:31,16) contains current block (note that we do not need the upper lines until the horizontal filtering.
-+
-+.set P0,63
-+.set P1,62
-+.set P2,61
-+.set P3,60
-+.set Q0,59
-+.set Q1,58
-+.set Q2,57
-+.set Q3,56
-+
-+.set dp,32
-+.set dq,33
-+.set d,34
-+.set decision,35
-+.set beta,36
-+.set beta2,37
-+.set beta3,38
-+.set ptest,39
-+.set qtest,40
-+.set pqtest,41
-+.set thresh,42
-+.set deltatest, 44
-+.set deltap1, 45
-+.set tc25, 46
-+.set setup,47
-+.set tc,48
-+.set tc25,49
-+.set tc2, 50
-+.set do_filter, 51
-+.set delta, 52
-+.set tc10, 53
-+.set delta0, 54
-+.set delta1, 55
-+.set zeros, 0
-+.set setup_input, 1
-+.set deltaq1, 2
-+
-+
-+
-+# hevc_deblock_16x16 deblocks an entire row that is 16 pixels high by the full width of the image.
-+# Row has num16 16x16 blocks across
-+# Beta goes from 0 to 64
-+# tc goes from 0 to 24
-+# setup[block_idx][0=vert,1=horz][0=first edge, 1=second edge][0=beta,1=tc][0..3=edge number]
-+#   has 8 bytes per edge
-+#   has 16 bytes per direction
-+#   has 32 bytes per 16x16 block
-+# hevc_deblock_16x16(uint8_t *img (r0), int stride (r1), int num16w (r2), uint8_t setup[num16][2][2][2][4](r3),int num16h(r4))
-+hevc_deblock_16x16:
-+  push r6-r15, lr
-+  mov r9,r4
-+  mov r4,r3
-+  mov r13,r2
-+  mov r2,r0
-+  mov r10,r0
-+  subscale4 r0,r1
-+  mov r8,63
-+  mov r6,-3
-+  vmov H(zeros,0),0
-+# r7 is number of blocks still to load
-+# r0 is location of current block - 4 * stride
-+# r1 is stride
-+# r2 is location of current block
-+# r3 is offset of start of block (actual edges start at H(16,16)+r3 for horizontal and H(16,0)+r3 for vertical
-+# r4 is setup
-+# r5 is for temporary calculations
-+# r8 holds 63
-+# r6 holds -3
-+# r9 holds the number of 16 high rows to process
-+# r10 holds the original img base
-+# r11 returns 0 if no filtering was done on the edge
-+# r12 saves a copy of this
-+# r13 is copy of width
-+
-+process_row:
-+  # First iteration does not do horizontal filtering on previous
-+  mov r7, r13
-+  mov r3,0
-+  vldb H(12++,16)+r3,(r0 += r1) REP 4    # Load the current block
-+  vldb H(16++,16)+r3,(r2 += r1) REP 16
-+  vldb H(setup_input,0), (r4)  # We may wish to prefetch these
-+  vstb H(zeros,0),(r4)
-+  bl vert_filter
-+  add r3,8
-+  vadd H(setup_input,0),H(setup_input,8),0 # Rotate to second set of 8
-+  bl vert_filter
-+  sub r3,8
-+  b start_deblock_loop
-+deblock_loop:
-+  # Middle iterations do vertical on current block and horizontal on preceding
-+  vldb H(12++,16)+r3,(r0 += r1) REP 4  # load the current block
-+  vldb H(16++,16)+r3,(r2 += r1) REP 16
-+  vldb H(setup_input,0), (r4)
-+  vstb H(zeros,0),(r4)
-+  bl vert_filter
-+  add r3,8
-+  vadd H(setup_input,0),H(setup_input,8),0
-+  bl vert_filter
-+  sub r3,8
-+  vldb H(setup_input,0), -16(r4)
-+  vstb H(zeros,0),-16(r4)
-+  bl horz_filter
-+  mov r12,r11
-+  add r3,8*64
-+  vadd H(setup_input,0),H(setup_input,8),0
-+  bl horz_filter
-+  sub r3,8*64
-+  addcmpbeq r12,0,0,skip_save_top
-+  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
-+skip_save_top:
-+  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
-+start_deblock_loop:
-+  # move onto next 16x16 (could do this with circular buffer support instead)
-+  add r3,16
-+  and r3,r8
-+  add r4,32
-+  # Perform loop counter operations (may work with an addcmpbgt as well?)
-+  add r0,16
-+  add r2,16
-+  sub r7,1
-+  cmp r7,0 # Are there still more blocks to load
-+  bgt deblock_loop
-+
-+  # Final iteration needs to just do horizontal filtering
-+  vldb H(setup_input,0), -16(r4)
-+  vstb H(zeros,0),-16(r4)
-+  bl horz_filter
-+  mov r12,r11
-+  add r3,8*64
-+  vadd H(setup_input,0),H(setup_input,8),0
-+  bl horz_filter
-+  sub r3,64*8
-+  addcmpbeq r12,0,0,skip_save_top2
-+  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
-+skip_save_top2:
-+  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
-+
-+# Now look to see if we should do another row
-+  sub r9,1
-+  cmp r9,0
-+  bgt start_again
-+  pop r6-r15, pc
-+start_again:
-+  # Need to sort out r0,r2 to point to next row down
-+  addscale16 r10,r1
-+  mov r2,r10
-+  subscale4 r0,r2,r1
-+  b process_row
-+
-+
-+# At this stage H(16,16)+r3 points to the first pixel of the 16 high edge to be filtered
-+# So we can reuse the code we move the parts to be filtered into HX(P0/P1/P2/P3/Q0/Q1/Q2/Q3,0) - we will perform a final saturation step on placing them back into the correct locations
-+
-+vert_filter:
-+  push lr
-+
-+  vmov HX(P3,0), V(16,12)+r3
-+  vmov HX(P2,0), V(16,13)+r3
-+  vmov HX(P1,0), V(16,14)+r3
-+  vmov HX(P0,0), V(16,15)+r3
-+  vmov HX(Q0,0), V(16,16)+r3
-+  vmov HX(Q1,0), V(16,17)+r3
-+  vmov HX(Q2,0), V(16,18)+r3
-+  vmov HX(Q3,0), V(16,19)+r3
-+
-+  bl do_luma_filter
-+
-+  vadds V(16,13)+r3, HX(P2,0), 0
-+  vadds V(16,14)+r3, HX(P1,0), 0
-+  vadds V(16,15)+r3, HX(P0,0), 0
-+  # P3 and Q3 never change so don't bother saving back
-+  vadds V(16,16)+r3, HX(Q0,0), 0
-+  vadds V(16,17)+r3, HX(Q1,0), 0
-+  vadds V(16,18)+r3, HX(Q2,0), 0
-+
-+  pop pc
-+
-+# Filter edge at H(16,0)+r3
-+horz_filter:
-+  push lr
-+
-+  vmov HX(P3,0), H(12,0)+r3
-+  vmov HX(P2,0), H(13,0)+r3
-+  vmov HX(P1,0), H(14,0)+r3
-+  vmov HX(P0,0), H(15,0)+r3
-+  vmov HX(Q0,0), H(16,0)+r3
-+  vmov HX(Q1,0), H(17,0)+r3
-+  vmov HX(Q2,0), H(18,0)+r3
-+  vmov HX(Q3,0), H(19,0)+r3
-+
-+  bl do_luma_filter
-+
-+  vadds H(13,0)+r3, HX(P2,0), 0
-+  vadds H(14,0)+r3, HX(P1,0), 0
-+  vadds H(15,0)+r3, HX(P0,0), 0
-+  # P3 and Q3 never change so don't bother saving back
-+  vadds H(16,0)+r3, HX(Q0,0), 0
-+  vadds H(17,0)+r3, HX(Q1,0), 0
-+  vadds H(18,0)+r3, HX(Q2,0), 0
-+
-+  pop pc
-+
-+# r4 points to array of beta/tc for each 4 length edge
-+do_luma_filter:
-+  valtl H(setup,0),H(setup_input,0),H(setup_input,0) # b*8tc*8
-+  valtl HX(beta,0),H(setup,0),H(setup,0)
-+  valtu HX(tc,0),H(setup,0),H(setup,0)
-+  vmul HX(tc25,0), HX(tc,0), 5
-+  vadd HX(tc25,0),HX(tc25,0), 1
-+  vasr HX(tc25,0), HX(tc25,0), 1
-+
-+  # Compute decision
-+  vadd HX(dp,0),HX(P1,0),HX(P1,0) # 2*P1
-+  vsub HX(dp,0),HX(P2,0),HX(dp,0) # P2-2*P1
-+  vadd HX(dp,0),HX(dp,0),HX(P0,0) # P2-2*P1+P0
-+  vdist HX(dp,0),HX(dp,0),0 # abs(P2-2*P1+P0) # dp0
-+
-+  vadd HX(dq,0),HX(Q1,0),HX(Q1,0) # 2*Q1
-+  vsub HX(dq,0),HX(Q2,0),HX(dq,0) # Q2-2*Q1
-+  vadd HX(dq,0),HX(dq,0),HX(Q0,0) # Q2-2*Q1+Q0
-+  vdist HX(dq,0),HX(dq,0),0 # abs(Q2-2*Q1+Q0) # dq0
-+
-+  vadd HX(d,0), HX(dp,0), HX(dq,0)
-+  vasr HX(beta2,0),HX(beta,0),2
-+  vasr HX(beta3,0),HX(beta,0),3
-+
-+  # Compute flags that are negative if all conditions pass
-+  vdist HX(decision,0), HX(P0,0), HX(P3,0) CLRA SACC
-+  vdist HX(decision,0), HX(Q0,0), HX(Q3,0) SACC
-+  vsub HX(decision,0), HX(decision,0), HX(beta3,0) SETF
-+
-+  vdist HX(decision,0), HX(P0,0), HX(Q0,0) IFN
-+  vsub HX(decision,0), HX(decision,0), HX(tc25,0) IFN SETF
-+  vadd HX(decision,0), HX(d,0), HX(d,0) IFN
-+  vsub HX(decision,0), HX(decision,0), HX(beta2,0) IFN SETF
-+  vmov HX(decision,0), 1 IFNN
-+  vadd H(decision,0),H(decision,3),0 IFN
-+  vadd H(decision,16),H(decision,19),0 IFN
-+  vmov -,HX(decision,0) SETF   # N marks strong filter
-+  vmov HX(decision,0), 1 IFNN  # NN marks normal filter
-+
-+  vadd HX(do_filter,0), HX(d,3), HX(d,0)
-+  vsub HX(do_filter,0), HX(do_filter,0), HX(beta,0) SETF # IFNN means no filter
-+  vmov HX(decision,0),0 IFNN # Z marks no filter
-+
-+  # Expand out decision (currently valid one every 4 pixels)  0...1...2...3
-+  # First extract out even terms
-+  vodd HX(decision,0),HX(decision,0),HX(decision,0)  # 0.1.2.3
-+  vodd HX(decision,0),HX(decision,0),HX(decision,0)  # 0123
-+  # Now expand back
-+  valtl HX(decision,0),HX(decision,0),HX(decision,0) # 00112233
-+  valtl HX(decision,0),HX(decision,0),HX(decision,0) SETF # 0000111122223333
-+
-+  # HX(decision,0) is negative if want strong filtering, 1 if want normal filtering, 0 if want no filtering
-+
-+  # Do a quick check to see if there is anything to do
-+  mov r11, 0 # Signal no filtering
-+  vmov -,1 IFNZ SUMS r5
-+  cmp r5,0
-+  beq filtering_done
-+  mov r11, 1 # Signal some filtering
-+  # And whether there is any strong filtering
-+  vmov -,1 IFN SUMS r5
-+  cmp r5,0
-+  beq normal_filtering
-+
-+  ##############################################################################
-+  # Strong filtering - could maybe fast case if all have same sign? (especially if all disabled!)
-+  vshl HX(tc2,0), HX(tc,0), 1  # Note that in normal filtering tx2 is tc/2, while here it is tc*2
-+
-+  # Take a copy of the original pixels for use in decision calculation
-+  vmov HX(P0,32),HX(P0,0)
-+  vmov HX(Q0,32),HX(Q0,0)
-+  vmov HX(P1,32),HX(P1,0)
-+  vmov HX(Q1,32),HX(Q1,0)
-+  vmov HX(P2,32),HX(P2,0)
-+  vmov HX(Q2,32),HX(Q2,0)
-+
-+  vadd -,HX(P2,32),4 CLRA SACC
-+  vshl -,HX(P1,32),1 SACC
-+  vshl -,HX(P0,32),1 SACC
-+  vshl -,HX(Q0,32),1 SACC
-+  vshl HX(delta,0),HX(Q1,32),0 SACC
-+  vasr HX(delta,0),HX(delta,0), 3
-+  vsub HX(delta,0),HX(delta,0),HX(P0,32)
-+  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
-+  vadd HX(P0,0),HX(P0,32),HX(delta,0) IFN
-+
-+  vadd -,HX(P2,32),2 CLRA SACC
-+  vadd -,HX(P1,32),HX(P0,32) SACC
-+  vshl HX(delta,0),HX(Q0,32),0 SACC
-+  vasr HX(delta,0),HX(delta,0), 2
-+  vsub HX(delta,0),HX(delta,0),HX(P1,32)
-+  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
-+  vadd HX(P1,0),HX(P1,32),HX(delta,0) IFN
-+
-+  vadd -,HX(Q0,32),4 CLRA SACC
-+  vadd -,HX(P1,32),HX(P0,32) SACC
-+  vmul -,HX(P2,32),3 SACC
-+  vshl HX(delta,0),HX(P3,0),1 SACC # Note that we have not made a copy of P3, so using P3,0 is correct
-+  vasr HX(delta,0),HX(delta,0), 3
-+  vsub HX(delta,0),HX(delta,0),HX(P2,32)
-+  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
-+  vadd HX(P2,0),HX(P2,32),HX(delta,0) IFN
-+  #vmov HX(P2,0),3 IFN
-+
-+  # Now reverse all P/Qs
-+
-+  vadd -,HX(Q2,32),4 CLRA SACC
-+  vshl -,HX(Q1,32),1 SACC
-+  vshl -,HX(Q0,32),1 SACC
-+  vshl -,HX(P0,32),1 SACC
-+  vshl HX(delta,0),HX(P1,32),0 SACC
-+  vasr HX(delta,0),HX(delta,0), 3
-+  vsub HX(delta,0),HX(delta,0),HX(Q0,32)
-+  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
-+  vadd HX(Q0,0),HX(Q0,32),HX(delta,0) IFN
-+
-+  vadd -,HX(Q2,32),2 CLRA SACC
-+  vadd -,HX(Q1,32),HX(Q0,32) SACC
-+  vshl HX(delta,0),HX(P0,32),0 SACC
-+  vasr HX(delta,0),HX(delta,0), 2
-+  vsub HX(delta,0),HX(delta,0),HX(Q1,32)
-+  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
-+  vadd HX(Q1,0),HX(Q1,32),HX(delta,0) IFN
-+
-+  vadd -,HX(P0,32),4 CLRA SACC
-+  vadd -,HX(Q1,32),HX(Q0,32) SACC
-+  vmul -,HX(Q2,32),3 SACC
-+  vshl HX(delta,0),HX(Q3,0),1 SACC # Note that we have not made a copy of Q3, so using Q3,0 is correct
-+  vasr HX(delta,0),HX(delta,0), 3
-+  vsub HX(delta,0),HX(delta,0),HX(Q2,32)
-+  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
-+  vadd HX(Q2,0),HX(Q2,32),HX(delta,0) IFN
-+
-+  ##############################################################################
-+  # Normal filtering
-+normal_filtering:
-+  # Invert the decision flags
-+  # make instruction more complicated as assembler has error and loses SETF
-+  vrsub HX(tc10,0), HX(decision,0), 0 SETF # IFN means normal filtering
-+  vmov  -, HX(tc10,0) SETF # IFN means normal filtering
-+
-+  vmov -,1 IFN SUMS r5
-+  cmp r5,0
-+  beq filtering_done
-+
-+  vasr HX(tc2,0), HX(tc,0), 1
-+  vmul HX(tc10,0), HX(tc,0), 10
-+
-+  vasr HX(thresh,0), HX(beta,0), 1
-+  vadd HX(thresh,0), HX(thresh,0), HX(beta,0)
-+  vasr HX(thresh,0), HX(thresh,0), 3 CLRA SACC
-+
-+  vadd HX(ptest,0),HX(dp,3),HX(dp,0)
-+  vsub HX(ptest,0),HX(ptest,0),HX(thresh,0) # ptest is negative if we need to do the P2 pixel
-+  vadd HX(qtest,0),HX(dq,3),HX(dq,0)
-+  vsub HX(qtest,0),HX(qtest,0),HX(thresh,0) # qtest is negative if we need to do the Q2 pixel
-+  # Expand ptest and qtest together
-+  vodd HX(pqtest,0),HX(ptest,0),HX(qtest,0)  # p.p.p.p.q.q.q.q
-+  vodd HX(pqtest,0),HX(pqtest,0),HX(pqtest,0) # ppppqqqq........
-+  valtl HX(pqtest,0),HX(pqtest,0),HX(pqtest,0) # ppppppppqqqqqqqq
-+  valtl HX(ptest,0),HX(pqtest,0),HX(pqtest,0)
-+  valtu HX(qtest,0),HX(pqtest,0),HX(pqtest,0)
-+
-+  vsub HX(delta0,0), HX(Q0,0), HX(P0,0)
-+  vsub HX(delta1,0), HX(Q1,0), HX(P1,0)
-+  vmov -,8 CLRA SACC
-+  vmul -,HX(delta0,0), 9 SACC
-+  vmul HX(delta0,0),HX(delta1,0), r6 SACC
-+  vasr HX(delta0,0), HX(delta0,0), 4
-+  vdist HX(deltatest,0), HX(delta0,0), 0
-+  vsub HX(deltatest,0), HX(deltatest,0), HX(tc10,0) IFN SETF # negative if still need to do something
-+  vmov HX(deltatest,0), 0 IFNN # clear if no need to do anything so we can reload flags later
-+
-+  vclamps HX(delta0,0), HX(delta0,0), HX(tc,0)
-+
-+  vadd HX(deltap1,0), HX(P2,0), HX(P0,0)
-+  vadd HX(deltap1,0), HX(deltap1,0), 1
-+  vasr HX(deltap1,0), HX(deltap1,0), 1 CLRA SACC
-+  vsub HX(deltap1,0), HX(delta0,0), HX(P1,0) SACC
-+  vasr HX(deltap1,0), HX(deltap1,0), 1
-+  vclamps HX(deltap1,0), HX(deltap1,0), HX(tc2,0)
-+
-+  vadd HX(deltaq1,0), HX(Q2,0), HX(Q0,0)
-+  vadd HX(deltaq1,0), HX(deltaq1,0), 1
-+  vasr HX(deltaq1,0), HX(deltaq1,0), 1 CLRA SACC
-+  vadd HX(deltaq1,0), HX(delta0,0), HX(Q1,0)
-+  vrsub -, HX(delta0,0), 0 SACC
-+  vrsub HX(deltaq1,0), HX(Q1,0), 0 SACC
-+  vasr HX(deltaq1,0), HX(deltaq1,0), 1
-+  vclamps HX(deltaq1,0), HX(deltaq1,0), HX(tc2,0)
-+
-+  vadds HX(P0,0), HX(P0,0), HX(delta0,0) IFN
-+  vsubs HX(Q0,0), HX(Q0,0), HX(delta0,0) IFN
-+
-+  vmov -,HX(ptest,0) IFN SETF # Negative if need to do p1
-+  vadds HX(P1,0), HX(P1,0), HX(deltap1,0) IFN
-+
-+  vmov -,HX(deltatest,0) SETF
-+  vmov -,HX(qtest,0) IFN SETF # Negative if need to do q1
-+  vadds HX(Q1,0), HX(Q1,0), HX(deltaq1,0) IFN
-+
-+  #vmov HX(P2,0),1 IFN
-+
-+filtering_done:
-+  b lr
-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-index 0121fca..05b2169 100644
---- a/libavcodec/rpi_qpu.c
-+++ b/libavcodec/rpi_qpu.c
-@@ -147,7 +147,7 @@ static int gpu_init(volatile struct GPU **gpu) {
-   vcsm_init();
-   gpu_malloc_uncached_internal(sizeof(struct GPU), &gpu_mem_ptr, mb);
-   ptr = (volatile struct GPU*)gpu_mem_ptr.arm;
--  memset(ptr, 0, sizeof *ptr);
-+  memset((void*)ptr, 0, sizeof *ptr);
-   vc = gpu_mem_ptr.vc;
- 
-   ptr->mb = mb;
-@@ -254,7 +254,7 @@ void gpu_cache_flush(GPU_MEM_PTR_T *p)
-     struct vcsm_user_clean_invalid_s iocache = {};
-     iocache.s[0].handle = p->vcsm_handle;
-     iocache.s[0].cmd = 3; // clean+invalidate
--    iocache.s[0].addr = p->arm;
-+    iocache.s[0].addr = (int) p->arm;
-     iocache.s[0].size  = p->numbytes;
-     vcsm_clean_invalid( &iocache );
- #else
-@@ -390,6 +390,7 @@ static void *vpu_start(void *arg) {
- #ifdef RPI_TIME_TOTAL_POSTED
-   int last_time=0;
-   long long on_time=0;
-+  long long on_time_deblock=0;
-   long long off_time=0;
-   int start_time;
-   int end_time;
-@@ -451,10 +452,13 @@ static void *vpu_start(void *arg) {
- #ifdef RPI_TIME_TOTAL_POSTED
-     end_time = Microseconds();
-     last_time = end_time;
--    on_time += end_time - start_time;
-+    if (p[6]==2)
-+      on_time_deblock += end_time - start_time;
-+    else
-+      on_time += end_time - start_time;
-     count++;
-     if ((count&0x7f)==0)
--      printf("Posted %d On=%dms, Off=%dms\n",count,(int)(on_time/1000),(int)(off_time/1000));
-+      printf("Posted %d On=%dms, On_deblock=%dms, Off=%dms\n",count,(int)(on_time/1000),(int)(on_time_deblock/1000),(int)(off_time/1000));
- #endif
-     pthread_mutex_lock(&post_mutex);
-     vpu_async_head++;
-diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
-index e86eb30..c5d8b29 100644
---- a/libavcodec/rpi_shader.c
-+++ b/libavcodec/rpi_shader.c
-@@ -61,7 +61,7 @@ unsigned int rpi_shader[] = {
- /* [0x00000120] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_frame_base, r2
- /* [0x00000128] */ 0x0c9e7440, 0x10020f27, // add t1s, r2, r1
- /* [0x00000130] */ 0x00000008, 0xe00208a7, // mov r2,8
--/* [0x00000138] */ 0x11827c80, 0x10021327, // shl rb12,unif, r2
-+/* [0x00000138] */ 0x11827c80, 0x10021327, // shl rb12,unif,r2
- /* [0x00000140] */ 0x0c827c80, 0x10021367, // add rb13,unif,r2
- /* [0x00000148] */ 0x15827d80, 0x100208a7, // mov r2, unif
- /* [0x00000150] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
--- 
-2.7.4
-
-
-From e9c59f0d7b42dfb10d85ab2477f95b44484a8d70 Mon Sep 17 00:00:00 2001
-From: Peter de Rivaz <peter.derivaz@gmail.com>
-Date: Wed, 1 Jul 2015 09:21:17 +0100
-Subject: [PATCH 65/68] Added ability to combine jobs
-
----
- libavcodec/rpi_qpu.c | 81 +++++++++++++++++++++++++++++++++++++++++++++++++++-
- 1 file changed, 80 insertions(+), 1 deletion(-)
-
-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-index 05b2169..91777be 100644
---- a/libavcodec/rpi_qpu.c
-+++ b/libavcodec/rpi_qpu.c
-@@ -8,6 +8,8 @@
- #define RPI_TIME_TOTAL_POSTED
- // define RPI_ASYNC to run the VPU in a separate thread, need to make a separate call to check for completion
- #define RPI_ASYNC
-+// Define RPI_COMBINE_JOBS to find jobs that can be executed in parallel
-+#define RPI_COMBINE_JOBS
- 
- #include <stdio.h>
- #include <stdlib.h>
-@@ -398,9 +400,15 @@ static void *vpu_start(void *arg) {
- #endif
-   while(1) {
-     int i;
--    int *p;
-+    int *p; // Pointer for a QPU/VPU job
-+#ifdef RPI_COMBINE_JOBS
-+    int *q = NULL; // Pointer for a VPU only job
-+    int have_qpu = 0;
-+    int have_vpu = 0;
-+#endif
-     int qpu_code;
-     int qpu_codeb;
-+    int num_jobs; // Number of jobs available
-     pthread_mutex_lock(&post_mutex);
-     while( vpu_async_tail - vpu_async_head <= 0)
-     {
-@@ -408,13 +416,38 @@ static void *vpu_start(void *arg) {
-       pthread_cond_wait(&post_cond_tail, &post_mutex);
-     }
-     p = vpu_cmds[vpu_async_head%MAXCMDS];
-+    num_jobs = vpu_async_tail - vpu_async_head;
-     pthread_mutex_unlock(&post_mutex);
- 
-     if (p[6] == -1) {
-       break; // Last job
-     }
-+    if (p[7] == 0 && p[0] == 0 && p[16]==0)
-+      goto job_done_early;
-+
-+#ifdef RPI_COMBINE_JOBS
-+    // First scan for a qpu job
-+    for (int x=0;x<num_jobs;x++) {
-+      p = vpu_cmds[(vpu_async_head+x)%MAXCMDS];
-+      if (p[7]) {
-+        have_qpu = 1;
-+        break;
-+      }
-+    }
-+    // Now scan for a non-qpu job
-+    for (int x=0;x<num_jobs;x++) {
-+      q = vpu_cmds[(vpu_async_head+x)%MAXCMDS];
-+      if (!q[7]) {
-+        have_vpu = 1;
-+        break;
-+      }
-+    }
-+    printf("Have_qpu = %d, have_vpu=%d\n",have_qpu,have_vpu);
-+#endif
-     qpu_code = p[7];
-     qpu_codeb = p[16];
-+
-+
-     //if (p[7]) {
-         //GPU_MEM_PTR_T *buf = (GPU_MEM_PTR_T *)p[7];
-         //gpu_cache_flush(buf);
-@@ -427,6 +460,40 @@ static void *vpu_start(void *arg) {
-     off_time += start_time-last_time;
- #endif
- 
-+#ifdef RPI_COMBINE_JOBS
-+    if (have_qpu) {
-+      for(i=0;i<8;i++) {
-+        gpu->mail[i*2] = p[8+i];
-+        gpu->mail[i*2 + 1] = qpu_code;
-+      }
-+      for(i=0;i<12;i++) {
-+        gpu->mail2[i*2] = p[17+i];
-+        gpu->mail2[i*2 + 1] = qpu_codeb;
-+      }
-+      if (have_vpu) {
-+        execute_multi(gpu->mb,
-+                              12,gpu->vc + offsetof(struct GPU, mail2), 1, 5000,
-+                              8,gpu->vc + offsetof(struct GPU, mail), 1 /* no flush */, 5000 /* timeout ms */,
-+                              p[0], p[1], p[2], p[3], p[4], p[5], p[6], // VPU0
-+                              q[0], q[1], q[2], q[3], q[4], q[5], q[6]); // VPU1
-+        q[0] = 0;
-+      } else {
-+        execute_multi(gpu->mb,
-+                              12,gpu->vc + offsetof(struct GPU, mail2), 1, 5000,
-+                              8,gpu->vc + offsetof(struct GPU, mail), 1 /* no flush */, 5000 /* timeout ms */,
-+                              p[0], p[1], p[2], p[3], p[4], p[5], p[6], // VPU0
-+                              0,    0   , 0   , 0   , 0   , 0   , 0); // VPU1
-+      }
-+      p[0] = 0;
-+      p[7] = 0;
-+      p[16] = 0;
-+    } else {
-+        av_assert0(have_vpu);
-+        vpu_execute_code(q[0], q[1], q[2], q[3], q[4], q[5], q[6]);
-+        q[0] = 0;
-+    }
-+#else
-+
-     if (!qpu_code) {
-       vpu_execute_code(p[0], p[1], p[2], p[3], p[4], p[5], p[6]);
-     } else {
-@@ -449,17 +516,29 @@ static void *vpu_start(void *arg) {
-                               0,    0   , 0   , 0   , 0   , 0   , 0); // VPU1
- #endif
-     }
-+#endif
-+
- #ifdef RPI_TIME_TOTAL_POSTED
-     end_time = Microseconds();
-     last_time = end_time;
-+#ifdef RPI_COMBINE_JOBS
-+    // There are three cases we may wish to distinguish of VPU/QPU activity
-+    on_time += end_time - start_time;
-+#else
-     if (p[6]==2)
-       on_time_deblock += end_time - start_time;
-     else
-       on_time += end_time - start_time;
-+#endif
-     count++;
-     if ((count&0x7f)==0)
-+#ifdef RPI_COMBINE_JOBS
-       printf("Posted %d On=%dms, On_deblock=%dms, Off=%dms\n",count,(int)(on_time/1000),(int)(on_time_deblock/1000),(int)(off_time/1000));
-+#else
-+      printf("Posted %d On=%dms, Off=%dms\n",count,(int)(on_time/1000),(int)(off_time/1000));
-+#endif
- #endif
-+job_done_early:
-     pthread_mutex_lock(&post_mutex);
-     vpu_async_head++;
-     pthread_cond_broadcast(&post_cond_head);
--- 
-2.7.4
-
-
-From 0d54661f303b2a8903e806648ed54a34dcf315dc Mon Sep 17 00:00:00 2001
-From: Peter de Rivaz <peter.derivaz@gmail.com>
-Date: Wed, 1 Jul 2015 12:53:10 +0100
-Subject: [PATCH 66/68] Added chroma deblocking
-
----
- libavcodec/hevc.c               |  20 ++
- libavcodec/hevc.h               |  12 +-
- libavcodec/hevc_filter.c        |  92 +++++-
- libavcodec/rpi_hevc_transform.h | 644 +++++++++++++++++++++++++++++++++++++++-
- libavcodec/rpi_hevc_transform.s | 207 +++++++++++++
- libavcodec/rpi_qpu.c            |  27 +-
- libavcodec/rpi_shader.qasm      |  11 +
- 7 files changed, 988 insertions(+), 25 deletions(-)
-
-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-index 4ce94a7..8437e10 100644
---- a/libavcodec/hevc.c
-+++ b/libavcodec/hevc.c
-@@ -251,6 +251,14 @@ static void pic_arrays_free(HEVCContext *s)
-       gpu_free(&s->y_setup_ptr);
-       s->y_setup_arm = 0;
-     }
-+    if (s->uv_setup_arm) {
-+      gpu_free(&s->uv_setup_ptr);
-+      s->uv_setup_arm = 0;
-+    }
-+    if (s->vpu_cmds_arm) {
-+      gpu_free(&s->vpu_cmds_ptr);
-+      s->vpu_cmds_arm = 0;
-+    }
- #endif
-     av_freep(&s->sao);
-     av_freep(&s->deblock);
-@@ -324,6 +332,18 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
-     s->y_setup_vc = (void*)s->y_setup_ptr.vc;
-     memset(s->y_setup_arm, 0, s->y_setup_ptr.numbytes);
-     printf("Setup %d by %d by %d\n",s->setup_width,s->setup_height,sizeof(*s->y_setup_arm));
-+
-+    s->uv_setup_width = ( (sps->width >> sps->hshift[1]) + 15) / 16;
-+    s->uv_setup_height = ( (sps->height >> sps->vshift[1]) + 15) / 16;
-+    gpu_malloc_uncached(sizeof(*s->uv_setup_arm) * s->uv_setup_width * s->uv_setup_height, &s->uv_setup_ptr); // TODO make this cached
-+    s->uv_setup_arm = (void*)s->uv_setup_ptr.arm;
-+    s->uv_setup_vc = (void*)s->uv_setup_ptr.vc;
-+    memset(s->uv_setup_arm, 0, s->uv_setup_ptr.numbytes);
-+    printf("Setup uv %d by %d by %d\n",s->uv_setup_width,s->uv_setup_height,sizeof(*s->uv_setup_arm));
-+
-+    gpu_malloc_uncached(sizeof(*s->vpu_cmds_arm) * 3,&s->vpu_cmds_ptr);
-+    s->vpu_cmds_arm = (void*) s->vpu_cmds_ptr.arm;
-+    s->vpu_cmds_vc = s->vpu_cmds_ptr.vc;
- #endif
- 
-     s->bs_width  = (width  >> 2) + 1;
-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-index cf08489..7eb37e6 100644
---- a/libavcodec/hevc.h
-+++ b/libavcodec/hevc.h
-@@ -56,7 +56,7 @@
-   #define RPI_MAX_JOBS 2
-   // Define RPI_WORKER to launch a worker thread for pixel processing tasks
-   #define RPI_WORKER
--
-+  // Define RPI_DEBLOCK_VPU to perform deblocking on the VPUs
-   #define RPI_DEBLOCK_VPU
- 
- #endif
-@@ -980,6 +980,16 @@ typedef struct HEVCContext {
-     uint8_t (*y_setup_vc)[2][2][2][4];
-     int setup_width; // Number of 16x16 blocks across the image
-     int setup_height; // Number of 16x16 blocks down the image
-+
-+    GPU_MEM_PTR_T uv_setup_ptr;
-+    uint8_t (*uv_setup_arm)[2][2][2][4];  // Half of this is unused [][][1][], but easier for the VPU as it allows us to store with zeros and addresses are aligned
-+    uint8_t (*uv_setup_vc)[2][2][2][4];
-+    int uv_setup_width;
-+    int uv_setup_height;
-+
-+    GPU_MEM_PTR_T vpu_cmds_ptr;
-+    int (*vpu_cmds_arm)[6]; // r0-r5 for each command
-+    int vpu_cmds_vc;
- #endif
- 
- #endif
-diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
-index 06371da..6367068 100644
---- a/libavcodec/hevc_filter.c
-+++ b/libavcodec/hevc_filter.c
-@@ -656,9 +656,23 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
-                                                                    s->frame->linesize[chroma],
-                                                                    c_tc, no_p, no_q);
-                         } else
-+#ifdef RPI_DEBLOCK_VPU
-+                        if (s->enable_rpi_deblock) {
-+                            uint8_t (*setup)[2][2][4];
-+                            int xc = x>>s->ps.sps->hshift[chroma];
-+                            int yc = y>>s->ps.sps->vshift[chroma];
-+                            int num16 = (yc>>4)*s->uv_setup_width + (xc>>4);
-+                            int a = ((yc>>3) & 1) << 1;
-+                            int b = (xc>>3) & 1;
-+                            setup = s->uv_setup_arm[num16];
-+                            setup[0][b][0][a] = c_tc[0];
-+                            setup[0][b][0][a + 1] = c_tc[1];
-+                        } else
-+#endif
-                             s->hevcdsp.hevc_v_loop_filter_chroma(src,
-                                                                  s->frame->linesize[chroma],
-                                                                  c_tc, no_p, no_q);
-+
-                     }
-                 }
- 
-@@ -689,6 +703,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
-                                                                    s->frame->linesize[chroma],
-                                                                    c_tc, no_p, no_q);
-                         } else
-+#ifdef RPI_DEBLOCK_VPU
-+                        if (s->enable_rpi_deblock) {
-+                            uint8_t (*setup)[2][2][4];
-+                            int xc = x>>s->ps.sps->hshift[chroma];
-+                            int yc = y>>s->ps.sps->vshift[chroma];
-+                            int num16 = (yc>>4)*s->uv_setup_width + (xc>>4);
-+                            int a = ((xc>>3) & 1) << 1;
-+                            int b = (yc>>3) & 1;
-+                            setup = s->uv_setup_arm[num16];
-+                            setup[1][b][0][a] = c_tc[0];
-+                            setup[1][b][0][a + 1] = c_tc[1];
-+                        } else
-+#endif
-                             s->hevcdsp.hevc_h_loop_filter_chroma(src,
-                                                                  s->frame->linesize[chroma],
-                                                                  c_tc, no_p, no_q);
-@@ -1013,33 +1040,56 @@ void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n)
- static void rpi_deblock(HEVCContext *s, int y, int ctb_size)
- {
-   // Flush image, 4 lines above to bottom of ctb stripe
--  ff_hevc_flush_buffer_lines(s, FFMAX(y-4,0), y+ctb_size, 1, 0);
-+  ff_hevc_flush_buffer_lines(s, FFMAX(y-4,0), y+ctb_size, 1, 1);
-   // TODO flush buffer of beta/tc setup when it becomes cached
-+
-+  // Prepare three commands at once to avoid calling overhead
-+  s->vpu_cmds_arm[0][0] = get_vc_address(s->frame->buf[0]) + s->frame->linesize[0] * y;
-+  s->vpu_cmds_arm[0][1] = s->frame->linesize[0];
-+  s->vpu_cmds_arm[0][2] = s->setup_width;
-+  s->vpu_cmds_arm[0][3] = (int) ( s->y_setup_vc + s->setup_width * (y>>4) );
-+  s->vpu_cmds_arm[0][4] = ctb_size>>4;
-+  s->vpu_cmds_arm[0][5] = 2;
-+
-+  s->vpu_cmds_arm[1][0] = get_vc_address(s->frame->buf[1]) + s->frame->linesize[1] * (y>> s->ps.sps->vshift[1]);
-+  s->vpu_cmds_arm[1][1] = s->frame->linesize[1];
-+  s->vpu_cmds_arm[1][2] = s->uv_setup_width;
-+  s->vpu_cmds_arm[1][3] = (int) ( s->uv_setup_vc + s->uv_setup_width * ((y>>4)>> s->ps.sps->vshift[1]) );
-+  s->vpu_cmds_arm[1][4] = (ctb_size>>4)>> s->ps.sps->vshift[1];
-+  s->vpu_cmds_arm[1][5] = 3;
-+
-+  s->vpu_cmds_arm[2][0] = get_vc_address(s->frame->buf[2]) + s->frame->linesize[2] * (y>> s->ps.sps->vshift[2]);
-+  s->vpu_cmds_arm[2][1] = s->frame->linesize[2];
-+  s->vpu_cmds_arm[2][2] = s->uv_setup_width;
-+  s->vpu_cmds_arm[2][3] = (int) ( s->uv_setup_vc + s->uv_setup_width * ((y>>4)>> s->ps.sps->vshift[1]) );
-+  s->vpu_cmds_arm[2][4] = (ctb_size>>4)>> s->ps.sps->vshift[1];
-+  s->vpu_cmds_arm[2][5] = 4;
-+
-   // Call VPU
--  // TODO add this to a separate pipeline of VPU jobs that can be run in parallel and wait for completion
--  vpu_wait(vpu_post_code( vpu_get_fn(), get_vc_address(s->frame->buf[0]) + s->frame->linesize[0] * y, s->frame->linesize[0],
--                               s->setup_width, (int) ( s->y_setup_vc + s->setup_width * (y>>4) ),
--                               ctb_size>>4, 2, 0)); // 2 means to do the deblocking code
-+  vpu_wait(vpu_post_code( vpu_get_fn(), s->vpu_cmds_vc, 3, 0, 0, 0, 5, 0)); // 5 means to do all the commands
- }
- 
--static void rpi_deblock2(HEVCContext *s, int y, int ctb_size)
--{
--   int y2;
--   for(y2=y;y2<y+ctb_size;y2+=16) {
--      rpi_deblock(s,y2,16);
--   }
--}
- #endif
- 
- void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
- {
-     int x_end = x >= s->ps.sps->width  - ctb_size;
-+#ifdef RPI_DEBLOCK_VPU
-+    int done_deblock = 0;
-+#endif
-     if (s->avctx->skip_loop_filter < AVDISCARD_ALL)
-         deblocking_filter_CTB(s, x, y);
- #ifdef RPI_DEBLOCK_VPU
-     if (s->enable_rpi_deblock && x_end)
-     {
--      rpi_deblock(s, y, ctb_size);
-+      int y_at_end = y >= s->ps.sps->height - ctb_size;
-+      int height = 64;  // Deblock in units 64 high to avoid too many VPU calls
-+      int y_start = y&~63;
-+      if (y_at_end) height = s->ps.sps->height - y_start;
-+      if ((((y+ctb_size)&63)==0) || y_at_end) {
-+        done_deblock = 1;
-+        rpi_deblock(s, y_start, height);
-+      }
-     }
- #endif
-     if (s->ps.sps->sao_enabled) {
-@@ -1070,11 +1120,25 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
-         //int newh = y + ctb_size - 4;
-         //int currh = s->ref->tf.progress->data[0];
-         //if (((y + ctb_size)&63)==0)
-+#ifdef RPI_DEBLOCK_VPU
-+        if (s->enable_rpi_deblock) {
-+          // we no longer need to flush the luma buffer as it is in GPU memory when using deblocking on the rpi
-+          if (done_deblock) {
-+            ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
-+          }
-+        } else {
-+#ifdef RPI_INTER_QPU
-+          ff_hevc_flush_buffer(s, &s->ref->tf, y + ctb_size - 4);
-+#endif
-+          ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
-+        }
-+#else
- #ifdef RPI_INTER_QPU
-         ff_hevc_flush_buffer(s, &s->ref->tf, y + ctb_size - 4);
--        // TODO we no longer need to flush the luma buffer as it is in GPU memory when using deblocking on the rpi
-+        // we no longer need to flush the luma buffer as it is in GPU memory when using deblocking on the rpi
- #endif
-         ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
-+#endif
-     }
- }
- 
-diff --git a/libavcodec/rpi_hevc_transform.h b/libavcodec/rpi_hevc_transform.h
-index b3f155f..4309f1c 100644
---- a/libavcodec/rpi_hevc_transform.h
-+++ b/libavcodec/rpi_hevc_transform.h
-@@ -3,14 +3,32 @@ unsigned char rpi_hevc_transform [] = {
- 106,
- 0,
- 144,
--38,
-+47,
- 1,
- 37,
- 106,
- 0,
- 144,
--57,
-+66,
- 1,
-+53,
-+106,
-+0,
-+144,
-+192,
-+4,
-+69,
-+106,
-+0,
-+144,
-+192,
-+4,
-+85,
-+106,
-+0,
-+144,
-+220,
-+5,
- 169,
- 3,
- 62,
-@@ -2427,4 +2445,626 @@ unsigned char rpi_hevc_transform [] = {
- 128,
- 90,
- 0,
 +169,
 +3,
 +14,
@@ -35609,15 +10256,100 @@ index b3f155f..4309f1c 100644
 +30,
 +33,
 +3,
- };
++};
 diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s
-index b055208..5543093 100644
---- a/libavcodec/rpi_hevc_transform.s
+new file mode 100644
+index 0000000..5543093
+--- /dev/null
 +++ b/libavcodec/rpi_hevc_transform.s
-@@ -85,6 +85,13 @@ hevc_trans_16x16:
-   beq memclear16
-   cmp r5,2
-   beq hevc_deblock_16x16
+@@ -0,0 +1,917 @@
++# ******************************************************************************
++# Argon Design Ltd.
++# (c) Copyright 2015 Argon Design Ltd. All rights reserved.
++#
++# Module : HEVC
++# Author : Peter de Rivaz
++# ******************************************************************************
++
++# HEVC VPU Transform
++#
++# Transform matrix can be thought of as
++#   output row vector = input row vector * transMatrix2
++#
++# The even rows of the matrix are symmetric
++# The odd rows of the matrix are antisymmetric
++#
++# So only need to compute the first half of the results, then can compute the remainder with a butterfly
++#
++# EXAMPLE
++#   (a b c d) (1 2  2  1)
++#             (3 4 -4 -3)
++#             (5 6  6  5)
++#             (7 8 -8 -7)
++#
++#  x=(a c)(1 2) = 1a+5c 2a+6c
++#         (5 6)
++#
++#  y=(b d)(3 4) = 3b+7d 4b+8d
++#         (7 8)
++#
++#  u=x+y = 1a+5c+3b+7d 2a+4b+6c+8d
++#  v=x-y = 1a+5c-3b-7d 2a+6c-4b-8d
++#
++#  Final results are (u , v[::-1])
++#
++#
++#  For 32x1 input, load even rows into HX(0++,0), odd rows into HX(16++,0)
++#  Apply the even matrix first and stop before rounding
++#  Then apply the odd matrix in a full manner:
++#
++#   First step is to compute partial products with the first input (16 cycles)
++#   1a 3b 5c 7d   16x1 input coefficients produce 16x16 output
++#   2a 4b 6c 8d
++#   2a -4b 6c -8d
++#   1a -3b 5c -7d
++#
++#   Second step is to sum partial products into final position (8 cycles)
++#   1a+3b+5c+7d
++#   2a+4b+6c+8d
++#   2a-4b+6c-8d
++#   1a-3b+5c-7d
++#
++#   Then can apply butterfly to combine even results and odd results + rounding to produce 16 rows of output at a time (need to save in transposed format)
++#
++#   For 16x16 no butterfly is required and can store final results in original location  (Could do 2 16x16s in parallel to make use of the trick - saves on the adds)
++#
++#   For 8x8 we could compute two in parallel.
++#
++#
++
++# Columns are transformed first
++#
++# Store top left half of transMatrix2 in
++# Store bottom left half of transMatrix2 in HX(32,32)
++#
++# For 16x16
++# HX(0:15,0) contains input data before transform
++# HY(0:15,0) contains 32bit output data after transform
++# HX(32,0) contains even rows of left half of transMatrix2
++# HX(32,32) contains odd rows of left half of transMatrix2
++# HY(48,0) contains partial products ready for summing
++#
++
++
++# hevc_trans_16x16(short *transMatrix2, short *coeffs, int num) # TODO add size so we can branch to correct implementation (or perhaps have coeffs32 and num32 as secondary inputs!)
++# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory)
++# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
++# num: number of 16x16 transforms to be done
++# coeffs32
++# num32: number of 32x32 transforms
++# command 0 for transform, 1 for memclear16(int16_t *dst,num16)
++#
++hevc_trans_16x16:
++  cmp r5,1
++  beq memclear16
++  cmp r5,2
++  beq hevc_deblock_16x16
 +  cmp r5,3
 +  beq hevc_uv_deblock_16x16
 +  cmp r5,4
@@ -35625,13 +10357,629 @@ index b055208..5543093 100644
 +  cmp r5,5
 +  beq hevc_run_command_list
 +
-   push r6-r15, lr # TODO cut down number of used registers
-   mov r14,r3 # coeffs32
-   mov r15,r4 # num32
-@@ -708,3 +715,203 @@ normal_filtering:
- 
- filtering_done:
-   b lr
++  push r6-r15, lr # TODO cut down number of used registers
++  mov r14,r3 # coeffs32
++  mov r15,r4 # num32
++  mov r3, 16*2 # Stride of transMatrix2 in bytes
++  vldh HX(32++,0),(r0 += r3) REP 16 # This is the 16x16 matrix, a transform is equivalent to multiplying input row vector * matrix
++
++  add r0, 16*16*2 # For 32x32 transforms we also need this matrix
++  vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
++
++  # Now use r0 to describe which matrix we are working on.
++  # Allows us to prefetch the next block of coefficients for efficiency.
++  mov r0,0 # This describes the location where we read our coefficients from
++  mov r3,16*2 # Stride of coefficients in bytes (TODO remove)
++  mov r7,16*16*2 # Total block size
++  mov r8,64*16 # Value used to swap from current to next VRF location
++  vldh HX(0++,0)+r0,(r1 += r3) REP 16
++  mov r4,64 # Constant used for rounding first pass
++  mov r5,1<<11 # Constant used for rounding second pass
++
++  # At start of block r0,r1 point to the current block (that has already been loaded)
++block_loop:
++  eor r0,r8
++  add r1,r7
++  # Prefetch the next block
++  vldh HX(0++,0)+r0,(r1 += r3) REP 16
++  eor r0,r8
++  sub r1,r7
++
++  # Transform the current block
++  bl col_trans_16
++  vadd HY(0++,0)+r0,HY(0++,0)+r0,r4 REP 16   # Now add on rounding, shift down by 7, and saturate
++  #vsasls HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # 9+7=16 so this ends up with the output saturated and in the top half of the word.
++  vasl HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16    # This should be saturating, but the instruction above does not assemble?
++  vmov VX(0,0++)+r0, HX(0++,32)+r0 REP 16    # For simplicity transpose this back to the original position
++
++  bl col_trans_16
++  vadd HY(0++,0)+r0,HY(0++,0)+r0,r5 REP 16   # Now add on rounding, shift down by 7, and saturate
++  #vsasls HY(0++,0)+r0,HY(0++,0)+r0,4 REP 16 # 4+12=16 so this ends up with the output saturated and in the top half of the word.
++  vasl HY(0++,0)+r0,HY(0++,0)+r0,4 REP 16    # This should be saturating, but the instruction above does not assemble?  (Probably because it ends with ls which is interpreted as a condition flag)
++
++  # Save results - note there has been a transposition during the processing so we save columns
++  vsth VX(0,32++)+r0, (r1 += r3) REP 16
++
++  # Move onto next block
++  eor r0,r8
++  add r1,r7
++
++  addcmpbgt r2,-1,0,block_loop
++
++  # Now go and do any 32x32 transforms
++  b hevc_trans_32x32
++
++  pop r6-r15, pc
++
++# r1,r2,r3 r7,r8 should be preserved
++# HX(0++,0)+r0 is the block to be transformed
++# HX(32++,0)+r6 is the 16x16 matrix of transform coefficients
++# Use HY(48,0) for intermediate results
++# r0 can be used, but should be returned to its original value at the end
++col_trans_16:
++  add r6,r0,16 # Final value for this loop
++col_trans_16_loop:
++  # First compute partial products for a single column
++  vmul32s HY(48++,0), VX(0,0)+r0, VX(32,0++) REP 16
++  # Then sum up the results and place back
++  vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
++  addcmpblt r0,1,r6,col_trans_16_loop
++  sub r0,16  # put r0 back to its original value
++  b lr
++
++col_trans_odd_16:
++  add r6,r0,16 # Final value for this loop
++col_trans_odd_16_loop:
++  # First compute partial products for a single column
++  vmul32s HY(48++,0), VX(0,0)+r0, VX(32,32++) REP 16
++  # Then sum up the results and place back
++  vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
++  addcmpblt r0,1,r6,col_trans_odd_16_loop
++  sub r0,16  # put r0 back to its original value
++  b lr
++
++# hevc_trans_32x32(short *transMatrix2, short *coeffs, int num)
++# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory) Even followed by odd
++# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
++# num: number of 16x16 transforms to be done
++#
++hevc_trans_32x32:
++  mov r1,r14 # coeffs
++  mov r2,r15 # num
++
++  # Fetch odd transform matrix
++  #mov r3, 16*2 # Stride of transMatrix2 in bytes (and of coefficients)
++  #vldh HX(32++,0),(r0 += r3) REP 16 # This is the even 16x16 matrix
++  #add r0, 16*16*2
++  #vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
++
++  mov r3, 32*2*2 # Stride used to fetch alternate rows of our input coefficient buffer
++  mov r7, 16*16*2 # Total block size
++  sub sp,sp,32*32*2+32 # Allocate some space on the stack for us to store 32*32 shorts as temporary results (needs to be aligned)
++  # set r8 to 32byte aligned stack pointer
++  add r8,sp,31
++  lsr r8,5
++  lsl r8,5
++  mov r9,r8  # Backup of the temporary storage
++  mov r10,r1 # Backup of the coefficient buffer
++block_loop32:
++
++  # COLUMN TRANSFORM
++  mov r4, 64 # Constant used for rounding first pass
++  mov r5, 9 # left shift used for rounding first pass
++
++  # Transform the first 16 columns
++  mov r1,r10  # Input Coefficient buffer
++  mov r8,r9   # Output temporary storage
++  bl trans32
++  # Transform the second 16 columns
++  add r8,32*16*2
++  add r1,32
++  bl trans32
++
++  # ROW TRANSFORM
++  mov r4, 1<<11 # Constant used for rounding second pass
++  mov r5, 4 # left shift used for rounding second pass
++
++  mov r1,r9  # Input temporary storage
++  mov r8,r10   # Output Coefficient buffer
++  bl trans32
++  # Transform the second 16 columns
++  add r8,32*16*2
++  add r1,32
++  bl trans32
++
++  add r10, 32*32*2 # move onto next block of coefficients
++  addcmpbgt r2,-1,0,block_loop32
++
++  add sp,sp,32*32*2+32 # Restore stack
++
++  pop r6-r15, pc
++
++trans32:
++  push lr
++  # We can no longer afford the VRF space to do prefetching when doing 32x32
++  # Fetch the even rows
++  vldh HX(0++,0),(r1 += r3) REP 16
++  # Fetch the odd rows
++  vldh HX(16++,0),64(r1 += r3) REP 16 # First odd row is 32 shorts ahead of r1
++
++  # Transform the even rows using even matrix
++  mov r0, 0 # Even rows
++  bl col_trans_16
++
++  # Now transform the odd rows using odd matrix
++  mov r0, 64*16 # Odd rows
++  bl col_trans_odd_16
++
++  # Now apply butterfly to compute the first 16 results
++  vadd HY(48++,0),HY(0++,0),HY(16++,0) REP 16
++  vadd HY(48++,0),HY(48++,0),r4 REP 16   # add on rounding,
++  vasl HY(48++,0),HY(48++,0),r5 REP 16    # shift down by 7, and saturate
++  # 16bit results now in HX(48,32)
++  mov r0,r8
++  mov r6,32*2
++  vsth VX(48,32++),(r0+=r6) REP 16
++
++  # Now apply butterfly to compute the second 16 results (in reverse order)
++  vsub HY(63,0),HY(0 ,0),HY(16,0)
++  vsub HY(62,0),HY(1 ,0),HY(17,0)
++  vsub HY(61,0),HY(2 ,0),HY(18,0)
++  vsub HY(60,0),HY(3 ,0),HY(19,0)
++  vsub HY(59,0),HY(4 ,0),HY(20,0)
++  vsub HY(58,0),HY(5 ,0),HY(21,0)
++  vsub HY(57,0),HY(6 ,0),HY(22,0)
++  vsub HY(56,0),HY(7 ,0),HY(23,0)
++  vsub HY(55,0),HY(8 ,0),HY(24,0)
++  vsub HY(54,0),HY(9 ,0),HY(25,0)
++  vsub HY(53,0),HY(10,0),HY(26,0)
++  vsub HY(52,0),HY(11,0),HY(27,0)
++  vsub HY(51,0),HY(12,0),HY(28,0)
++  vsub HY(50,0),HY(13,0),HY(29,0)
++  vsub HY(49,0),HY(14,0),HY(30,0)
++  vsub HY(48,0),HY(15,0),HY(31,0)
++  vadd HY(48++,0),HY(48++,0),r4 REP 16   # add on rounding,
++  vasl HY(48++,0),HY(48++,0),r5 REP 16    # shift down by 7, and saturate
++  add r0,r8,32
++  vsth VX(48,32++),(r0+=r6) REP 16
++  pop pc
++
++memclear16:
++  # r0 is address
++  # r1 is number of 16bits values to set to 0 (may overrun past end and clear more than specified)
++  vmov HX(0++,0),0 REP 16
++  mov r2,32
++loop:
++  vsth HX(0++,0),(r0+=r2) REP 16
++  add r0,16*16*2
++  sub r1,16*16
++  cmp r1,0
++  bgt loop
++  b lr
++
++
++################################################################################
++# HEVC VPU Deblock
++#
++# Vertical edges before horizontal
++# Decision can change every 4 pixels, but only 8 pixel boundaries are deblocked
++#
++# ARM is responsible for storing beta and tc for each 4 pixels horiz and vert edge.
++# The VPU code works in units of 16x16 blocks.
++# We do vertical filtering for the current block followed by horizontal filtering for the previous (except for the first time).
++# One final horizontal filter is required at the end.
++# PCM is not allowed in this code.
++#
++#
++# H(16-4:16+15,0) contains previous block (note that we need 4 lines above of context that may get altered during filtering)
++# H(16:31,16) contains current block (note that we do not need the upper lines until the horizontal filtering.
++
++.set P0,63
++.set P1,62
++.set P2,61
++.set P3,60
++.set Q0,59
++.set Q1,58
++.set Q2,57
++.set Q3,56
++
++.set dp,32
++.set dq,33
++.set d,34
++.set decision,35
++.set beta,36
++.set beta2,37
++.set beta3,38
++.set ptest,39
++.set qtest,40
++.set pqtest,41
++.set thresh,42
++.set deltatest, 44
++.set deltap1, 45
++.set tc25, 46
++.set setup,47
++.set tc,48
++.set tc25,49
++.set tc2, 50
++.set do_filter, 51
++.set delta, 52
++.set tc10, 53
++.set delta0, 54
++.set delta1, 55
++.set zeros, 0
++.set setup_input, 1
++.set deltaq1, 2
++
++
++
++# hevc_deblock_16x16 deblocks an entire row that is 16 pixels high by the full width of the image.
++# Row has num16 16x16 blocks across
++# Beta goes from 0 to 64
++# tc goes from 0 to 24
++# setup[block_idx][0=vert,1=horz][0=first edge, 1=second edge][0=beta,1=tc][0..3=edge number]
++#   has 8 bytes per edge
++#   has 16 bytes per direction
++#   has 32 bytes per 16x16 block
++# hevc_deblock_16x16(uint8_t *img (r0), int stride (r1), int num16w (r2), uint8_t setup[num16][2][2][2][4](r3),int num16h(r4))
++hevc_deblock_16x16:
++  push r6-r15, lr
++  mov r9,r4
++  mov r4,r3
++  mov r13,r2
++  mov r2,r0
++  mov r10,r0
++  subscale4 r0,r1
++  mov r8,63
++  mov r6,-3
++  vmov H(zeros,0),0
++# r7 is number of blocks still to load
++# r0 is location of current block - 4 * stride
++# r1 is stride
++# r2 is location of current block
++# r3 is offset of start of block (actual edges start at H(16,16)+r3 for horizontal and H(16,0)+r3 for vertical
++# r4 is setup
++# r5 is for temporary calculations
++# r8 holds 63
++# r6 holds -3
++# r9 holds the number of 16 high rows to process
++# r10 holds the original img base
++# r11 returns 0 if no filtering was done on the edge
++# r12 saves a copy of this
++# r13 is copy of width
++
++process_row:
++  # First iteration does not do horizontal filtering on previous
++  mov r7, r13
++  mov r3,0
++  vldb H(12++,16)+r3,(r0 += r1) REP 4    # Load the current block
++  vldb H(16++,16)+r3,(r2 += r1) REP 16
++  vldb H(setup_input,0), (r4)  # We may wish to prefetch these
++  vstb H(zeros,0),(r4)
++  bl vert_filter
++  add r3,8
++  vadd H(setup_input,0),H(setup_input,8),0 # Rotate to second set of 8
++  bl vert_filter
++  sub r3,8
++  b start_deblock_loop
++deblock_loop:
++  # Middle iterations do vertical on current block and horizontal on preceding
++  vldb H(12++,16)+r3,(r0 += r1) REP 4  # load the current block
++  vldb H(16++,16)+r3,(r2 += r1) REP 16
++  vldb H(setup_input,0), (r4)
++  vstb H(zeros,0),(r4)
++  bl vert_filter
++  add r3,8
++  vadd H(setup_input,0),H(setup_input,8),0
++  bl vert_filter
++  sub r3,8
++  vldb H(setup_input,0), -16(r4)
++  vstb H(zeros,0),-16(r4)
++  bl horz_filter
++  mov r12,r11
++  add r3,8*64
++  vadd H(setup_input,0),H(setup_input,8),0
++  bl horz_filter
++  sub r3,8*64
++  addcmpbeq r12,0,0,skip_save_top
++  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
++skip_save_top:
++  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
++start_deblock_loop:
++  # move onto next 16x16 (could do this with circular buffer support instead)
++  add r3,16
++  and r3,r8
++  add r4,32
++  # Perform loop counter operations (may work with an addcmpbgt as well?)
++  add r0,16
++  add r2,16
++  sub r7,1
++  cmp r7,0 # Are there still more blocks to load
++  bgt deblock_loop
++
++  # Final iteration needs to just do horizontal filtering
++  vldb H(setup_input,0), -16(r4)
++  vstb H(zeros,0),-16(r4)
++  bl horz_filter
++  mov r12,r11
++  add r3,8*64
++  vadd H(setup_input,0),H(setup_input,8),0
++  bl horz_filter
++  sub r3,64*8
++  addcmpbeq r12,0,0,skip_save_top2
++  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
++skip_save_top2:
++  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
++
++# Now look to see if we should do another row
++  sub r9,1
++  cmp r9,0
++  bgt start_again
++  pop r6-r15, pc
++start_again:
++  # Need to sort out r0,r2 to point to next row down
++  addscale16 r10,r1
++  mov r2,r10
++  subscale4 r0,r2,r1
++  b process_row
++
++
++# At this stage H(16,16)+r3 points to the first pixel of the 16 high edge to be filtered
++# So we can reuse the code we move the parts to be filtered into HX(P0/P1/P2/P3/Q0/Q1/Q2/Q3,0) - we will perform a final saturation step on placing them back into the correct locations
++
++vert_filter:
++  push lr
++
++  vmov HX(P3,0), V(16,12)+r3
++  vmov HX(P2,0), V(16,13)+r3
++  vmov HX(P1,0), V(16,14)+r3
++  vmov HX(P0,0), V(16,15)+r3
++  vmov HX(Q0,0), V(16,16)+r3
++  vmov HX(Q1,0), V(16,17)+r3
++  vmov HX(Q2,0), V(16,18)+r3
++  vmov HX(Q3,0), V(16,19)+r3
++
++  bl do_luma_filter
++
++  vadds V(16,13)+r3, HX(P2,0), 0
++  vadds V(16,14)+r3, HX(P1,0), 0
++  vadds V(16,15)+r3, HX(P0,0), 0
++  # P3 and Q3 never change so don't bother saving back
++  vadds V(16,16)+r3, HX(Q0,0), 0
++  vadds V(16,17)+r3, HX(Q1,0), 0
++  vadds V(16,18)+r3, HX(Q2,0), 0
++
++  pop pc
++
++# Filter edge at H(16,0)+r3
++horz_filter:
++  push lr
++
++  vmov HX(P3,0), H(12,0)+r3
++  vmov HX(P2,0), H(13,0)+r3
++  vmov HX(P1,0), H(14,0)+r3
++  vmov HX(P0,0), H(15,0)+r3
++  vmov HX(Q0,0), H(16,0)+r3
++  vmov HX(Q1,0), H(17,0)+r3
++  vmov HX(Q2,0), H(18,0)+r3
++  vmov HX(Q3,0), H(19,0)+r3
++
++  bl do_luma_filter
++
++  vadds H(13,0)+r3, HX(P2,0), 0
++  vadds H(14,0)+r3, HX(P1,0), 0
++  vadds H(15,0)+r3, HX(P0,0), 0
++  # P3 and Q3 never change so don't bother saving back
++  vadds H(16,0)+r3, HX(Q0,0), 0
++  vadds H(17,0)+r3, HX(Q1,0), 0
++  vadds H(18,0)+r3, HX(Q2,0), 0
++
++  pop pc
++
++# r4 points to array of beta/tc for each 4 length edge
++do_luma_filter:
++  valtl H(setup,0),H(setup_input,0),H(setup_input,0) # b*8tc*8
++  valtl HX(beta,0),H(setup,0),H(setup,0)
++  valtu HX(tc,0),H(setup,0),H(setup,0)
++  vmul HX(tc25,0), HX(tc,0), 5
++  vadd HX(tc25,0),HX(tc25,0), 1
++  vasr HX(tc25,0), HX(tc25,0), 1
++
++  # Compute decision
++  vadd HX(dp,0),HX(P1,0),HX(P1,0) # 2*P1
++  vsub HX(dp,0),HX(P2,0),HX(dp,0) # P2-2*P1
++  vadd HX(dp,0),HX(dp,0),HX(P0,0) # P2-2*P1+P0
++  vdist HX(dp,0),HX(dp,0),0 # abs(P2-2*P1+P0) # dp0
++
++  vadd HX(dq,0),HX(Q1,0),HX(Q1,0) # 2*Q1
++  vsub HX(dq,0),HX(Q2,0),HX(dq,0) # Q2-2*Q1
++  vadd HX(dq,0),HX(dq,0),HX(Q0,0) # Q2-2*Q1+Q0
++  vdist HX(dq,0),HX(dq,0),0 # abs(Q2-2*Q1+Q0) # dq0
++
++  vadd HX(d,0), HX(dp,0), HX(dq,0)
++  vasr HX(beta2,0),HX(beta,0),2
++  vasr HX(beta3,0),HX(beta,0),3
++
++  # Compute flags that are negative if all conditions pass
++  vdist HX(decision,0), HX(P0,0), HX(P3,0) CLRA SACC
++  vdist HX(decision,0), HX(Q0,0), HX(Q3,0) SACC
++  vsub HX(decision,0), HX(decision,0), HX(beta3,0) SETF
++
++  vdist HX(decision,0), HX(P0,0), HX(Q0,0) IFN
++  vsub HX(decision,0), HX(decision,0), HX(tc25,0) IFN SETF
++  vadd HX(decision,0), HX(d,0), HX(d,0) IFN
++  vsub HX(decision,0), HX(decision,0), HX(beta2,0) IFN SETF
++  vmov HX(decision,0), 1 IFNN
++  vadd H(decision,0),H(decision,3),0 IFN
++  vadd H(decision,16),H(decision,19),0 IFN
++  vmov -,HX(decision,0) SETF   # N marks strong filter
++  vmov HX(decision,0), 1 IFNN  # NN marks normal filter
++
++  vadd HX(do_filter,0), HX(d,3), HX(d,0)
++  vsub HX(do_filter,0), HX(do_filter,0), HX(beta,0) SETF # IFNN means no filter
++  vmov HX(decision,0),0 IFNN # Z marks no filter
++
++  # Expand out decision (currently valid one every 4 pixels)  0...1...2...3
++  # First extract out even terms
++  vodd HX(decision,0),HX(decision,0),HX(decision,0)  # 0.1.2.3
++  vodd HX(decision,0),HX(decision,0),HX(decision,0)  # 0123
++  # Now expand back
++  valtl HX(decision,0),HX(decision,0),HX(decision,0) # 00112233
++  valtl HX(decision,0),HX(decision,0),HX(decision,0) SETF # 0000111122223333
++
++  # HX(decision,0) is negative if want strong filtering, 1 if want normal filtering, 0 if want no filtering
++
++  # Do a quick check to see if there is anything to do
++  mov r11, 0 # Signal no filtering
++  vmov -,1 IFNZ SUMS r5
++  cmp r5,0
++  beq filtering_done
++  mov r11, 1 # Signal some filtering
++  # And whether there is any strong filtering
++  vmov -,1 IFN SUMS r5
++  cmp r5,0
++  beq normal_filtering
++
++  ##############################################################################
++  # Strong filtering - could maybe fast case if all have same sign? (especially if all disabled!)
++  vshl HX(tc2,0), HX(tc,0), 1  # Note that in normal filtering tx2 is tc/2, while here it is tc*2
++
++  # Take a copy of the original pixels for use in decision calculation
++  vmov HX(P0,32),HX(P0,0)
++  vmov HX(Q0,32),HX(Q0,0)
++  vmov HX(P1,32),HX(P1,0)
++  vmov HX(Q1,32),HX(Q1,0)
++  vmov HX(P2,32),HX(P2,0)
++  vmov HX(Q2,32),HX(Q2,0)
++
++  vadd -,HX(P2,32),4 CLRA SACC
++  vshl -,HX(P1,32),1 SACC
++  vshl -,HX(P0,32),1 SACC
++  vshl -,HX(Q0,32),1 SACC
++  vshl HX(delta,0),HX(Q1,32),0 SACC
++  vasr HX(delta,0),HX(delta,0), 3
++  vsub HX(delta,0),HX(delta,0),HX(P0,32)
++  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
++  vadd HX(P0,0),HX(P0,32),HX(delta,0) IFN
++
++  vadd -,HX(P2,32),2 CLRA SACC
++  vadd -,HX(P1,32),HX(P0,32) SACC
++  vshl HX(delta,0),HX(Q0,32),0 SACC
++  vasr HX(delta,0),HX(delta,0), 2
++  vsub HX(delta,0),HX(delta,0),HX(P1,32)
++  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
++  vadd HX(P1,0),HX(P1,32),HX(delta,0) IFN
++
++  vadd -,HX(Q0,32),4 CLRA SACC
++  vadd -,HX(P1,32),HX(P0,32) SACC
++  vmul -,HX(P2,32),3 SACC
++  vshl HX(delta,0),HX(P3,0),1 SACC # Note that we have not made a copy of P3, so using P3,0 is correct
++  vasr HX(delta,0),HX(delta,0), 3
++  vsub HX(delta,0),HX(delta,0),HX(P2,32)
++  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
++  vadd HX(P2,0),HX(P2,32),HX(delta,0) IFN
++  #vmov HX(P2,0),3 IFN
++
++  # Now reverse all P/Qs
++
++  vadd -,HX(Q2,32),4 CLRA SACC
++  vshl -,HX(Q1,32),1 SACC
++  vshl -,HX(Q0,32),1 SACC
++  vshl -,HX(P0,32),1 SACC
++  vshl HX(delta,0),HX(P1,32),0 SACC
++  vasr HX(delta,0),HX(delta,0), 3
++  vsub HX(delta,0),HX(delta,0),HX(Q0,32)
++  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
++  vadd HX(Q0,0),HX(Q0,32),HX(delta,0) IFN
++
++  vadd -,HX(Q2,32),2 CLRA SACC
++  vadd -,HX(Q1,32),HX(Q0,32) SACC
++  vshl HX(delta,0),HX(P0,32),0 SACC
++  vasr HX(delta,0),HX(delta,0), 2
++  vsub HX(delta,0),HX(delta,0),HX(Q1,32)
++  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
++  vadd HX(Q1,0),HX(Q1,32),HX(delta,0) IFN
++
++  vadd -,HX(P0,32),4 CLRA SACC
++  vadd -,HX(Q1,32),HX(Q0,32) SACC
++  vmul -,HX(Q2,32),3 SACC
++  vshl HX(delta,0),HX(Q3,0),1 SACC # Note that we have not made a copy of Q3, so using Q3,0 is correct
++  vasr HX(delta,0),HX(delta,0), 3
++  vsub HX(delta,0),HX(delta,0),HX(Q2,32)
++  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
++  vadd HX(Q2,0),HX(Q2,32),HX(delta,0) IFN
++
++  ##############################################################################
++  # Normal filtering
++normal_filtering:
++  # Invert the decision flags
++  # make instruction more complicated as assembler has error and loses SETF
++  vrsub HX(tc10,0), HX(decision,0), 0 SETF # IFN means normal filtering
++  vmov  -, HX(tc10,0) SETF # IFN means normal filtering
++
++  vmov -,1 IFN SUMS r5
++  cmp r5,0
++  beq filtering_done
++
++  vasr HX(tc2,0), HX(tc,0), 1
++  vmul HX(tc10,0), HX(tc,0), 10
++
++  vasr HX(thresh,0), HX(beta,0), 1
++  vadd HX(thresh,0), HX(thresh,0), HX(beta,0)
++  vasr HX(thresh,0), HX(thresh,0), 3 CLRA SACC
++
++  vadd HX(ptest,0),HX(dp,3),HX(dp,0)
++  vsub HX(ptest,0),HX(ptest,0),HX(thresh,0) # ptest is negative if we need to do the P2 pixel
++  vadd HX(qtest,0),HX(dq,3),HX(dq,0)
++  vsub HX(qtest,0),HX(qtest,0),HX(thresh,0) # qtest is negative if we need to do the Q2 pixel
++  # Expand ptest and qtest together
++  vodd HX(pqtest,0),HX(ptest,0),HX(qtest,0)  # p.p.p.p.q.q.q.q
++  vodd HX(pqtest,0),HX(pqtest,0),HX(pqtest,0) # ppppqqqq........
++  valtl HX(pqtest,0),HX(pqtest,0),HX(pqtest,0) # ppppppppqqqqqqqq
++  valtl HX(ptest,0),HX(pqtest,0),HX(pqtest,0)
++  valtu HX(qtest,0),HX(pqtest,0),HX(pqtest,0)
++
++  vsub HX(delta0,0), HX(Q0,0), HX(P0,0)
++  vsub HX(delta1,0), HX(Q1,0), HX(P1,0)
++  vmov -,8 CLRA SACC
++  vmul -,HX(delta0,0), 9 SACC
++  vmul HX(delta0,0),HX(delta1,0), r6 SACC
++  vasr HX(delta0,0), HX(delta0,0), 4
++  vdist HX(deltatest,0), HX(delta0,0), 0
++  vsub HX(deltatest,0), HX(deltatest,0), HX(tc10,0) IFN SETF # negative if still need to do something
++  vmov HX(deltatest,0), 0 IFNN # clear if no need to do anything so we can reload flags later
++
++  vclamps HX(delta0,0), HX(delta0,0), HX(tc,0)
++
++  vadd HX(deltap1,0), HX(P2,0), HX(P0,0)
++  vadd HX(deltap1,0), HX(deltap1,0), 1
++  vasr HX(deltap1,0), HX(deltap1,0), 1 CLRA SACC
++  vsub HX(deltap1,0), HX(delta0,0), HX(P1,0) SACC
++  vasr HX(deltap1,0), HX(deltap1,0), 1
++  vclamps HX(deltap1,0), HX(deltap1,0), HX(tc2,0)
++
++  vadd HX(deltaq1,0), HX(Q2,0), HX(Q0,0)
++  vadd HX(deltaq1,0), HX(deltaq1,0), 1
++  vasr HX(deltaq1,0), HX(deltaq1,0), 1 CLRA SACC
++  vadd HX(deltaq1,0), HX(delta0,0), HX(Q1,0)
++  vrsub -, HX(delta0,0), 0 SACC
++  vrsub HX(deltaq1,0), HX(Q1,0), 0 SACC
++  vasr HX(deltaq1,0), HX(deltaq1,0), 1
++  vclamps HX(deltaq1,0), HX(deltaq1,0), HX(tc2,0)
++
++  vadds HX(P0,0), HX(P0,0), HX(delta0,0) IFN
++  vsubs HX(Q0,0), HX(Q0,0), HX(delta0,0) IFN
++
++  vmov -,HX(ptest,0) IFN SETF # Negative if need to do p1
++  vadds HX(P1,0), HX(P1,0), HX(deltap1,0) IFN
++
++  vmov -,HX(deltatest,0) SETF
++  vmov -,HX(qtest,0) IFN SETF # Negative if need to do q1
++  vadds HX(Q1,0), HX(Q1,0), HX(deltaq1,0) IFN
++
++  #vmov HX(P2,0),1 IFN
++
++filtering_done:
++  b lr
 +
 +
 +hevc_uv_deblock_16x16:
@@ -35832,1623 +11180,1410 @@ index b055208..5543093 100644
 +  bgt loop_cmds
 +
 +  pop r6-r7, pc
+diff --git a/libavcodec/rpi_mailbox.c b/libavcodec/rpi_mailbox.c
+new file mode 100644
+index 0000000..3904efc
+--- /dev/null
++++ b/libavcodec/rpi_mailbox.c
+@@ -0,0 +1,340 @@
++/*
++Copyright (c) 2012, Broadcom Europe Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++*/
++
++#include <stdio.h>
++#include <string.h>
++#include <stdlib.h>
++#include <fcntl.h>
++#include <unistd.h>
++#include <assert.h>
++#include <stdint.h>
++#include <sys/mman.h>
++#include <sys/ioctl.h>
++
++#include <linux/ioctl.h>
++
++#define MAJOR_NUM 100
++#define IOCTL_MBOX_PROPERTY _IOWR(MAJOR_NUM, 0, char *)
++#define DEVICE_FILE_NAME "/dev/vcio"
++
++#include "rpi_mailbox.h"
++
++#define PAGE_SIZE (4*1024)
++
++// Shared memory will not be cached in ARM cache
++void *mapmem_shared(unsigned base, unsigned size)
++{
++   int mem_fd;
++   unsigned offset = base % PAGE_SIZE;
++   base = base - offset;
++   /* open /dev/mem */
++   if ((mem_fd = open("/dev/mem", O_RDWR|O_SYNC) ) < 0) {
++      printf("can't open /dev/mem\nThis program should be run as root. Try prefixing command with: sudo\n");
++      return NULL;
++   }
++   void *mem = mmap(
++      0,
++      size,
++      PROT_READ|PROT_WRITE,
++      MAP_SHARED/*|MAP_FIXED*/,
++      mem_fd,
++      base);
++#ifdef DEBUG
++   printf("base=0x%x, mem=%p\n", base, mem);
++#endif
++   if (mem == MAP_FAILED) {
++      printf("mmap error %d\n", (int)mem);
++      return NULL;
++   }
++   close(mem_fd);
++   return (char *)mem + offset;
++}
++
++// Unshared memory will be faster as lives in ARM cache, but requires cache flushing
++void *mapmem_private(unsigned base, unsigned size)
++{
++   int mem_fd;
++   unsigned offset = base % PAGE_SIZE;
++   base = base - offset;
++   /* open /dev/mem */
++   if ((mem_fd = open("/dev/mem", O_RDWR|O_SYNC) ) < 0) {
++      printf("can't open /dev/mem\nThis program should be run as root. Try prefixing command with: sudo\n");
++      return NULL;
++   }
++   void *mem = mmap(
++      0,
++      size,
++      PROT_READ|PROT_WRITE,
++      MAP_PRIVATE/*|MAP_FIXED*/,
++      mem_fd,
++      base);
++#ifdef DEBUG
++   printf("base=0x%x, mem=%p\n", base, mem);
++#endif
++   if (mem == MAP_FAILED) {
++      printf("mmap error %d\n", (int)mem);
++      return NULL;
++   }
++   close(mem_fd);
++   return (char *)mem + offset;
++}
++
++void unmapmem(void *addr, unsigned size)
++{
++   int s = munmap(addr, size);
++   if (s != 0) {
++      printf("munmap error %d\n", s);
++      exit (-1);
++   }
++}
++
++/*
++ * use ioctl to send mbox property message
++ */
++
++static int mbox_property(int file_desc, void *buf)
++{
++   int ret_val = ioctl(file_desc, IOCTL_MBOX_PROPERTY, buf);
++
++   if (ret_val < 0) {
++      printf("ioctl_set_msg failed:%d\n", ret_val);
++   }
++
++#ifdef DEBUG
++   unsigned *p = buf; int i; unsigned size = *(unsigned *)buf;
++   for (i=0; i<size/4; i++)
++      printf("%04x: 0x%08x\n", i*sizeof *p, p[i]);
++#endif
++   return ret_val;
++}
++
++unsigned mem_alloc(int file_desc, unsigned size, unsigned align, unsigned flags)
++{
++   int i=0;
++   unsigned p[32];
++   p[i++] = 0; // size
++   p[i++] = 0x00000000; // process request
++
++   p[i++] = 0x3000c; // (the tag id)
++   p[i++] = 12; // (size of the buffer)
++   p[i++] = 12; // (size of the data)
++   p[i++] = size; // (num bytes? or pages?)
++   p[i++] = align; // (alignment)
++   p[i++] = flags; // (MEM_FLAG_L1_NONALLOCATING)
++
++   p[i++] = 0x00000000; // end tag
++   p[0] = i*sizeof *p; // actual size
++
++   mbox_property(file_desc, p);
++   return p[5];
++}
++
++unsigned mem_free(int file_desc, unsigned handle)
++{
++   int i=0;
++   unsigned p[32];
++   p[i++] = 0; // size
++   p[i++] = 0x00000000; // process request
++
++   p[i++] = 0x3000f; // (the tag id)
++   p[i++] = 4; // (size of the buffer)
++   p[i++] = 4; // (size of the data)
++   p[i++] = handle;
++
++   p[i++] = 0x00000000; // end tag
++   p[0] = i*sizeof *p; // actual size
++
++   mbox_property(file_desc, p);
++   return p[5];
++}
++
++unsigned mem_lock(int file_desc, unsigned handle)
++{
++   int i=0;
++   unsigned p[32];
++   p[i++] = 0; // size
++   p[i++] = 0x00000000; // process request
++
++   p[i++] = 0x3000d; // (the tag id)
++   p[i++] = 4; // (size of the buffer)
++   p[i++] = 4; // (size of the data)
++   p[i++] = handle;
++
++   p[i++] = 0x00000000; // end tag
++   p[0] = i*sizeof *p; // actual size
++
++   mbox_property(file_desc, p);
++   return p[5];
++}
++
++unsigned mem_unlock(int file_desc, unsigned handle)
++{
++   int i=0;
++   unsigned p[32];
++   p[i++] = 0; // size
++   p[i++] = 0x00000000; // process request
++
++   p[i++] = 0x3000e; // (the tag id)
++   p[i++] = 4; // (size of the buffer)
++   p[i++] = 4; // (size of the data)
++   p[i++] = handle;
++
++   p[i++] = 0x00000000; // end tag
++   p[0] = i*sizeof *p; // actual size
++
++   mbox_property(file_desc, p);
++   return p[5];
++}
++
++unsigned execute_code(int file_desc, unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5)
++{
++   int i=0;
++   unsigned p[32];
++   p[i++] = 0; // size
++   p[i++] = 0x00000000; // process request
++
++   p[i++] = 0x30010; // (the tag id)
++   p[i++] = 28; // (size of the buffer)
++   p[i++] = 28; // (size of the data)
++   p[i++] = code;
++   p[i++] = r0;
++   p[i++] = r1;
++   p[i++] = r2;
++   p[i++] = r3;
++   p[i++] = r4;
++   p[i++] = r5;
++
++   p[i++] = 0x00000000; // end tag
++   p[0] = i*sizeof *p; // actual size
++
++   mbox_property(file_desc, p);
++   return p[5];
++}
++
++unsigned qpu_enable(int file_desc, unsigned enable)
++{
++   int i=0;
++   unsigned p[32];
++
++   p[i++] = 0; // size
++   p[i++] = 0x00000000; // process request
++
++   p[i++] = 0x30012; // (the tag id)
++   p[i++] = 4; // (size of the buffer)
++   p[i++] = 4; // (size of the data)
++   p[i++] = enable;
++
++   p[i++] = 0x00000000; // end tag
++   p[0] = i*sizeof *p; // actual size
++
++   mbox_property(file_desc, p);
++   return p[5];
++}
++
++unsigned execute_qpu(int file_desc, unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout) {
++   int i=0;
++   unsigned p[32];
++
++   p[i++] = 0; // size
++   p[i++] = 0x00000000; // process request
++   p[i++] = 0x30011; // (the tag id)
++   p[i++] = 16; // (size of the buffer)
++   p[i++] = 16; // (size of the data)
++   p[i++] = num_qpus;
++   p[i++] = control;
++   p[i++] = noflush;
++   p[i++] = timeout; // ms
++
++   p[i++] = 0x00000000; // end tag
++   p[0] = i*sizeof *p; // actual size
++
++   mbox_property(file_desc, p);
++   return p[5];
++}
++
++void execute_multi(int file_desc,
++   unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout,
++   unsigned num_qpus_2, unsigned control_2, unsigned noflush_2, unsigned timeout_2,
++   unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
++   unsigned code_2, unsigned r0_2, unsigned r1_2, unsigned r2_2, unsigned r3_2, unsigned r4_2, unsigned r5_2) {
++   int i=0;
++   unsigned p[32];
++
++   p[i++] = 0; // size
++   p[i++] = 0x00000000; // process request
++   p[i++] = 0x30018; // (the tag id)
++   p[i++] = 88; // (size of the buffer)
++   p[i++] = 88; // (size of the data)
++
++   p[i++] = num_qpus;
++   p[i++] = control;
++   p[i++] = noflush;
++   p[i++] = timeout; // ms
++
++   p[i++] = num_qpus_2;
++   p[i++] = control_2;
++   p[i++] = noflush_2;
++   p[i++] = timeout_2; // ms
++
++   p[i++] = code;
++   p[i++] = r0;
++   p[i++] = r1;
++   p[i++] = r2;
++   p[i++] = r3;
++   p[i++] = r4;
++   p[i++] = r5;
++
++   p[i++] = code_2;
++   p[i++] = r0_2;
++   p[i++] = r1_2;
++   p[i++] = r2_2;
++   p[i++] = r3_2;
++   p[i++] = r4_2;
++   p[i++] = r5_2;
++
++   p[i++] = 0x00000000; // end tag
++   p[0] = i*sizeof *p; // actual size
++
++   mbox_property(file_desc, p);
++   return;
++}
++
++int mbox_open() {
++   int file_desc;
++
++   // open a char device file used for communicating with kernel mbox driver
++   file_desc = open(DEVICE_FILE_NAME, 0);
++   if (file_desc < 0) {
++      printf("Can't open device file: %s\n", DEVICE_FILE_NAME);
++      printf("Try creating a device file with: sudo mknod %s c %d 0\n", DEVICE_FILE_NAME, MAJOR_NUM);
++   }
++   return file_desc;
++}
++
++void mbox_close(int file_desc) {
++  close(file_desc);
++}
+diff --git a/libavcodec/rpi_mailbox.h b/libavcodec/rpi_mailbox.h
+new file mode 100644
+index 0000000..5898102
+--- /dev/null
++++ b/libavcodec/rpi_mailbox.h
+@@ -0,0 +1,25 @@
++#ifndef RPI_MAILBOX_H
++#define RPI_MAILBOX_H
++
++extern int mbox_open(void);
++extern void mbox_close(int file_desc);
++
++extern unsigned get_version(int file_desc);
++extern unsigned mem_alloc(int file_desc, unsigned size, unsigned align, unsigned flags);
++extern unsigned mem_free(int file_desc, unsigned handle);
++extern unsigned mem_lock(int file_desc, unsigned handle);
++extern unsigned mem_unlock(int file_desc, unsigned handle);
++extern void *mapmem_shared(unsigned base, unsigned size);
++extern void *mapmem_private(unsigned base, unsigned size);
++extern void unmapmem(void *addr, unsigned size);
++
++extern unsigned execute_code(int file_desc, unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5);
++extern unsigned execute_qpu(int file_desc, unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout);
++extern void execute_multi(int file_desc,
++   unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout,
++   unsigned num_qpus_2, unsigned control_2, unsigned noflush_2, unsigned timeout_2,
++   unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
++   unsigned code_2, unsigned r0_2, unsigned r1_2, unsigned r2_2, unsigned r3_2, unsigned r4_2, unsigned r5_2);
++extern unsigned qpu_enable(int file_desc, unsigned enable);
++
++#endif
 diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-index 91777be..5aa0432 100644
---- a/libavcodec/rpi_qpu.c
+new file mode 100644
+index 0000000..a01c051
+--- /dev/null
 +++ b/libavcodec/rpi_qpu.c
-@@ -397,6 +397,8 @@ static void *vpu_start(void *arg) {
-   int start_time;
-   int end_time;
-   int count=0;
-+  int count_deblock=0;
-+  int count_qpu=0;
- #endif
-   while(1) {
-     int i;
-@@ -442,7 +444,7 @@ static void *vpu_start(void *arg) {
-         break;
-       }
-     }
--    printf("Have_qpu = %d, have_vpu=%d\n",have_qpu,have_vpu);
-+    //printf("Have_qpu = %d, have_vpu=%d\n",have_qpu,have_vpu);
- #endif
-     qpu_code = p[7];
-     qpu_codeb = p[16];
-@@ -460,6 +462,12 @@ static void *vpu_start(void *arg) {
-     off_time += start_time-last_time;
- #endif
- 
+@@ -0,0 +1,991 @@
++#ifdef RPI
++// Use vchiq service for submitting jobs
++#define GPUSERVICE
++
++// This works better than the mmap in that the memory can be cached, but requires a kernel modification to enable the device.
++// define RPI_TIME_TOTAL_QPU to print out how much time is spent in the QPU code
++//#define RPI_TIME_TOTAL_QPU
++// define RPI_TIME_TOTAL_VPU to print out how much time is spent in the VPI code
++//#define RPI_TIME_TOTAL_VPU
++// define RPI_TIME_TOTAL_POSTED to print out how much time is spent in the multi execute QPU/VPU combined
++#define RPI_TIME_TOTAL_POSTED
++
++#include <stdio.h>
++#include <stdlib.h>
++#include <string.h>
++#include <stddef.h>
++#include <stdint.h>
++#include "libavutil/avassert.h"
++
++#include "config.h"
++
++#include <pthread.h>
++#include <time.h>
++
++#include "rpi_mailbox.h"
++#include "rpi_qpu.h"
++#include "rpi_shader.h"
++#include "rpi_hevc_transform.h"
++
++#include "rpi_user_vcsm.h"
++#ifdef GPUSERVICE
++#pragma GCC diagnostic push
++// Many many redundant decls in the header files
++#pragma GCC diagnostic ignored "-Wredundant-decls"
++#include "interface/vmcs_host/vc_vchi_gpuserv.h"
++#pragma GCC diagnostic pop
++#endif
++
++// QPU profile flags
 +#define NO_FLUSH 1
 +#define CLEAR_PROFILE 2
 +#define OUTPUT_COUNTS 4
 +
 +#define FLAGS_FOR_PROFILING (NO_FLUSH)
 +
- #ifdef RPI_COMBINE_JOBS
-     if (have_qpu) {
-       for(i=0;i<8;i++) {
-@@ -472,14 +480,14 @@ static void *vpu_start(void *arg) {
-       }
-       if (have_vpu) {
-         execute_multi(gpu->mb,
--                              12,gpu->vc + offsetof(struct GPU, mail2), 1, 5000,
-+                              12,gpu->vc + offsetof(struct GPU, mail2), FLAGS_FOR_PROFILING, 5000,
-                               8,gpu->vc + offsetof(struct GPU, mail), 1 /* no flush */, 5000 /* timeout ms */,
-                               p[0], p[1], p[2], p[3], p[4], p[5], p[6], // VPU0
-                               q[0], q[1], q[2], q[3], q[4], q[5], q[6]); // VPU1
-         q[0] = 0;
-       } else {
-         execute_multi(gpu->mb,
--                              12,gpu->vc + offsetof(struct GPU, mail2), 1, 5000,
-+                              12,gpu->vc + offsetof(struct GPU, mail2), FLAGS_FOR_PROFILING, 5000,
-                               8,gpu->vc + offsetof(struct GPU, mail), 1 /* no flush */, 5000 /* timeout ms */,
-                               p[0], p[1], p[2], p[3], p[4], p[5], p[6], // VPU0
-                               0,    0   , 0   , 0   , 0   , 0   , 0); // VPU1
-@@ -510,7 +518,7 @@ static void *vpu_start(void *arg) {
-       execute_qpu(gpu->mb,8,gpu->vc + offsetof(struct GPU, mail), 1 /* no flush */, 5000 /* timeout ms */);
- #else
-       execute_multi(gpu->mb,
--                              12,gpu->vc + offsetof(struct GPU, mail2), 1, 5000,
-+                              12,gpu->vc + offsetof(struct GPU, mail2), FLAGS_FOR_PROFILING , 5000,
-                               8,gpu->vc + offsetof(struct GPU, mail), 1 /* no flush */, 5000 /* timeout ms */,
-                               p[0], p[1], p[2], p[3], p[4], p[5], p[6], // VPU0
-                               0,    0   , 0   , 0   , 0   , 0   , 0); // VPU1
-@@ -525,17 +533,20 @@ static void *vpu_start(void *arg) {
-     // There are three cases we may wish to distinguish of VPU/QPU activity
-     on_time += end_time - start_time;
- #else
--    if (p[6]==2)
-+    if (p[6]>1) {
-+      count_deblock++;
-       on_time_deblock += end_time - start_time;
--    else
-+    } else {
-       on_time += end_time - start_time;
-+      count_qpu++;
-+    }
- #endif
-     count++;
-     if ((count&0x7f)==0)
- #ifdef RPI_COMBINE_JOBS
--      printf("Posted %d On=%dms, On_deblock=%dms, Off=%dms\n",count,(int)(on_time/1000),(int)(on_time_deblock/1000),(int)(off_time/1000));
--#else
-       printf("Posted %d On=%dms, Off=%dms\n",count,(int)(on_time/1000),(int)(off_time/1000));
-+#else
-+      printf("Posted %d On=%dms (%d calls), On_deblock=%dms (%d calls), Off=%dms\n",count,(int)(on_time/1000),count_qpu,(int)(on_time_deblock/1000),count_deblock,(int)(off_time/1000));
- #endif
- #endif
- job_done_early:
-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-index 0686249..64bf5b0 100644
---- a/libavcodec/rpi_shader.qasm
-+++ b/libavcodec/rpi_shader.qasm
-@@ -1077,6 +1077,17 @@ nop        ; nop # delay slot 2
- ::mc_interrupt_exit12
-   mov  -, vw_wait # wait on the VDW
- 
-+  # Dummy wait to test instructions
-+#  mov r3,1000000
-+#:dummy_loop
-+#  sub.setf r3, r3, 1
-+#  nop
-+#  nop
-+#  brr.anynn -, r:dummy_loop
-+#  nop
-+#  nop
-+#  nop
 +
-   ldtmu0
-   ldtmu0
-   ldtmu1
--- 
-2.7.4
-
-
-From 12a194bddd049ab97154e9fbdd46b63b558a3bee Mon Sep 17 00:00:00 2001
-From: Ben Avison <bavison@riscosopen.org>
-Date: Tue, 23 Jun 2015 23:42:03 +0100
-Subject: [PATCH 67/68] armv7/hevc: Optimise deblocking boundary strength
- calculation
-
----
- libavcodec/arm/hevcdsp_deblock_neon.S | 115 +++++++++++++++++
- libavcodec/arm/hevcdsp_init_neon.c    |   9 ++
- libavcodec/hevc.h                     |  11 --
- libavcodec/hevc_filter.c              | 224 ++++++++++++++--------------------
- libavcodec/hevcdsp.c                  | 116 ++++++++++++++++++
- libavcodec/hevcdsp.h                  |  14 +++
- 6 files changed, 344 insertions(+), 145 deletions(-)
-
-diff --git a/libavcodec/arm/hevcdsp_deblock_neon.S b/libavcodec/arm/hevcdsp_deblock_neon.S
-index 166bddb..bad4589 100644
---- a/libavcodec/arm/hevcdsp_deblock_neon.S
-+++ b/libavcodec/arm/hevcdsp_deblock_neon.S
-@@ -383,3 +383,118 @@ function ff_hevc_h_loop_filter_chroma_neon, export=1
-         vst1.8   {d4}, [r0]
-         bx       lr
- endfunc
++// On Pi2 there is no way to access the VPU L2 cache
++// GPU_MEM_FLG should be 4 for uncached memory.  (Or C for alias to allocate in the VPU L2 cache)
++// However, if using VCSM allocated buffers, need to use C at the moment because VCSM does not allocate uncached memory correctly
++// The QPU crashes if we mix L2 cached and L2 uncached accesses due to a HW bug.
++#define GPU_MEM_FLG 0x4
++// GPU_MEM_MAP is meaningless on the Pi2 and should be left at 0  (On Pi1 it allows ARM to access VPU L2 cache)
++#define GPU_MEM_MAP 0x0
 +
-+/* ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_inc, int out_inc,
-+ *                                            int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
-+ *                                            MvField *curr, MvField *neigh, uint8_t *bs)
-+ */
-+function ff_hevc_deblocking_boundary_strengths_neon, export=1
-+        add         ip, sp, #4*4
-+        push        {a2-a4,v1-v8,lr}
-+        ldmia       ip, {v5-v7}
-+1:      ldmdb       ip, {v1-v4}
-+        ldrsb       a3, [v5, #8]    @ curr->ref_idx
-+        ldrsb       v8, [v5, #9]
-+        ldrsb       ip, [v6, #8]    @ neigh->ref_idx
-+        ldrsb       lr, [v6, #9]
-+        ldr         v1, [v1, a3, lsl #2]
-+        ldrb        a3, [v5, #10]   @ curr->pred_flag
-+        ldr         v2, [v2, v8, lsl #2]
-+        ldrb        v8, [v6, #10]   @ neigh->pred_flag
-+        ldr         v3, [v3, ip, lsl #2]
-+        ldr         v4, [v4, lr, lsl #2]
-+        teq         a3, #3
-+        beq         20f
-+        teq         v8, #3
-+        beq         90f
++#define vcos_verify_ge0(x) ((x)>=0)
 +
-+        tst         a3, #1
-+        ldrne       a3, [v5, #0]    @ curr->mv[0]
-+        ldreq       a3, [v5, #4]    @ curr->mv[1]
-+        moveq       v1, v2
-+        tst         v8, #1
-+        ldrne       v8, [v6, #0]    @ neigh->mv[0]
-+        ldreq       v8, [v6, #4]    @ neigh->mv[1]
-+        moveq       v3, v4
-+        teq         v1, v3
-+        bne         10f
-+        ldr         lr, =0xFFFCFFFC
-+        ssub16      ip, v8, a3
-+        ssub16      a3, a3, v8
-+        sel         a3, a3, ip
-+        ands        a3, a3, lr
-+        @ drop through
-+10:     movne       a3, #1
-+11:     subs        a2, a2, #1
-+12:     strbhs      a3, [v7], a4
-+        subs        a2, a2, #1
-+        bhs         12b
-+
-+        ldm         sp, {a2, a3}
-+        add         ip, sp, #16*4
-+        subs        a1, a1, #1
-+        add         v5, v5, a3
-+        add         v6, v6, a3
-+        bhi         1b
-+        pop         {a2-a4,v1-v8,pc}
-+
-+20:     teq         v8, #3
-+        bne         10b
-+
-+        teq         v1, v3
-+        teqeq       v2, v4
-+        bne         40f
-+        teq         v1, v2
-+        bne         30f
-+
-+        ldrd        v1, v2, [v5]    @ curr->mv
-+        ldrd        v3, v4, [v6]    @ neigh->mv
-+        ldr         lr, =0xFFFCFFFC
-+        ssub16      ip, v3, v1
-+        ssub16      a3, v1, v3
-+        sel         a3, a3, ip
-+        ands        a3, a3, lr
-+        bne         25f
-+        ssub16      ip, v4, v2
-+        ssub16      a3, v2, v4
-+        sel         a3, a3, ip
-+        ands        a3, a3, lr
-+        beq         11b
-+        @ drop through
-+25:     ssub16      ip, v4, v1
-+        ssub16      a3, v1, v4
-+        sel         a3, a3, ip
-+        ands        a3, a3, lr
-+        bne         10b
-+        ssub16      ip, v3, v2
-+        ssub16      a3, v2, v3
-+        sel         a3, a3, ip
-+        ands        a3, a3, lr
-+        b           10b
-+
-+30:     ldrd        v1, v2, [v5]    @ curr->mv
-+        ldrd        v3, v4, [v6]    @ neigh->mv
-+        ldr         lr, =0xFFFCFFFC
-+        ssub16      ip, v3, v1
-+        ssub16      a3, v1, v3
-+        sel         a3, a3, ip
-+        ands        a3, a3, lr
-+        bne         10b
-+        ssub16      ip, v4, v2
-+        ssub16      a3, v2, v4
-+        sel         a3, a3, ip
-+        ands        a3, a3, lr
-+        b           10b
-+
-+40:     teq         v1, v4
-+        teqeq       v2, v3
-+        bne         10b
-+
-+        ldrd        v1, v2, [v5]    @ curr->mv
-+        ldrd        v3, v4, [v6]    @ neigh->mv
-+        ldr         lr, =0xFFFCFFFC
-+        b           25b
-+
-+90:     mov         a3, #1
-+        b           11b
-+endfunc
-diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c
-index e5da7e9..49c70dd 100644
---- a/libavcodec/arm/hevcdsp_init_neon.c
-+++ b/libavcodec/arm/hevcdsp_init_neon.c
-@@ -290,6 +290,10 @@ static void ff_hevc_sao_edge_neon_wrapper(uint8_t *_dst /* align 16 */, uint8_t
- }
- #undef CMP
- 
-+void ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_inc, int out_inc,
-+                                                int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
-+                                                MvField *curr, MvField *neigh, uint8_t *bs);
-+
- av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
- {
-     if (bit_depth == 8) {
-@@ -387,4 +391,9 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
-         c->put_hevc_qpel_uni[8][0][0]  = ff_hevc_put_qpel_uw_pixels_w48_neon_8;
-         c->put_hevc_qpel_uni[9][0][0]  = ff_hevc_put_qpel_uw_pixels_w64_neon_8;
-     }
-+
-+    assert(offsetof(MvField, mv) == 0);
-+    assert(offsetof(MvField, ref_idx) == 8);
-+    assert(offsetof(MvField, pred_flag) == 10);
-+    c->hevc_deblocking_boundary_strengths = ff_hevc_deblocking_boundary_strengths_neon;
- }
-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-index 7eb37e6..496c0e1 100644
---- a/libavcodec/hevc.h
-+++ b/libavcodec/hevc.h
-@@ -684,17 +684,6 @@ typedef struct CodingUnit {
-     uint8_t cu_transquant_bypass_flag;
- } CodingUnit;
- 
--typedef struct Mv {
--    int16_t x;  ///< horizontal component of motion vector
--    int16_t y;  ///< vertical component of motion vector
--} Mv;
--
--typedef struct MvField {
--    DECLARE_ALIGNED(4, Mv, mv)[2];
--    int8_t ref_idx[2];
--    int8_t pred_flag;
--} MvField;
--
- typedef struct NeighbourAvailable {
-     int cand_bottom_left;
-     int cand_left;
-diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
-index 6367068..826a82f 100644
---- a/libavcodec/hevc_filter.c
-+++ b/libavcodec/hevc_filter.c
-@@ -726,69 +726,6 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
-     }
- }
- 
--static int boundary_strength(HEVCContext *s, MvField *curr, MvField *neigh,
--                             RefPicList *neigh_refPicList)
--{
--    if (curr->pred_flag == PF_BI &&  neigh->pred_flag == PF_BI) {
--        // same L0 and L1
--        if (s->ref->refPicList[0].list[curr->ref_idx[0]] == neigh_refPicList[0].list[neigh->ref_idx[0]]  &&
--            s->ref->refPicList[0].list[curr->ref_idx[0]] == s->ref->refPicList[1].list[curr->ref_idx[1]] &&
--            neigh_refPicList[0].list[neigh->ref_idx[0]] == neigh_refPicList[1].list[neigh->ref_idx[1]]) {
--            if ((FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
--                 FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4) &&
--                (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
--                 FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4))
--                return 1;
--            else
--                return 0;
--        } else if (neigh_refPicList[0].list[neigh->ref_idx[0]] == s->ref->refPicList[0].list[curr->ref_idx[0]] &&
--                   neigh_refPicList[1].list[neigh->ref_idx[1]] == s->ref->refPicList[1].list[curr->ref_idx[1]]) {
--            if (FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
--                FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4)
--                return 1;
--            else
--                return 0;
--        } else if (neigh_refPicList[1].list[neigh->ref_idx[1]] == s->ref->refPicList[0].list[curr->ref_idx[0]] &&
--                   neigh_refPicList[0].list[neigh->ref_idx[0]] == s->ref->refPicList[1].list[curr->ref_idx[1]]) {
--            if (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
--                FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4)
--                return 1;
--            else
--                return 0;
--        } else {
--            return 1;
--        }
--    } else if ((curr->pred_flag != PF_BI) && (neigh->pred_flag != PF_BI)){ // 1 MV
--        Mv A, B;
--        int ref_A, ref_B;
--
--        if (curr->pred_flag & 1) {
--            A     = curr->mv[0];
--            ref_A = s->ref->refPicList[0].list[curr->ref_idx[0]];
--        } else {
--            A     = curr->mv[1];
--            ref_A = s->ref->refPicList[1].list[curr->ref_idx[1]];
--        }
--
--        if (neigh->pred_flag & 1) {
--            B     = neigh->mv[0];
--            ref_B = neigh_refPicList[0].list[neigh->ref_idx[0]];
--        } else {
--            B     = neigh->mv[1];
--            ref_B = neigh_refPicList[1].list[neigh->ref_idx[1]];
--        }
--
--        if (ref_A == ref_B) {
--            if (FFABS(A.x - B.x) >= 4 || FFABS(A.y - B.y) >= 4)
--                return 1;
--            else
--                return 0;
--        } else
--            return 1;
--    }
--
--    return 1;
--}
- 
- void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
-                                            int log2_trafo_size)
-@@ -799,10 +736,17 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
-     int log2_min_tu_size = s->ps.sps->log2_min_tb_size;
-     int min_pu_width     = s->ps.sps->min_pu_width;
-     int min_tu_width     = s->ps.sps->min_tb_width;
--    int is_intra = tab_mvf[(y0 >> log2_min_pu_size) * min_pu_width +
--                           (x0 >> log2_min_pu_size)].pred_flag == PF_INTRA;
-     int boundary_upper, boundary_left;
--    int i, j, bs;
-+    int i, j;
-+    RefPicList *rpl      = s->ref->refPicList;
-+    int min_pu_in_4pix   = (1 << log2_min_pu_size) >> 2;
-+    int trafo_in_min_pus = (1 << log2_trafo_size) >> log2_min_pu_size;
-+    int y_pu             = y0 >> log2_min_pu_size;
-+    int x_pu             = x0 >> log2_min_pu_size;
-+    MvField *curr        = &tab_mvf[y_pu * min_pu_width + x_pu];
-+    int is_intra         = curr->pred_flag == PF_INTRA;
-+    int inc              = log2_min_pu_size == 2 ? 2 : 1;
-+    uint8_t *bs;
- 
- #ifdef DISABLE_STRENGTHS
-     return;
-@@ -818,34 +762,56 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
-           (y0 % (1 << s->ps.sps->log2_ctb_size)) == 0)))
-         boundary_upper = 0;
- 
-+    bs = &s->horizontal_bs[(x0 + y0 * s->bs_width) >> 2];
-+
-     if (boundary_upper) {
-         RefPicList *rpl_top = (lc->boundary_flags & BOUNDARY_UPPER_SLICE) ?
-                               ff_hevc_get_ref_list(s, s->ref, x0, y0 - 1) :
--                              s->ref->refPicList;
--        int yp_pu = (y0 - 1) >> log2_min_pu_size;
--        int yq_pu =  y0      >> log2_min_pu_size;
--        int yp_tu = (y0 - 1) >> log2_min_tu_size;
--        int yq_tu =  y0      >> log2_min_tu_size;
-+                              rpl;
-+        MvField *top = curr - min_pu_width;
-+
-+        if (is_intra) {
-+            for (i = 0; i < (1 << log2_trafo_size); i += 4)
-+                bs[i >> 2] = 2;
-+
-+        } else {
-+            int y_tu = y0 >> log2_min_tu_size;
-+            int x_tu = x0 >> log2_min_tu_size;
-+            uint8_t *curr_cbf_luma = &s->cbf_luma[y_tu * min_tu_width + x_tu];
-+            uint8_t *top_cbf_luma = curr_cbf_luma - min_tu_width;
-+
-+            s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
-+                    min_pu_in_4pix, sizeof (MvField), 4 >> 2,
-+                    rpl[0].list, rpl[1].list, rpl_top[0].list, rpl_top[1].list,
-+                    curr, top, bs);
- 
-             for (i = 0; i < (1 << log2_trafo_size); i += 4) {
--                int x_pu = (x0 + i) >> log2_min_pu_size;
--                int x_tu = (x0 + i) >> log2_min_tu_size;
--                MvField *top  = &tab_mvf[yp_pu * min_pu_width + x_pu];
--                MvField *curr = &tab_mvf[yq_pu * min_pu_width + x_pu];
--                uint8_t top_cbf_luma  = s->cbf_luma[yp_tu * min_tu_width + x_tu];
--                uint8_t curr_cbf_luma = s->cbf_luma[yq_tu * min_tu_width + x_tu];
--
--                if (curr->pred_flag == PF_INTRA || top->pred_flag == PF_INTRA)
--                    bs = 2;
--                else if (curr_cbf_luma || top_cbf_luma)
--                    bs = 1;
--                else
--                    bs = boundary_strength(s, curr, top, rpl_top);
--                s->horizontal_bs[((x0 + i) + y0 * s->bs_width) >> 2] = bs;
-+                int i_pu = i >> log2_min_pu_size;
-+                int i_tu = i >> log2_min_tu_size;
-+
-+                if (top[i_pu].pred_flag == PF_INTRA)
-+                    bs[i >> 2] = 2;
-+                else if (curr_cbf_luma[i_tu] || top_cbf_luma[i_tu])
-+                    bs[i >> 2] = 1;
-             }
-+        }
-+    }
-+
-+    if (!is_intra) {
-+        for (j = inc; j < trafo_in_min_pus; j += inc) {
-+            MvField *top;
-+
-+            curr += min_pu_width * inc;
-+            top = curr - min_pu_width;
-+            bs += s->bs_width * inc << log2_min_pu_size >> 2;
-+
-+            s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
-+                    min_pu_in_4pix, sizeof (MvField), 4 >> 2,
-+                    rpl[0].list, rpl[1].list, rpl[0].list, rpl[1].list,
-+                    curr, top, bs);
-+        }
-     }
- 
--    // bs for vertical TU boundaries
-     boundary_left = x0 > 0 && !(x0 & 7);
-     if (boundary_left &&
-         ((!s->sh.slice_loop_filter_across_slices_enabled_flag &&
-@@ -856,64 +822,54 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
-           (x0 % (1 << s->ps.sps->log2_ctb_size)) == 0)))
-         boundary_left = 0;
- 
-+    curr = &tab_mvf[y_pu * min_pu_width + x_pu];
-+    bs = &s->vertical_bs[(x0 + y0 * s->bs_width) >> 2];
-+
-     if (boundary_left) {
-         RefPicList *rpl_left = (lc->boundary_flags & BOUNDARY_LEFT_SLICE) ?
-                                ff_hevc_get_ref_list(s, s->ref, x0 - 1, y0) :
--                               s->ref->refPicList;
--        int xp_pu = (x0 - 1) >> log2_min_pu_size;
--        int xq_pu =  x0      >> log2_min_pu_size;
--        int xp_tu = (x0 - 1) >> log2_min_tu_size;
--        int xq_tu =  x0      >> log2_min_tu_size;
--
--            for (i = 0; i < (1 << log2_trafo_size); i += 4) {
--                int y_pu      = (y0 + i) >> log2_min_pu_size;
--                int y_tu      = (y0 + i) >> log2_min_tu_size;
--                MvField *left = &tab_mvf[y_pu * min_pu_width + xp_pu];
--                MvField *curr = &tab_mvf[y_pu * min_pu_width + xq_pu];
--                uint8_t left_cbf_luma = s->cbf_luma[y_tu * min_tu_width + xp_tu];
--                uint8_t curr_cbf_luma = s->cbf_luma[y_tu * min_tu_width + xq_tu];
--
--                if (curr->pred_flag == PF_INTRA || left->pred_flag == PF_INTRA)
--                    bs = 2;
--                else if (curr_cbf_luma || left_cbf_luma)
--                    bs = 1;
--                else
--                    bs = boundary_strength(s, curr, left, rpl_left);
--                s->vertical_bs[(x0 + (y0 + i) * s->bs_width) >> 2] = bs;
--            }
--    }
-+                               rpl;
-+        MvField *left = curr - 1;
- 
--    if (log2_trafo_size > log2_min_pu_size && !is_intra) {
--        RefPicList *rpl = s->ref->refPicList;
-+        if (is_intra) {
-+            for (j = 0; j < (1 << log2_trafo_size); j += 4)
-+                bs[j * s->bs_width >> 2] = 2;
- 
--        // bs for TU internal horizontal PU boundaries
--        for (j = 8; j < (1 << log2_trafo_size); j += 8) {
--            int yp_pu = (y0 + j - 1) >> log2_min_pu_size;
--            int yq_pu = (y0 + j)     >> log2_min_pu_size;
--
--            for (i = 0; i < (1 << log2_trafo_size); i += 4) {
--                int x_pu = (x0 + i) >> log2_min_pu_size;
--                MvField *top  = &tab_mvf[yp_pu * min_pu_width + x_pu];
--                MvField *curr = &tab_mvf[yq_pu * min_pu_width + x_pu];
--
--                bs = boundary_strength(s, curr, top, rpl);
--                s->horizontal_bs[((x0 + i) + (y0 + j) * s->bs_width) >> 2] = bs;
-+        } else {
-+            int y_tu = y0 >> log2_min_tu_size;
-+            int x_tu = x0 >> log2_min_tu_size;
-+            uint8_t *curr_cbf_luma = &s->cbf_luma[y_tu * min_tu_width + x_tu];
-+            uint8_t *left_cbf_luma = curr_cbf_luma - 1;
-+
-+            s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
-+                    min_pu_in_4pix, min_pu_width * sizeof (MvField), 4 * s->bs_width >> 2,
-+                    rpl[0].list, rpl[1].list, rpl_left[0].list, rpl_left[1].list,
-+                    curr, left, bs);
-+
-+            for (j = 0; j < (1 << log2_trafo_size); j += 4) {
-+                int j_pu = j >> log2_min_pu_size;
-+                int j_tu = j >> log2_min_tu_size;
-+
-+                if (left[j_pu * min_pu_width].pred_flag == PF_INTRA)
-+                    bs[j * s->bs_width >> 2] = 2;
-+                else if (curr_cbf_luma[j_tu * min_tu_width] || left_cbf_luma[j_tu * min_tu_width])
-+                    bs[j * s->bs_width >> 2] = 1;
-             }
-         }
-+    }
- 
--        // bs for TU internal vertical PU boundaries
--        for (j = 0; j < (1 << log2_trafo_size); j += 4) {
--            int y_pu = (y0 + j) >> log2_min_pu_size;
-+    if (!is_intra) {
-+        for (i = inc; i < trafo_in_min_pus; i += inc) {
-+            MvField *left;
- 
--            for (i = 8; i < (1 << log2_trafo_size); i += 8) {
--                int xp_pu = (x0 + i - 1) >> log2_min_pu_size;
--                int xq_pu = (x0 + i)     >> log2_min_pu_size;
--                MvField *left = &tab_mvf[y_pu * min_pu_width + xp_pu];
--                MvField *curr = &tab_mvf[y_pu * min_pu_width + xq_pu];
-+            curr += inc;
-+            left = curr - 1;
-+            bs += inc << log2_min_pu_size >> 2;
- 
--                bs = boundary_strength(s, curr, left, rpl);
--                s->vertical_bs[((x0 + i) + (y0 + j) * s->bs_width) >> 2] = bs;
--            }
-+            s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
-+                    min_pu_in_4pix, min_pu_width * sizeof (MvField), 4 * s->bs_width >> 2,
-+                    rpl[0].list, rpl[1].list, rpl[0].list, rpl[1].list,
-+                    curr, left, bs);
-         }
-     }
- }
-diff --git a/libavcodec/hevcdsp.c b/libavcodec/hevcdsp.c
-index 9d773d9..a6534a9 100644
---- a/libavcodec/hevcdsp.c
-+++ b/libavcodec/hevcdsp.c
-@@ -123,6 +123,120 @@ DECLARE_ALIGNED(16, const int8_t, ff_hevc_qpel_filters[3][16]) = {
- #include "hevcdsp_template.c"
- #undef BIT_DEPTH
- 
-+static void hevc_deblocking_boundary_strengths(int pus, int dup, int in_inc, int out_inc,
-+                                               int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
-+                                               MvField *curr, MvField *neigh, uint8_t *bs)
++/*static const unsigned code[] =
 +{
-+    for (; pus > 0; pus--) {
-+        int strength, out;
-+        int curr_refL0 = curr_rpl0[curr->ref_idx[0]];
-+        int curr_refL1 = curr_rpl1[curr->ref_idx[1]];
-+        int neigh_refL0 = neigh_rpl0[neigh->ref_idx[0]];
-+        int neigh_refL1 = neigh_rpl1[neigh->ref_idx[1]];
++  #include "rpi_shader.hex"
++};*/
 +
-+#if 1 // This more directly matches the original implementation
-+        if (curr->pred_flag == PF_BI &&  neigh->pred_flag == PF_BI) {
-+            // same L0 and L1
-+            if (curr_refL0 == neigh_refL0 &&
-+                curr_refL0 == curr_refL1 &&
-+                neigh_refL0 == neigh_refL1) {
-+                if ((FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
-+                     FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4) &&
-+                    (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
-+                     FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4))
-+                    strength = 1;
-+                else
-+                    strength = 0;
-+            } else if (neigh_refL0 == curr_refL0 &&
-+                       neigh_refL1 == curr_refL1) {
-+                if (FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
-+                    FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4)
-+                    strength = 1;
-+                else
-+                    strength = 0;
-+            } else if (neigh_refL1 == curr_refL0 &&
-+                       neigh_refL0 == curr_refL1) {
-+                if (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
-+                    FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4)
-+                    strength = 1;
-+                else
-+                    strength = 0;
-+            } else {
-+                strength = 1;
-+            }
-+        } else if ((curr->pred_flag != PF_BI) && (neigh->pred_flag != PF_BI)){ // 1 MV
-+            Mv curr_mv0, neigh_mv0;
++// Size in 32bit words
++#define QPU_CODE_SIZE 2048
++#define VPU_CODE_SIZE 2048
 +
-+            if (curr->pred_flag & 1) {
-+                curr_mv0   = curr->mv[0];
-+            } else {
-+                curr_mv0   = curr->mv[1];
-+                curr_refL0 = curr_refL1;
-+            }
++const short rpi_transMatrix2even[32][16] = { // Even rows first
++{64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64},
++{90,  87,  80,  70,  57,  43,  25,   9,  -9, -25, -43, -57, -70, -80, -87, -90},
++{89,  75,  50,  18, -18, -50, -75, -89, -89, -75, -50, -18,  18,  50,  75,  89},
++{87,  57,   9, -43, -80, -90, -70, -25,  25,  70,  90,  80,  43,  -9, -57, -87},
++{83,  36, -36, -83, -83, -36,  36,  83,  83,  36, -36, -83, -83, -36,  36,  83},
++{80,   9, -70, -87, -25,  57,  90,  43, -43, -90, -57,  25,  87,  70,  -9, -80},
++{75, -18, -89, -50,  50,  89,  18, -75, -75,  18,  89,  50, -50, -89, -18,  75},
++{70, -43, -87,   9,  90,  25, -80, -57,  57,  80, -25, -90,  -9,  87,  43, -70},
++{64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64},
++{57, -80, -25,  90,  -9, -87,  43,  70, -70, -43,  87,   9, -90,  25,  80, -57},
++{50, -89,  18,  75, -75, -18,  89, -50, -50,  89, -18, -75,  75,  18, -89,  50},
++{43, -90,  57,  25, -87,  70,   9, -80,  80,  -9, -70,  87, -25, -57,  90, -43},
++{36, -83,  83, -36, -36,  83, -83,  36,  36, -83,  83, -36, -36,  83, -83,  36},
++{25, -70,  90, -80,  43,   9, -57,  87, -87,  57,  -9, -43,  80, -90,  70, -25},
++{18, -50,  75, -89,  89, -75,  50, -18, -18,  50, -75,  89, -89,  75, -50,  18},
++{ 9, -25,  43, -57,  70, -80,  87, -90,  90, -87,  80, -70,  57, -43,  25,  -9},
++// Odd rows
++{90,  90,  88,  85,  82,  78,  73,  67,  61,  54,  46,  38,  31,  22,  13,   4},
++{90,  82,  67,  46,  22,  -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13},
++{88,  67,  31, -13, -54, -82, -90, -78, -46,  -4,  38,  73,  90,  85,  61,  22},
++{85,  46, -13, -67, -90, -73, -22,  38,  82,  88,  54,  -4, -61, -90, -78, -31},
++{82,  22, -54, -90, -61,  13,  78,  85,  31, -46, -90, -67,   4,  73,  88,  38},
++{78,  -4, -82, -73,  13,  85,  67, -22, -88, -61,  31,  90,  54, -38, -90, -46},
++{73, -31, -90, -22,  78,  67, -38, -90, -13,  82,  61, -46, -88,  -4,  85,  54},
++{67, -54, -78,  38,  85, -22, -90,   4,  90,  13, -88, -31,  82,  46, -73, -61},
++{61, -73, -46,  82,  31, -88, -13,  90,  -4, -90,  22,  85, -38, -78,  54,  67},
++{54, -85,  -4,  88, -46, -61,  82,  13, -90,  38,  67, -78, -22,  90, -31, -73},
++{46, -90,  38,  54, -90,  31,  61, -88,  22,  67, -85,  13,  73, -82,   4,  78},
++{38, -88,  73,  -4, -67,  90, -46, -31,  85, -78,  13,  61, -90,  54,  22, -82},
++{31, -78,  90, -61,   4,  54, -88,  82, -38, -22,  73, -90,  67, -13, -46,  85},
++{22, -61,  85, -90,  73, -38,  -4,  46, -78,  90, -82,  54, -13, -31,  67, -88},
++{13, -38,  61, -78,  88, -90,  85, -73,  54, -31,   4,  22, -46,  67, -82,  90},
++{ 4, -13,  22, -31,  38, -46,  54, -61,  67, -73,  78, -82,  85, -88,  90, -90}
++};
 +
-+            if (neigh->pred_flag & 1) {
-+                neigh_mv0   = neigh->mv[0];
-+            } else {
-+                neigh_mv0   = neigh->mv[1];
-+                neigh_refL0 = neigh_refL1;
-+            }
++struct GPU
++{
++  unsigned int qpu_code[QPU_CODE_SIZE];
++  unsigned int vpu_code[VPU_CODE_SIZE];
++  short transMatrix2even[16*16*2];
++  int open_count; // Number of allocated video buffers
++  int      mb; // Mailbox handle
++  int      vc; // Address in GPU memory
++  int mail[12*2]; // These are used to pass pairs of code/unifs to the QPUs for the first QPU task
++  int mail2[12*2]; // These are used to pass pairs of code/unifs to the QPUs for the second QPU task
++};
 +
-+            if (curr_refL0 == neigh_refL0) {
-+                if (FFABS(curr_mv0.x - neigh_mv0.x) >= 4 || FFABS(curr_mv0.y - neigh_mv0.y) >= 4)
-+                    strength = 1;
-+                else
-+                    strength = 0;
-+            } else
-+                strength = 1;
-+        } else
-+            strength = 1;
-+#else // This has exactly the same effect, but is more suitable for vectorisation
-+        Mv curr_mv[2];
-+        Mv neigh_mv[2];
-+        memcpy(curr_mv, curr->mv, sizeof curr_mv);
-+        memcpy(neigh_mv, neigh->mv, sizeof neigh_mv);
++// Stop more than one thread trying to allocate memory or use the processing resources at once
++static pthread_mutex_t gpu_mutex = PTHREAD_MUTEX_INITIALIZER;
++static volatile struct GPU* gpu = NULL;
++static GPU_MEM_PTR_T gpu_mem_ptr;
 +
-+        if (!(curr->pred_flag & 2)) {
-+            curr_mv[1] = curr_mv[0];
-+            curr_refL1 = curr_refL0;
-+        }
-+        if (!(neigh->pred_flag & 2)) {
-+            neigh_mv[1] = neigh_mv[0];
-+            neigh_refL1 = neigh_refL0;
-+        }
-+        if (!(curr->pred_flag & 1)) {
-+            curr_mv[0] = curr_mv[1];
-+            curr_refL0 = curr_refL1;
-+        }
-+        if (!(neigh->pred_flag & 1)) {
-+            neigh_mv[0] = neigh_mv[1];
-+            neigh_refL0 = neigh_refL1;
-+        }
-+
-+        strength = 1;
-+
-+        strength &= (neigh_refL0 != curr_refL0) | (neigh_refL1 != curr_refL1) |
-+                (FFABS(neigh_mv[0].x - curr_mv[0].x) >= 4) | (FFABS(neigh_mv[0].y - curr_mv[0].y) >= 4) |
-+                (FFABS(neigh_mv[1].x - curr_mv[1].x) >= 4) | (FFABS(neigh_mv[1].y - curr_mv[1].y) >= 4);
-+
-+        strength &= (neigh_refL1 != curr_refL0) | (neigh_refL0 != curr_refL1) |
-+                (FFABS(neigh_mv[1].x - curr_mv[0].x) >= 4) | (FFABS(neigh_mv[1].y - curr_mv[0].y) >= 4) |
-+                (FFABS(neigh_mv[0].x - curr_mv[1].x) >= 4) | (FFABS(neigh_mv[0].y - curr_mv[1].y) >= 4);
-+
-+        strength |= (((curr->pred_flag + 1) ^ (neigh->pred_flag + 1)) >> 2);
++#if defined(RPI_TIME_TOTAL_QPU) || defined(RPI_TIME_TOTAL_VPU) || defined(RPI_TIME_TOTAL_POSTED)
++static unsigned int Microseconds(void) {
++    struct timespec ts;
++    unsigned int x;
++    static unsigned int base = 0;
++    clock_gettime(CLOCK_REALTIME, &ts);
++    x = ts.tv_sec*1000000 + ts.tv_nsec/1000;
++    if (base==0) base=x;
++    return x-base;
++}
 +#endif
 +
-+        curr += in_inc / sizeof (MvField);
-+        neigh += in_inc / sizeof (MvField);
++static int gpu_malloc_uncached_internal(int numbytes, GPU_MEM_PTR_T *p, int mb);
++static void gpu_free_internal(GPU_MEM_PTR_T *p);
 +
-+        for (out = dup; out > 0; out--)
-+        {
-+            *bs = strength;
-+            bs += out_inc;
-+        }
-+    }
-+}
-+
- void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
- {
- #undef FUNC
-@@ -257,6 +371,8 @@ int i = 0;
-         break;
-     }
- 
-+    hevcdsp->hevc_deblocking_boundary_strengths = hevc_deblocking_boundary_strengths;
-+
-     if (ARCH_X86)
-         ff_hevc_dsp_init_x86(hevcdsp, bit_depth);
-     if (ARCH_ARM)
-diff --git a/libavcodec/hevcdsp.h b/libavcodec/hevcdsp.h
-index 9f1f6dd..e221e54 100644
---- a/libavcodec/hevcdsp.h
-+++ b/libavcodec/hevcdsp.h
-@@ -42,6 +42,17 @@ typedef struct SAOParams {
-     uint8_t type_idx[3];    ///< sao_type_idx
- } SAOParams;
- 
-+typedef struct Mv {
-+    int16_t x;  ///< horizontal component of motion vector
-+    int16_t y;  ///< vertical component of motion vector
-+} Mv;
-+
-+typedef struct MvField {
-+    DECLARE_ALIGNED(4, Mv, mv)[2];
-+    int8_t ref_idx[2];
-+    int8_t pred_flag;
-+} MvField;
-+
- typedef struct HEVCDSPContext {
-     void (*put_pcm)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
-                     struct GetBitContext *gb, int pcm_bit_depth);
-@@ -120,6 +131,9 @@ typedef struct HEVCDSPContext {
-     void (*hevc_v_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride,
-                                         int32_t *tc, uint8_t *no_p,
-                                         uint8_t *no_q);
-+    void (*hevc_deblocking_boundary_strengths)(int pus, int dup, int in_inc, int out_inc,
-+                                               int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
-+                                               MvField *curr, MvField *neigh, uint8_t *bs);
- } HEVCDSPContext;
- 
- void ff_hevc_dsp_init(HEVCDSPContext *hpc, int bit_depth);
--- 
-2.7.4
-
-
-From 619366d6acfd5f040a3116fda97b1146c8e40250 Mon Sep 17 00:00:00 2001
-From: Peter de Rivaz <peter.derivaz@gmail.com>
-Date: Wed, 15 Jul 2015 09:09:11 +0100
-Subject: [PATCH 68/68] Only enable qpu when needed
-
----
- libavcodec/hevc.h    |  2 +-
- libavcodec/rpi_qpu.c | 21 ++++++++++++++++-----
- 2 files changed, 17 insertions(+), 6 deletions(-)
-
-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-index 496c0e1..ce14975 100644
---- a/libavcodec/hevc.h
-+++ b/libavcodec/hevc.h
-@@ -57,7 +57,7 @@
-   // Define RPI_WORKER to launch a worker thread for pixel processing tasks
-   #define RPI_WORKER
-   // Define RPI_DEBLOCK_VPU to perform deblocking on the VPUs
--  #define RPI_DEBLOCK_VPU
-+  //#define RPI_DEBLOCK_VPU
- 
- #endif
- 
-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-index 5aa0432..ffd13ca 100644
---- a/libavcodec/rpi_qpu.c
-+++ b/libavcodec/rpi_qpu.c
-@@ -9,7 +9,7 @@
- // define RPI_ASYNC to run the VPU in a separate thread, need to make a separate call to check for completion
- #define RPI_ASYNC
- // Define RPI_COMBINE_JOBS to find jobs that can be executed in parallel
--#define RPI_COMBINE_JOBS
-+//#define RPI_COMBINE_JOBS
- 
- #include <stdio.h>
- #include <stdlib.h>
-@@ -143,9 +143,9 @@ static int gpu_init(volatile struct GPU **gpu) {
-   volatile struct GPU* ptr;
- 	if (mb < 0)
- 		return -1;
--
++// Connect to QPU, returns 0 on success.
++static int gpu_init(volatile struct GPU **gpu) {
++  int mb = mbox_open();
++  int vc;
++  volatile struct GPU* ptr;
++	if (mb < 0)
++		return -1;
 +#ifndef RPI_ASYNC
- 	if (qpu_enable(mb, 1)) return -2;
--
++	if (qpu_enable(mb, 1)) return -2;
 +#endif
-   vcsm_init();
-   gpu_malloc_uncached_internal(sizeof(struct GPU), &gpu_mem_ptr, mb);
-   ptr = (volatile struct GPU*)gpu_mem_ptr.arm;
-@@ -336,9 +336,9 @@ static void gpu_term(void)
-     vpu_post_code(0, 0, 0, 0, 0, 0, -1, NULL);
-     pthread_join(vpu_thread, &res);
-   }
--#endif
--
-+#else
-   qpu_enable(mb, 0);
-+#endif
-   gpu_free_internal(&gpu_mem_ptr);
- 
-   vcsm_exit();
-@@ -400,6 +400,7 @@ static void *vpu_start(void *arg) {
-   int count_deblock=0;
-   int count_qpu=0;
- #endif
-+  int qpu_started = 0;
-   while(1) {
-     int i;
-     int *p; // Pointer for a QPU/VPU job
-@@ -427,6 +428,12 @@ static void *vpu_start(void *arg) {
-     if (p[7] == 0 && p[0] == 0 && p[16]==0)
-       goto job_done_early;
- 
-+    if (!qpu_started) {
-+      int result = qpu_enable(gpu->mb, 1);
-+      av_assert0(result==0);
-+      qpu_started = 1;
-+    }
++  vcsm_init();
++  gpu_malloc_uncached_internal(sizeof(struct GPU), &gpu_mem_ptr, mb);
++  ptr = (volatile struct GPU*)gpu_mem_ptr.arm;
++  memset((void*)ptr, 0, sizeof *ptr);
++  vc = gpu_mem_ptr.vc;
 +
- #ifdef RPI_COMBINE_JOBS
-     // First scan for a qpu job
-     for (int x=0;x<num_jobs;x++) {
-@@ -556,6 +563,10 @@ job_done_early:
-     pthread_mutex_unlock(&post_mutex);
-   }
- 
-+  if (qpu_started) {
-+    qpu_enable(gpu->mb, 0);
++  ptr->mb = mb;
++  ptr->vc = vc;
++
++  printf("GPU allocated at 0x%x\n",vc);
++
++  *gpu = ptr;
++
++  // Now copy over the QPU code into GPU memory
++  {
++    int num_bytes = qpu_get_fn(QPU_MC_END) - qpu_get_fn(QPU_MC_SETUP_UV);
++    av_assert0(num_bytes<=QPU_CODE_SIZE*sizeof(unsigned int));
++    memcpy((void*)ptr->qpu_code, rpi_shader, num_bytes);
 +  }
++  // And the VPU code
++  {
++    int num_bytes = sizeof(rpi_hevc_transform);
++    av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
++    memcpy((void*)ptr->vpu_code, rpi_hevc_transform, num_bytes);
++  }
++  // And the transform coefficients
++  memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, sizeof(rpi_transMatrix2even));
 +
-   return NULL;
- }
- 
--- 
-2.7.4
-
-From a0d0946951b53e64ce103dd61b455f8d1f72caf9 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Tue, 9 Feb 2016 11:57:40 +0000
-Subject: [PATCH 1/2] Zero copy code v6
-
-This version has GPU buffer pooling code
----
- ffmpeg.c                 | 123 +++++++++-----
- libavcodec/Makefile      |   2 +
- libavcodec/avcodec.h     |   6 +
- libavcodec/hevc.c        |  92 ++++++-----
- libavcodec/hevc_filter.c |  83 +++++-----
- libavcodec/rpi_qpu.c     |   2 +-
- libavcodec/rpi_qpu.h     | 109 ++++++++++++-
- libavcodec/rpi_zc.c      | 406 +++++++++++++++++++++++++++++++++++++++++++++++
- libavcodec/rpi_zc.h      |  83 ++++++++++
- 9 files changed, 779 insertions(+), 127 deletions(-)
- create mode 100644 libavcodec/rpi_zc.c
- create mode 100644 libavcodec/rpi_zc.h
-
-diff --git a/ffmpeg.c b/ffmpeg.c
-index 50c6e86..953e5b8 100644
---- a/ffmpeg.c
-+++ b/ffmpeg.c
-@@ -25,7 +25,7 @@
- 
- #ifdef RPI
- #define RPI_DISPLAY
--//#define RPI_ZERO_COPY
-+#define RPI_ZERO_COPY
- #endif
- 
- #include "config.h"
-@@ -80,9 +80,7 @@
- #include <interface/mmal/util/mmal_default_components.h>
- #include <interface/mmal/util/mmal_connection.h>
- #include <interface/mmal/util/mmal_util_params.h>
--#ifdef RPI_ZERO_COPY
--#include "libavcodec/rpi_qpu.h"
--#endif
-+#include "libavcodec/rpi_zc.h"
- #endif
- 
- #if HAVE_SYS_RESOURCE_H
-@@ -183,13 +181,7 @@ static void free_input_threads(void);
- 
- static MMAL_COMPONENT_T* rpi_display = NULL;
- static MMAL_POOL_T *rpi_pool = NULL;
--
--#ifdef RPI_ZERO_COPY
--static uint8_t *get_vc_handle(AVBufferRef *bref) {
--  GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
--  return (uint8_t *)p->vc_handle;
--}
--#endif
-+static volatile int rpi_display_count = 0;
- 
- static MMAL_POOL_T* display_alloc_pool(MMAL_PORT_T* port, size_t w, size_t h)
- {
-@@ -206,7 +198,7 @@ static MMAL_POOL_T* display_alloc_pool(MMAL_PORT_T* port, size_t w, size_t h)
-     for (i = 0; i < NUM_BUFFERS; ++i)
-     {
-        MMAL_BUFFER_HEADER_T* buffer = pool->header[i];
--       void* bufPtr = buffer->data;
-+       char * bufPtr = buffer->data;
-        memset(bufPtr, i*30, w*h);
-        memset(bufPtr+w*h, 128, (w*h)/2);
-     }
-@@ -215,23 +207,31 @@ static MMAL_POOL_T* display_alloc_pool(MMAL_PORT_T* port, size_t w, size_t h)
-     return pool;
- }
- 
--static void display_cb_input(MMAL_PORT_T *port,MMAL_BUFFER_HEADER_T *buffer) {
-+static void display_cb_input(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer) {
-+#ifdef RPI_ZERO_COPY
-+    av_rpi_zc_unref(buffer->user_data);
-+    --rpi_display_count;
-+#endif
-+    mmal_buffer_header_release(buffer);
-+}
-+
-+static void display_cb_control(MMAL_PORT_T *port,MMAL_BUFFER_HEADER_T *buffer) {
-   mmal_buffer_header_release(buffer);
- }
- 
- static MMAL_COMPONENT_T* display_init(size_t x, size_t y, size_t w, size_t h)
- {
-     MMAL_COMPONENT_T* display;
--    int w2 = (w+31)&~31;
--    int h2 = (h+15)&~15;
-     MMAL_DISPLAYREGION_T region =
-     {
--        {MMAL_PARAMETER_DISPLAYREGION, sizeof(region)},
-+        .hdr = {MMAL_PARAMETER_DISPLAYREGION, sizeof(region)},
-         .set = MMAL_DISPLAY_SET_LAYER | MMAL_DISPLAY_SET_FULLSCREEN | MMAL_DISPLAY_SET_DEST_RECT,
-         .layer = 2,
-         .fullscreen = 0,
-         .dest_rect = {x, y, w, h}
-     };
-+    const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(w, h);
-+
-     bcm_host_init();  // TODO is this needed?
-     mmal_component_create(MMAL_COMPONENT_DEFAULT_VIDEO_RENDERER, &display);
-     assert(display);
-@@ -240,8 +240,8 @@ static MMAL_COMPONENT_T* display_init(size_t x, size_t y, size_t w, size_t h)
- 
-     MMAL_ES_FORMAT_T* format = display->input[0]->format;
-     format->encoding = MMAL_ENCODING_I420;
--    format->es->video.width = w2;
--    format->es->video.height = h2;
-+    format->es->video.width = geo.stride_y;
-+    format->es->video.height = geo.height_y;
-     format->es->video.crop.x = 0;
-     format->es->video.crop.y = 0;
-     format->es->video.crop.width = w;
-@@ -250,46 +250,75 @@ static MMAL_COMPONENT_T* display_init(size_t x, size_t y, size_t w, size_t h)
- 
-     mmal_component_enable(display);
- 
--    rpi_pool = display_alloc_pool(display->input[0], w2, h2);
-+    rpi_pool = display_alloc_pool(display->input[0], geo.stride_y, geo.height_y);
- 
-     mmal_port_enable(display->input[0],display_cb_input);
--    mmal_port_enable(display->control,display_cb_input);
-+    mmal_port_enable(display->control,display_cb_control);
- 
--    printf("Allocated display %d %d\n",w,h);
-+    printf("Allocated display %dx%d in %dx%d\n", w, h, geo.stride_y, geo.height_y);
- 
-     return display;
- }
- 
--static void display_frame(MMAL_COMPONENT_T* display,AVFrame* fr)
-+static void display_frame(struct AVCodecContext * const s, MMAL_COMPONENT_T* const display, const AVFrame* const fr)
- {
--    int w = fr->width;
--    int h = fr->height;
--    int w2 = (w+31)&~31;
--    int h2 = (h+15)&~15;
-     if (!display || !rpi_pool)
-         return;
-+
-+    if (rpi_display_count >= 3) {
-+        av_log(s, AV_LOG_VERBOSE, "Frame dropped\n");
-+        return;
++#ifdef RPI_ASYNC
++  {
++    int err;
++    vpu_async_tail = 0;
++    vpu_async_head = 0;
++    err = pthread_create(&vpu_thread, NULL, vpu_start, NULL);
++    //printf("Created thread\n");
++    if (err) {
++        av_log(NULL, AV_LOG_FATAL, "Failed to create vpu thread\n");
++        return -4;
 +    }
 +
-     MMAL_BUFFER_HEADER_T* buf = mmal_queue_get(rpi_pool->queue);
-     if (!buf) {
--      // Running too fast so drop the frame
--      return;
-+        // Running too fast so drop the frame
-+        printf("Q alloc failure\n");
-+        return;
-     }
-     assert(buf);
-     buf->cmd = 0;
--    buf->length = (w2 * h2 * 3)/2;
-     buf->offset = 0; // Offset to valid data
-     buf->flags = 0;
- #ifdef RPI_ZERO_COPY
--    buf->data = get_vc_handle(fr->buf[0]);
--    buf->alloc_size = (w2*h2*3)/2;
-+{
-+    const AVRpiZcRefPtr fr_buf = av_rpi_zc_ref(s, fr, 1);
-+
-+    buf->user_data = fr_buf;
-+    buf->data = av_rpi_zc_vc_handle(fr_buf);
-+    buf->alloc_size =
-+        buf->length = av_rpi_zc_numbytes(fr_buf);
-+
-+    ++rpi_display_count;
-+}
- #else
-+{
-+#error YYY
-+    int w = fr->width;
-+    int h = fr->height;
-+    int w2 = (w+31)&~31;
-+    int h2 = (h+15)&~15;
-+
-+    buf->length = (w2 * h2 * 3)/2;
-+    buf->user_data = NULL;
-+
-     //mmal_buffer_header_mem_lock(buf);
-     memcpy(buf->data, fr->data[0], w2 * h);
-     memcpy(buf->data+w2*h2, fr->data[1], w2 * h / 4);
-     memcpy(buf->data+w2*h2*5/4, fr->data[2], w2 * h / 4);
-     //mmal_buffer_header_mem_unlock(buf);
-+}
- #endif
- 
--    mmal_port_send_buffer(display->input[0], buf);  // I assume this will automatically get released
-+    while (rpi_display_count >= 3) {
-+        usleep(5000);
-+    }
-+
-+    if (mmal_port_send_buffer(display->input[0], buf) != MMAL_SUCCESS)
 +    {
-+        printf("** send failed: depth=%d\n", rpi_display_count);
-+        display_cb_input(NULL, buf);
++      struct sched_param param = {0};
++      int policy = 0;
++
++      if (pthread_getschedparam(vpu_thread, &policy, &param) != 0)
++      {
++        av_log(NULL, AV_LOG_ERROR, "Unable to get VPU thread scheduling parameters\n");
++      }
++      else
++      {
++        av_log(NULL, AV_LOG_INFO, "VPU thread: policy=%d (%s), pri=%d\n",
++            policy,
++            policy == SCHED_RR ? "RR" : policy == SCHED_FIFO ? "FIFO" : "???" ,
++            param.sched_priority);
++
++        policy = SCHED_FIFO;
++        param.sched_priority = sched_get_priority_max(SCHED_FIFO);
++
++        av_log(NULL, AV_LOG_INFO, "Attempt to set: policy=%d (%s), pri=%d\n",
++            policy,
++            policy == SCHED_RR ? "RR" : policy == SCHED_FIFO ? "FIFO" : "???" ,
++            param.sched_priority);
++
++        if (pthread_setschedparam(vpu_thread, policy, &param) != 0)
++        {
++          av_log(NULL, AV_LOG_ERROR, "Unable to set VPU thread scheduling parameters\n");
++        }
++        else
++        {
++          if (pthread_getschedparam(vpu_thread, &policy, &param) != 0)
++          {
++            av_log(NULL, AV_LOG_ERROR, "Unable to get VPU thread scheduling parameters\n");
++          }
++          else
++          {
++            av_log(NULL, AV_LOG_INFO, "VPU thread (after): policy=%d (%s), pri=%d\n",
++                policy,
++                policy == SCHED_RR ? "RR" : policy == SCHED_FIFO ? "FIFO" : "???" ,
++                param.sched_priority);
++          }
++        }
++      }
++
 +    }
- }
- 
- static void display_exit(MMAL_COMPONENT_T* display)
-@@ -687,6 +716,11 @@ static void ffmpeg_cleanup(int ret)
-         avformat_close_input(&input_files[i]->ctx);
-         av_freep(&input_files[i]);
-     }
 +
-+#ifdef RPI_DISPLAY
-+    display_exit(rpi_display);
++  }
 +#endif
 +
-     for (i = 0; i < nb_input_streams; i++) {
-         InputStream *ist = input_streams[i];
- 
-@@ -698,6 +732,9 @@ static void ffmpeg_cleanup(int ret)
-         av_freep(&ist->filters);
-         av_freep(&ist->hwaccel_device);
- 
-+#ifdef RPI_ZERO_COPY
-+        av_rpi_zc_uninit(ist->dec_ctx);
++  return 0;
++}
++
++// Returns 1 if the gpu is currently idle
++static int gpu_idle(void)
++{
++  int ret = pthread_mutex_trylock(&gpu_mutex);
++  if (ret==0) {
++    pthread_mutex_unlock(&gpu_mutex);
++    return 1;
++  }
++  return 0;
++}
++
++// Make sure we have exclusive access to the mailbox, and enable qpu if necessary.
++static void gpu_lock(void) {
++  pthread_mutex_lock(&gpu_mutex);
++
++  if (gpu==NULL) {
++    gpu_init(&gpu);
++  }
++}
++
++static void gpu_unlock(void) {
++  pthread_mutex_unlock(&gpu_mutex);
++}
++
++static int gpu_malloc_uncached_internal(int numbytes, GPU_MEM_PTR_T *p, int mb) {
++  p->numbytes = numbytes;
++  p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
++  av_assert0(p->vcsm_handle);
++  p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
++  av_assert0(p->vc_handle);
++  p->arm = vcsm_lock(p->vcsm_handle);
++  av_assert0(p->arm);
++  p->vc = mem_lock(mb, p->vc_handle);
++  av_assert0(p->vc);
++  return 0;
++}
++
++// Allocate memory on GPU
++// Fills in structure <p> containing ARM pointer, videocore handle, videocore memory address, numbytes
++// Returns 0 on success.
++// This allocates memory that will not be cached in ARM's data cache.
++// Therefore safe to use without data cache flushing.
++int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p)
++{
++  int r;
++  gpu_lock();
++  r = gpu_malloc_uncached_internal(numbytes, p, gpu->mb);
++  gpu->open_count++;
++  gpu_unlock();
++  return r;
++}
++
++int gpu_get_mailbox(void)
++{
++  av_assert0(gpu);
++  return gpu->mb;
++}
++
++// Call this to clean and invalidate a region of memory
++void gpu_cache_flush(const GPU_MEM_PTR_T * const p)
++{
++#ifdef RPI_FAST_CACHEFLUSH
++    struct vcsm_user_clean_invalid_s iocache = {};
++    iocache.s[0].handle = p->vcsm_handle;
++    iocache.s[0].cmd = 3; // clean+invalidate
++    iocache.s[0].addr = (int) p->arm;
++    iocache.s[0].size  = p->numbytes;
++    vcsm_clean_invalid( &iocache );
++#else
++    void *tmp = vcsm_lock(p->vcsm_handle);
++    vcsm_unlock_ptr(tmp);
 +#endif
-         avcodec_free_context(&ist->dec_ctx);
- 
-         av_freep(&input_streams[i]);
-@@ -729,9 +766,6 @@ static void ffmpeg_cleanup(int ret)
-     term_exit();
-     ffmpeg_exited = 1;
- 
--#ifdef RPI_DISPLAY
--    display_exit(rpi_display);
--#endif
- }
- 
- void remove_avoptions(AVDictionary **a, AVDictionary *b)
-@@ -1091,18 +1125,19 @@ static void do_video_out(AVFormatContext *s,
-     int frame_size = 0;
-     InputStream *ist = NULL;
-     AVFilterContext *filter = ost->filter->filter;
++}
 +
-+    if (ost->source_index >= 0)
-+        ist = input_streams[ost->source_index];
++void gpu_cache_flush3(GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2)
++{
++#ifdef RPI_FAST_CACHEFLUSH
++    struct vcsm_user_clean_invalid_s iocache = {};
++    iocache.s[0].handle = p0->vcsm_handle;
++    iocache.s[0].cmd = 3; // clean+invalidate
++    iocache.s[0].addr = (int) p0->arm;
++    iocache.s[0].size  = p0->numbytes;
++    iocache.s[1].handle = p1->vcsm_handle;
++    iocache.s[1].cmd = 3; // clean+invalidate
++    iocache.s[1].addr = (int) p1->arm;
++    iocache.s[1].size  = p1->numbytes;
++    iocache.s[2].handle = p2->vcsm_handle;
++    iocache.s[2].cmd = 3; // clean+invalidate
++    iocache.s[2].addr = (int) p2->arm;
++    iocache.s[2].size  = p2->numbytes;
++    vcsm_clean_invalid( &iocache );
++#else
++    void *tmp;
++    tmp = vcsm_lock(p0->vcsm_handle);
++    vcsm_unlock_ptr(tmp);
++    tmp = vcsm_lock(p1->vcsm_handle);
++    vcsm_unlock_ptr(tmp);
++    tmp = vcsm_lock(p2->vcsm_handle);
++    vcsm_unlock_ptr(tmp);
++#endif
++}
 +
- #ifdef RPI_DISPLAY
--    if (next_picture)
-+    if (next_picture && ist != NULL)
-     {
--	if (!rpi_display)
-+        if (!rpi_display)
-            rpi_display = display_init(0,0,next_picture->width,next_picture->height);
--        display_frame(rpi_display,next_picture);
-+        display_frame(ist->dec_ctx, rpi_display, next_picture);
-     }
- #endif
- 
--    if (ost->source_index >= 0)
--        ist = input_streams[ost->source_index];
--
-     if (filter->inputs[0]->frame_rate.num > 0 &&
-         filter->inputs[0]->frame_rate.den > 0)
-         duration = 1/(av_q2d(filter->inputs[0]->frame_rate) * av_q2d(enc->time_base));
-@@ -2708,6 +2743,12 @@ static int init_input_stream(int ist_index, char *error, int error_len)
-         ist->dec_ctx->opaque                = ist;
-         ist->dec_ctx->get_format            = get_format;
-         ist->dec_ctx->get_buffer2           = get_buffer;
++static int gpu_malloc_cached_internal(int numbytes, GPU_MEM_PTR_T *p) {
++  p->numbytes = numbytes;
++  p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST, (char *)"Video Frame" );
++  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_VC, (char *)"Video Frame" );
++  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
++  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST_AND_VC, (char *)"Video Frame" );
++  av_assert0(p->vcsm_handle);
++  p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
++  av_assert0(p->vc_handle);
++  p->arm = vcsm_lock(p->vcsm_handle);
++  av_assert0(p->arm);
++  p->vc = mem_lock(gpu->mb, p->vc_handle);
++  av_assert0(p->vc);
++  return 0;
++}
 +
-+#ifdef RPI_ZERO_COPY
-+        // Overrides the above get_buffer2
-+        av_rpi_zc_init(ist->dec_ctx);
++// This allocates data that will be
++//    Cached in ARM L2
++//    Uncached in VPU L2
++int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p)
++{
++  int r;
++  gpu_lock();
++  r = gpu_malloc_cached_internal(numbytes, p);
++  gpu->open_count++;
++  gpu_unlock();
++  return r;
++}
++
++static void gpu_term(void)
++{
++  int mb;
++
++  if (gpu==NULL)
++    return;
++  mb = gpu->mb;
++
++  // ??? Tear down anything needed for gpuexecute
++
++  qpu_enable(mb, 0);
++  gpu_free_internal(&gpu_mem_ptr);
++
++  vcsm_exit();
++
++  mbox_close(mb);
++  gpu = NULL;
++}
++
++void gpu_free_internal(GPU_MEM_PTR_T *p) {
++  int mb = gpu->mb;
++  mem_unlock(mb,p->vc_handle);
++  vcsm_unlock_ptr(p->arm);
++  vcsm_free(p->vcsm_handle);
++}
++
++void gpu_free(GPU_MEM_PTR_T *p) {
++  gpu_lock();
++
++  gpu_free_internal(p);
++
++  gpu->open_count--;
++  if (gpu->open_count==0) {
++      printf("Closing GPU\n");
++      gpu_term();
++      gpu = NULL;
++  }
++  gpu_unlock();
++}
++
++unsigned int vpu_get_fn(void) {
++  // Make sure that the gpu is initialized
++  if (gpu==NULL) {
++    printf("Preparing gpu\n");
++    gpu_lock();
++    gpu_unlock();
++  }
++  return gpu->vc + offsetof(struct GPU,vpu_code);
++}
++
++unsigned int vpu_get_constants(void) {
++  if (gpu==NULL) {
++    gpu_lock();
++    gpu_unlock();
++  }
++  return gpu->vc + offsetof(struct GPU,transMatrix2even);
++}
++
++#ifdef GPUSERVICE
++static void callback(void *cookie)
++{
++  sem_post((sem_t *)cookie);
++}
 +#endif
 +
-         ist->dec_ctx->thread_safe_callbacks = 1;
- 
-         av_opt_set_int(ist->dec_ctx, "refcounted_frames", 1, 0);
-diff --git a/libavcodec/Makefile b/libavcodec/Makefile
-index 03065cd..21e4514 100644
---- a/libavcodec/Makefile
-+++ b/libavcodec/Makefile
-@@ -9,6 +9,7 @@ HEADERS = avcodec.h                                                     \
-           rpi_shader.h                                                  \
-           rpi_mailbox.h                                                 \
-           rpi_hevc_transform.h                                          \
-+          rpi_zc.h                                                      \
-           d3d11va.h                                                     \
-           dirac.h                                                       \
-           dv_profile.h                                                  \
-@@ -50,6 +51,7 @@ OBJS = allcodecs.o                                                      \
-        rpi_qpu.o                                                        \
-        rpi_shader.o                                                     \
-        rpi_mailbox.o                                                    \
-+       rpi_zc.o                                                         \
-        vorbis_parser.o                                                  \
-        xiph.o                                                           \
- 
-diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
-index 39713ed..a1ba217 100644
---- a/libavcodec/avcodec.h
-+++ b/libavcodec/avcodec.h
-@@ -3505,6 +3505,12 @@ typedef struct AVCodecContext {
- #define FF_SUB_TEXT_FMT_ASS_WITH_TIMINGS 1
- #endif
- 
-+    /**
-+     * Opaque pointer for use by replacement get_buffer2 code
-+     *
-+     * @author jc (08/02/2016)
-+     */
-+    void * get_buffer_context;
- } AVCodecContext;
- 
- AVRational av_codec_get_pkt_timebase         (const AVCodecContext *avctx);
-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-index 8437e10..51736c7 100644
---- a/libavcodec/hevc.c
-+++ b/libavcodec/hevc.c
-@@ -114,10 +114,6 @@ static uint32_t rpi_filter_coefs[8][1] = {
-         { ENCODE_COEFFS(  -2,  10,  58,  -2) }
- };
- 
--static uint32_t get_vc_address(AVBufferRef *bref) {
--  GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
--  return p->vc;
--}
- #endif
- 
- 
-@@ -2197,9 +2193,9 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-                   int bw = nPbW-start_x;
-                   int bh = nPbH-start_y;
-                   y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff);
--                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[0]);
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref0->frame);
-                   y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + 8 + start_x) & 0xffff);
--                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[0]);
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref0->frame);
-                   *y++ = ( (bw<16 ? bw : 16) << 16 ) + (bh<16 ? bh : 16);
-                   *y++ = my2_mx2_my_mx;
-                   if (weight_flag) {
-@@ -2207,7 +2203,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-                   } else {
-                       *y++ = 1; // Weight of 1 and offset of 0
-                   }
--                  *y++ = (get_vc_address(s->frame->buf[0]) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
-+                  *y++ = (get_vc_address_y(s->frame) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
-                   y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter;
-                 }
-             }
-@@ -2246,8 +2242,8 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-                       u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
-                       u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
-                       u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
--                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[1]);
--                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[2]);
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_u(ref0->frame);
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_v(ref0->frame);
-                       *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
-                       *u++ = rpi_filter_coefs[_mx][0];
-                       *u++ = rpi_filter_coefs[_my][0];
-@@ -2258,8 +2254,8 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-                           *u++ = 1; // Weight of 1 and offset of 0
-                           *u++ = 1;
-                       }
--                      *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
--                      *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-+                      *u++ = (get_vc_address_u(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
-+                      *u++ = (get_vc_address_v(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-                     }
-                 }
-                 s->curr_u_mvs = u;
-@@ -2297,9 +2293,9 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-                   int bw = nPbW-start_x;
-                   int bh = nPbH-start_y;
-                   y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff);
--                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[0]);
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref1->frame);
-                   y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + 8 + start_x) & 0xffff);
--                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[0]);
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref1->frame);
-                   *y++ = ( (bw<16 ? bw : 16) << 16 ) + (bh<16 ? bh : 16);
-                   *y++ = my2_mx2_my_mx;
-                   if (weight_flag) {
-@@ -2307,7 +2303,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-                   } else {
-                       *y++ = 1; // Weight of 1 and offset of 0
-                   }
--                  *y++ = (get_vc_address(s->frame->buf[0]) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
-+                  *y++ = (get_vc_address_y(s->frame) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
-                   y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter;
-                 }
-             }
-@@ -2347,8 +2343,8 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-                       u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
-                       u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
-                       u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
--                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[1]);
--                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[2]);
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_u(ref1->frame);
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_v(ref1->frame);
-                       *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
-                       // TODO chroma weight and offset... s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]
-                       *u++ = rpi_filter_coefs[_mx][0];
-@@ -2360,8 +2356,8 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-                           *u++ = 1; // Weight of 1 and offset of 0
-                           *u++ = 1;
-                       }
--                      *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
--                      *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-+                      *u++ = (get_vc_address_u(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
-+                      *u++ = (get_vc_address_v(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-                     }
-                 }
-                 s->curr_u_mvs = u;
-@@ -2403,13 +2399,13 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-                   int bw = nPbW-start_x;
-                   int bh = nPbH-start_y;
-                   y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff);
--                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[0]);
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref0->frame);
-                   y++[-RPI_LUMA_COMMAND_WORDS] = ((y2 - 3 + start_y) << 16) + ( (x2 - 3 + start_x) & 0xffff); // Second fetch is for ref1
--                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[0]);
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref1->frame);
-                   *y++ = ( (bw<8 ? bw : 8) << 16 ) + (bh<16 ? bh : 16);
-                   *y++ = my2_mx2_my_mx;
-                   *y++ = 1; // B frame weighted prediction not supported
--                  *y++ = (get_vc_address(s->frame->buf[0]) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
-+                  *y++ = (get_vc_address_y(s->frame) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
-                   y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter_b;
-                 }
-             }
-@@ -2453,8 +2449,8 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-                       u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b0;
-                       u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
-                       u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
--                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[1]);
--                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[2]);
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_u(ref0->frame);
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_v(ref0->frame);
-                       *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
-                       *u++ = rpi_filter_coefs[_mx][0];
-                       *u++ = rpi_filter_coefs[_my][0];
-@@ -2464,14 +2460,14 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-                       u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b;
-                       u++[-RPI_CHROMA_COMMAND_WORDS] = x2_c - 1 + start_x;
-                       u++[-RPI_CHROMA_COMMAND_WORDS] = y2_c - 1 + start_y;
--                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[1]);
--                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[2]);
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_u(ref1->frame);
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_v(ref1->frame);
-                       *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
-                       *u++ = rpi_filter_coefs[_mx2][0];
-                       *u++ = rpi_filter_coefs[_my2][0];
-                       u+=2; // Weights not supported in B slices
--                      *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
--                      *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-+                      *u++ = (get_vc_address_u(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
-+                      *u++ = (get_vc_address_v(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-                     }
-                 }
-                 s->curr_u_mvs = u;
-@@ -3270,12 +3266,13 @@ static int32_t filter8_luma(uint8_t *data, int x0, int y0, int pitch, int my_mx,
-    return vsum;
- }
- 
--static uint8_t *test_frame(HEVCContext *s,uint32_t p, AVFrame *frame, int cIdx)
-+static uint8_t *test_frame(HEVCContext *s,uint32_t p, AVFrame *frame, const int cIdx)
- {
-   //int pic_width        = s->ps.sps->width >> s->ps.sps->hshift[cIdx];
-   int pic_height       = s->ps.sps->height >> s->ps.sps->vshift[cIdx];
-   int pitch = frame->linesize[cIdx];
--  uint32_t base = get_vc_address(frame->buf[cIdx]);
-+  uint32_t base = c_idx == 0 ? get_vc_address_y(frame);
-+    c_idx == 1 ? get_vc_address_u(frame) : get_vc_address_v(frame);
-   if (p>=base && p<base+pitch*pic_height) {
-     return frame->data[cIdx] + (p-base);
-   }
-@@ -3562,6 +3559,7 @@ static void rpi_launch_vpu_qpu(HEVCContext *s)
- #ifdef RPI
- 
- #ifndef RPI_FAST_CACHEFLUSH
-+#error RPI_FAST_CACHEFLUSH is broken
- static void flush_buffer(AVBufferRef *bref) {
-     GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
-     gpu_cache_flush(p);
-@@ -3572,7 +3570,7 @@ static void flush_frame(HEVCContext *s,AVFrame *frame)
- {
- #ifdef RPI_FAST_CACHEFLUSH
-     struct vcsm_user_clean_invalid_s iocache = {};
--    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[1]);
-+    GPU_MEM_PTR_T p = get_gpu_mem_ptr_u(s->frame);
-     int n = s->ps.sps->height;
-     int curr_y = 0;
-     int curr_uv = 0;
-@@ -3580,21 +3578,21 @@ static void flush_frame(HEVCContext *s,AVFrame *frame)
-     int sz,base;
-     sz = s->frame->linesize[1] * (n_uv-curr_uv);
-     base = s->frame->linesize[1] * curr_uv;
--    iocache.s[0].handle = p->vcsm_handle;
-+    iocache.s[0].handle = p.vcsm_handle;
-     iocache.s[0].cmd = 3; // clean+invalidate
--    iocache.s[0].addr = (int)(p->arm) + base;
-+    iocache.s[0].addr = (int)(p.arm) + base;
-     iocache.s[0].size  = sz;
--    p = av_buffer_pool_opaque(frame->buf[2]);
--    iocache.s[1].handle = p->vcsm_handle;
-+    p = get_gpu_mem_ptr_v(s->frame);
-+    iocache.s[1].handle = p.vcsm_handle;
-     iocache.s[1].cmd = 3; // clean+invalidate
--    iocache.s[1].addr = (int)(p->arm) + base;
-+    iocache.s[1].addr = (int)(p.arm) + base;
-     iocache.s[1].size  = sz;
--    p = av_buffer_pool_opaque(frame->buf[0]);
-+    p = get_gpu_mem_ptr_y(s->frame);
-     sz = s->frame->linesize[0] * (n-curr_y);
-     base = s->frame->linesize[0] * curr_y;
--    iocache.s[2].handle = p->vcsm_handle;
-+    iocache.s[2].handle = p.vcsm_handle;
-     iocache.s[2].cmd = 3; // clean+invalidate
--    iocache.s[2].addr = (int)(p->arm) + base;
-+    iocache.s[2].addr = (int)(p.arm) + base;
-     iocache.s[2].size  = sz;
-     vcsm_clean_invalid( &iocache );
- #else
-@@ -3612,7 +3610,7 @@ static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM
-     int curr_y;
-     int curr_uv;
-     int n_uv;
--    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[1]);
-+    GPU_MEM_PTR_T p = get_gpu_mem_ptr_u(s->frame);
-     int sz,base;
-     int (*d)[2] = s->dblk_cmds[job];
-     int low=(*d)[1];
-@@ -3629,21 +3627,21 @@ static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM
- 
-     sz = s->frame->linesize[1] * (n_uv-curr_uv);
-     base = s->frame->linesize[1] * curr_uv;
--    iocache.s[0].handle = p->vcsm_handle;
-+    iocache.s[0].handle = p.vcsm_handle;
-     iocache.s[0].cmd = 3; // clean+invalidate
--    iocache.s[0].addr = (int)(p->arm) + base;
-+    iocache.s[0].addr = (int)(p.arm) + base;
-     iocache.s[0].size  = sz;
--    p = av_buffer_pool_opaque(frame->buf[2]);
--    iocache.s[1].handle = p->vcsm_handle;
-+    p = get_gpu_mem_ptr_v(s->frame);
-+    iocache.s[1].handle = p.vcsm_handle;
-     iocache.s[1].cmd = 3; // clean+invalidate
--    iocache.s[1].addr = (int)(p->arm) + base;
-+    iocache.s[1].addr = (int)(p.arm) + base;
-     iocache.s[1].size  = sz;
--    p = av_buffer_pool_opaque(frame->buf[0]);
-+    p = get_gpu_mem_ptr_y(s->frame);
-     sz = s->frame->linesize[0] * (n-curr_y);
-     base = s->frame->linesize[0] * curr_y;
--    iocache.s[2].handle = p->vcsm_handle;
-+    iocache.s[2].handle = p.vcsm_handle;
-     iocache.s[2].cmd = 3; // clean+invalidate
--    iocache.s[2].addr = (int)(p->arm) + base;
-+    iocache.s[2].addr = (int)(p.arm) + base;
-     iocache.s[2].size  = sz;
- 
-     iocache.s[3].handle = p0->vcsm_handle;
-diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
-index 826a82f..c4fa305 100644
---- a/libavcodec/hevc_filter.c
-+++ b/libavcodec/hevc_filter.c
-@@ -879,17 +879,25 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
- #undef CR
- 
- #ifdef RPI_INTER_QPU
--static void flush_buffer(AVBufferRef *bref) {
--    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
--    gpu_cache_flush(p);
-+static void flush_buffer_y(const AVFrame * const frame) {
-+    GPU_MEM_PTR_T p = get_gpu_mem_ptr_y(frame);
-+    gpu_cache_flush(&p);
- }
- 
--// Return Physical address for this image
--static uint32_t get_vc_address(AVBufferRef *bref) {
--  GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
--  return p->vc;
-+static void flush_buffer_u(const AVFrame * const frame) {
-+    GPU_MEM_PTR_T p = get_gpu_mem_ptr_u(frame);
-+    gpu_cache_flush(&p);
- }
- 
-+static void flush_buffer_v(const AVFrame * const frame) {
-+    GPU_MEM_PTR_T p = get_gpu_mem_ptr_v(frame);
-+    gpu_cache_flush(&p);
++
++static volatile uint32_t post_done = 0;
++static volatile uint32_t post_qed = 0;
++
++static void post_code2_cb(void * v)
++{
++  uint32_t n = (uint32_t)v;
++  if ((int32_t)(n - post_done) > 0) {
++    post_done = n;
++  }
 +}
 +
 +
-+#ifdef RPI_DEBLOCK_VPU
-+#error Not fixed yet
++// Post a command to the queue
++// Returns an id which we can use to wait for completion
++int vpu_post_code2(unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5, GPU_MEM_PTR_T *buf)
++{
++  struct gpu_job_s j[1] = {
++    {
++      .command = EXECUTE_VPU,
++      .u.v.q = {code, r0, r1, r2, r3, r4, r5},
++      .callback.func = post_code2_cb
++    }
++  };
++  uint32_t id;
 +
- // ff_hevc_flush_buffer_lines
- // flushes and invalidates all pixel rows in [start,end-1]
- static void ff_hevc_flush_buffer_lines(HEVCContext *s, int start, int end, int flush_luma, int flush_chroma)
-@@ -901,44 +909,44 @@ static void ff_hevc_flush_buffer_lines(HEVCContext *s, int start, int end, int f
-         int curr_uv = curr_y >> s->ps.sps->vshift[1];
-         int n_uv = n >> s->ps.sps->vshift[1];
-         int sz,base;
--        GPU_MEM_PTR_T *p;
-+        GPU_MEM_PTR_T p;
-         if (curr_uv < 0) curr_uv = 0;
-         if (n_uv<=curr_uv) { return; }
-         sz = s->frame->linesize[1] * (n_uv-curr_uv);
-         base = s->frame->linesize[1] * curr_uv;
-         if (flush_chroma) {
--          p = av_buffer_pool_opaque(s->frame->buf[1]);
--          iocache.s[0].handle = p->vcsm_handle;
-+          p = get_gpu_mem_ptr_u(s->frame);
-+          iocache.s[0].handle = p.vcsm_handle;
-           iocache.s[0].cmd = 3; // clean+invalidate
--          iocache.s[0].addr = (int)p->arm + base;
-+          iocache.s[0].addr = (int)p.arm + base;
-           iocache.s[0].size  = sz;
--          p = av_buffer_pool_opaque(s->frame->buf[2]);
--          iocache.s[1].handle = p->vcsm_handle;
-+          p = get_gpu_mem_ptr_v(s->frame);
-+          iocache.s[1].handle = p.vcsm_handle;
-           iocache.s[1].cmd = 3; // clean+invalidate
--          iocache.s[1].addr = (int)p->arm + base;
-+          iocache.s[1].addr = (int)p.arm + base;
-           iocache.s[1].size  = sz;
-         }
-         if (flush_luma) {
--          p = av_buffer_pool_opaque(s->frame->buf[0]);
-+          p = get_gpu_mem_ptr_y(s->frame);
-           sz = s->frame->linesize[0] * (n-curr_y);
-           base = s->frame->linesize[0] * curr_y;
--          iocache.s[2].handle = p->vcsm_handle;
-+          iocache.s[2].handle = p.vcsm_handle;
-           iocache.s[2].cmd = 3; // clean+invalidate
--          iocache.s[2].addr = (int)p->arm + base;
-+          iocache.s[2].addr = (int)p.arm + base;
-           iocache.s[2].size  = sz;
-         }
-         vcsm_clean_invalid( &iocache );
- #else
-         if (flush_chroma) {
--          flush_buffer(s->frame->buf[1]);
--          flush_buffer(s->frame->buf[2]);
-+          flush_buffer_u(s->frame);
-+          flush_buffer_v(s->frame);
-         }
-         if (flush_luma) {
--          flush_buffer(s->frame->buf[0]);
-+          flush_buffer_y(s->frame);
-         }
- #endif
- }
--
++  j[0].callback.cookie = (void *)(id = ++post_qed);
++
++  av_assert0(vc_gpuserv_execute_code(1, j) == 0);
++
++  return id;
++}
++
++int vpu_qpu_post_code2(unsigned vpu_code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
++    int qpu0_n, const uint32_t * qpu0_mail,
++    int qpu1_n, const uint32_t * qpu1_mail)
++{
++#if 1
++  sem_t sync0;
++  struct gpu_job_s j[4];
++
++  sem_init(&sync0, 0, 0);
++
++  j[0].command = EXECUTE_VPU;
++  j[0].u.v.q[0] = vpu_code;
++  j[0].u.v.q[1] = r0;
++  j[0].u.v.q[2] = r1;
++  j[0].u.v.q[3] = r2;
++  j[0].u.v.q[4] = r3;
++  j[0].u.v.q[5] = r4;
++  j[0].u.v.q[6] = r5;
++  j[0].callback.func = 0;
++  j[0].callback.cookie = NULL;
++
++  j[1].command = EXECUTE_QPU;
++  j[1].u.q.jobs = qpu1_n;
++  memcpy(j[1].u.q.control, qpu1_mail, qpu1_n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
++  j[1].u.q.noflush = FLAGS_FOR_PROFILING;
++  j[1].u.q.timeout = 5000;
++  j[1].callback.func = 0;
++  j[1].callback.cookie = NULL;
++
++  j[2].command = EXECUTE_QPU;
++  j[2].u.q.jobs = qpu0_n;
++  memcpy(j[2].u.q.control, qpu0_mail, qpu0_n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
++  j[2].u.q.noflush = 1;
++  j[2].u.q.timeout = 5000;
++  j[2].callback.func = 0;
++  j[2].callback.cookie = NULL;
++
++  j[3].command = EXECUTE_SYNC;
++  j[3].u.s.mask = 3;
++  j[3].callback.func = callback;
++  j[3].callback.cookie = (void *)&sync0;
++
++  av_assert0(vc_gpuserv_execute_code(4, j) == 0);
++
++  sem_wait(&sync0);
++#else
++
++  sem_t sync0, sync2;
++  struct gpu_job_s j[3];
++
++  sem_init(&sync0, 0, 0);
++  sem_init(&sync2, 0, 0);
++
++  j[0].command = EXECUTE_VPU;
++  j[0].u.v.q[0] = vpu_code;
++  j[0].u.v.q[1] = r0;
++  j[0].u.v.q[2] = r1;
++  j[0].u.v.q[3] = r2;
++  j[0].u.v.q[4] = r3;
++  j[0].u.v.q[5] = r4;
++  j[0].u.v.q[6] = r5;
++  j[0].callback.func = callback;
++  j[0].callback.cookie = (void *)&sync0;
++
++  j[1].command = EXECUTE_QPU;
++  j[1].u.q.jobs = qpu1_n;
++  memcpy(j[1].u.q.control, qpu1_mail, qpu1_n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
++  j[1].u.q.noflush = FLAGS_FOR_PROFILING;
++  j[1].u.q.timeout = 5000;
++  j[1].callback.func = 0;
++  j[1].callback.cookie = NULL;
++
++  j[2].command = EXECUTE_QPU;
++  j[2].u.q.jobs = qpu0_n;
++  memcpy(j[2].u.q.control, qpu0_mail, qpu0_n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
++  j[2].u.q.noflush = 1;
++  j[2].u.q.timeout = 5000;
++  j[2].callback.func = callback;
++  j[2].callback.cookie = (void *)&sync2;
++
++  av_assert0(vc_gpuserv_execute_code(3, j) == 0);
++
++  sem_wait(&sync0);
++  sem_wait(&sync2);
 +#endif
- 
- void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n)
- {
-@@ -950,37 +958,37 @@ void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n)
-         int curr_uv = curr_y >> s->ps.sps->vshift[1];
-         int n_uv = n >> s->ps.sps->vshift[1];
-         int sz,base;
--        GPU_MEM_PTR_T *p;
-+        GPU_MEM_PTR_T p;
-         if (curr_uv < 0) curr_uv = 0;
-         if (n_uv<=curr_uv) { return; }
-         sz = s->frame->linesize[1] * (n_uv-curr_uv);
-         base = s->frame->linesize[1] * curr_uv;
--        p = av_buffer_pool_opaque(s->frame->buf[1]);
--        iocache.s[0].handle = p->vcsm_handle;
-+        p = get_gpu_mem_ptr_u(s->frame);
-+        iocache.s[0].handle = p.vcsm_handle;
-         iocache.s[0].cmd = 3; // clean+invalidate
--        iocache.s[0].addr = (int)p->arm + base;
-+        iocache.s[0].addr = (int)p.arm + base;
-         iocache.s[0].size  = sz;
--        p = av_buffer_pool_opaque(s->frame->buf[2]);
--        iocache.s[1].handle = p->vcsm_handle;
-+        p = get_gpu_mem_ptr_v(s->frame);
-+        iocache.s[1].handle = p.vcsm_handle;
-         iocache.s[1].cmd = 3; // clean+invalidate
--        iocache.s[1].addr = (int)p->arm + base;
-+        iocache.s[1].addr = (int)p.arm + base;
-         iocache.s[1].size  = sz;
- 
- #ifdef RPI_LUMA_QPU
--        p = av_buffer_pool_opaque(s->frame->buf[0]);
-+        p = get_gpu_mem_ptr_y(s->frame);
-         sz = s->frame->linesize[0] * (n-curr_y);
-         base = s->frame->linesize[0] * curr_y;
--        iocache.s[2].handle = p->vcsm_handle;
-+        iocache.s[2].handle = p.vcsm_handle;
-         iocache.s[2].cmd = 3; // clean+invalidate
--        iocache.s[2].addr = (int)p->arm + base;
-+        iocache.s[2].addr = (int)p.arm + base;
-         iocache.s[2].size  = sz;
- #endif
-         vcsm_clean_invalid( &iocache );
- #else
--        flush_buffer(s->frame->buf[1]);
--        flush_buffer(s->frame->buf[2]);
-+        flush_buffer_u(s->frame);
-+        flush_buffer_v(s->frame);
- #ifdef RPI_LUMA_QPU
--        flush_buffer(s->frame->buf[0]);
-+        flush_buffer_y(s->frame);
- #endif
- 
- #endif
-@@ -992,6 +1000,7 @@ void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n)
- #endif
- 
- #ifdef RPI_DEBLOCK_VPU
-+#error XXX
- /* rpi_deblock deblocks an entire row of ctbs using the VPU */
- static void rpi_deblock(HEVCContext *s, int y, int ctb_size)
- {
-@@ -1000,21 +1009,21 @@ static void rpi_deblock(HEVCContext *s, int y, int ctb_size)
-   // TODO flush buffer of beta/tc setup when it becomes cached
- 
-   // Prepare three commands at once to avoid calling overhead
--  s->vpu_cmds_arm[0][0] = get_vc_address(s->frame->buf[0]) + s->frame->linesize[0] * y;
-+  s->vpu_cmds_arm[0][0] = get_vc_address_y(s->frame) + s->frame->linesize[0] * y;
-   s->vpu_cmds_arm[0][1] = s->frame->linesize[0];
-   s->vpu_cmds_arm[0][2] = s->setup_width;
-   s->vpu_cmds_arm[0][3] = (int) ( s->y_setup_vc + s->setup_width * (y>>4) );
-   s->vpu_cmds_arm[0][4] = ctb_size>>4;
-   s->vpu_cmds_arm[0][5] = 2;
- 
--  s->vpu_cmds_arm[1][0] = get_vc_address(s->frame->buf[1]) + s->frame->linesize[1] * (y>> s->ps.sps->vshift[1]);
-+  s->vpu_cmds_arm[1][0] = get_vc_address_u(s->frame) + s->frame->linesize[1] * (y>> s->ps.sps->vshift[1]);
-   s->vpu_cmds_arm[1][1] = s->frame->linesize[1];
-   s->vpu_cmds_arm[1][2] = s->uv_setup_width;
-   s->vpu_cmds_arm[1][3] = (int) ( s->uv_setup_vc + s->uv_setup_width * ((y>>4)>> s->ps.sps->vshift[1]) );
-   s->vpu_cmds_arm[1][4] = (ctb_size>>4)>> s->ps.sps->vshift[1];
-   s->vpu_cmds_arm[1][5] = 3;
- 
--  s->vpu_cmds_arm[2][0] = get_vc_address(s->frame->buf[2]) + s->frame->linesize[2] * (y>> s->ps.sps->vshift[2]);
-+  s->vpu_cmds_arm[2][0] = get_vc_address_v(s->frame) + s->frame->linesize[2] * (y>> s->ps.sps->vshift[2]);
-   s->vpu_cmds_arm[2][1] = s->frame->linesize[2];
-   s->vpu_cmds_arm[2][2] = s->uv_setup_width;
-   s->vpu_cmds_arm[2][3] = (int) ( s->uv_setup_vc + s->uv_setup_width * ((y>>4)>> s->ps.sps->vshift[1]) );
-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-index ffd13ca..b0c9bc5 100644
---- a/libavcodec/rpi_qpu.c
-+++ b/libavcodec/rpi_qpu.c
-@@ -250,7 +250,7 @@ int gpu_get_mailbox(void)
- }
- 
- // Call this to clean and invalidate a region of memory
--void gpu_cache_flush(GPU_MEM_PTR_T *p)
-+void gpu_cache_flush(const GPU_MEM_PTR_T * const p)
- {
- #ifdef RPI_FAST_CACHEFLUSH
-     struct vcsm_user_clean_invalid_s iocache = {};
++
++  return 0;
++}
++
++
++// Wait for completion of the given command
++void vpu_wait(int id)
++{
++  if (id == 0) {
++#if 0
++    sem_t sync0;
++    struct gpu_job_s j[1] =
++    {
++      {
++        .command = EXECUTE_SYNC,
++        .u.s.mask = 3,
++        .callback.func = callback,
++        .callback.cookie = (void *)&sync0
++      }
++    };
++
++    sem_init(&sync0, 0, 0);
++
++    av_assert0(vc_gpuserv_execute_code(1, j) == 0);
++
++    sem_wait(&sync0);
++#endif
++  }
++  else {
++    while ((int32_t)(post_done - (uint32_t)id) < 0) {
++      usleep(1000);
++    }
++  }
++}
++
++
++unsigned int qpu_get_fn(int num) {
++    // Make sure that the gpu is initialized
++    unsigned int *fn;
++    if (gpu==NULL) {
++      printf("Preparing gpu\n");
++      gpu_lock();
++      gpu_unlock();
++    }
++    switch(num) {
++    case QPU_MC_SETUP:
++      fn = mc_setup;
++      break;
++    case QPU_MC_FILTER:
++      fn = mc_filter;
++      break;
++    case QPU_MC_EXIT:
++      fn = mc_exit;
++      break;
++    case QPU_MC_INTERRUPT_EXIT12:
++      fn = mc_interrupt_exit12;
++      break;
++    case QPU_MC_FILTER_B:
++      fn = mc_filter_b;
++      break;
++    //case QPU_MC_FILTER_HONLY:
++    //  fn = mc_filter_honly;
++    //  break;
++    case QPU_MC_SETUP_UV:
++      fn = mc_setup_uv;
++      break;
++    case QPU_MC_FILTER_UV:
++      fn = mc_filter_uv;
++      break;
++    case QPU_MC_FILTER_UV_B0:
++      fn = mc_filter_uv_b0;
++      break;
++    case QPU_MC_FILTER_UV_B:
++      fn = mc_filter_uv_b;
++      break;
++    case QPU_MC_INTERRUPT_EXIT8:
++      fn = mc_interrupt_exit8;
++      break;
++    case QPU_MC_END:
++      fn = mc_end;
++      break;
++    default:
++      printf("Unknown function\n");
++      exit(-1);
++    }
++    return gpu->vc + 4*(int)(fn-rpi_shader);
++    //return code[num] + gpu->vc;
++}
++
++#if 0
++typedef unsigned int uint32_t;
++
++typedef struct mvs_s {
++    GPU_MEM_PTR_T unif_mvs_ptr;
++    uint32_t *unif_mvs; // Base of memory for motion vector commands
++
++    // _base pointers are to the start of the row
++    uint32_t *mvs_base[8];
++    // these pointers are to the next free space
++    uint32_t *u_mvs[8];
++
++} HEVCContext;
++
++#define RPI_CHROMA_COMMAND_WORDS 12
++
++static void rpi_inter_clear(HEVCContext *s)
++{
++    int i;
++    for(i=0;i<8;i++) {
++        s->u_mvs[i] = s->mvs_base[i];
++        *s->u_mvs[i]++ = 0;
++        *s->u_mvs[i]++ = 0;
++        *s->u_mvs[i]++ = 0;
++        *s->u_mvs[i]++ = 0;
++        *s->u_mvs[i]++ = 0;
++        *s->u_mvs[i]++ = 128;  // w
++        *s->u_mvs[i]++ = 128;  // h
++        *s->u_mvs[i]++ = 128;  // stride u
++        *s->u_mvs[i]++ = 128;  // stride v
++        s->u_mvs[i] += 3;  // Padding words
++    }
++}
++
++static void rpi_execute_inter_qpu(HEVCContext *s)
++{
++    int k;
++    uint32_t *unif_vc = (uint32_t *)s->unif_mvs_ptr.vc;
++
++    for(k=0;k<8;k++) {
++        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
++        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
++        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP); //  dummy location for V
++    }
++
++    s->u_mvs[8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
++
++    qpu_run_shader8(qpu_get_fn(QPU_MC_SETUP_UV),
++      (uint32_t)(unif_vc+(s->mvs_base[0 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
++      (uint32_t)(unif_vc+(s->mvs_base[1 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
++      (uint32_t)(unif_vc+(s->mvs_base[2 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
++      (uint32_t)(unif_vc+(s->mvs_base[3 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
++      (uint32_t)(unif_vc+(s->mvs_base[4 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
++      (uint32_t)(unif_vc+(s->mvs_base[5 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
++      (uint32_t)(unif_vc+(s->mvs_base[6 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
++      (uint32_t)(unif_vc+(s->mvs_base[7 ] - (uint32_t*)s->unif_mvs_ptr.arm))
++      );
++}
++
++void rpi_test_qpu(void)
++{
++    HEVCContext mvs;
++    HEVCContext *s = &mvs;
++    int i;
++    int uv_commands_per_qpu = (1 + (256*64*2)/(4*4)) * RPI_CHROMA_COMMAND_WORDS;
++    uint32_t *p;
++    printf("Allocate memory\n");
++    gpu_malloc_uncached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr );
++    s->unif_mvs = (uint32_t *) s->unif_mvs_ptr.arm;
++
++    // Set up initial locations for uniform streams
++    p = s->unif_mvs;
++    for(i = 0; i < 8; i++) {
++        s->mvs_base[i] = p;
++        p += uv_commands_per_qpu;
++    }
++    // Now run a simple program that should just quit immediately after a single texture fetch
++    rpi_inter_clear(s);
++    for(i=0;i<4;i++) {
++      printf("Launch QPUs\n");
++      rpi_execute_inter_qpu(s);
++      printf("Done\n");
++    }
++    printf("Free memory\n");
++    gpu_free(&s->unif_mvs_ptr);
++    return;
++}
++#endif
++
++#if 0
++
++int32_t hcoeffs[] = {-4, 10, -21, 70, 90, -24, 11, -4};
++//int32_t hcoeffs[] = {1, 1, 1, 1, 1, 1, 1, 1};
++int32_t vcoeffs[] = {-2, 6, -13, 37, 115, -20, 9, -4};
++//int32_t vcoeffs[] = {1, 1, 1, 1, 1, 1, 1, 1};
++
++#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0-1) & 0xff) | ((c1-1) & 0xff) << 8 | ((c2-1) & 0xff) << 16 | ((c3-1) & 0xff) << 24);
++
++static uint8_t av_clip_uint8(int32_t a)
++{
++    if (a&(~255)) return (-a)>>31;
++    else          return a;
++}
++
++static int32_t filter8(const uint8_t *data, int pitch)
++{
++   int32_t vsum = 0;
++   int x, y;
++
++   for (y = 0; y < 8; y++) {
++      int32_t hsum = 0;
++
++      for (x = 0; x < 8; x++)
++         hsum += hcoeffs[x]*data[x + y * pitch];
++
++      vsum += vcoeffs[y]*av_clip_uint8( (hsum + 64) >> 7); // Added brackets to stop compiler warning
++   }
++
++   return av_clip_uint8( (vsum + 64) >> 7);
++}
++
++// Note regression changes coefficients so is not thread safe
++//#define REGRESSION
++#ifdef REGRESSION
++#define CMAX 100
++#else
++#define CMAX 2
++#endif
++#define YMAX 16
++
++int rpi_test_shader(void)
++{
++   int i, c;
++
++   uint32_t *unifs;
++
++   uint8_t *in_buffer;
++   uint8_t *out_buffer[2];
++
++   GPU_MEM_PTR_T unifs_ptr;
++   GPU_MEM_PTR_T in_buffer_ptr;
++   GPU_MEM_PTR_T out_buffer_ptr[2];
++
++   // Addresses in GPU memory of filter programs
++   uint32_t mc_setup = 0;
++   uint32_t mc_filter = 0;
++   uint32_t mc_exit = 0;
++
++   int pitch = 0x500;
++
++   if (gpu==NULL) {
++      gpu_lock();
++      gpu_unlock();
++   }
++
++   printf("This needs to change to reflect new assembler\n");
++   // Use table to compute locations of program start points
++   mc_setup = code[0] + gpu->vc;
++   mc_filter = code[1] + gpu->vc;
++   mc_exit = code[2] + gpu->vc;
++
++   if (!vcos_verify_ge0(gpu_malloc_uncached(4*64,&unifs_ptr))) {
++      return -2;
++   }
++   unifs = (uint32_t*)unifs_ptr.arm;
++
++   if (!vcos_verify_ge0(gpu_malloc_uncached(64*23,&in_buffer_ptr))) {
++      return -3;
++   }
++   in_buffer = (uint8_t*)in_buffer_ptr.arm;
++
++   if (!vcos_verify_ge0(gpu_malloc_uncached(16*pitch,&out_buffer_ptr[0])) || !vcos_verify_ge0(gpu_malloc_uncached(16*pitch,&out_buffer_ptr[1]))) {
++      return -4;
++   }
++   out_buffer[0] = (uint8_t*)out_buffer_ptr[0].arm;
++   out_buffer[1] = (uint8_t*)out_buffer_ptr[1].arm;
++
++   for (c = 0; c < CMAX; c++) {
++      int xo[] = {rand()&31, rand()&31};
++
++#ifdef REGRESSION
++      for (i = 0; i < 8; i++) {
++         hcoeffs[i] = (int8_t)rand();
++         vcoeffs[i] = (int8_t)rand();
++         if (hcoeffs[i]==-128)
++           hcoeffs[i]++;
++         if (vcoeffs[i]==-128)
++           vcoeffs[i]++;
++      }
++#endif
++
++      for (i = 0; i < 64*23; i++) {
++         //printf("%d %d %p\n",i,gpu->mb,&in_buffer[i]);
++         in_buffer[i] = rand();
++      }
++
++      // Clear output array
++      {
++        int b;
++        for(b=0;b<2;b++) {
++          for(i=0;i<16*16;i++) {
++            out_buffer[b][i] = 3;
++          }
++        }
++      }
++
++      unifs[0] = mc_filter;
++      unifs[1] = in_buffer_ptr.vc+xo[0]+16;
++      unifs[2] = 64; // src pitch
++      unifs[3] = pitch; // dst pitch
++      unifs[4] = 0; // Padding
++      unifs[5] = 0;
++      unifs[6] = 0;
++      unifs[7 ] = mc_filter;
++      unifs[8 ] = in_buffer_ptr.vc+xo[1]+16;
++      unifs[9 ] = ENCODE_COEFFS(hcoeffs[0], hcoeffs[1], hcoeffs[2], hcoeffs[3]);
++      unifs[10] = ENCODE_COEFFS(hcoeffs[4], hcoeffs[5], hcoeffs[6], hcoeffs[7]);
++      unifs[11] = ENCODE_COEFFS(vcoeffs[0], vcoeffs[1], vcoeffs[2], vcoeffs[3]);
++      unifs[12] = ENCODE_COEFFS(vcoeffs[4], vcoeffs[5], vcoeffs[6], vcoeffs[7]);
++      unifs[13] = out_buffer_ptr[0].vc;
++      unifs[14] = mc_exit;
++      unifs[15] = in_buffer_ptr.vc+xo[1]+16;        // dummy
++      unifs[16] = ENCODE_COEFFS(hcoeffs[0], hcoeffs[1], hcoeffs[2], hcoeffs[3]);
++      unifs[17] = ENCODE_COEFFS(hcoeffs[4], hcoeffs[5], hcoeffs[6], hcoeffs[7]);
++      unifs[18] = ENCODE_COEFFS(vcoeffs[0], vcoeffs[1], vcoeffs[2], vcoeffs[3]);
++      unifs[19] = ENCODE_COEFFS(vcoeffs[4], vcoeffs[5], vcoeffs[6], vcoeffs[7]);
++      unifs[20] = out_buffer_ptr[1].vc;
++
++      printf("Gpu->vc=%x Code=%x dst=%x\n",gpu->vc, mc_filter,out_buffer_ptr[1].vc);
++
++      // flush_dcache(); TODO is this needed on ARM side? - tried to use the direct alias to avoid this problem
++
++      //qpu_run_shader(mc_setup, unifs_ptr.vc);
++      //qpu_run_shader(gpu, gpu->vc, unifs_ptr.vc);
++      rpi_do_block(in_buffer_ptr.vc+xo[0]+16, 64, out_buffer_ptr[0].vc, pitch,out_buffer[0]);
++      rpi_do_block(in_buffer_ptr.vc+xo[1]+16, 64, out_buffer_ptr[1].vc, pitch,out_buffer[1]);
++
++      if (1)
++      {
++         int x, y, b;
++         int bad = 0;
++
++         for (b=0; b<2; ++b)
++            for (y=0; y<YMAX; ++y)
++               for (x=0; x<16; ++x) {
++                  int32_t ref = filter8(in_buffer+x+y*64+xo[b], 64);
++
++                  if (out_buffer[b][x+y*pitch] != ref) {
++                      bad = 1;
++//                     printf("%d, %d, %d, %d\n", c, b, x, y);
++                  }
++#ifndef REGRESSION
++                  //printf("%08x %08x\n", out_buffer[b][x+y*pitch], ref);
++#endif
++               }
++          if (bad)
++            printf("Failed dst=%x test=%d\n",out_buffer_ptr[1].vc,c);
++          else
++            printf("Passed dst=%x test=%d\n",out_buffer_ptr[1].vc,c);
++      }
++      //printf("%d\n", simpenrose_get_qpu_tick_count());
++   }
++
++   gpu_free(&out_buffer_ptr[0]);
++   gpu_free(&out_buffer_ptr[1]);
++   gpu_free(&in_buffer_ptr);
++   gpu_free(&unifs_ptr);
++
++   return 0;
++}
++
++void rpi_do_block_arm(const uint8_t *in_buffer, int src_pitch, uint8_t *dst, int dst_pitch)
++{
++  int x,y;
++  for (y=0; y<16; ++y) {
++    for (x=0; x<16; ++x) {
++       dst[x+y*dst_pitch] = filter8(in_buffer+x+y*src_pitch, src_pitch);
++    }
++  }
++}
++
++void rpi_do_block(const uint8_t *in_buffer_vc, int src_pitch, uint8_t *dst_vc, int dst_pitch, uint8_t *dst)
++{
++   uint32_t *unifs;
++
++   GPU_MEM_PTR_T unifs_ptr;
++   //uint8_t *out_buffer;
++   //GPU_MEM_PTR_T out_buffer_ptr;
++
++   // Addresses in GPU memory of filter programs
++   uint32_t mc_setup = 0;
++   uint32_t mc_filter = 0;
++   uint32_t mc_exit = 0;
++   //int x,y;
++
++   if (gpu==NULL) {
++      gpu_lock();
++      gpu_unlock();
++   }
++
++   // Use table to compute locations of program start points
++   mc_setup = code[0] + gpu->vc;
++   mc_filter = code[1] + gpu->vc;
++   mc_exit = code[2] + gpu->vc;
++
++   if (!vcos_verify_ge0(gpu_malloc_uncached(4*64,&unifs_ptr))) {
++      return;
++   }
++   //gpu_malloc_uncached(16*dst_pitch,&out_buffer_ptr);
++   //out_buffer = (uint8_t*)out_buffer_ptr.arm;
++
++   /*for (y=0; y<16; ++y) {
++      for (x=0; x<16; ++x) {
++         out_buffer[x+y*dst_pitch] = 7;
++      }
++    }*/
++
++   unifs = (uint32_t*)unifs_ptr.arm;
++
++    unifs[0] = mc_filter;
++    unifs[1] = (int)in_buffer_vc;
++    unifs[2] = src_pitch; // src pitch
++    unifs[3] = dst_pitch; // dst pitch
++    unifs[4] = 0; // Padding
++    unifs[5] = 0;
++    unifs[6] = 0;
++    unifs[7 ] = mc_exit;
++    unifs[8 ] = (int)in_buffer_vc;
++    unifs[9 ] = ENCODE_COEFFS(hcoeffs[0], hcoeffs[1], hcoeffs[2], hcoeffs[3]);
++    unifs[10] = ENCODE_COEFFS(hcoeffs[4], hcoeffs[5], hcoeffs[6], hcoeffs[7]);
++    unifs[11] = ENCODE_COEFFS(vcoeffs[0], vcoeffs[1], vcoeffs[2], vcoeffs[3]);
++    unifs[12] = ENCODE_COEFFS(vcoeffs[4], vcoeffs[5], vcoeffs[6], vcoeffs[7]);
++    unifs[13] = (int)dst_vc;
++    //unifs[13] = (int)out_buffer_ptr.vc;
++
++    //printf("Gpu->vc=%x Code=%x dst=%x\n",gpu->vc, mc_filter,out_buffer_ptr[1].vc);
++
++    qpu_run_shader(mc_setup, unifs_ptr.vc);
++
++    /*for (y=0; y<16; ++y) {
++      for (x=0; x<16; ++x) {
++         dst[x+y*dst_pitch] = out_buffer[x+y*dst_pitch];
++      }
++    }*/
++
++    gpu_free(&unifs_ptr);
++    //gpu_free(&out_buffer_ptr);
++}
++
++
++
++#endif
++
++#endif // RPI
 diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
-index 81c2bb1..b913f79 100644
---- a/libavcodec/rpi_qpu.h
+new file mode 100644
+index 0000000..c6cdb2b
+--- /dev/null
 +++ b/libavcodec/rpi_qpu.h
-@@ -2,8 +2,11 @@
- #define RPI_QPU_H
- 
- // Define RPI_FAST_CACHEFLUSH to use the VCSM cache flush code
+@@ -0,0 +1,176 @@
++#ifndef RPI_QPU_H
++#define RPI_QPU_H
++
++// Define RPI_FAST_CACHEFLUSH to use the VCSM cache flush code
 +// *** N.B. Code has rotted & crashes if this is unset (before this set of changes)
- #define RPI_FAST_CACHEFLUSH
- 
++#define RPI_FAST_CACHEFLUSH
++
 +#define RPI_ONE_BUF 1
 +
- typedef struct gpu_mem_ptr_s {
-   unsigned char *arm; // Pointer to memory mapped on ARM side
-   int vc_handle;   // Videocore handle of relocatable memory
-@@ -16,9 +19,113 @@ typedef struct gpu_mem_ptr_s {
- extern int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p);
- extern int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p);
- extern void gpu_free(GPU_MEM_PTR_T *p);
--extern void gpu_cache_flush(GPU_MEM_PTR_T *p);
++typedef struct gpu_mem_ptr_s {
++  unsigned char *arm; // Pointer to memory mapped on ARM side
++  int vc_handle;   // Videocore handle of relocatable memory
++  int vcsm_handle; // Handle for use by VCSM
++  int vc;       // Address for use in GPU code
++  int numbytes; // Size of memory block
++} GPU_MEM_PTR_T;
++
++// General GPU functions
++extern int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p);
++extern int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p);
++extern void gpu_free(GPU_MEM_PTR_T *p);
 +extern void gpu_cache_flush(const GPU_MEM_PTR_T * const p);
- extern void gpu_cache_flush3(GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2);
- 
++extern void gpu_cache_flush3(GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2);
++
 +#include "libavutil/frame.h"
 +#if !RPI_ONE_BUF
 +static inline uint32_t get_vc_address_y(const AVFrame * const frame) {
@@ -37553,9 +12688,2283 @@ index 81c2bb1..b913f79 100644
 +#endif
 +
 +
- // QPU specific functions
- extern void qpu_run_shader8(int code, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8);
- extern void qpu_run_shader12(int code, int num, int code2, int num2, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8, int unifs9, int unifs10, int unifs11, int unifs12);
++// QPU specific functions
++extern void rpi_test_qpu(void);
++
++enum {
++  QPU_MC_SETUP,
++  QPU_MC_FILTER,
++  QPU_MC_EXIT,
++  QPU_MC_INTERRUPT_EXIT12,
++  QPU_MC_FILTER_B,
++  QPU_MC_FILTER_HONLY,
++  QPU_MC_SETUP_UV,
++  QPU_MC_FILTER_UV,
++  QPU_MC_FILTER_UV_B0,
++  QPU_MC_FILTER_UV_B,
++  QPU_MC_INTERRUPT_EXIT8,
++  QPU_MC_END
++  };
++extern unsigned int qpu_get_fn(int num);
++
++#define QPU_N_UV   8
++#define QPU_N_Y    12
++#define QPU_N_MAX  16
++
++#define QPU_MAIL_EL_VALS  2
++#define QPU_MAIL_EL_SIZE  (QPU_MAIL_EL_VALS * sizeof(uint32_t))
++#define QPU_MAIL_VALS_MAX (QPU_N_MAX * QPU_MAIL_EL_VALS)
++#define QPU_MAIL_SIZE (QPU_MAIL_VALS_MAX * sizeof(uint32_t))
++
++// VPU specific functions
++extern unsigned int vpu_get_fn(void);
++extern unsigned int vpu_get_constants(void);
++//extern unsigned vpu_execute_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5);
++extern int vpu_post_code2( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5, GPU_MEM_PTR_T *buf);
++int vpu_qpu_post_code2(unsigned vpu_code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
++    int qpu0_n, const uint32_t * qpu0_mail,
++    int qpu1_n, const uint32_t * qpu1_mail);
++
++extern void vpu_wait( int id);
++
++// Simple test of shader code
++extern int rpi_test_shader(void);
++
++extern void rpi_do_block(const unsigned char *in_buffer_vc, int src_pitch, unsigned char *dst_vc, int dst_pitch, unsigned char *dst);
++extern void rpi_do_block_arm(const unsigned char *in_buffer, int src_pitch, unsigned char *dst, int dst_pitch);
++
++extern int gpu_get_mailbox(void);
++
++#endif
+diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
+new file mode 100644
+index 0000000..06fb166
+--- /dev/null
++++ b/libavcodec/rpi_shader.c
+@@ -0,0 +1,629 @@
++#include "rpi_shader.h"
++
++#ifdef _MSC_VER
++   #include <stdint.h>
++   /* cast through uintptr_t to avoid warnings */
++   #define POINTER_TO_UINT(X) ((unsigned int)(uintptr_t)(X))
++#else
++   #define POINTER_TO_UINT(X) ((unsigned int)(X))
++#endif
++
++#ifdef __cplusplus
++extern "C" { /* the types are probably wrong... */
++#endif
++#ifdef __cplusplus
++}
++#endif
++
++#ifdef _MSC_VER
++__declspec(align(8))
++#elif defined(__GNUC__)
++__attribute__((aligned(8)))
++#endif
++unsigned int rpi_shader[] = {
++// ::mc_setup_uv
++/* [0x00000000] */ 0x15827d80, 0x100207e7, // mov ra31, unif
++/* [0x00000008] */ 0x0c9a0f80, 0x10020427, // add ra_x, unif, elem_num
++/* [0x00000010] */ 0x15827d80, 0x10020767, // mov ra_y, unif
++/* [0x00000018] */ 0x15827d80, 0x10020627, // mov ra_frame_base, unif
++/* [0x00000020] */ 0x009e7000, 0x100009e7, // nop
++/* [0x00000028] */ 0x0d620f80, 0x10020667, // sub ra_u2v_ref_offset, unif, ra_frame_base
++/* [0x00000030] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
++/* [0x00000038] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
++/* [0x00000040] */ 0x15827d80, 0x10021427, // mov rb16, unif
++/* [0x00000048] */ 0x15827d80, 0x10020827, // mov r0, unif
++/* [0x00000050] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
++/* [0x00000058] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
++/* [0x00000060] */ 0x00010000, 0xe0020127, // mov ra4, 0x10000
++/* [0x00000068] */ 0x00000001, 0xe0020527, // mov ra_k1, 1
++/* [0x00000070] */ 0x00000100, 0xe00205a7, // mov ra_k256, 256
++/* [0x00000078] */ 0x00000040, 0xe00207a7, // mov ra30, 64
++/* [0x00000080] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
++/* [0x00000088] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255
++/* [0x00000090] */ 0x00000018, 0xe00215e7, // mov rb23, 24
++/* [0x00000098] */ 0x00000000, 0xe0020227, // mov ra8, 0
++/* [0x000000a0] */ 0x00000000, 0xe0020267, // mov ra9, 0
++/* [0x000000a8] */ 0x00000000, 0xe00202a7, // mov ra10, 0
++/* [0x000000b0] */ 0x00000000, 0xe00202e7, // mov ra11, 0
++/* [0x000000b8] */ 0x00000000, 0xe0020327, // mov ra12, 0
++/* [0x000000c0] */ 0x00000000, 0xe0020367, // mov ra13, 0
++/* [0x000000c8] */ 0x00000000, 0xe00203a7, // mov ra14, 0
++/* [0x000000d0] */ 0x00000000, 0xe00203e7, // mov ra15, 0
++/* [0x000000d8] */ 0x15427d80, 0x10020827, // mov r0, ra_x
++/* [0x000000e0] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
++/* [0x000000e8] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base
++/* [0x000000f0] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
++/* [0x000000f8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
++/* [0x00000100] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
++/* [0x00000108] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
++/* [0x00000110] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x, r0
++/* [0x00000118] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
++/* [0x00000120] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
++/* [0x00000128] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_frame_base, r2
++/* [0x00000130] */ 0x0c9e7440, 0x10020f27, // add t1s, r2, r1
++/* [0x00000138] */ 0x00000009, 0xe00208a7, // mov r2, 9
++/* [0x00000140] */ 0x0c827580, 0x10021367, // add rb13, r2, unif
++/* [0x00000148] */ 0x15827d80, 0x100009e7, // mov -, unif
++/* [0x00000150] */ 0x15827d80, 0x100208a7, // mov r2, unif
++/* [0x00000158] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
++/* [0x00000160] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
++/* [0x00000168] */ 0x159e7480, 0x10020867, // mov r1, r2
++/* [0x00000170] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
++/* [0x00000178] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
++/* [0x00000180] */ 0x159e7480, 0x10020827, // mov r0, r2
++/* [0x00000188] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
++/* [0x00000190] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
++/* [0x00000198] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
++/* [0x000001a0] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
++/* [0x000001a8] */ 0x0f9c11c0, 0xd00208a7, // asr r2, r0, 1
++/* [0x000001b0] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
++/* [0x000001b8] */ 0x0c9e7440, 0x10021567, // add rb21, r2, r1
++/* [0x000001c0] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
++/* [0x000001c8] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
++/* [0x000001d0] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
++/* [0x000001d8] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
++/* [0x000001e0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
++/* [0x000001e8] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
++/* [0x000001f0] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
++/* [0x000001f8] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
++/* [0x00000200] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x
++/* [0x00000208] */ 0x0c627380, 0x10020f27, // add t1s, r1, ra_frame_base
++// ::mc_filter_uv
++/* [0x00000210] */ 0x15827d80, 0x100207e7, // mov ra31, unif
++/* [0x00000218] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
++/* [0x00000220] */ 0x938001f6, 0xd0024821, // max r0, r0, 0         ; mov r1, unif
++/* [0x00000228] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
++/* [0x00000230] */ 0x8d4e0ef6, 0x10025891, // sub r2, unif, r3      ; mov ra_xshift, ra_xshift_next
++/* [0x00000238] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
++/* [0x00000240] */ 0x8c8270f6, 0x10025801, // add r0, r0, r3        ; mov ra1, unif
++/* [0x00000248] */ 0x9481c1f6, 0xd00254c0, // and rb_x_next, r0, ~3 ; mov ra0, unif
++/* [0x00000250] */ 0x959dc27f, 0x10024731, // mov ra_y_next, r1     ; mov vw_setup, rb28
++/* [0x00000258] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
++/* [0x00000260] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b
++/* [0x00000268] */ 0x0c041dc0, 0xd2021467, // add rb17, ra1.16a, 1
++/* [0x00000270] */ 0x0c043dc0, 0xd20214a7, // add rb18, ra1.16a, 3
++/* [0x00000278] */ 0x11047dc0, 0xd2020827, // shl r0,   ra1.16a, 7
++/* [0x00000280] */ 0x0c067180, 0x14020827, // add r0,   r0, ra1.16b
++/* [0x00000288] */ 0x119d01c0, 0xd0020827, // shl r0,   r0, i_shift16
++/* [0x00000290] */ 0x8c81b1f6, 0x10025683, // add rb26, r0, rb27    ; mov ra3, unif
++/* [0x00000298] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++/* [0x000002a0] */ 0x950e0ff6, 0x18024048, // mov ra1, unif         ; mov rb8,  ra3.8a
++/* [0x000002a8] */ 0x950e0ff6, 0x1a064049, // mov.ifnz ra1, unif    ; mov rb9,  ra3.8b
++/* [0x000002b0] */ 0x800e7036, 0x1c0049ca, // nop                   ; mov rb10, ra3.8c
++/* [0x000002b8] */ 0x950c0ff6, 0xde0248cb, // mov r3, 0             ; mov rb11, ra3.8d
++/* [0x000002c0] */ 0x1104ddc0, 0x14020867, // shl r1, ra1.16b, rb13
++/* [0x000002c8] */ 0x0f9c13c0, 0xd0021327, // asr rb12, r1, 1
++/* [0x000002d0] */ 0x11041dc0, 0xd20213a7, // shl rb14, ra1.16a, 1
++// :uvloop
++/* [0x000002d8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1          ; ldtmu0
++/* [0x000002e0] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
++/* [0x000002e8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
++/* [0x000002f0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
++/* [0x000002f8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
++/* [0x00000300] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
++/* [0x00000308] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
++/* [0x00000310] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
++/* [0x00000318] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
++/* [0x00000320] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2
++/* [0x00000328] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++/* [0x00000330] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,       r0
++/* [0x00000338] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8
++/* [0x00000340] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1
++/* [0x00000348] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9
++/* [0x00000350] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2
++/* [0x00000358] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
++/* [0x00000360] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3
++/* [0x00000368] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
++/* [0x00000370] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
++/* [0x00000378] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
++/* [0x00000380] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
++/* [0x00000388] */ 0x55389db7, 0x10024361, // mov ra13, ra14          ; mul24 r1, ra14, rb9
++/* [0x00000390] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
++/* [0x00000398] */ 0x55308037, 0x100243e0, // mov ra15, r0            ; mul24 r0, ra12, rb8
++/* [0x000003a0] */ 0x4d38a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra14, rb10
++/* [0x000003a8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
++/* [0x000003b0] */ 0x8d9f223f, 0x10020867, // sub r1, r1, r0          ; mov -, vw_wait
++/* [0x000003b8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
++/* [0x000003c0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
++/* [0x000003c8] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
++/* [0x000003d0] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
++/* [0x000003d8] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
++/* [0x000003e0] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:uvloop
++/* [0x000003e8] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
++/* [0x000003f0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
++/* [0x000003f8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
++/* [0x00000400] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
++/* [0x00000408] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
++/* [0x00000410] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
++/* [0x00000418] */ 0x00000010, 0xe0020827, // mov r0, 16
++/* [0x00000420] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000428] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
++/* [0x00000430] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
++/* [0x00000438] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
++/* [0x00000440] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
++// ::mc_filter_uv_b0
++/* [0x00000448] */ 0x15827d80, 0x100207e7, // mov ra31, unif
++/* [0x00000450] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
++/* [0x00000458] */ 0x938001f6, 0xd0024821, // max r0, r0, 0                ; mov r1, unif
++/* [0x00000460] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
++/* [0x00000468] */ 0x8d4e0ef6, 0x10025891, // sub r2, unif, r3             ; mov ra_xshift, ra_xshift_next
++/* [0x00000470] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
++/* [0x00000478] */ 0x8c8270f6, 0x10025801, // add r0, r0, r3  	     ; mov ra1, unif
++/* [0x00000480] */ 0x9481c1f6, 0xd00254c0, // and rb_x_next, r0, ~3        ; mov ra0, unif
++/* [0x00000488] */ 0x959d527f, 0x10024731, // mov ra_y_next, r1            ; mov vw_setup, rb21
++/* [0x00000490] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
++/* [0x00000498] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b
++/* [0x000004a0] */ 0x0c041dc0, 0xd2021467, // add rb17, ra1.16a, 1
++/* [0x000004a8] */ 0x0c043dc0, 0xd20214a7, // add rb18, ra1.16a, 3
++/* [0x000004b0] */ 0x11047dc0, 0xd2020827, // shl r0,   ra1.16a, 7
++/* [0x000004b8] */ 0x0c067180, 0x14020827, // add r0,   r0, ra1.16b
++/* [0x000004c0] */ 0x918101f6, 0xd0025803, // shl r0,   r0, i_shift16      ; mov ra3, unif
++/* [0x000004c8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
++/* [0x000004d0] */ 0x150e7d80, 0x18021227, // mov rb8, ra3.8a
++/* [0x000004d8] */ 0x150e7d80, 0x1a021267, // mov rb9, ra3.8b
++/* [0x000004e0] */ 0x150e7d80, 0x1c0212a7, // mov rb10, ra3.8c
++/* [0x000004e8] */ 0x150e7d80, 0x1e0212e7, // mov rb11, ra3.8d
++/* [0x000004f0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++/* [0x000004f8] */ 0x15827d80, 0x100213a7, // mov      rb14, unif
++/* [0x00000500] */ 0x95800dbf, 0xd00653a3, // mov.ifnz rb14, unif    ; mov r3, 0
++// :uvloop_b0
++/* [0x00000508] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1          ; ldtmu0
++/* [0x00000510] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
++/* [0x00000518] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
++/* [0x00000520] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
++/* [0x00000528] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
++/* [0x00000530] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
++/* [0x00000538] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
++/* [0x00000540] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
++/* [0x00000548] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
++/* [0x00000550] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2
++/* [0x00000558] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++/* [0x00000560] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,       r0
++/* [0x00000568] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8
++/* [0x00000570] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1
++/* [0x00000578] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9
++/* [0x00000580] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2
++/* [0x00000588] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
++/* [0x00000590] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3
++/* [0x00000598] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
++/* [0x000005a0] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
++/* [0x000005a8] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
++/* [0x000005b0] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
++/* [0x000005b8] */ 0x55389db7, 0x10024361, // mov ra13, ra14          ; mul24 r1, ra14, rb9
++/* [0x000005c0] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
++/* [0x000005c8] */ 0x55308037, 0x100243e0, // mov ra15, r0            ; mul24 r0, ra12, rb8
++/* [0x000005d0] */ 0x4d38a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra14, rb10
++/* [0x000005d8] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
++/* [0x000005e0] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:uvloop_b0
++/* [0x000005e8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
++/* [0x000005f0] */ 0x8d9f223f, 0x10020867, // sub r1, r1, r0          ; mov -, vw_wait
++/* [0x000005f8] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
++/* [0x00000600] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
++/* [0x00000608] */ 0x15827d80, 0x100009e7, // mov -, unif
++/* [0x00000610] */ 0x15827d80, 0x100009e7, // mov -, unif
++/* [0x00000618] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_filter_uv_b
++/* [0x00000620] */ 0x15827d80, 0x100207e7, // mov ra31, unif
++/* [0x00000628] */ 0x954dcdbf, 0x10024471, // mov ra_xshift, ra_xshift_next      ; mov vw_setup, rb28
++/* [0x00000630] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
++/* [0x00000638] */ 0x938001f6, 0xd002581c, // max r0, r0, 0                      ; mov ra_y_next, unif
++/* [0x00000640] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
++/* [0x00000648] */ 0x4d808cc7, 0xd0025893, // sub r2, unif, r3                   ; mul24 ra_xshift_next, r0, 8
++/* [0x00000650] */ 0x8c8270f6, 0x10025801, // add r0, r0, r3                     ; mov ra1, unif
++/* [0x00000658] */ 0x9481c1f6, 0xd00254c0, // and rb_x_next, r0, ~3              ; mov ra0, unif
++/* [0x00000660] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b
++/* [0x00000668] */ 0x0c041dc0, 0xd2021467, // add rb17, ra1.16a, 1
++/* [0x00000670] */ 0x0c043dc0, 0xd20214a7, // add rb18, ra1.16a, 3
++/* [0x00000678] */ 0x11047dc0, 0xd2020827, // shl r0,   ra1.16a, 7
++/* [0x00000680] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
++/* [0x00000688] */ 0x918151f6, 0xd00258c3, // shl r3, r0, i_shift21     ; mov ra3, unif
++/* [0x00000690] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
++/* [0x00000698] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
++/* [0x000006a0] */ 0x0c067180, 0x14020827, // add r0, r0, ra1.16b
++/* [0x000006a8] */ 0x119d01c0, 0xd0020827, // shl r0, r0, i_shift16
++/* [0x000006b0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
++/* [0x000006b8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++/* [0x000006c0] */ 0x950e0ff6, 0x18024048, // mov      ra1, unif  ; mov rb8,  ra3.8a
++/* [0x000006c8] */ 0x950e0ff6, 0x1a064049, // mov.ifnz ra1, unif  ; mov rb9,  ra3.8b
++/* [0x000006d0] */ 0x800e7036, 0x1c0049ca, // nop                 ; mov rb10, ra3.8c
++/* [0x000006d8] */ 0x950c0ff6, 0xde0248cb, // mov r3, 0           ; mov rb11, ra3.8d
++/* [0x000006e0] */ 0x1104ddc0, 0x14020867, // shl r1, ra1.16b, rb13
++/* [0x000006e8] */ 0x0f9c13c0, 0xd0021327, // asr rb12, r1, 1
++// :uvloop_b
++/* [0x000006f0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1          ; ldtmu0
++/* [0x000006f8] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
++/* [0x00000700] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
++/* [0x00000708] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
++/* [0x00000710] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift     ; v8subs r0, r0, rb20
++/* [0x00000718] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
++/* [0x00000720] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
++/* [0x00000728] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
++/* [0x00000730] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2         ; v8subs r1, r1, rb20
++/* [0x00000738] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2
++/* [0x00000740] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++/* [0x00000748] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,       r0
++/* [0x00000750] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8
++/* [0x00000758] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1
++/* [0x00000760] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9
++/* [0x00000768] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2
++/* [0x00000770] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
++/* [0x00000778] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3
++/* [0x00000780] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
++/* [0x00000788] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
++/* [0x00000790] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
++/* [0x00000798] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
++/* [0x000007a0] */ 0x55389db7, 0x10024361, // mov ra13, ra14          ; mul24 r1, ra14, rb9
++/* [0x000007a8] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
++/* [0x000007b0] */ 0x55308037, 0x100243e0, // mov ra15, r0            ; mul24 r0, ra12, rb8
++/* [0x000007b8] */ 0x4d38a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra14, rb10
++/* [0x000007c0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
++/* [0x000007c8] */ 0x4d13023e, 0x10024860, // sub r1, r1, r0          ; mul24 r0, vpm, ra4
++/* [0x000007d0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
++/* [0x000007d8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
++/* [0x000007e0] */ 0x4f0501ce, 0xd2024821, // asr r0, r0, i_shift16   ; mul24 r1, r1, ra1.16a
++/* [0x000007e8] */ 0x409ce007, 0x100049e0, // nop                     ; mul24 r0, r0, rb14
++/* [0x000007f0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
++/* [0x000007f8] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
++/* [0x00000800] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
++/* [0x00000808] */ 0xfffffec8, 0xf06809e7, // brr.anyn -, r:uvloop_b
++/* [0x00000810] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
++/* [0x00000818] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
++/* [0x00000820] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
++/* [0x00000828] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
++/* [0x00000830] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
++/* [0x00000838] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
++/* [0x00000840] */ 0x00000010, 0xe0020827, // mov r0, 16
++/* [0x00000848] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000850] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
++/* [0x00000858] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
++/* [0x00000860] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
++/* [0x00000868] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
++// ::mc_exit
++/* [0x00000870] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
++/* [0x00000878] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
++/* [0x00000880] */ 0x009e7000, 0xa00009e7, // ldtmu0
++/* [0x00000888] */ 0x009e7000, 0xb00009e7, // ldtmu1
++/* [0x00000890] */ 0x009e7000, 0xa00009e7, // ldtmu0
++/* [0x00000898] */ 0x009e7000, 0xb00009e7, // ldtmu1
++/* [0x000008a0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
++/* [0x000008a8] */ 0x009e7000, 0x100009e7, // nop        ; nop
++/* [0x000008b0] */ 0x009e7000, 0x100009e7, // nop        ; nop
++// ::mc_interrupt_exit8
++/* [0x000008b8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
++/* [0x000008c0] */ 0x009e7000, 0xa00009e7, // ldtmu0
++/* [0x000008c8] */ 0x009e7000, 0xb00009e7, // ldtmu1
++/* [0x000008d0] */ 0x009e7000, 0xa00009e7, // ldtmu0
++/* [0x000008d8] */ 0x009e7000, 0xb00009e7, // ldtmu1
++/* [0x000008e0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x000008e8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x000008f0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x000008f8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x00000900] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x00000908] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x00000910] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x00000918] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
++/* [0x00000920] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
++/* [0x00000928] */ 0x009e7000, 0x100009e7, // nop        ; nop
++// ::mc_setup
++/* [0x00000930] */ 0x00000010, 0xe00208e7, // mov r3, 16
++/* [0x00000938] */ 0x15827d80, 0x10020227, // mov ra8, unif
++/* [0x00000940] */ 0x15827d80, 0x10020267, // mov ra9, unif
++/* [0x00000948] */ 0x15827d80, 0x100202a7, // mov ra10, unif
++/* [0x00000950] */ 0x15827d80, 0x100202e7, // mov ra11, unif
++/* [0x00000958] */ 0x15827d80, 0x10020867, // mov r1, unif
++/* [0x00000960] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3
++/* [0x00000968] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3
++/* [0x00000970] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3
++/* [0x00000978] */ 0x0d9c13c0, 0xd0021667, // sub rb_frame_width_minus_1,r1,1
++/* [0x00000980] */ 0x0d9c11c0, 0xd00217a7, // sub rb_frame_height_minus_1,r0,1
++/* [0x00000988] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
++/* [0x00000990] */ 0x15827d80, 0x10020827, // mov r0, unif
++/* [0x00000998] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
++/* [0x000009a0] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
++/* [0x000009a8] */ 0x15227d80, 0x10020867, // mov r1, ra8
++/* [0x000009b0] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3
++/* [0x000009b8] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3
++/* [0x000009c0] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3
++/* [0x000009c8] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
++/* [0x000009d0] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x000009d8] */ 0x922591f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, ra9
++/* [0x000009e0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
++/* [0x000009e8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
++/* [0x000009f0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
++/* [0x000009f8] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
++/* [0x00000a00] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
++/* [0x00000a08] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
++/* [0x00000a10] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
++/* [0x00000a18] */ 0x8c9e7452, 0x10025e18, // add t0s, r2, r1 ; mov ra_frame_base, r2
++/* [0x00000a20] */ 0x152a7d80, 0x10020867, // mov r1, ra10
++/* [0x00000a28] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3
++/* [0x00000a30] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3
++/* [0x00000a38] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3
++/* [0x00000a40] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
++/* [0x00000a48] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x00000a50] */ 0x922d91f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, ra11
++/* [0x00000a58] */ 0x119c31c0, 0xd0021067, // shl rx_xshift2_next, r0, 3
++/* [0x00000a60] */ 0x0c9c13c0, 0xd0120567, // add ra_y2, r1, 1
++/* [0x00000a68] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
++/* [0x00000a70] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
++/* [0x00000a78] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
++/* [0x00000a80] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
++/* [0x00000a88] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
++/* [0x00000a90] */ 0x8c9e7452, 0x10025f19, // add t1s, r2, r1 ; mov ra_frame_base2, r2
++/* [0x00000a98] */ 0x00000001, 0xe0020527, // mov ra_k1, 1
++/* [0x00000aa0] */ 0x00000100, 0xe00205a7, // mov ra_k256, 256
++/* [0x00000aa8] */ 0x00000040, 0xe00207a7, // mov ra30, 64
++/* [0x00000ab0] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
++/* [0x00000ab8] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255
++/* [0x00000ac0] */ 0x00000018, 0xe00215e7, // mov rb23, 24
++/* [0x00000ac8] */ 0x00000000, 0xe0020227, // mov ra8, 0
++/* [0x00000ad0] */ 0x00000000, 0xe0020267, // mov ra9, 0
++/* [0x00000ad8] */ 0x00000000, 0xe00202a7, // mov ra10, 0
++/* [0x00000ae0] */ 0x00000000, 0xe00202e7, // mov ra11, 0
++/* [0x00000ae8] */ 0x00000000, 0xe0020327, // mov ra12, 0
++/* [0x00000af0] */ 0x00000000, 0xe0020367, // mov ra13, 0
++/* [0x00000af8] */ 0x00000000, 0xe00203a7, // mov ra14, 0
++/* [0x00000b00] */ 0x00000000, 0xe00203e7, // mov ra15, 0
++/* [0x00000b08] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
++/* [0x00000b10] */ 0x159e7480, 0x10020867, // mov r1, r2
++/* [0x00000b18] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
++/* [0x00000b20] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
++/* [0x00000b28] */ 0x159e7480, 0x10020827, // mov r0, r2
++/* [0x00000b30] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
++/* [0x00000b38] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
++/* [0x00000b40] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
++/* [0x00000b48] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
++/* [0x00000b50] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
++/* [0x00000b58] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
++/* [0x00000b60] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
++/* [0x00000b68] */ 0x0c809dc0, 0xd0021367, // add rb13, unif, 9
++/* [0x00000b70] */ 0x15827d80, 0x100009e7, // mov -, unif
++/* [0x00000b78] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
++/* [0x00000b80] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
++/* [0x00000b88] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
++/* [0x00000b90] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
++/* [0x00000b98] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_frame_base
++/* [0x00000ba0] */ 0x13540dc0, 0xd2020867, // max r1, ra_y2, 0
++/* [0x00000ba8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
++/* [0x00000bb0] */ 0x0c541dc0, 0xd2120567, // add ra_y2, ra_y2, 1
++/* [0x00000bb8] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
++/* [0x00000bc0] */ 0x0c667380, 0x10020f27, // add t1s, r1, ra_frame_base2
++// :per_block_setup
++/* [0x00000bc8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++/* [0x00000bd0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
++/* [0x00000bd8] */ 0x959a0ff6, 0x10024061, // mov ra1, unif  ; mov r1, elem_num
++/* [0x00000be0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
++/* [0x00000be8] */ 0x159c1fc0, 0x10021027, // mov rx_xshift2, rx_xshift2_next
++/* [0x00000bf0] */ 0x0c067c40, 0x12020827, // add r0, ra1.16a, r1
++/* [0x00000bf8] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x00000c00] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
++/* [0x00000c08] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
++/* [0x00000c10] */ 0x95048ff6, 0xd40258dc, // mov r3, 8                          ; mov ra_y_next, ra1.16b
++/* [0x00000c18] */ 0x9481c1f6, 0xd0025801, // and r0, r0, ~3                     ; mov ra1, unif
++/* [0x00000c20] */ 0x0c9e7400, 0x100206a7, // add ra_frame_base_next, r2, r0
++/* [0x00000c28] */ 0x0c067c40, 0x12020827, // add r0, ra1.16a, r1
++/* [0x00000c30] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x00000c38] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
++/* [0x00000c40] */ 0x119c31c0, 0xd0021067, // shl rx_xshift2_next, r0, 3
++/* [0x00000c48] */ 0x8c0676f6, 0x142258d5, // add r3, r3, r3                     ; mov ra_y2_next, ra1.16b
++/* [0x00000c50] */ 0x9481c1f6, 0xd0025801, // and r0, r0, ~3                     ; mov ra1, unif
++/* [0x00000c58] */ 0x0c9e7400, 0x100214e7, // add rx_frame_base2_next, r2, r0
++/* [0x00000c60] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
++/* [0x00000c68] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b
++/* [0x00000c70] */ 0x0c045dc0, 0xd2021467, // add rb17, ra1.16a, 5
++/* [0x00000c78] */ 0x0c047dc0, 0xd20214a7, // add rb18, ra1.16a, 7
++/* [0x00000c80] */ 0x11047dc0, 0xd2020827, // shl r0,   ra1.16a, 7
++/* [0x00000c88] */ 0x0c067180, 0x14020827, // add r0,   r0, ra1.16b
++/* [0x00000c90] */ 0x119d01c0, 0xd0020827, // shl r0,   r0, i_shift16
++/* [0x00000c98] */ 0x8c81b1f6, 0x100256a0, // add rb26, r0, rb27                 ; mov r0, unif
++/* [0x00000ca0] */ 0x119d01c0, 0xd0040827, // shl.ifz r0, r0, i_shift16
++/* [0x00000ca8] */ 0x119c31c0, 0xd0020227, // shl ra8, r0, 3
++/* [0x00000cb0] */ 0x00010100, 0xe0020867, // mov r1,0x00010100
++/* [0x00000cb8] */ 0x10227380, 0x1e4200a7, // ror ra2.8a, r1, ra8.8d
++/* [0x00000cc0] */ 0x10227380, 0x1c420027, // ror ra0.8a, r1, ra8.8c
++/* [0x00000cc8] */ 0x01040400, 0xe0020867, // mov r1,0x01040400
++/* [0x00000cd0] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d
++/* [0x00000cd8] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c
++/* [0x00000ce0] */ 0x050b0a00, 0xe0020867, // mov r1,0x050b0a00
++/* [0x00000ce8] */ 0x10227380, 0x1e6200a7, // ror ra2.8c, r1, ra8.8d
++/* [0x00000cf0] */ 0x10227380, 0x1c620027, // ror ra0.8c, r1, ra8.8c
++/* [0x00000cf8] */ 0x11283a40, 0xe0020867, // mov r1,0x11283a40
++/* [0x00000d00] */ 0x10227380, 0x1e7200a7, // ror ra2.8d, r1, ra8.8d
++/* [0x00000d08] */ 0x10227380, 0x1c720027, // ror ra0.8d, r1, ra8.8c
++/* [0x00000d10] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100
++/* [0x00000d18] */ 0x10227380, 0x1e4200e7, // ror ra3.8a, r1, ra8.8d
++/* [0x00000d20] */ 0x10227380, 0x1c420067, // ror ra1.8a, r1, ra8.8c
++/* [0x00000d28] */ 0x0a0b0500, 0xe0020867, // mov r1,0x0a0b0500
++/* [0x00000d30] */ 0x10227380, 0x1e5200e7, // ror ra3.8b, r1, ra8.8d
++/* [0x00000d38] */ 0x10227380, 0x1c520067, // ror ra1.8b, r1, ra8.8c
++/* [0x00000d40] */ 0x04040100, 0xe0020867, // mov r1,0x04040100
++/* [0x00000d48] */ 0x10227380, 0x1e6200e7, // ror ra3.8c, r1, ra8.8d
++/* [0x00000d50] */ 0x10227380, 0x1c620067, // ror ra1.8c, r1, ra8.8c
++/* [0x00000d58] */ 0x01010000, 0xe0020867, // mov r1,0x01010000
++/* [0x00000d60] */ 0x902203bf, 0x1e7240e0, // ror ra3.8d, r1, ra8.8d    ; mov r0, unif
++/* [0x00000d68] */ 0x9020d3bf, 0x1c724061, // ror ra1.8d, r1, ra8.8c    ; mov r1, rb13
++/* [0x00000d70] */ 0x910e0e76, 0x18024844, // shl r1, unif, r1          ; mov rb4, ra3.8a
++/* [0x00000d78] */ 0x8f0e70f6, 0x1a024485, // asr ra18, r0, r3          ; mov rb5, ra3.8b
++/* [0x00000d80] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
++/* [0x00000d88] */ 0x910e70f6, 0x1c024806, // shl r0, r0, r3            ; mov rb6, ra3.8c
++/* [0x00000d90] */ 0x950c0ff6, 0xde0248c7, // mov r3, 0                 ; mov rb7, ra3.8d
++/* [0x00000d98] */ 0x0f9c93c0, 0xd0021327, // asr rb12, r1, 9
++// ::mc_filter
++/* [0x00000da0] */ 0x0f9cf1c0, 0xd00213a7, // asr rb14, r0, 15
++// :yloop
++/* [0x00000da8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1                           ; ldtmu0
++/* [0x00000db0] */ 0x8e4539bf, 0xb0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu1
++/* [0x00000db8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
++/* [0x00000dc0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
++/* [0x00000dc8] */ 0x8e5409f6, 0x14129855, // shr r1, r4, rx_xshift2    ; mov.ifz ra_y2, ra_y2_next
++/* [0x00000dd0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
++/* [0x00000dd8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
++/* [0x00000de0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
++/* [0x00000de8] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
++/* [0x00000df0] */ 0x13540dc0, 0xd20208a7, // max r2, ra_y2, 0
++/* [0x00000df8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
++/* [0x00000e00] */ 0x4c541dd3, 0xd2124562, // add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
++/* [0x00000e08] */ 0xec654c8f, 0x10024f21, // add t1s, ra_frame_base2, r2  ; v8subs r1, r1, rb20
++/* [0x00000e10] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++/* [0x00000e18] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,      r0
++/* [0x00000e20] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
++/* [0x00000e28] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1, r0 << 1
++/* [0x00000e30] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
++/* [0x00000e38] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2
++/* [0x00000e40] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
++/* [0x00000e48] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3
++/* [0x00000e50] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
++/* [0x00000e58] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4
++/* [0x00000e60] */ 0x40074031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12
++/* [0x00000e68] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5
++/* [0x00000e70] */ 0x40073031, 0xda00c9e3, // nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13
++/* [0x00000e78] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6
++/* [0x00000e80] */ 0x40072031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14
++/* [0x00000e88] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7
++/* [0x00000e90] */ 0x40071031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15
++/* [0x00000e98] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
++/* [0x00000ea0] */ 0x8d2087f6, 0xd00269e1, // sub.setf -, r3, 8       ; mov r1,   ra8
++/* [0x00000ea8] */ 0x95249dbf, 0x10024208, // mov ra8,  ra9           ; mov rb8,  rb9
++/* [0x00000eb0] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:yloop
++/* [0x00000eb8] */ 0x9528adbf, 0x10024249, // mov ra9,  ra10          ; mov rb9,  rb10
++/* [0x00000ec0] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11          ; mov rb10, rb11
++/* [0x00000ec8] */ 0x959e7009, 0x100242cb, // mov ra11, r0            ; mov rb11, r1
++/* [0x00000ed0] */ 0x4008803e, 0x180049e0, // nop                     ; mul24 r0, rb8,  ra2.8a
++/* [0x00000ed8] */ 0x4008903e, 0x1a0049e1, // nop                     ; mul24 r1, rb9,  ra2.8b
++/* [0x00000ee0] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0          ; mul24 r0, rb10, ra2.8c
++/* [0x00000ee8] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0          ; mul24 r0, rb11, ra2.8d
++/* [0x00000ef0] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8,  rb4
++/* [0x00000ef8] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9,  rb5
++/* [0x00000f00] */ 0x4d286237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb6
++/* [0x00000f08] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
++/* [0x00000f10] */ 0x8d9f223f, 0x10020867, // sub r1, r1, r0          ; mov -, vw_wait
++/* [0x00000f18] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
++/* [0x00000f20] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
++/* [0x00000f28] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
++/* [0x00000f30] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
++/* [0x00000f38] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
++/* [0x00000f40] */ 0xfffffe48, 0xf06809e7, // brr.anyn -, r:yloop
++/* [0x00000f48] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
++/* [0x00000f50] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
++/* [0x00000f58] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
++/* [0x00000f60] */ 0xfffffc48, 0xf0f809e7, // brr -, r:per_block_setup
++/* [0x00000f68] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
++/* [0x00000f70] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
++/* [0x00000f78] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
++// ::mc_filter_b
++/* [0x00000f80] */ 0x0f9d01c0, 0xd00213a7, // asr rb14, r0, i_shift16
++// :yloopb
++/* [0x00000f88] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1                           ; ldtmu0
++/* [0x00000f90] */ 0x8e4539bf, 0xb0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu1
++/* [0x00000f98] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
++/* [0x00000fa0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
++/* [0x00000fa8] */ 0x8e5409f6, 0x14129855, // shr r1, r4, rx_xshift2    ; mov.ifz ra_y2, ra_y2_next
++/* [0x00000fb0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
++/* [0x00000fb8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
++/* [0x00000fc0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
++/* [0x00000fc8] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
++/* [0x00000fd0] */ 0x13540dc0, 0xd20208a7, // max r2, ra_y2, 0
++/* [0x00000fd8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
++/* [0x00000fe0] */ 0x4c541dd3, 0xd2124562, // add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
++/* [0x00000fe8] */ 0xec654c8f, 0x10024f21, // add t1s, ra_frame_base2, r2  ; v8subs r1, r1, rb20
++/* [0x00000ff0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++/* [0x00000ff8] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,      r0
++/* [0x00001000] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
++/* [0x00001008] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1, r0 << 1
++/* [0x00001010] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
++/* [0x00001018] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2
++/* [0x00001020] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
++/* [0x00001028] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3
++/* [0x00001030] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
++/* [0x00001038] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4
++/* [0x00001040] */ 0x40074031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12
++/* [0x00001048] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5
++/* [0x00001050] */ 0x40073031, 0xda00c9e3, // nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13
++/* [0x00001058] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6
++/* [0x00001060] */ 0x40072031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14
++/* [0x00001068] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7
++/* [0x00001070] */ 0x40071031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15
++/* [0x00001078] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
++/* [0x00001080] */ 0x8d2087f6, 0xd00269e1, // sub.setf -, r3, 8       ; mov r1,   ra8
++/* [0x00001088] */ 0x95249dbf, 0x10024208, // mov ra8,  ra9           ; mov rb8,  rb9
++/* [0x00001090] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:yloopb
++/* [0x00001098] */ 0x9528adbf, 0x10024249, // mov ra9,  ra10          ; mov rb9,  rb10
++/* [0x000010a0] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11          ; mov rb10, rb11
++/* [0x000010a8] */ 0x959e7009, 0x100242cb, // mov ra11, r0            ; mov rb11, r1
++/* [0x000010b0] */ 0x4008803e, 0x180049e0, // nop                     ; mul24 r0, rb8,  ra2.8a
++/* [0x000010b8] */ 0x4008903e, 0x1a0049e1, // nop                     ; mul24 r1, rb9,  ra2.8b
++/* [0x000010c0] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0          ; mul24 r0, rb10, ra2.8c
++/* [0x000010c8] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0          ; mul24 r0, rb11, ra2.8d
++/* [0x000010d0] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8,  rb4
++/* [0x000010d8] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9,  rb5
++/* [0x000010e0] */ 0x4d286237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb6
++/* [0x000010e8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
++/* [0x000010f0] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0          ; mov r2, rb12
++/* [0x000010f8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
++/* [0x00001100] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
++/* [0x00001108] */ 0x409ce00f, 0x100049e0, // nop                     ; mul24 r0, r1, rb14
++/* [0x00001110] */ 0x4c4b808e, 0xd0024821, // add r0, r0, r2          ; mul24 r1, r1 << 8, ra18 << 8
++/* [0x00001118] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
++/* [0x00001120] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
++/* [0x00001128] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:yloopb
++/* [0x00001130] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
++/* [0x00001138] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
++/* [0x00001140] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
++/* [0x00001148] */ 0xfffffa60, 0xf0f809e7, // brr -, r:per_block_setup
++/* [0x00001150] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
++/* [0x00001158] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
++/* [0x00001160] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
++// ::mc_interrupt_exit12
++/* [0x00001168] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
++/* [0x00001170] */ 0x009e7000, 0xa00009e7, // ldtmu0
++/* [0x00001178] */ 0x009e7000, 0xa00009e7, // ldtmu0
++/* [0x00001180] */ 0x009e7000, 0xb00009e7, // ldtmu1
++/* [0x00001188] */ 0x009e7000, 0xb00009e7, // ldtmu1
++/* [0x00001190] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x00001198] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x000011a0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x000011a8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x000011b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x000011b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x000011c0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x000011c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x000011d0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x000011d8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x000011e0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x000011e8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
++/* [0x000011f0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
++/* [0x000011f8] */ 0x009e7000, 0x100009e7, // nop        ; nop
++// ::mc_exit1
++/* [0x00001200] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
++/* [0x00001208] */ 0x009e7000, 0xa00009e7, // ldtmu0
++/* [0x00001210] */ 0x009e7000, 0xb00009e7, // ldtmu1
++/* [0x00001218] */ 0x009e7000, 0xa00009e7, // ldtmu0
++/* [0x00001220] */ 0x009e7000, 0xb00009e7, // ldtmu1
++/* [0x00001228] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
++/* [0x00001230] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
++/* [0x00001238] */ 0x009e7000, 0x100009e7, // nop        ; nop
++// ::mc_end
++};
++#ifdef __HIGHC__
++#pragma Align_to(8, rpi_shader)
++#endif
+diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
+new file mode 100644
+index 0000000..9772796
+--- /dev/null
++++ b/libavcodec/rpi_shader.h
+@@ -0,0 +1,19 @@
++#ifndef rpi_shader_H
++#define rpi_shader_H
++
++extern unsigned int rpi_shader[];
++
++#define mc_setup_uv (rpi_shader + 0)
++#define mc_filter_uv (rpi_shader + 132)
++#define mc_filter_uv_b0 (rpi_shader + 274)
++#define mc_filter_uv_b (rpi_shader + 392)
++#define mc_exit (rpi_shader + 540)
++#define mc_interrupt_exit8 (rpi_shader + 558)
++#define mc_setup (rpi_shader + 588)
++#define mc_filter (rpi_shader + 872)
++#define mc_filter_b (rpi_shader + 992)
++#define mc_interrupt_exit12 (rpi_shader + 1114)
++#define mc_exit1 (rpi_shader + 1152)
++#define mc_end (rpi_shader + 1168)
++
++#endif
+diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
+new file mode 100644
+index 0000000..aa9e1e7
+--- /dev/null
++++ b/libavcodec/rpi_shader.qasm
+@@ -0,0 +1,1098 @@
++# register allocation
++#
++# ra0...ra7                                     eight horizontal filter coefficients
++#
++# rb0 rx_shift2
++# rb1 rb_y2_next
++#
++# rb4...rb7
++#
++# rb8..rb11, ra8...ra11                         Y: eight filtered rows of context (ra11 == most recent)
++#
++#                                               (ra15 isn't clamped to zero - this happens during the
++#                                                copy to ra14, and during its use in the vertical filter)
++#
++# rb8...rb11                                    eight vertical filter coefficients
++
++# ra4                                           y: Fiter, UV: 0x10000
++
++# rb12                                          offset to add before shift (round + weighting offsets)
++# rb13                                          shift: denom + 6 + 9
++# rb14                                          L0 weight (U on left, V on right)
++# rb15                                          -- free --
++#
++# ra16                                          clipped(row start address+elem_num)&~3
++# ra17                                          per-channel shifts
++# ra18                                          L1 weight (Y)
++# ra19                                          next ra17
++#
++# rb16                                          pitch
++# rb17                                          height + 1
++# rb18                                          height + 3
++# rb19                                          next ra16
++#
++# ra20                                          1
++# ra21                                          ra_21
++# ra22 ra_k256                                  256
++# ra23 ra_y2_next                               ra_y2_next
++#
++# rb20                                          0xffffff00
++# rb21                                          vpm_setup for reading/writing 16bit results into VPM
++# rb22 rb_k255                                  255
++# rb23                                          24
++#
++# rb24                                          vdw_setup_1(dst_pitch)
++# rb25                                          frame width-1
++# rb26                                          height<<23 + width<<16 + vdw_setup_0
++# rb27                                          vdw_setup_0 (depends on QPU number)
++# rb28                                          vpm_setup (depends on QPU number) for writing 8bit results into VPM
++# rb29                                          vdw_setup_1(dst_pitch-width)
++# rb30                                          frame height-1
++# rb31                                          used as temp to count loop iterations
++#
++# ra24                                          clipped(row start address+8+elem_num)&~3
++# ra25                                          per-channel shifts 2
++# ra26                                          next ra24
++# ra27                                          next ra25
++# ra28                                          next y
++# ra29                                          y for next texture access
++# ra30                                          64
++#
++# ra31                                          next kernel address
++
++.set rb_frame_width_minus_1,       rb25
++.set rb_frame_height_minus_1,      rb30
++.set rb_pitch,                     rb16
++.set ra_x,                         ra16
++.set ra_y2,                        ra21.16a
++.set ra_y2_next,                   ra21.16b
++
++.set rb_x_next,                    rb19
++.set rx_frame_base2_next,          rb19
++
++.set ra_frame_base,                ra24
++.set ra_frame_base_next,           ra26
++.set ra_xshift,                    ra17
++
++.set ra_u2v_ref_offset,            ra25
++.set ra_frame_base2,               ra25
++
++.set ra_xshift_next,               ra19
++.set rx_xshift2,                   rb0
++.set rx_xshift2_next,              rb1
++
++.set ra_u2v_dst_offset,            ra27
++
++.set ra_y_next,                    ra28
++.set ra_y,                         ra29
++
++.set ra_k1,                        ra20
++.set rb_k255,                      rb22
++.set ra_k256,                      ra22
++
++# With shifts only the bottom 5 bits are considered so -16=16, -15=17 etc.
++.set i_shift16,                    -16
++.set i_shift21,                    -11
++
++################################################################################
++# mc_setup_uv(next_kernel, x, y, ref_u_base, ref_v_base, frame_width, frame_height, pitch, dst_pitch, offset, denom, vpm_id)
++::mc_setup_uv
++
++# Read starting kernel
++mov ra31, unif
++
++# Load first request location
++add ra_x, unif, elem_num # Store x
++mov ra_y, unif # Store y
++mov ra_frame_base, unif # Store frame u base
++nop
++sub ra_u2v_ref_offset, unif, ra_frame_base # Store offset to add to move from u to v in reference frame
++
++# Read image dimensions
++sub rb25,unif,1
++sub rb30,unif,1
++
++# get source pitch
++mov rb16, unif
++
++# get destination pitch
++mov r0, unif
++mov r1, vdw_setup_1(0)
++add rb24, r1, r0
++
++# load constants
++
++mov ra4, 0x10000
++mov ra_k1, 1
++mov ra_k256, 256
++mov ra30, 64
++
++mov rb20, 0xffffff00
++mov rb_k255, 255
++mov rb23, 24
++
++# touch vertical context to keep simulator happy
++
++mov ra8, 0
++mov ra9, 0
++mov ra10, 0
++mov ra11, 0
++mov ra12, 0
++mov ra13, 0
++mov ra14, 0
++mov ra15, 0
++
++# Compute base address for first and second access
++mov r0, ra_x           # Load x
++max r0, r0, 0; mov r1, ra_y # Load y
++min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base  # Load the frame base
++shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
++add ra_y, r1, 1
++add r0, r0, r3
++and r0, r0, ~3
++max r1, r1, 0 ; mov ra_x, r0 # y
++min r1, r1, rb_frame_height_minus_1
++# submit texture requests for first line
++add r2, r2, r0 ; mul24 r1, r1, rb_pitch
++add t0s, r0, r1 ; mov ra_frame_base, r2
++add t1s, r2, r1
++
++mov r2, 9
++add rb13, r2, unif  # denominator
++mov -, unif         # Unused
++
++# Compute part of VPM to use for DMA output
++mov r2, unif
++shl r2, r2, 1   # Convert QPU numbers to be even (this means we can only use 8 QPUs, but is necessary as we need to save 16bit intermediate results)
++and r2, r2, 15
++mov r1, r2
++asr r1, r1, 2
++shl r1, r1, 6
++mov r0, r2
++and r0, r0, 3
++add r0, r0, r1
++
++mov r1, vpm_setup(0, 4, h8p(0, 0))   # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
++add rb28, r0, r1  # VPM 8bit storage
++asr r2, r0, 1     # r0 = bc0000d
++mov r1, vpm_setup(0, 2, h16p(0, 0))  # 2 is stride - stride acts on ADDR which is Y[5:0],H[0] for 16 bit
++add rb21, r2, r1  # VPM for 16bit intermediates
++mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
++shl r0, r0, 5
++add rb27, r0, r1  # DMA out
++
++# submit texture requests for second line
++max r1, ra_y, 0
++min r1, r1, rb_frame_height_minus_1
++add ra_y, ra_y, 1
++bra -, ra31
++nop ; mul24 r1, r1, rb_pitch
++add t0s, r1, ra_x
++add t1s, r1, ra_frame_base
++
++
++
++################################################################################
++
++# mc_filter_uv(next_kernel, x, y, frame_u_base, frame_v_base, width_height, hcoeffs, vcoeffs, offset_weight_u, offset_weight_v, this_u_dst, this_v_dst)
++
++# At this point we have already issued two pairs of texture requests for the current block
++# ra_x, ra_x16_base point to the current coordinates for this block
++::mc_filter_uv
++mov ra31, unif
++
++# per-channel shifts were calculated on the *previous* invocation
++
++# get base addresses and per-channel shifts for *next* invocation
++add r0, unif, elem_num    # x
++max r0, r0, 0         ; mov r1, unif # y
++min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base
++# compute offset from frame base u to frame base v
++sub r2, unif, r3      ; mov ra_xshift, ra_xshift_next
++shl ra_xshift_next, r0, 3
++add r0, r0, r3        ; mov ra1, unif  # ; width_height
++and rb_x_next, r0, ~3 ; mov ra0, unif  # H filter coeffs
++mov ra_y_next, r1     ; mov vw_setup, rb28
++add ra_frame_base_next, rb_x_next, r2
++
++# set up VPM write
++# get width,height of block
++
++sub rb29, rb24, ra1.16b  # Compute vdw_setup1(dst_pitch-width)
++add rb17, ra1.16a, 1
++add rb18, ra1.16a, 3
++shl r0,   ra1.16a, 7
++add r0,   r0, ra1.16b    # Combine width and height of destination area
++shl r0,   r0, i_shift16  # Shift into bits 16 upwards of the vdw_setup0 register
++add rb26, r0, rb27    ; mov ra3, unif  # ; V filter coeffs
++
++mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++
++# unpack filter coefficients
++
++mov ra1, unif         ; mov rb8,  ra3.8a   # U offset/weight
++mov.ifnz ra1, unif    ; mov rb9,  ra3.8b   # V offset/weight
++nop                   ; mov rb10, ra3.8c
++mov r3, 0             ; mov rb11, ra3.8d   # Loop count
++
++shl r1, ra1.16b, rb13
++asr rb12, r1, 1
++shl rb14, ra1.16a, 1  # b14 = weight*2
++
++# rb14 - weight L0 * 2
++# rb13 = weight denom + 6 + 9
++# rb12 = (((is P) ? offset L0 * 2 : offset L1 + offset L0) + 1) << (rb13 - 1)
++
++# r2 is elem_num
++# retrieve texture results and pick out bytes
++# then submit two more texture requests
++
++# r3 = 0
++:uvloop
++# retrieve texture results and pick out bytes
++# then submit two more texture requests
++
++sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1          ; ldtmu0     # loop counter increment
++shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
++mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
++mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
++shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
++
++max r2, ra_y, 0  # y
++min r2, r2, rb_frame_height_minus_1
++add ra_y, ra_y, 1         ; mul24 r2, r2, r3
++add t0s, ra_x, r2    ; v8subs r1, r1, rb20
++add t1s, ra_frame_base, r2
++
++# generate seven shifted versions
++# interleave with scroll of vertical context
++
++mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++
++# apply horizontal filter
++nop                  ; mul24      r3, ra0.8a,       r0
++nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8
++nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1
++nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9
++sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2
++nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
++add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3
++nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
++sub r0, r2, r3       ; mov r3, rb31
++sub.setf -, r3, 4    ; mov ra12, ra13
++brr.anyn -, r:uvloop
++mov ra13, ra14          ; mul24 r1, ra14, rb9
++mov ra14, ra15
++mov ra15, r0            ; mul24 r0, ra12, rb8
++# >>> .anyn uvloop
++
++# apply vertical filter and write to VPM
++
++sub r1, r1, r0          ; mul24 r0, ra14, rb10
++add r1, r1, r0          ; mul24 r0, ra15, rb11
++sub r1, r1, r0          ; mov -, vw_wait
++sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
++asr r1, r1, 14
++nop                     ; mul24 r1, r1, rb14
++shl r1, r1, 8
++
++add r1, r1, rb12
++brr.anyn -, r:uvloop
++asr r1, r1, rb13
++min r1, r1, rb_k255       # Delay 2
++max vpm, r1, 0         # Delay 3
++
++# DMA out for U
++
++mov vw_setup, rb26 # VDW setup 0
++mov vw_setup, rb29 # Stride
++mov vw_addr, unif # start the VDW
++
++# DMA out for V
++# We need to wait for the U to complete first, but have nothing useful to compute while we wait.
++# Could potentially push this write into the start of the next pipeline stage.
++mov r0, 16
++mov -, vw_wait
++
++bra -, ra31
++add vw_setup, rb26, r0 # VDW setup 0
++mov vw_setup, rb29 # Stride
++mov vw_addr, unif # start the VDW
++
++
++################################################################################
++
++# mc_filter_uv_b0(next_kernel, x, y, frame_u_base, frame_v_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_u_dst, this_v_dst)
++
++# At this point we have already issued two pairs of texture requests for the current block
++# ra_x, ra_x16_base point to the current coordinates for this block
++::mc_filter_uv_b0
++mov ra31, unif
++
++# per-channel shifts were calculated on the *previous* invocation
++
++# get base addresses and per-channel shifts for *next* invocation
++add r0, unif, elem_num       # x
++max r0, r0, 0                ; mov r1, unif # y
++min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base
++sub r2, unif, r3             ; mov ra_xshift, ra_xshift_next # compute offset from frame base u to frame base v ;
++shl ra_xshift_next, r0, 3
++add r0, r0, r3  	     ; mov ra1, unif   # ; width_height
++and rb_x_next, r0, ~3        ; mov ra0, unif   # ; H filter coeffs
++mov ra_y_next, r1            ; mov vw_setup, rb21
++
++add ra_frame_base_next, rb_x_next, r2
++
++# Need to have unsigned coeffs to so we can just unpack in the filter
++# chroma filter always goes -ve, +ve, +ve, -ve. This is fixed in the
++# filter code. Unpack into b regs for V
++
++# set up VPM write, we need to save 16bit precision
++
++sub rb29, rb24, ra1.16b         # Compute vdw_setup1(dst_pitch-width)
++add rb17, ra1.16a, 1
++add rb18, ra1.16a, 3
++shl r0,   ra1.16a, 7
++add r0,   r0, ra1.16b           # Combine width and height of destination area
++shl r0,   r0, i_shift16      ; mov ra3, unif  # ; V filter coeffs
++add rb26, r0, rb27
++
++mov rb8, ra3.8a
++mov rb9, ra3.8b
++mov rb10, ra3.8c
++mov rb11, ra3.8d
++
++# r2 is elem_num
++# r3 is loop counter
++
++mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++
++mov      rb14, unif                 # U weight L0
++mov.ifnz rb14, unif    ; mov r3, 0  # V weight L0 ; Loop counter
++# rb14 unused in b0 but will hang around till the second pass
++
++# retrieve texture results and pick out bytes
++# then submit two more texture requests
++
++# r3 = 0
++:uvloop_b0
++# retrieve texture results and pick out bytes
++# then submit two more texture requests
++
++sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1          ; ldtmu0     # loop counter increment
++shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
++mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
++mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
++shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
++
++max r2, ra_y, 0  # y
++min r2, r2, rb_frame_height_minus_1
++add ra_y, ra_y, 1         ; mul24 r2, r2, r3
++add t0s, ra_x, r2    ; v8subs r1, r1, rb20
++add t1s, ra_frame_base, r2
++
++# generate seven shifted versions
++# interleave with scroll of vertical context
++
++mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++
++nop                  ; mul24      r3, ra0.8a,       r0
++nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8
++nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1
++nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9
++sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2
++nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
++add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3
++nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
++sub r0, r2, r3       ; mov r3, rb31
++sub.setf -, r3, 4    ; mov ra12, ra13
++brr.anyn -, r:uvloop_b0
++mov ra13, ra14          ; mul24 r1, ra14, rb9  # ra14 is about to be ra13
++mov ra14, ra15
++mov ra15, r0            ; mul24 r0, ra12, rb8
++# >>> .anyn uvloop_b0
++
++# apply vertical filter and write to VPM
++
++sub r1, r1, r0          ; mul24 r0, ra14, rb10
++sub.setf -, r3, rb18
++brr.anyn -, r:uvloop_b0
++add r1, r1, r0          ; mul24 r0, ra15, rb11
++sub r1, r1, r0          ; mov -, vw_wait
++asr vpm, r1, 6
++# >>> .anyn uvloop_b0
++
++# in pass0 we don't really need to save any results, but need to discard the uniforms
++# DMA out for U
++
++bra -, ra31
++mov -, unif           # Delay 1
++mov -, unif           # Delay 2
++nop                   # Delay 3
++
++
++################################################################################
++
++::mc_filter_uv_b
++mov ra31, unif
++
++# per-channel shifts were calculated on the *previous* invocation
++
++# set up VPM write
++mov ra_xshift, ra_xshift_next      ; mov vw_setup, rb28
++
++# get base addresses and per-channel shifts for *next* invocation
++add r0, unif, elem_num    # x
++max r0, r0, 0                      ; mov ra_y_next, unif # y
++min r0, r0, rb_frame_width_minus_1 ; mov r3, unif        # V frame_base
++# compute offset from frame base u to frame base v
++sub r2, unif, r3                   ; mul24 ra_xshift_next, r0, 8 # U frame_base
++add r0, r0, r3                     ; mov ra1, unif       # width_height
++and rb_x_next, r0, ~3              ; mov ra0, unif       # H filter coeffs
++
++sub rb29, rb24, ra1.16b  # Compute vdw_setup1(dst_pitch-width)
++add rb17, ra1.16a, 1
++add rb18, ra1.16a, 3
++shl r0,   ra1.16a, 7
++
++add ra_frame_base_next, rb_x_next, r2
++
++# r0 is currently height<<7
++# For vr_setup we want height<<20 (so 20-7=13 additional bits)
++shl r3, r0, i_shift21     ; mov ra3, unif # Shl 13 + Mask off top 8 bits ; V filter coeffs
++shr r3, r3, 8
++add vr_setup, r3, rb21
++
++add r0, r0, ra1.16b    # Combine width and height of destination area
++shl r0, r0, i_shift16  # Shift into bits 16 upwards of the vdw_setup0 register
++add rb26, r0, rb27
++
++# get filter coefficients
++
++mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++
++# Get offset & weight stuff
++
++# The unif read occurs unconditionally, only the write is conditional
++mov      ra1, unif  ; mov rb8,  ra3.8a    # U offset/weight ;
++mov.ifnz ra1, unif  ; mov rb9,  ra3.8b    # V offset/weight ;
++nop                 ; mov rb10, ra3.8c
++mov r3, 0           ; mov rb11, ra3.8d    # Loop counter ;
++
++shl r1, ra1.16b, rb13
++asr rb12, r1, 1
++
++# ra1.16a used directly in the loop
++
++# retrieve texture results and pick out bytes
++# then submit two more texture requests
++
++# r3 = 0
++:uvloop_b
++# retrieve texture results and pick out bytes
++# then submit two more texture requests
++
++sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1          ; ldtmu0     # loop counter increment
++shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
++mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
++mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
++shr r1, r4, ra_xshift     ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
++
++max r2, ra_y, 0  # y
++min r2, r2, rb_frame_height_minus_1
++add ra_y, ra_y, 1         ; mul24 r2, r2, r3
++add t0s, ra_x, r2         ; v8subs r1, r1, rb20
++add t1s, ra_frame_base, r2
++
++# generate seven shifted versions
++# interleave with scroll of vertical context
++
++mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++
++nop                  ; mul24      r3, ra0.8a,       r0
++nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8
++nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1
++nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9
++sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2
++nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
++add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3
++nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
++sub r0, r2, r3       ; mov r3, rb31
++sub.setf -, r3, 4    ; mov ra12, ra13
++brr.anyn -, r:uvloop_b
++mov ra13, ra14          ; mul24 r1, ra14, rb9
++mov ra14, ra15
++mov ra15, r0            ; mul24 r0, ra12, rb8
++# >>> .anyn uvloop_b
++
++# apply vertical filter and write to VPM
++
++sub r1, r1, r0          ; mul24 r0, ra14, rb10
++add r1, r1, r0          ; mul24 r0, ra15, rb11
++# Beware: vpm read gets unsigned 16-bit value, so we must sign extend it
++sub r1, r1, r0          ; mul24 r0, vpm, ra4  # ra4 = 0x10000
++sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
++asr r1, r1, 14          # shift2=6
++
++asr r0, r0, i_shift16   ; mul24 r1, r1, ra1.16a
++nop                     ; mul24 r0, r0, rb14
++
++add r1, r1, r0          ; mov -, vw_wait
++shl r1, r1, 8           # Lose bad top 8 bits & sign extend
++
++add r1, r1, rb12        # rb12 = (offsetL0 + offsetL1 + 1) << (rb13 - 1)
++
++brr.anyn -, r:uvloop_b
++asr r1, r1, rb13         # Delay 1
++min r1, r1, rb_k255       # Delay 2
++max vpm, r1, 0         # Delay 3
++
++
++# DMA out for U
++
++mov vw_setup, rb26 # VDW setup 0
++mov vw_setup, rb29 # Stride
++mov vw_addr, unif # start the VDW
++
++# DMA out for V
++# We need to wait for the U to complete first, but have nothing useful to compute while we wait.
++# Could potentially push this write into the start of the next pipeline stage.
++mov r0, 16
++mov -, vw_wait
++
++bra -, ra31
++add vw_setup, rb26, r0 # VDW setup 0
++mov vw_setup, rb29 # Stride
++mov vw_addr, unif # start the VDW
++
++################################################################################
++
++# mc_exit()
++
++::mc_exit
++mov  -, vw_wait # wait on the VDW
++
++mov -,srel(0)
++
++ldtmu0
++ldtmu1
++ldtmu0
++ldtmu1
++
++nop        ; nop ; thrend
++nop        ; nop # delay slot 1
++nop        ; nop # delay slot 2
++
++# mc_interrupt_exit8()
++::mc_interrupt_exit8
++mov  -, vw_wait # wait on the VDW
++
++ldtmu0
++ldtmu1
++ldtmu0
++ldtmu1
++
++mov -,sacq(0) # 1
++mov -,sacq(0) # 2
++mov -,sacq(0) # 3
++mov -,sacq(0) # 4
++mov -,sacq(0) # 5
++mov -,sacq(0) # 6
++mov -,sacq(0) # 7
++
++nop        ; nop ; thrend
++mov interrupt, 1; nop # delay slot 1
++nop        ; nop # delay slot 2
++
++
++
++
++
++# LUMA CODE
++
++# The idea is to form B predictions by doing 8 pixels from ref0 in parallel with 8 pixels from ref1.
++# For P frames we make the second x,y coordinates offset by +8
++
++################################################################################
++# mc_setup(y_x, ref_y_base, y2_x2, ref_y2_base, frame_width_height, pitch, dst_pitch, offset_shift, tbd, next_kernel)
++::mc_setup
++  mov r3, 16
++
++  # Need to save these because we need to know the frame dimensions before computing texture coordinates
++  mov ra8, unif  # y_x
++  mov ra9, unif  # ref_y_base
++  mov ra10, unif # y2_x2
++  mov ra11, unif # ref_y2_base
++
++# Read image dimensions
++  mov r1, unif # width_height
++  shl r0,r1,r3
++  asr r1,r1,r3 # width
++  asr r0,r0,r3 # height
++  sub rb_frame_width_minus_1,r1,1
++  sub rb_frame_height_minus_1,r0,1
++
++# get source pitch
++  mov rb_pitch, unif # src_pitch
++
++# get destination pitch
++  mov r0, unif       # dst_pitch
++  mov r1, vdw_setup_1(0)
++  add rb24, r1, r0
++
++# Compute base address for first and second access
++  mov r1, ra8 # y_x
++  shl r0,r1,r3 # r0 is x<<16
++  asr r1,r1,r3 # r1 is y
++  asr r0,r0,r3 # r0 is x
++  add r0, r0, elem_num # Load x
++  max r0, r0, 0
++  min r0, r0, rb_frame_width_minus_1 ; mov r2, ra9  # Load the frame base
++  shl ra_xshift_next, r0, 3 # Compute shifts
++  add ra_y, r1, 1
++  and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
++  add r2, r2, r0  # r2 is address for frame0 (not including y offset)
++  max r1, r1, 0
++  min r1, r1, rb_frame_height_minus_1
++  nop             ; mul24 r1, r1, rb_pitch   # r2 contains the addresses (not including y offset) for frame0
++  add t0s, r2, r1 ; mov ra_frame_base, r2
++
++  mov r1, ra10 # y_x
++  shl r0,r1,r3 # r0 is x<<16
++  asr r1,r1,r3 # r1 is y
++  asr r0,r0,r3 # r0 is x
++  add r0, r0, elem_num # Load x
++  max r0, r0, 0
++  min r0, r0, rb_frame_width_minus_1 ; mov r2, ra11  # Load the frame base
++  shl rx_xshift2_next, r0, 3 # Compute shifts
++  add ra_y2, r1, 1
++  and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
++  add r2, r2, r0  # r2 is address for frame1 (not including y offset)
++  max r1, r1, 0
++  min r1, r1, rb_frame_height_minus_1
++  nop             ; mul24 r1, r1, rb_pitch   # r2 contains the addresses (not including y offset) for frame0
++  add t1s, r2, r1 ; mov ra_frame_base2, r2
++
++
++# load constants
++
++  mov ra_k1, 1
++  mov ra_k256, 256
++  mov ra30, 64
++
++  mov rb20, 0xffffff00
++  mov rb_k255, 255
++  mov rb23, 24
++
++# touch vertical context to keep simulator happy
++
++  mov ra8, 0
++  mov ra9, 0
++  mov ra10, 0
++  mov ra11, 0
++  mov ra12, 0
++  mov ra13, 0
++  mov ra14, 0
++  mov ra15, 0
++
++# Compute part of VPM to use
++  mov r2, qpu_num
++  mov r1, r2
++  asr r1, r1, 2
++  shl r1, r1, 6
++  mov r0, r2
++  and r0, r0, 3
++  add r0, r0, r1
++  mov r1, vpm_setup(0, 4, h8p(0, 0))   # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
++  add rb28, r0, r1  # VPM for saving data
++  mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
++  shl r0, r0, 5
++  add rb27, r0, r1  # Command for dma output
++
++# Weighted prediction denom
++  add rb13, unif, 9  # unif = weight denom + 6
++
++  mov -, unif # Unused
++
++# submit texture requests for second line
++  max r1, ra_y, 0
++  min r1, r1, rb_frame_height_minus_1
++  add ra_y, ra_y, 1
++  nop ; mul24 r1, r1, rb_pitch
++  add t0s, r1, ra_frame_base
++
++  max r1, ra_y2, 0
++  min r1, r1, rb_frame_height_minus_1
++  add ra_y2, ra_y2, 1
++  nop ; mul24 r1, r1, rb_pitch
++  add t1s, r1, ra_frame_base2
++
++# FALL THROUGHT TO PER-BLOCK SETUP
++
++# Start of per-block setup code
++# P and B blocks share the same setup code to save on Icache space
++:per_block_setup
++  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++  mov ra31, unif
++
++  mov ra1, unif  ; mov r1, elem_num  # y_x ; elem_num has implicit unpack??
++
++# per-channel shifts were calculated on the *previous* invocation
++  mov ra_xshift, ra_xshift_next
++  mov rx_xshift2, rx_xshift2_next
++
++# get base addresses and per-channel shifts for *next* invocation
++
++  add r0, ra1.16a, r1 # Load x
++  max r0, r0, 0
++  min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
++  shl ra_xshift_next, r0, 3 # Compute shifts
++  mov r3, 8                          ; mov ra_y_next, ra1.16b
++  and r0, r0, ~3                     ; mov ra1, unif # y2_x2
++  add ra_frame_base_next, r2, r0
++
++  add r0, ra1.16a, r1 # Load x
++  max r0, r0, 0
++  min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
++  shl rx_xshift2_next, r0, 3         # Compute shifts
++  add r3, r3, r3                     ; mov ra_y2_next, ra1.16b  # r3 = 16 ;
++  and r0, r0, ~3                     ; mov ra1, unif  # width_height ; r0 gives the clipped and aligned x coordinate
++  add rx_frame_base2_next, r2, r0    # r2 is address for frame1 (not including y offset)
++
++# set up VPM write
++  mov vw_setup, rb28
++
++# get width,height of block (unif load above)
++  sub rb29, rb24, ra1.16b # Compute vdw_setup1(dst_pitch-width)
++  add rb17, ra1.16a, 5
++  add rb18, ra1.16a, 7
++  shl r0,   ra1.16a, 7
++  add r0,   r0, ra1.16b # Combine width and height of destination area
++  shl r0,   r0, i_shift16 # Shift into bits 16 upwards of the vdw_setup0 register
++  add rb26, r0, rb27                 ; mov r0, unif   # Packed filter offsets
++
++# get filter coefficients and discard unused B frame values
++  shl.ifz r0, r0, i_shift16      # Pick half to use
++  shl ra8, r0, 3
++
++# Pack the 1st 4 filter coefs for H & V tightly
++
++  mov r1,0x00010100  # -ve
++  ror ra2.8a, r1, ra8.8d
++  ror ra0.8a, r1, ra8.8c
++
++  mov r1,0x01040400
++  ror ra2.8b, r1, ra8.8d
++  ror ra0.8b, r1, ra8.8c
++
++  mov r1,0x050b0a00  # -ve
++  ror ra2.8c, r1, ra8.8d
++  ror ra0.8c, r1, ra8.8c
++
++  mov r1,0x11283a40
++  ror ra2.8d, r1, ra8.8d
++  ror ra0.8d, r1, ra8.8c
++
++# In the 2nd vertical half we use b registers due to
++# using a-side fifo regs. The easiest way to achieve this to pack it
++# and then unpack!
++
++  mov r1,0x3a281100
++  ror ra3.8a, r1, ra8.8d
++  ror ra1.8a, r1, ra8.8c
++
++  mov r1,0x0a0b0500  # -ve
++  ror ra3.8b, r1, ra8.8d
++  ror ra1.8b, r1, ra8.8c
++
++  mov r1,0x04040100
++  ror ra3.8c, r1, ra8.8d
++  ror ra1.8c, r1, ra8.8c
++
++# Extract weighted prediction information in parallel
++
++  mov r1,0x01010000  # -ve
++  ror ra3.8d, r1, ra8.8d    ; mov r0, unif      # ; weight L1 weight L1 (hi16)/weight L0 (lo16)
++  ror ra1.8d, r1, ra8.8c    ; mov r1, rb13      # ; rb13 = weight denom + 6 + 9
++
++# r3 = 16 from (long way) above
++  shl r1, unif, r1          ; mov rb4, ra3.8a   # combined offet = ((is P) ? offset L0 * 2 : offset L1 + offset L0) + 1) ;
++  asr ra18, r0, r3          ; mov rb5, ra3.8b
++  bra -, ra31
++  shl r0, r0, r3            ; mov rb6, ra3.8c
++  mov r3, 0                 ; mov rb7, ra3.8d   # loop count ;
++  asr rb12, r1, 9
++
++# >>> branch ra31
++#
++# r3 = 0
++# ra18 = weight L1
++# r0   = weight L0 << 16 (will be put into rb14 in filter preamble)
++# rb13 = weight denom + 6 + 9
++# rb12 = (((is P) ? offset L0 * 2 : offset L1 + offset L0) + 1) << (rb13 - 1)
++
++
++################################################################################
++# mc_filter(y_x, frame_base, y2_x2, frame_base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
++# In a P block, y2_x2 should be y_x+8
++# At this point we have already issued two pairs of texture requests for the current block
++
++::mc_filter
++# r0 = weight << 16; We want weight * 2 in rb14
++  asr rb14, r0, 15
++
++# r3 = 0
++
++:yloop
++# retrieve texture results and pick out bytes
++# then submit two more texture requests
++
++# If we knew there was no clipping then this code would get simpler.
++# Perhaps we could add on the pitch and clip using larger values?
++
++# N.B. Whilst y == y2 as far as this loop is concerned we will start
++# the grab for the next block before we finish with this block and that
++# might be B where y != y2 so we must do full processing on both y and y2
++
++  sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1                           ; ldtmu0
++  shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu1
++  mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
++  mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
++  shr r1, r4, rx_xshift2    ; mov.ifz ra_y2, ra_y2_next
++
++  max r2, ra_y, 0  # y
++  min r2, r2, rb_frame_height_minus_1
++  add ra_y, ra_y, 1            ; mul24 r2, r2, r3
++  add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte
++
++  max r2, ra_y2, 0  # y
++  min r2, r2, rb_frame_height_minus_1
++  add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
++  add t1s, ra_frame_base2, r2  ; v8subs r1, r1, rb20
++
++# generate seven shifted versions
++# interleave with scroll of vertical context
++
++  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++
++# apply horizontal filter
++  nop                  ; mul24      r3, ra0.8a,      r0
++  nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
++  nop                  ; mul24      r2, ra0.8b << 1, r0 << 1
++  nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
++  sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2
++  nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
++  sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3
++  nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
++  add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4
++  nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12
++  add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5
++  nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13
++  sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6
++  nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14
++  add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7
++  nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15
++  sub r0, r2, r3       ; mov r3, rb31
++
++  sub.setf -, r3, 8       ; mov r1,   ra8
++  mov ra8,  ra9           ; mov rb8,  rb9
++  brr.anyn -, r:yloop
++  mov ra9,  ra10          ; mov rb9,  rb10
++  mov ra10, ra11          ; mov rb10, rb11
++  mov ra11, r0            ; mov rb11, r1
++  # >>> .anyn yloop
++
++  # apply vertical filter and write to VPM
++
++  nop                     ; mul24 r0, rb8,  ra2.8a
++  nop                     ; mul24 r1, rb9,  ra2.8b
++  sub r1, r1, r0          ; mul24 r0, rb10, ra2.8c
++  sub r1, r1, r0          ; mul24 r0, rb11, ra2.8d
++  add r1, r1, r0          ; mul24 r0, ra8,  rb4
++  add r1, r1, r0          ; mul24 r0, ra9,  rb5
++  sub r1, r1, r0          ; mul24 r0, ra10, rb6
++  add r1, r1, r0          ; mul24 r0, ra11, rb7
++  sub r1, r1, r0          ; mov -, vw_wait
++# At this point r1 is a 22-bit signed quantity: 8 (original sample),
++#  +6, +6 (each pass), +1 (the passes can overflow slightly), +1 (sign)
++# The top 8 bits have rubbish in them as mul24 is unsigned
++# The low 6 bits need discard before weighting
++  sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256  # x256 - sign extend & discard rubbish
++  asr r1, r1, 14
++  nop                     ; mul24 r1, r1, rb14
++  add r1, r1, rb12
++
++  shl r1, r1, 8
++  brr.anyn -, r:yloop
++  asr r1, r1, rb13
++# We have a saturating pack unit - I can't help feeling it should be useful here
++  min r1, r1, rb_k255       # Delay 2  rb_k255 = 255
++  max vpm, r1, 0         # Delay 3
++# >>> branch.anyn yloop
++
++# DMA out
++
++  brr -, r:per_block_setup
++  mov vw_setup, rb26 # VDW setup 0    Delay 1
++  mov vw_setup, rb29 # Stride         Delay 2
++  mov vw_addr, unif # start the VDW   Delay 3
++
++
++
++################################################################################
++
++# mc_filter_b(y_x, frame_base, y2_x2, frame_base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
++# In a P block, only the first half of coefficients contain used information.
++# At this point we have already issued two pairs of texture requests for the current block
++# May be better to just send 16.16 motion vector and figure out the coefficients inside this block (only 4 cases so can compute hcoeffs in around 24 cycles?)
++# Can fill in the coefficients so only
++# Can also assume default weighted prediction for B frames.
++# Perhaps can unpack coefficients in a more efficient manner by doing H/V for a and b at the same time?
++# Or possibly by taking advantage of symmetry?
++# From 19->7 32bits per command.
++
++::mc_filter_b
++  # r0 = weightL0 << 16, we want it in rb14
++  asr rb14, r0, i_shift16
++
++:yloopb
++# retrieve texture results and pick out bytes
++# then submit two more texture requests
++
++# If we knew there was no clipping then this code would get simpler.
++# Perhaps we could add on the pitch and clip using larger values?
++
++  sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1                           ; ldtmu0
++  shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu1
++  mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
++  mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
++  shr r1, r4, rx_xshift2    ; mov.ifz ra_y2, ra_y2_next
++
++  max r2, ra_y, 0  # y
++  min r2, r2, rb_frame_height_minus_1
++  add ra_y, ra_y, 1            ; mul24 r2, r2, r3
++  add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte
++
++  max r2, ra_y2, 0  # y
++  min r2, r2, rb_frame_height_minus_1
++  add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
++  add t1s, ra_frame_base2, r2  ; v8subs r1, r1, rb20
++
++# generate seven shifted versions
++# interleave with scroll of vertical context
++
++  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++
++# apply horizontal filter
++  nop                  ; mul24      r3, ra0.8a,      r0
++  nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
++  nop                  ; mul24      r2, ra0.8b << 1, r0 << 1
++  nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
++  sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2
++  nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
++  sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3
++  nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
++  add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4
++  nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12
++  add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5
++  nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13
++  sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6
++  nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14
++  add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7
++  nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15
++  sub r0, r2, r3       ; mov r3, rb31
++
++  sub.setf -, r3, 8       ; mov r1,   ra8
++  mov ra8,  ra9           ; mov rb8,  rb9
++  brr.anyn -, r:yloopb
++  mov ra9,  ra10          ; mov rb9,  rb10
++  mov ra10, ra11          ; mov rb10, rb11
++  mov ra11, r0            ; mov rb11, r1
++  # >>> .anyn yloopb
++
++  # apply vertical filter and write to VPM
++
++  nop                     ; mul24 r0, rb8,  ra2.8a
++  nop                     ; mul24 r1, rb9,  ra2.8b
++  sub r1, r1, r0          ; mul24 r0, rb10, ra2.8c
++  sub r1, r1, r0          ; mul24 r0, rb11, ra2.8d
++  add r1, r1, r0          ; mul24 r0, ra8,  rb4
++  add r1, r1, r0          ; mul24 r0, ra9,  rb5
++  sub r1, r1, r0          ; mul24 r0, ra10, rb6
++  add r1, r1, r0          ; mul24 r0, ra11, rb7
++  sub r1, r1, r0          ; mov r2, rb12
++# As with P-pred r1 is a 22-bit signed quantity in 32-bits
++# Top 8 bits are bad - low 6 bits should be discarded
++  sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
++
++  asr r1, r1, 14
++  nop                     ; mul24 r0, r1, rb14
++  add r0, r0, r2          ; mul24 r1, r1 << 8, ra18 << 8
++
++  add r1, r1, r0          ; mov -, vw_wait
++  shl r1, r1, 8
++
++  brr.anyn -, r:yloopb
++  asr r1, r1, rb13         # Delay 1
++  min r1, r1, rb_k255       # Delay 2
++  max vpm, r1, 0         # Delay 3
++
++# DMA out
++  brr -, r:per_block_setup
++  mov vw_setup, rb26 # VDW setup 0    Delay 1
++  mov vw_setup, rb29 # Stride         Delay 2
++  mov vw_addr, unif # start the VDW   Delay 3
++
++################################################################################
++
++# mc_interrupt_exit12()
++::mc_interrupt_exit12
++  mov  -, vw_wait # wait on the VDW
++
++  # Dummy wait to test instructions
++#  mov r3,1000000
++#:dummy_loop
++#  sub.setf r3, r3, 1
++#  nop
++#  nop
++#  brr.anynn -, r:dummy_loop
++#  nop
++#  nop
++#  nop
++
++  ldtmu0
++  ldtmu0
++  ldtmu1
++  ldtmu1
++
++  mov -,sacq(0) # 1
++  mov -,sacq(0) # 2
++  mov -,sacq(0) # 3
++  mov -,sacq(0) # 4
++  mov -,sacq(0) # 5
++  mov -,sacq(0) # 6
++  mov -,sacq(0) # 7
++  mov -,sacq(0) # 8
++  mov -,sacq(0) # 9
++  mov -,sacq(0) # 10
++  mov -,sacq(0) # 11
++
++  nop        ; nop ; thrend
++  mov interrupt, 1; nop # delay slot 1
++  nop        ; nop # delay slot 2
++
++
++::mc_exit1
++  mov  -, vw_wait # wait on the VDW
++
++  ldtmu0
++  ldtmu1
++  ldtmu0
++  ldtmu1
++  nop        ; nop ; thrend
++  mov interrupt, 1; nop # delay slot 1
++  nop        ; nop # delay slot 2
++
++
++::mc_end
++# Do not add code here because mc_end must appear after all other code.
+diff --git a/libavcodec/rpi_user_vcsm.h b/libavcodec/rpi_user_vcsm.h
+new file mode 100644
+index 0000000..db41a4d
+--- /dev/null
++++ b/libavcodec/rpi_user_vcsm.h
+@@ -0,0 +1,459 @@
++/*****************************************************************************
++* Copyright 2001 - 2011 Broadcom Corporation.  All rights reserved.
++*
++* This program is the proprietary software of Broadcom Corporation and/or
++* its licensors, and may only be used, duplicated, modified or distributed
++* pursuant to the terms and conditions of a separate, written license
++* agreement executed between you and Broadcom (an "Authorized License").
++* Except as set forth in an Authorized License, Broadcom grants no license
++* (express or implied), right to use, or waiver of any kind with respect to
++* the Software, and Broadcom expressly reserves all rights in and to the
++* Software and all intellectual property rights therein.  IF YOU HAVE NO
++* AUTHORIZED LICENSE, THEN YOU HAVE NO RIGHT TO USE THIS SOFTWARE IN ANY
++* WAY, AND SHOULD IMMEDIATELY NOTIFY BROADCOM AND DISCONTINUE ALL USE OF
++* THE SOFTWARE.
++*
++* Except as expressly set forth in the Authorized License,
++* 1. This program, including its structure, sequence and organization,
++*    constitutes the valuable trade secrets of Broadcom, and you shall use
++*    all reasonable efforts to protect the confidentiality thereof, and to
++*    use this information only in connection with your use of Broadcom
++*    integrated circuit products.
++* 2. TO THE MAXIMUM EXTENT PERMITTED BY LAW, THE SOFTWARE IS PROVIDED "AS IS"
++*    AND WITH ALL FAULTS AND BROADCOM MAKES NO PROMISES, REPRESENTATIONS OR
++*    WARRANTIES, EITHER EXPRESS, IMPLIED, STATUTORY, OR OTHERWISE, WITH
++*    RESPECT TO THE SOFTWARE.  BROADCOM SPECIFICALLY DISCLAIMS ANY AND ALL
++*    IMPLIED WARRANTIES OF TITLE, MERCHANTABILITY, NONINFRINGEMENT, FITNESS
++*    FOR A PARTICULAR PURPOSE, LACK OF VIRUSES, ACCURACY OR COMPLETENESS,
++*    QUIET ENJOYMENT, QUIET POSSESSION OR CORRESPONDENCE TO DESCRIPTION. YOU
++*    ASSUME THE ENTIRE RISK ARISING OUT OF USE OR PERFORMANCE OF THE SOFTWARE.
++* 3. TO THE MAXIMUM EXTENT PERMITTED BY LAW, IN NO EVENT SHALL BROADCOM OR ITS
++*    LICENSORS BE LIABLE FOR (i) CONSEQUENTIAL, INCIDENTAL, SPECIAL, INDIRECT,
++*    OR EXEMPLARY DAMAGES WHATSOEVER ARISING OUT OF OR IN ANY WAY RELATING TO
++*    YOUR USE OF OR INABILITY TO USE THE SOFTWARE EVEN IF BROADCOM HAS BEEN
++*    ADVISED OF THE POSSIBILITY OF SUCH DAMAGES; OR (ii) ANY AMOUNT IN EXCESS
++*    OF THE AMOUNT ACTUALLY PAID FOR THE SOFTWARE ITSELF OR U.S. $1, WHICHEVER
++*    IS GREATER. THESE LIMITATIONS SHALL APPLY NOTWITHSTANDING ANY FAILURE OF
++*    ESSENTIAL PURPOSE OF ANY LIMITED REMEDY.
++*****************************************************************************/
++
++#ifndef __USER_VCSM__H__INCLUDED__
++#define __USER_VCSM__H__INCLUDED__
++
++/* VideoCore Shared Memory - user interface library.
++**
++** This library provides all the necessary abstraction for any application to
++** make use of the shared memory service which is distributed accross a kernel
++** driver and a videocore service.
++**
++** It is an application design decision to choose or not to use this service.
++**
++** The logical flow of operations that a user application needs to follow when
++** using this service is:
++**
++**       1) Initialize the service.
++**       2) Allocate shared memory blocks.
++**       3) Start using the allocated blocks.
++**          - In order to gain ownership on a block, lock the allocated block,
++**            locking a block returns a valid address that the user application
++**            can access.
++**          - When finished with using the block for the current execution cycle
++**            or function, and so when giving up the ownership, unlock the block.
++**       4) A block can be locked/unlocked as many times required - within or outside
++**          of - a specific execution context.
++**       5) To completely release an allocated block, free it.
++**       6) If the service is no longer required, terminate it.
++**
++**
++** Some generic considerations:
++
++** Allocating memory blocks.
++**
++**   Memory blocks can be allocated in different manners depending on the cache
++**   behavior desired.  A given block can either be:
++
++**       - Allocated in a non cached fashion all the way through host and videocore.
++**       - Allocated in a cached fashion on host OR videocore.
++**       - Allocated in a cached fashion on host AND videocore.
++**
++**   It is an application decision to determine how to allocate a block.  Evidently
++**   if the application will be doing substantial read/write accesses to a given block,
++**   it is recommended to allocate the block at least in a 'host cached' fashion for
++**   better results.
++**
++**
++** Locking memory blocks.
++**
++**   When the memory block has been allocated in a host cached fashion, locking the
++**   memory block (and so taking ownership of it) will trigger a cache invalidation.
++**
++**   For the above reason and when using host cached allocation, it is important that
++**   an application properly implements the lock/unlock mechanism to ensure cache will
++**   stay coherent, otherwise there is no guarantee it will at all be.
++**
++**   It is possible to dynamically change the host cache behavior (ie cached or non
++**   cached) of a given allocation without needing to free and re-allocate the block.
++**   This feature can be useful for such application which requires access to the block
++**   only at certain times and not otherwise.  By changing the cache behavior dynamically
++**   the application can optimize performances for a given duration of use.
++**   Such dynamic cache behavior remapping only applies to host cache and not videocore
++**   cache.  If one requires to change the videocore cache behavior, then a new block
++**   must be created to replace the old one.
++**
++**   On successful locking, a valid pointer is returned that the application can use
++**   to access to data inside the block.  There is no guarantee that the pointer will
++**   stay valid following the unlock action corresponding to this lock.
++**
++**
++** Unocking memory blocks.
++**
++**   When the memory block has been allocated in a host cached fashion, unlocking the
++**   memory block (and so forgiving its ownership) will trigger a cache flush unless
++**   explicitely asked not to flush the cache for performances reasons.
++**
++**   For the above reason and when using host cached allocation, it is important that
++**   an application properly implements the lock/unlock mechanism to ensure cache will
++**   stay coherent, otherwise there is no guarantee it will at all be.
++**
++**
++** A complete API is defined below.
++*/
++
++#ifdef __cplusplus
++extern "C"
++{
++#endif
++
++/* Different status that can be dumped.
++*/
++typedef enum
++{
++   VCSM_STATUS_VC_WALK_ALLOC = 0,   // Walks *all* the allocation on videocore.
++                                    // Result of the walk is seen in the videocore
++                                    // log.
++   VCSM_STATUS_HOST_WALK_MAP,       // Walks the *full* mapping allocation on host
++                                    // driver (ie for all processes).  Result of
++                                    // the walk is seen in the kernel log.
++   VCSM_STATUS_HOST_WALK_PID_MAP,   // Walks the per process mapping allocation on host
++                                    // driver (for current process).  Result of
++                                    // the walk is seen in the kernel log.
++   VCSM_STATUS_HOST_WALK_PID_ALLOC, // Walks the per process host allocation on host
++                                    // driver (for current process).  Result of
++                                    // the walk is seen in the kernel log.
++   VCSM_STATUS_VC_MAP_ALL,          // Equivalent to both VCSM_STATUS_VC_WALK_ALLOC and
++                                    // VCSM_STATUS_HOST_WALK_MAP.
++                                    //
++   VCSM_STATUS_NONE,                // Must be last - invalid.
++
++} VCSM_STATUS_T;
++
++/* Different kind of cache behavior.
++*/
++typedef enum
++{
++   VCSM_CACHE_TYPE_NONE = 0,        // No caching applies.
++   VCSM_CACHE_TYPE_HOST,            // Allocation is cached on host (user space).
++   VCSM_CACHE_TYPE_VC,              // Allocation is cached on videocore.
++   VCSM_CACHE_TYPE_HOST_AND_VC,     // Allocation is cached on both host and videocore.
++
++} VCSM_CACHE_TYPE_T;
++
++/* Initialize the vcsm processing.
++**
++** Must be called once before attempting to do anything else.
++**
++** Returns 0 on success, -1 on error.
++*/
++int vcsm_init( void );
++
++
++/* Terminates the vcsm processing.
++**
++** Must be called vcsm services are no longer needed, it will
++** take care of removing any allocation under the current process
++** control if deemed necessary.
++*/
++void vcsm_exit( void );
++
++
++/* Queries the status of the the vcsm.
++**
++** Triggers dump of various kind of information, see the
++** different variants specified in VCSM_STATUS_T.
++**
++** Pid is optional.
++*/
++void vcsm_status( VCSM_STATUS_T status, int pid );
++
++
++/* Allocates a non-cached block of memory of size 'size' via the vcsm memory
++** allocator.
++**
++** Returns:        0 on error
++**                 a non-zero opaque handle on success.
++**
++** On success, the user must invoke vcsm_lock with the returned opaque
++** handle to gain access to the memory associated with the opaque handle.
++** When finished using the memory, the user calls vcsm_unlock_xx (see those
++** function definition for more details on the one that can be used).
++**
++** A well behaved application should make every attempt to lock/unlock
++** only for the duration it needs to access the memory data associated with
++** the opaque handle.
++*/
++unsigned int vcsm_malloc( unsigned int size, char *name );
++
++
++/* Allocates a cached block of memory of size 'size' via the vcsm memory
++** allocator, the type of caching requested is passed as argument of the
++** function call.
++**
++** Returns:        0 on error
++**                 a non-zero opaque handle on success.
++**
++** On success, the user must invoke vcsm_lock with the returned opaque
++** handle to gain access to the memory associated with the opaque handle.
++** When finished using the memory, the user calls vcsm_unlock_xx (see those
++** function definition for more details on the one that can be used).
++**
++** A well behaved application should make every attempt to lock/unlock
++** only for the duration it needs to access the memory data associated with
++** the opaque handle.
++*/
++unsigned int vcsm_malloc_cache( unsigned int size, VCSM_CACHE_TYPE_T cache, char *name );
++
++
++/* Shares an allocated block of memory via the vcsm memory allocator.
++**
++** Returns:        0 on error
++**                 a non-zero opaque handle on success.
++**
++** On success, the user must invoke vcsm_lock with the returned opaque
++** handle to gain access to the memory associated with the opaque handle.
++** When finished using the memory, the user calls vcsm_unlock_xx (see those
++** function definition for more details on the one that can be used).
++**
++** A well behaved application should make every attempt to lock/unlock
++** only for the duration it needs to access the memory data associated with
++** the opaque handle.
++*/
++unsigned int vcsm_malloc_share( unsigned int handle );
++
++
++/* Resizes a block of memory allocated previously by vcsm_alloc.
++**
++** Returns:        0 on success
++**                 -errno on error.
++**
++** The handle must be unlocked by user prior to attempting any
++** resize action.
++**
++** On error, the original size allocated against the handle
++** remains available the same way it would be following a
++** successful vcsm_malloc.
++*/
++int vcsm_resize( unsigned int handle, unsigned int new_size );
++
++
++/* Frees a block of memory that was successfully allocated by
++** a prior call the vcms_alloc.
++**
++** The handle should be considered invalid upon return from this
++** call.
++**
++** Whether any memory is actually freed up or not as the result of
++** this call will depends on many factors, if all goes well it will
++** be freed.  If something goes wrong, the memory will likely end up
++** being freed up as part of the vcsm_exit process.  In the end the
++** memory is guaranteed to be freed one way or another.
++*/
++void vcsm_free( unsigned int handle );
++
++
++/* Retrieves a videocore opaque handle from a mapped user address
++** pointer.  The videocore handle will correspond to the actual
++** memory mapped in videocore.
++**
++** Returns:        0 on error
++**                 a non-zero opaque handle on success.
++**
++** Note: the videocore opaque handle is distinct from the user
++**       opaque handle (allocated via vcsm_malloc) and it is only
++**       significant for such application which knows what to do
++**       with it, for the others it is just a number with little
++**       use since nothing can be done with it (in particular
++**       for safety reason it cannot be used to map anything).
++*/
++unsigned int vcsm_vc_hdl_from_ptr( void *usr_ptr );
++
++
++/* Retrieves a videocore opaque handle from a opaque handle
++** pointer.  The videocore handle will correspond to the actual
++** memory mapped in videocore.
++**
++** Returns:        0 on error
++**                 a non-zero opaque handle on success.
++**
++** Note: the videocore opaque handle is distinct from the user
++**       opaque handle (allocated via vcsm_malloc) and it is only
++**       significant for such application which knows what to do
++**       with it, for the others it is just a number with little
++**       use since nothing can be done with it (in particular
++**       for safety reason it cannot be used to map anything).
++*/
++unsigned int vcsm_vc_hdl_from_hdl( unsigned int handle );
++
++
++/* Retrieves a user opaque handle from a mapped user address
++** pointer.
++**
++** Returns:        0 on error
++**                 a non-zero opaque handle on success.
++*/
++unsigned int vcsm_usr_handle( void *usr_ptr );
++
++
++/* Retrieves a mapped user address from an opaque user
++** handle.
++**
++** Returns:        0 on error
++**                 a non-zero address on success.
++**
++** On success, the address corresponds to the pointer
++** which can access the data allocated via the vcsm_malloc
++** call.
++*/
++void *vcsm_usr_address( unsigned int handle );
++
++
++/* Locks the memory associated with this opaque handle.
++**
++** Returns:        NULL on error
++**                 a valid pointer on success.
++**
++** A user MUST lock the handle received from vcsm_malloc
++** in order to be able to use the memory associated with it.
++**
++** On success, the pointer returned is only valid within
++** the lock content (ie until a corresponding vcsm_unlock_xx
++** is invoked).
++*/
++void *vcsm_lock( unsigned int handle );
++
++
++/* Locks the memory associated with this opaque handle.  The lock
++** also gives a chance to update the *host* cache behavior of the
++** allocated buffer if so desired.  The *videocore* cache behavior
++** of the allocated buffer cannot be changed by this call and such
++** attempt will be ignored.
++**
++** The system will attempt to honour the cache_update mode request,
++** the cache_result mode will provide the final answer on which cache
++** mode is really in use.  Failing to change the cache mode will not
++** result in a failure to lock the buffer as it is an application
++** decision to choose what to do if (cache_result != cache_update)
++**
++** The value returned in cache_result can only be considered valid if
++** the returned pointer is non NULL.  The cache_result pointer may be
++** NULL if the application does not care about the actual outcome of
++** its action with regards to the cache behavior change.
++**
++** Returns:        NULL on error
++**                 a valid pointer on success.
++**
++** A user MUST lock the handle received from vcsm_malloc
++** in order to be able to use the memory associated with it.
++**
++** On success, the pointer returned is only valid within
++** the lock content (ie until a corresponding vcsm_unlock_xx
++** is invoked).
++*/
++void *vcsm_lock_cache( unsigned int handle,
++                       VCSM_CACHE_TYPE_T cache_update,
++                       VCSM_CACHE_TYPE_T *cache_result );
++
++
++/* Unlocks the memory associated with this user mapped address.
++**
++** Returns:        0 on success
++**                 -errno on error.
++**
++** After unlocking a mapped address, the user should no longer
++** attempt to reference it.
++*/
++int vcsm_unlock_ptr( void *usr_ptr );
++
++
++/* Unlocks the memory associated with this user mapped address.
++** Apply special processing that would override the otherwise
++** default behavior.
++**
++** If 'cache_no_flush' is specified:
++**    Do not flush cache as the result of the unlock (if cache
++**    flush was otherwise applicable in this case).
++**
++** Returns:        0 on success
++**                 -errno on error.
++**
++** After unlocking a mapped address, the user should no longer
++** attempt to reference it.
++*/
++int vcsm_unlock_ptr_sp( void *usr_ptr, int cache_no_flush );
++
++
++/* Unlocks the memory associated with this user opaque handle.
++**
++** Returns:        0 on success
++**                 -errno on error.
++**
++** After unlocking an opaque handle, the user should no longer
++** attempt to reference the mapped addressed once associated
++** with it.
++*/
++int vcsm_unlock_hdl( unsigned int handle );
++
++
++/* Unlocks the memory associated with this user opaque handle.
++** Apply special processing that would override the otherwise
++** default behavior.
++**
++** If 'cache_no_flush' is specified:
++**    Do not flush cache as the result of the unlock (if cache
++**    flush was otherwise applicable in this case).
++**
++** Returns:        0 on success
++**                 -errno on error.
++**
++** After unlocking an opaque handle, the user should no longer
++** attempt to reference the mapped addressed once associated
++** with it.
++*/
++int vcsm_unlock_hdl_sp( unsigned int handle, int cache_no_flush );
++
++/* Clean and/or invalidate the memory associated with this user opaque handle
++**
++** Returns:        non-zero on error
++**
++** structure contains a list of flush/invalidate commands. Commands are:
++** 0: nop
++** 1: invalidate       given virtual range in L1/L2
++** 2: clean            given virtual range in L1/L2
++** 3: clean+invalidate given virtual range in L1/L2
++** 4: flush all L1/L2
++*/
++struct vcsm_user_clean_invalid_s {
++   struct {
++      unsigned int cmd;
++      unsigned int handle;
++      unsigned int addr;
++      unsigned int size;
++   } s[8];
++};
++
++int vcsm_clean_invalid( struct vcsm_user_clean_invalid_s *s );
++
++#ifdef __cplusplus
++}
++#endif
++
++#endif /* __USER_VCSM__H__INCLUDED__ */
 diff --git a/libavcodec/rpi_zc.c b/libavcodec/rpi_zc.c
 new file mode 100644
 index 0000000..9580165
@@ -38057,81 +15466,3090 @@ index 0000000..f0109f4
 +
 +#endif
 +
--- 
-2.7.4
-
-
-From a6da64e1ca42f0394ccfa55dca782a456841da94 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Tue, 1 Mar 2016 14:21:25 +0000
-Subject: [PATCH 2/2] Set VPU scheduling thread to high priority after creation
-
----
- libavcodec/rpi_qpu.c | 48 +++++++++++++++++++++++++++++++++++++++++++++++-
- 1 file changed, 47 insertions(+), 1 deletion(-)
-
-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-index b0c9bc5..ee19231 100644
---- a/libavcodec/rpi_qpu.c
-+++ b/libavcodec/rpi_qpu.c
-@@ -182,9 +182,55 @@ static int gpu_init(volatile struct GPU **gpu) {
-     err = pthread_create(&vpu_thread, NULL, vpu_start, NULL);
-     //printf("Created thread\n");
-     if (err) {
--        printf("Failed to create vpu thread\n");
-+        av_log(NULL, AV_LOG_FATAL, "Failed to create vpu thread\n");
-         return -4;
-     }
+diff --git a/libavcodec/utils.c b/libavcodec/utils.c
+index f7adb52..708526e 100644
+--- a/libavcodec/utils.c
++++ b/libavcodec/utils.c
+@@ -26,6 +26,12 @@
+  */
+ 
+ #include "config.h"
 +
-+    {
-+      struct sched_param param = {0};
-+      int policy = 0;
++#ifdef RPI
++// Move video buffers to GPU memory
++#define RPI_GPU_BUFFERS
++#endif
 +
-+      if (pthread_getschedparam(vpu_thread, &policy, &param) != 0)
-+      {
-+        av_log(NULL, AV_LOG_ERROR, "Unable to get VPU thread scheduling parameters\n");
-+      }
-+      else
-+      {
-+        av_log(NULL, AV_LOG_INFO, "VPU thread: policy=%d (%s), pri=%d\n",
-+            policy,
-+            policy == SCHED_RR ? "RR" : policy == SCHED_FIFO ? "FIFO" : "???" ,
-+            param.sched_priority);
+ #include "libavutil/atomic.h"
+ #include "libavutil/attributes.h"
+ #include "libavutil/avassert.h"
+@@ -64,6 +70,10 @@
+ #include "libavutil/ffversion.h"
+ const char av_codec_ffversion[] = "FFmpeg version " FFMPEG_VERSION;
+ 
++#ifdef RPI_GPU_BUFFERS
++#include "rpi_qpu.h"
++#endif
 +
-+        policy = SCHED_FIFO;
-+        param.sched_priority = sched_get_priority_max(SCHED_FIFO);
+ #if HAVE_PTHREADS || HAVE_W32THREADS || HAVE_OS2THREADS
+ static int default_lockmgr_cb(void **arg, enum AVLockOp op)
+ {
+@@ -503,6 +513,47 @@ int avcodec_fill_audio_frame(AVFrame *frame, int nb_channels,
+     return ret;
+ }
+ 
++#ifdef RPI_GPU_BUFFERS
++static void rpi_buffer_default_free(void *opaque, uint8_t *data)
++{
++    GPU_MEM_PTR_T *p = opaque;
++    gpu_free(p);
++    av_free(p);
++}
 +
-+        av_log(NULL, AV_LOG_INFO, "Attempt to set: policy=%d (%s), pri=%d\n",
-+            policy,
-+            policy == SCHED_RR ? "RR" : policy == SCHED_FIFO ? "FIFO" : "???" ,
-+            param.sched_priority);
++static AVBufferRef *rpi_buffer_alloc(int size)
++{
++    AVBufferRef *ret = NULL;
++    uint8_t    *data = NULL;
++    GPU_MEM_PTR_T *p;
 +
-+        if (pthread_setschedparam(vpu_thread, policy, &param) != 0)
-+        {
-+          av_log(NULL, AV_LOG_ERROR, "Unable to set VPU thread scheduling parameters\n");
-+        }
-+        else
-+        {
-+          if (pthread_getschedparam(vpu_thread, &policy, &param) != 0)
-+          {
-+            av_log(NULL, AV_LOG_ERROR, "Unable to get VPU thread scheduling parameters\n");
-+          }
-+          else
-+          {
-+            av_log(NULL, AV_LOG_INFO, "VPU thread (after): policy=%d (%s), pri=%d\n",
-+                policy,
-+                policy == SCHED_RR ? "RR" : policy == SCHED_FIFO ? "FIFO" : "???" ,
-+                param.sched_priority);
-+          }
-+        }
-+      }
++    static int total=0;
++    total+=size;
 +
++    p = av_malloc(sizeof *p);
++    if (!p)
++        return NULL;
++
++    if (gpu_malloc_cached(size,p)<0)  // Change this line to choose cached or uncached memory.  The caching here refers to the ARM data cache.
++        return NULL;
++
++    data = p->arm;
++    printf("Rpi alloc %d/%d ARM=%p VC=%x->%x\n",size,total,p->arm,p->vc,p->vc+size);
++    //memset(data, 64, size);
++
++    if (!data)
++        return NULL;
++
++    ret = av_buffer_create(data, size, rpi_buffer_default_free, p, 0);
++    if (!ret) {
++        gpu_free(p);
++        av_freep(&p);
 +    }
 +
-   }
++    return ret;
++}
++#endif
++
+ static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame)
+ {
+     FramePool *pool = avctx->internal->pool;
+@@ -550,6 +601,14 @@ static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame)
+             av_buffer_pool_uninit(&pool->pools[i]);
+             pool->linesize[i] = linesize[i];
+             if (size[i]) {
++#ifdef RPI_GPU_BUFFERS
++                if (avctx->codec_id == AV_CODEC_ID_HEVC)
++                    pool->pools[i] = av_buffer_pool_init(size[i] + 16 + STRIDE_ALIGN - 1,
++                                                     CONFIG_MEMORY_POISONING ?
++                                                        NULL :
++                                                        rpi_buffer_alloc);
++                else
++#endif
+                 pool->pools[i] = av_buffer_pool_init(size[i] + 16 + STRIDE_ALIGN - 1,
+                                                      CONFIG_MEMORY_POISONING ?
+                                                         NULL :
+diff --git a/libavformat/mpegts.c b/libavformat/mpegts.c
+index b31d233..2767306 100644
+--- a/libavformat/mpegts.c
++++ b/libavformat/mpegts.c
+@@ -701,7 +701,7 @@ static const StreamType ISO_types[] = {
  #endif
+     { 0x1b, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_H264       },
+     { 0x1c, AVMEDIA_TYPE_AUDIO, AV_CODEC_ID_AAC        },
+-    { 0x20, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_H264       },
++    { 0x20, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_H264_MVC   },
+     { 0x21, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_JPEG2000   },
+     { 0x24, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_HEVC       },
+     { 0x42, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_CAVS       },
+diff --git a/libavformat/utils.c b/libavformat/utils.c
+index 6f343f2..83f26d5 100644
+--- a/libavformat/utils.c
++++ b/libavformat/utils.c
+@@ -691,7 +691,7 @@ static int update_wrap_reference(AVFormatContext *s, AVStream *st, int stream_in
+         int default_stream_index = av_find_default_stream_index(s);
+         if (s->streams[default_stream_index]->pts_wrap_reference == AV_NOPTS_VALUE) {
+             for (i = 0; i < s->nb_streams; i++) {
+-                if (av_find_program_from_stream(s, NULL, i))
++                if (0 && av_find_program_from_stream(s, NULL, i))
+                     continue;
+                 s->streams[i]->pts_wrap_reference = pts_wrap_reference;
+                 s->streams[i]->pts_wrap_behavior = pts_wrap_behavior;
+diff --git a/libavutil/buffer.c b/libavutil/buffer.c
+index 694e116..203ca7b 100644
+--- a/libavutil/buffer.c
++++ b/libavutil/buffer.c
+@@ -425,3 +425,9 @@ AVBufferRef *av_buffer_pool_get(AVBufferPool *pool)
  
--- 
-2.7.4
-
+     return ret;
+ }
++
++// Return the opaque for the underlying frame (gives us a GPU_MEM_PTR_T)
++void *av_buffer_pool_opaque(AVBufferRef *ref) {
++  BufferPoolEntry *buf = av_buffer_get_opaque(ref);
++  return buf->opaque;
++}
+diff --git a/libavutil/buffer.h b/libavutil/buffer.h
+index 0c0ce12..82e0bc3 100644
+--- a/libavutil/buffer.h
++++ b/libavutil/buffer.h
+@@ -283,6 +283,9 @@ void av_buffer_pool_uninit(AVBufferPool **pool);
+  */
+ AVBufferRef *av_buffer_pool_get(AVBufferPool *pool);
+ 
++// Return the opaque for the underlying frame
++void *av_buffer_pool_opaque(AVBufferRef *ref);
++
+ /**
+  * @}
+  */
+diff --git a/pi-util/conf.sh b/pi-util/conf.sh
+new file mode 100755
+index 0000000..8b596a2
+--- /dev/null
++++ b/pi-util/conf.sh
+@@ -0,0 +1,33 @@
++echo "Configure for Pi2/3"
++
++RPI_BUILDROOT=`pwd`/build
++RPI_ROOTFS=$RPI_BUILDROOT/linux/raspian_jessie_pi1-sysroot
++RPI_TOOLROOT=$RPI_BUILDROOT/tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf
++RPI_OPT_VC=$RPI_ROOTFS/opt/vc
++#RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_ROOTFS/usr/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
++RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
++RPI_DEFS="-D__VCCOREVER__=0x04000000 -DRPI=1"
++#RPI_DEFS="-D__VCCOREVER__=0x04000000"
++RPI_LIBDIRS="-L$RPI_ROOTFS/lib -L$RPI_ROOTFS/usr/lib -L$RPI_OPT_VC/lib"
++#RPI_KEEPS="-save-temps=obj"
++RPI_KEEPS=""
++
++./configure --enable-cross-compile\
++ --arch=armv6t2\
++ --cpu=cortex-a7\
++ --target-os=linux\
++ --disable-stripping\
++ --disable-thumb\
++ --enable-mmal\
++ --extra-cflags="-g $RPI_KEEPS $RPI_DEFS $RPI_INCLUDES"\
++ --extra-cxxflags="$RPI_DEFS $RPI_INCLUDES"\
++ --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_ROOTFS/lib,-rpath-link=$RPI_ROOTFS/usr/lib"\
++ --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\
++ --cross-prefix=$RPI_TOOLROOT/bin/arm-linux-gnueabihf-
++
++# --enable-extra-warnings\
++# --arch=armv71\
++# --enable-shared\
++
++# gcc option for getting asm listing
++# -Wa,-ahls
+diff --git a/pi-util/conf_h265.csv b/pi-util/conf_h265.csv
+new file mode 100644
+index 0000000..61d1399
+--- /dev/null
++++ b/pi-util/conf_h265.csv
+@@ -0,0 +1,144 @@
++1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.bit,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.md5
++2,AMP_A_Samsung_6,AMP_A_Samsung_6.bin,AMP_A_Samsung_6.md5
++1,AMP_B_Samsung_6,AMP_B_Samsung_6.bin,AMP_B_Samsung_6.md5
++1,AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5
++1,AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5
++1,AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5
++1,AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5
++1,AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5
++2,AMVP_C_Samsung_6,AMVP_C_Samsung_6.bin,AMVP_C_Samsung_6.md5
++1,BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5
++1,CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5
++1,CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5
++1,CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5
++1,CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5
++1,CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5
++1,CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5
++1,CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5
++1,CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5
++1,CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5
++1,cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5
++1,CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5
++1,CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5
++1,DBLK_A_MAIN10_VIXS_3,DBLK_A_MAIN10_VIXS_3.bit,DBLK_A_MAIN10_VIXS_3.md5
++1,DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5
++1,DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5
++1,DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5
++1,DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5
++1,DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5
++1,DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5
++1,DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5
++2,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5
++1,DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5
++1,DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5
++1,DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5
++1,DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5
++1,DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5
++1,ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5
++1,ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5
++1,ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5
++1,EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5
++1,FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5
++1,HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5
++1,INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5
++1,INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5
++1,ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5
++1,ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5
++1,ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5
++1,ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5
++1,ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5
++1,IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5
++1,IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5
++1,IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5
++1,LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5
++1,LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5
++1,LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5
++1,MAXBINS_A_TI_4,MAXBINS_A_TI_4.bit,MAXBINS_A_TI_4.md5
++1,MAXBINS_B_TI_4,MAXBINS_B_TI_4.bit,MAXBINS_B_TI_4.md5
++1,MAXBINS_C_TI_4,MAXBINS_C_TI_4.bit,MAXBINS_C_TI_4.md5
++1,MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5
++1,MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5
++1,MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5
++1,MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5
++1,MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5
++1,MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5
++1,MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5
++1,MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5
++1,MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5
++1,MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5
++1,NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5
++1,NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5
++2,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5
++1,OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5
++1,OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5
++1,OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5
++1,PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5
++1,PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5
++1,PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5
++1,PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5
++1,PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5
++1,PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5
++1,PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5
++1,PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5
++1,PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5
++1,POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5
++2,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5
++1,PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5
++1,RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5
++2,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5
++1,RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5
++1,RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5
++1,RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5
++1,RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5
++1,RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5
++1,RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5
++1,RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5
++1,RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5
++1,RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5
++1,RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5
++1,RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5
++1,RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5
++1,RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5
++1,RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5
++1,RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5
++1,SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5
++1,SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5
++1,SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5
++1,SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5
++1,SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5
++1,SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5
++1,SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5
++1,SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5
++1,SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5
++1,SLIST_A_Sony_4,str.bin,SLIST_A_Sony_4_yuv.md5
++1,SLIST_B_Sony_8,str.bin,SLIST_B_Sony_8_yuv.md5
++1,SLIST_C_Sony_3,str.bin,SLIST_C_Sony_3_yuv.md5
++1,SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5
++2,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5
++1,STRUCT_A_Samsung_6,STRUCT_A_Samsung_6.bin,STRUCT_A_Samsung_6.md5
++1,STRUCT_B_Samsung_6,STRUCT_B_Samsung_6.bin,STRUCT_B_Samsung_6.md5
++1,TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5
++1,TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5
++1,TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5
++1,TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5
++1,TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5
++1,TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5
++2,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5
++1,TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5
++2,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5
++1,WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5
++1,WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5
++1,WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5
++1,WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5
++1,WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5
++1,WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5
++1,WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5
++1,WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5
++1,WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5
++1,WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5
++1,WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5
++1,WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5
++1,WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5
++1,WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5
++1,WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5
++1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
+diff --git a/pi-util/ffconf.py b/pi-util/ffconf.py
+new file mode 100644
+index 0000000..38f942f
+--- /dev/null
++++ b/pi-util/ffconf.py
+@@ -0,0 +1,146 @@
++#!/usr/bin/env python
++
++import os
++import subprocess
++import re
++import argparse
++import sys
++import csv
++from stat import *
++
++conf_root = "/opt/conform/h265"
++ffmpeg_exec = "./ffmpeg"
++
++def testone(fileroot, name, es_file, md5_file):
++    tmp_root = "/tmp"
++
++    dec_file = os.path.join(tmp_root, name + ".dec.md5")
++    try:
++        os.remove(dec_file)
++    except:
++        pass
++
++    flog = open(os.path.join(tmp_root, name + ".log"), "wt")
++
++    # Unaligned needed for cropping conformance
++    rstr = subprocess.call(
++        [ffmpeg_exec, "-flags", "unaligned", "-vcodec", "hevc", "-i", os.path.join(fileroot, es_file), "-f", "md5", dec_file],
++        stdout=flog, stderr=subprocess.STDOUT)
++
++    try:
++        m1 = None
++        m2 = None
++        with open(os.path.join(fileroot, md5_file)) as f:
++            for line in f:
++                m1 = re.search("[0-9a-f]{32}", line.lower())
++                if m1:
++                    break
++
++        with open(dec_file) as f:
++            m2 = re.search("[0-9a-f]{32}", f.readline())
++    except:
++        pass
++
++    rv = False
++    if  m1 and m2 and m1.group() == m2.group():
++        print >> flog, "Match: " + m1.group()
++        rv = True
++    elif not m1:
++        print >> flog, "****** Cannot find m1"
++    elif not m2:
++        print >> flog, "****** Cannot find m2"
++    else:
++        print >> flog, "****** Mismatch: " + m1.group() + " != " + m2.group()
++    flog.close()
++    return rv
++
++def scandir(root):
++    aconf = []
++    ents = os.listdir(conf_root)
++    ents.sort(key=str.lower)
++    for name in ents:
++        test_path = os.path.join(conf_root, name)
++        if S_ISDIR(os.stat(test_path).st_mode):
++            files = os.listdir(test_path)
++            es_file = "?"
++            md5_file = "?"
++            for f in files:
++                (base, ext) = os.path.splitext(f)
++                if base[0] == '.':
++                    pass
++                elif ext == ".bit" or ext == ".bin":
++                    es_file = f
++                elif ext == ".md5":
++                    if md5_file == "?":
++                        md5_file = f
++                    elif base[-3:] == "yuv":
++                        md5_file = f
++            aconf.append((1, name, es_file, md5_file))
++    return aconf
++
++def runtest(name, tests):
++    if not tests:
++        return True
++    for t in tests:
++        if name[0:len(t)] == t:
++            return True
++        return False
++
++def doconf(csva, tests):
++    failures = []
++    unx_success = []
++    for a in csva:
++        exp_test = int(a[0])
++        if (exp_test and runtest(a[1], tests)):
++            name = a[1]
++            print "==== ", name,
++            sys.stdout.flush()
++
++            if (not testone(os.path.join(conf_root, name), name, a[2], a[3])) :
++                if exp_test == 1:
++                    failures.append(name)
++                    print ": * FAIL *"
++                else:
++                    print ": fail"
++            else:
++                if exp_test == 2:
++                    print ": * OK *"
++                    unx_success.append(name)
++                else:
++                    print ": ok"
++
++
++    if failures or unx_success:
++        print "Unexpected Failures:", failures
++        print "Unexpected Success: ", unx_success
++    else:
++        print "All tests normal"
++
++
++class ConfCSVDialect(csv.Dialect):
++    delimiter = ','
++    doublequote = True
++    lineterminator = '\n'
++    quotechar='"'
++    quoting = csv.QUOTE_MINIMAL
++    skipinitialspace = True
++    strict = True
++
++if __name__ == '__main__':
++
++    argp = argparse.ArgumentParser(description="FFmpeg h265 conformance tester")
++    argp.add_argument("tests", nargs='*')
++    argp.add_argument("--csvgen", action='store_true', help="Generate CSV file for dir")
++    argp.add_argument("--csv", default="pi-util/conf_h265.csv", help="CSV filename")
++    args = argp.parse_args()
++
++    if args.csvgen:
++        csv.writer(sys.stdout).writerows(scandir(conf_root))
++        exit(0)
++
++    with open(args.csv, 'rt') as csvfile:
++        csva = [a for a in csv.reader(csvfile, ConfCSVDialect())]
++
++
++    doconf(csva, args.tests)
++
+diff --git a/pi-util/qasm.py b/pi-util/qasm.py
+new file mode 100644
+index 0000000..1eacc04
+--- /dev/null
++++ b/pi-util/qasm.py
+@@ -0,0 +1,2502 @@
++#!/usr/bin/env python
++
++#    add.ifz.setf  -, r0, ra0 ; fmul  rb1, rany2, 0 ; thrend # comment
++#    add  r0, r0, 1                    # implicit mul nop
++#    nop                               # explicit add nop, implicit mul nop
++#    bkpt                              # implicit add/mul nop
++#    mov  r0, 0x1234                   # hex immediate
++#    mov  r0, 20 * 40                  # expressions...
++#    mov  r0, f(sqrt(2.0) * 3.0)       # f() converts float to bits
++#    mov  r0, a:label                  # put address of label in r0
++# :label
++#    bra.allnn  ra2, a:1f              # branch to label 1 (searching forward), using absolute address
++# :1
++#    brr.anyz  -, r:1b                 # branch to label 1 (searching backward), using relative address
++# :1                                   # multiple definitions of numeric labels (differentiated using f/b)
++# .set my_val, 3                       # introduce alias for 3
++# .set my_reg, r0                      # and for r0
++#    mov  my_reg, my_val               # then use them
++# .set my_reg2, my_reg + my_val        # r0 plus 3 is r3
++# .macro my_add, a, b, c               # a, b, c act as if .set on entry
++# .set my_val, 10
++#    add  a, b, c
++#    mov  r0, my_val                   # 10
++# .endm                                # forget all .sets since .macro (including arg .sets)
++#    mov  r0, my_val                   # 3
++#    my_add  my_reg2, my_reg, ra0 << 4 # << rotates left (>> rotates right)
++
++import math
++import optparse
++import os
++import random
++import re
++import struct
++import sys
++import time
++
++###############################################################################
++# constants
++###############################################################################
++
++# ops
++######
++
++# negatives are internal qasm ops
++
++AOP_MOV     = -3   # two operands
++AOP_BRA     = -2   # two operands
++AOP_BRR     = -1   # two operands
++AOP_NOP     = 0x00 # no operands
++AOP_FADD    = 0x01
++AOP_FSUB    = 0x02
++AOP_FMIN    = 0x03
++AOP_FMAX    = 0x04
++AOP_FMINABS = 0x05
++AOP_FMAXABS = 0x06
++AOP_FTOI    = 0x07 # two operands
++AOP_ITOF    = 0x08 # two operands
++AOP_ADD     = 0x0c
++AOP_SUB     = 0x0d
++AOP_SHR     = 0x0e
++AOP_ASR     = 0x0f
++AOP_ROR     = 0x10
++AOP_SHL     = 0x11
++AOP_MIN     = 0x12
++AOP_MAX     = 0x13
++AOP_AND     = 0x14
++AOP_OR      = 0x15
++AOP_XOR     = 0x16
++AOP_NOT     = 0x17 # two operands
++AOP_CLZ     = 0x18 # two operands
++AOP_V8ADDS  = 0x1e
++AOP_V8SUBS  = 0x1f
++
++MOP_MOV    = -1  # two operands
++MOP_NOP    = 0x0 # no operands
++MOP_FMUL   = 0x1
++MOP_MUL24  = 0x2
++MOP_V8MULD = 0x3
++MOP_V8MIN  = 0x4
++MOP_V8MAX  = 0x5
++MOP_V8ADDS = 0x6
++MOP_V8SUBS = 0x7
++
++# ldi modes
++############
++
++LDI_32          = 0
++LDI_EL_SIGNED   = 1
++LDI_EL_UNSIGNED = 3
++LDI_SEMA        = 4
++
++# conds
++########
++
++COND_NEVER  = 0
++COND_ALWAYS = 1
++COND_IFZ    = 2
++COND_IFNZ   = 3
++COND_IFN    = 4
++COND_IFNN   = 5
++COND_IFC    = 6
++COND_IFNC   = 7
++
++BCOND_ALLZ   = 0
++BCOND_ALLNZ  = 1
++BCOND_ANYZ   = 2
++BCOND_ANYNZ  = 3
++BCOND_ALLN   = 4
++BCOND_ALLNN  = 5
++BCOND_ANYN   = 6
++BCOND_ANYNN  = 7
++BCOND_ALLC   = 8
++BCOND_ALLNC  = 9
++BCOND_ANYC   = 10
++BCOND_ANYNC  = 11
++BCOND_ALWAYS = 15
++
++# packing/unpacking
++####################
++
++# regfile a pack modes
++PACK_A_NOP   = 0
++PACK_A_16A   = 1
++PACK_A_16B   = 2
++PACK_A_8888  = 3
++PACK_A_8A    = 4
++PACK_A_8B    = 5
++PACK_A_8C    = 6
++PACK_A_8D    = 7
++PACK_A_32S   = 8
++PACK_A_16AS  = 9
++PACK_A_16BS  = 10
++PACK_A_8888S = 11
++PACK_A_8AS   = 12
++PACK_A_8BS   = 13
++PACK_A_8CS   = 14
++PACK_A_8DS   = 15
++
++# mul unit pack modes
++PACK_MUL_NOP  = 0
++PACK_MUL_8888 = 3
++PACK_MUL_8A   = 4
++PACK_MUL_8B   = 5
++PACK_MUL_8C   = 6
++PACK_MUL_8D   = 7
++
++# regfile a unpack modes
++UNPACK_A_NOP = 0
++UNPACK_A_16A = 1
++UNPACK_A_16B = 2
++UNPACK_A_8R  = 3
++UNPACK_A_8A  = 4
++UNPACK_A_8B  = 5
++UNPACK_A_8C  = 6
++UNPACK_A_8D  = 7
++
++# r4 unpack modes
++UNPACK_R4_NOP = 0
++UNPACK_R4_16A = 1
++UNPACK_R4_16B = 2
++UNPACK_R4_8R  = 3
++UNPACK_R4_8A  = 4
++UNPACK_R4_8B  = 5
++UNPACK_R4_8C  = 6
++UNPACK_R4_8D  = 7
++
++PACK_TYPE_INT    = 0
++PACK_TYPE_FLOAT  = 1
++PACK_TYPE_EITHER = -1
++
++PACK_MODE_A      = 0 # regfile a
++PACK_MODE_M      = 1 # mul unit
++PACK_MODE_EITHER = -1
++
++UNPACK_LOC_A     = 0 # regfile a
++UNPACK_LOC_R4    = 1 # r4
++UNPACK_LOC_AB    = 2 # either regfile a or regfile b
++UNPACK_LOC_OTHER = 3 # somewhere else
++
++# args
++#######
++
++# loc_t, ie internal
++MUX_AC  = 0
++MUX_ANY = 1
++MUX_A   = 2
++MUX_B   = 3
++RW_EITHER = 0
++RW_READ   = 1
++RW_WRITE  = 2
++
++RADDR_NOP = 39
++
++# negatives are for internal use
++RMUX_SEMA  = -6
++RMUX_LABEL = -5
++RMUX_IMMV  = -4
++RMUX_IMM   = -3
++RMUX_AC    = -2
++RMUX_ANY   = -1
++RMUX_A0    = 0 # followed by A1, A2, A3, A4, A5
++RMUX_A     = 6
++RMUX_B     = 7
++
++WADDR_R0  = 32 # followed by R1, R2, R3
++WADDR_NOP = 39
++
++WMUX_ANY = 0
++WMUX_A   = 1
++WMUX_B   = 2
++
++# signals
++##########
++
++SIG_BKPT       = 0
++SIG_NORMAL     = 1
++SIG_THRSW      = 2
++SIG_THREND     = 3
++SIG_SBWAIT     = 4
++SIG_SBDONE     = 5
++SIG_INT        = 6 # on a0
++SIG_LTHRSW     = 6 # on b0
++SIG_LOADCV     = 7
++SIG_LOADC      = 8
++SIG_LDCEND     = 9
++SIG_LDTMU0     = 10
++SIG_LDTMU1     = 11
++SIG_ROTATE     = 12 # on a0
++SIG_LOADAM     = 12 # on b0
++SIG_SMALLIMMED = 13
++SIG_IMMED      = 14
++SIG_BRANCH     = 15
++
++# multi-line assembler constructs
++##################################
++
++CONSTRUCT_MACRO = 0x1
++CONSTRUCT_IF    = 0x2
++CONSTRUCT_ELSE  = 0x4
++CONSTRUCT_REP   = 0x8
++
++###############################################################################
++# helpers
++###############################################################################
++
++def asm_error(message, location = None):
++   if location is None:
++      location = current_location
++   if location == '':
++      sys.stderr.write('qasm ERROR: %s\n' % message)
++   else:
++      sys.stderr.write('qasm ERROR: %s: %s\n' % (location, message))
++   sys.exit(-1)
++
++def asm_warning(message, location = None):
++   if disable_warnings or (nwarn_level != 0):
++      return
++   if location is None:
++      location = current_location
++   if location == '':
++      sys.stderr.write('qasm WARNING: %s\n' % message)
++   else:
++      sys.stderr.write('qasm WARNING: %s: %s\n' % (location, message))
++   if warnings_are_errors:
++      asm_error('warnings are errors!', location)
++
++# smart_split('') = []
++# smart_split('a') = ['a']
++# smart_split('a(1, 2),[3, 4, 5],6') = ['a(1, 2)', '[3, 4, 5]', '6']
++def smart_split(s, delim = ',', count = 0):
++   if len(s) == 0:
++      return []
++   parts = []
++   depth = 0
++   i = 0
++   for j in xrange(len(s)):
++      if s[j] in '([{':
++         depth += 1
++      elif s[j] in ')]}':
++         depth -= 1
++      elif (s[j] == delim) and (depth == 0):
++         parts.append(s[i:j])
++         i = j + 1
++         if len(parts) == count:
++            break
++   if depth != 0:
++      asm_error('bracket nesting fail')
++   parts.append(s[i:])
++   return parts
++
++def is_int(x):
++   return isinstance(x, int) or isinstance(x, long)
++
++###############################################################################
++# "parsing" stuff
++###############################################################################
++
++re_macro = re.compile('\\.macro\\s+(?P<name>\\w+)(?P<params>(\\s*,\\s*\\w+)*)$')
++re_if = re.compile('\\.if((?P<set>n?set)\\s+(?P<name>\\w+)|\\s(?P<condition>.+))$')
++re_elif = re.compile('\\.elif((?P<set>n?set)\\s+(?P<name>\\w+)|\\s(?P<condition>.+))$')
++re_rep = re.compile('\\.rep\\s+(?P<name>\\w+)\\s*,(?P<count>.+)$')
++re_include = re.compile('\\.include\\s(?P<filename>.+)$')
++re_set = re.compile('\\.set\\s+(?P<name>\\w+)\\s*,(?P<val>.+)$')
++re_unset = re.compile('\\.unset\\s+(?P<name>\\w+)$')
++re_eval = re.compile('\\.eval\\s(?P<expr>.+)$')
++re_print_info_warn_error = re.compile('\\.(?P<print_info_warn_error>print|info|warn|error)\\s(?P<message>.+)$')
++re_assert = re.compile('\\.assert\\s(?P<condition>.+)$')
++re_data = re.compile('\\.d(?P<size>[124])\\s(?P<data>.+)$')
++re_macro_inst = re.compile('(?P<name>\\w+)(?P<args>\\s.+|)$')
++re_label = re.compile(':(?P<name>:?[a-zA-Z_]\\w*|\\d+)$')
++re_op = re.compile('(?P<op>\\w+)(\\.(?P<cond>\\w+))??(\\.(?P<sf>setf))?(?P<args>\\s.+|)$')
++re_label_ref_left = re.compile('\\b([ar]):')
++re_label_ref_right = re.compile('[a-zA-Z_]\\w*|\\d+[bf]$')
++re_pack = re.compile('\\.([0-9]\\w*[a-df-zA-DF-Z_])') # a bit weird because we don't want to pick up float literals...
++
++# ops
++######
++
++aops = {
++   'mov': (AOP_MOV, 2),
++   'bra': (AOP_BRA, 2),
++   'brr': (AOP_BRR, 2),
++   'nop': (AOP_NOP, 0),
++   'fadd': (AOP_FADD, 3),
++   'fsub': (AOP_FSUB, 3),
++   'fmin': (AOP_FMIN, 3),
++   'fmax': (AOP_FMAX, 3),
++   'fminabs': (AOP_FMINABS, 3),
++   'fmaxabs': (AOP_FMAXABS, 3),
++   'ftoi': (AOP_FTOI, 2),
++   'itof': (AOP_ITOF, 2),
++   'add': (AOP_ADD, 3),
++   'sub': (AOP_SUB, 3),
++   'shr': (AOP_SHR, 3),
++   'asr': (AOP_ASR, 3),
++   'ror': (AOP_ROR, 3),
++   'shl': (AOP_SHL, 3),
++   'min': (AOP_MIN, 3),
++   'max': (AOP_MAX, 3),
++   'and': (AOP_AND, 3),
++   'or': (AOP_OR, 3),
++   'xor': (AOP_XOR, 3),
++   'not': (AOP_NOT, 2),
++   'clz': (AOP_CLZ, 2),
++   'v8adds': (AOP_V8ADDS, 3),
++   'v8subs': (AOP_V8SUBS, 3)}
++
++def get_aop(aop):
++   if aop not in aops:
++      asm_error('invalid aop')
++   return aops[aop]
++
++mops = {
++   'mov': (MOP_MOV, 2),
++   'nop': (MOP_NOP, 0),
++   'fmul': (MOP_FMUL, 3),
++   'mul24': (MOP_MUL24, 3),
++   'v8muld': (MOP_V8MULD, 3),
++   'v8min': (MOP_V8MIN, 3),
++   'v8max': (MOP_V8MAX, 3),
++   'v8adds': (MOP_V8ADDS, 3),
++   'v8subs': (MOP_V8SUBS, 3)}
++
++def get_mop(mop):
++   if mop not in mops:
++      asm_error('invalid mop')
++   return mops[mop]
++
++# conds
++########
++
++conds = {
++   'ifz': COND_IFZ,
++   'ifnz': COND_IFNZ,
++   'ifn': COND_IFN,
++   'ifnn': COND_IFNN,
++   'ifc': COND_IFC,
++   'ifnc': COND_IFNC}
++
++def get_cond(cond):
++   if not cond:
++      return COND_ALWAYS
++   if cond not in conds:
++      asm_error('invalid cond')
++   return conds[cond]
++
++bconds = {
++   'allz': BCOND_ALLZ,
++   'allnz': BCOND_ALLNZ,
++   'anyz': BCOND_ANYZ,
++   'anynz': BCOND_ANYNZ,
++   'alln': BCOND_ALLN,
++   'allnn': BCOND_ALLNN,
++   'anyn': BCOND_ANYN,
++   'anynn': BCOND_ANYNN,
++   'allc': BCOND_ALLC,
++   'allnc': BCOND_ALLNC,
++   'anyc': BCOND_ANYC,
++   'anync': BCOND_ANYNC}
++
++def get_bcond(bcond):
++   if not bcond:
++      return BCOND_ALWAYS
++   if bcond not in bconds:
++      asm_error('invalid bcond')
++   return bconds[bcond]
++
++def get_setf(setf):
++   if not setf:
++      return False
++   return True
++
++# packing/unpacking
++####################
++
++packs = {
++   '16a':    (PACK_A_16A,    PACK_TYPE_INT,    PACK_MODE_A),
++   '16b':    (PACK_A_16B,    PACK_TYPE_INT,    PACK_MODE_A),
++   '16af':   (PACK_A_16A,    PACK_TYPE_FLOAT,  PACK_MODE_A),
++   '16bf':   (PACK_A_16B,    PACK_TYPE_FLOAT,  PACK_MODE_A),
++   '8abcd':  (PACK_A_8888,   PACK_TYPE_EITHER, PACK_MODE_A),
++   '8a':     (PACK_A_8A,     PACK_TYPE_EITHER, PACK_MODE_A),
++   '8b':     (PACK_A_8B,     PACK_TYPE_EITHER, PACK_MODE_A),
++   '8c':     (PACK_A_8C,     PACK_TYPE_EITHER, PACK_MODE_A),
++   '8d':     (PACK_A_8D,     PACK_TYPE_EITHER, PACK_MODE_A),
++   's':      (PACK_A_32S,    PACK_TYPE_EITHER, PACK_MODE_A),
++   '16as':   (PACK_A_16AS,   PACK_TYPE_EITHER, PACK_MODE_A),
++   '16bs':   (PACK_A_16BS,   PACK_TYPE_EITHER, PACK_MODE_A),
++   '8abcds': (PACK_A_8888S,  PACK_TYPE_EITHER, PACK_MODE_A),
++   '8as':    (PACK_A_8AS,    PACK_TYPE_EITHER, PACK_MODE_A),
++   '8bs':    (PACK_A_8BS,    PACK_TYPE_EITHER, PACK_MODE_A),
++   '8cs':    (PACK_A_8CS,    PACK_TYPE_EITHER, PACK_MODE_A),
++   '8ds':    (PACK_A_8DS,    PACK_TYPE_EITHER, PACK_MODE_A),
++   '8abcdc': (PACK_MUL_8888, PACK_TYPE_EITHER, PACK_MODE_M),
++   '8ac':    (PACK_MUL_8A,   PACK_TYPE_EITHER, PACK_MODE_M),
++   '8bc':    (PACK_MUL_8B,   PACK_TYPE_EITHER, PACK_MODE_M),
++   '8cc':    (PACK_MUL_8C,   PACK_TYPE_EITHER, PACK_MODE_M),
++   '8dc':    (PACK_MUL_8D,   PACK_TYPE_EITHER, PACK_MODE_M)}
++
++def get_pack(pack):
++   if not pack:
++      return (0, PACK_TYPE_EITHER, PACK_MODE_EITHER)
++   if pack not in packs:
++      asm_error('invalid pack')
++   return packs[pack]
++
++a_unpacks = {
++   '16a':  (UNPACK_A_16A, PACK_TYPE_INT),
++   '16b':  (UNPACK_A_16B, PACK_TYPE_INT),
++   '16af': (UNPACK_A_16A, PACK_TYPE_FLOAT),
++   '16bf': (UNPACK_A_16B, PACK_TYPE_FLOAT),
++   '8dr':  (UNPACK_A_8R,  PACK_TYPE_EITHER),
++   '8a':   (UNPACK_A_8A,  PACK_TYPE_INT),
++   '8b':   (UNPACK_A_8B,  PACK_TYPE_INT),
++   '8c':   (UNPACK_A_8C,  PACK_TYPE_INT),
++   '8d':   (UNPACK_A_8D,  PACK_TYPE_INT),
++   '8ac':  (UNPACK_A_8A,  PACK_TYPE_FLOAT),
++   '8bc':  (UNPACK_A_8B,  PACK_TYPE_FLOAT),
++   '8cc':  (UNPACK_A_8C,  PACK_TYPE_FLOAT),
++   '8dc':  (UNPACK_A_8D,  PACK_TYPE_FLOAT)}
++
++def get_a_unpack(unpack):
++   if not unpack:
++      return (UNPACK_A_NOP, PACK_TYPE_EITHER, UNPACK_LOC_A)
++   if unpack not in a_unpacks:
++      asm_error('invalid ra unpack')
++   return a_unpacks[unpack] + (UNPACK_LOC_A,)
++
++r4_unpacks = {
++   '16af': UNPACK_R4_16A,
++   '16bf': UNPACK_R4_16B,
++   '8dr':  UNPACK_R4_8R,
++   '8ac':  UNPACK_R4_8A,
++   '8bc':  UNPACK_R4_8B,
++   '8cc':  UNPACK_R4_8C,
++   '8dc':  UNPACK_R4_8D}
++
++def get_r4_unpack(unpack):
++   if not unpack:
++      return (UNPACK_R4_NOP, PACK_TYPE_EITHER, UNPACK_LOC_R4)
++   if unpack not in r4_unpacks:
++      asm_error('invalid r4 unpack')
++   return (r4_unpacks[unpack], PACK_TYPE_EITHER, UNPACK_LOC_R4)
++
++# args
++#######
++
++class loc_t:
++   def __init__(self, mux, i, rot, r5_rot, pack, rw):
++      self.mux = mux
++      self.i = i
++      self.rot = rot % 16
++      self.r5_rot = r5_rot % 16
++      self.pack = pack
++      self.rw = rw
++
++   def copy(self):
++      return loc_t(self.mux, self.i, self.rot, self.r5_rot, self.pack, self.rw)
++
++   def __add__(self, i):
++      if not is_int(i):
++         raise Exception('can only add integer to loc')
++      return loc_t(self.mux, self.i + i, self.rot, self.r5_rot, self.pack, self.rw)
++
++   def __sub__(self, i):
++      if not is_int(i):
++         raise Exception('can only subtract integer from loc')
++      return loc_t(self.mux, self.i - i, self.rot, self.r5_rot, self.pack, self.rw)
++
++   def __cmp__(self, other):
++      if is_int(other):
++         return cmp(self.i, other)
++      if not isinstance(other, loc_t):
++         raise Exception('can only compare loc to integer or other loc')
++      if self.mux != other.mux:
++         return cmp(self.mux, other.mux)
++      if self.i != other.i:
++         return cmp(self.i, other.i)
++      if self.rot != other.rot:
++         return cmp(self.rot, other.rot)
++      if self.r5_rot != other.r5_rot:
++         return cmp(self.r5_rot, other.r5_rot)
++      return cmp(self.pack, other.pack)
++
++   def is_r5(self):
++      return (self.mux == MUX_AC) and (self.i == 5)
++
++   def shift(self, rot, left):
++      if isinstance(rot, loc_t) and rot.is_r5():
++         if (rot.rot != 0) or (rot.r5_rot != 0) or rot.pack:
++            raise Exception('can\'t rotate by rotated/unpacked r5')
++         return loc_t(self.mux, self.i, self.rot, self.r5_rot + (-1 if left else 1), self.pack, self.rw)
++      if not is_int(rot):
++         raise Exception('can only rotate by integer or r5')
++      return loc_t(self.mux, self.i, self.rot + (-rot if left else rot), self.r5_rot, self.pack, self.rw)
++
++   def __lshift__(self, rot):
++      return self.shift(rot, True)
++
++   def __rshift__(self, rot):
++      return self.shift(rot, False)
++
++   def __getattr__(self, name):
++      # discard the first character if it is an underscore. this is a total hack
++      # to allow packs starting with a digit to work
++      if name[0] == '_':
++         name = name[1:]
++      if (name in packs) or (name in a_unpacks) or (name in r4_unpacks):
++         if self.pack:
++            raise Exception('can\'t specify two packs')
++         return loc_t(self.mux, self.i, self.rot, self.r5_rot, name, self.rw)
++      raise AttributeError()
++
++   def __str__(self):
++      if self.mux == MUX_AC:
++         return 'r%d' % self.i
++      if self.mux == MUX_ANY:
++         return 'rany%d' % self.i
++      if self.mux == MUX_A:
++         return 'ra%d' % self.i
++      if self.mux == MUX_B:
++         return 'rb%d' % self.i
++      assert 0
++
++class sema_t:
++   def __init__(self, acq, i):
++      if not is_int(i):
++         raise Exception('semaphore index must be integer')
++      self.acq = acq
++      self.i = i
++
++class label_t:
++   def __init__(self, rel, name, offset):
++      self.rel = rel
++      self.name = name
++      self.offset = offset
++
++   def __add__(self, offset):
++      return label_t(self.rel, self.name, self.offset + offset)
++
++   def __sub__(self, offset):
++      return label_t(self.rel, self.name, self.offset - offset)
++
++class label_maker_t:
++   def __init__(self, rel):
++      self.rel = rel
++
++   def __getattr__(self, name):
++      # we discard the first character. this is a total hack to allow numeric labels to work
++      if not re_label_ref_right.match(name[1:]):
++         raise Exception('invalid label reference')
++      return label_t(self.rel, name[1:], 0)
++
++def bits(x, n):
++   if (x >> n) != 0:
++      raise Exception('%d doesn\'t fit in %d bits' % (x, n))
++   return x
++
++def bitsw(x, n):
++   if x == (1 << n):
++      x = 0
++   return bits(x, n)
++
++def bitsws(x, n):
++   if x == (1 << (n - 1)):
++      x = 0
++   if -(1 << (n - 1)) <= x < 0:
++      x += 1 << n
++   return bits(x, n)
++
++def vpm_setup(n, stride, addr, v2 = False):
++   horiz, laned, size, y, x, p = addr
++   if size not in (0, 1, 2):
++      raise Exception('addr size should be 0, 1, or 2')
++   if horiz:
++      if x != 0:
++         raise Exception('horizontal accesses must have x of 0')
++   else:
++      if (y & 0xf) != 0:
++         raise Exception('vertical accesses must be 16 row aligned')
++   hls = (bits(horiz, 1) << 3) | (bits(laned, 1) << 2) | (2 - size)
++   if v2:
++      return ((1 << 29) | (bitsw(n, 5) << 24) | (bitsws(stride, 7) << 16) |
++         (hls << 12) | ((bits(y, 8) | bits(x, 4)) << size) | bits(p, size))
++   return ((bitsw(n, 4) << 20) | (bitsw(stride, 6) << 12) |
++      (hls << 8) | ((bits(y, 6) | bits(x, 4)) << size) | bits(p, size))
++
++def vdw_setup_0(n, m, addr):
++   horiz, size, y, x, p = addr
++   if size not in (0, 1, 2):
++      raise Exception('addr size should be 0, 1, or 2')
++   return ((2 << 30) | (bitsw(n, 7) << 23) | (bitsw(m, 7) << 16) |
++      (bits(horiz, 1) << 14) | (bits(y, 7) << 7) | (bits(x, 4) << 3) | (size << 1) | bits(p, size))
++
++def vdr_setup_0(n, m, addr, vpm_stride, stride):
++   horiz, size, y, x, p = addr
++   if size not in (0, 1, 2):
++      raise Exception('addr size should be 0, 1, or 2')
++   if (stride < 8) or (stride & (stride - 1)):
++      raise Exception('stride must be power of 2 >= 8, 8 meaning use extended stride')
++   log2_stride = 3
++   while (1 << log2_stride) != stride:
++      log2_stride += 1
++   return ((1 << 31) | (size << 29) | (bits(p, size) << 28) | (bits(log2_stride - 3, 4) << 24) |
++      (bitsw(m, 4) << 20) | (bitsw(n, 4) << 16) | (bitsw(vpm_stride, 4) << 12) |
++      (bits(1 - horiz, 1) << 11) | (bits(y, 7) << 4) | bits(x, 4))
++
++class allocator_t:
++   def __init__(self, *available):
++      self.available = list(available)
++      self.allocated = {}
++      self.reserved = []
++
++   def copy(self):
++      a = allocator_t()
++      a.available = self.available[:]
++      a.allocated = self.allocated.copy()
++      a.reserved = self.reserved[:]
++      return a
++
++   def forget(self):
++      self.__init__(self.available + self.allocated.values() + self.reserved)
++
++   def reserve(self, *rs):
++      for r in rs:
++         self.available.remove(r)
++         self.reserved.append(r)
++
++   def retire(self, name):
++      r = self.allocated.pop(name)
++      del r.__invert__
++      del r.retire
++      self.available.append(r)
++      return r
++
++   def __getattr__(self, name):
++      if name not in self.allocated:
++         r = self.available.pop()
++         r.retire = lambda: self.retire(name) # this is an ugly hack to get nicer retire syntax
++         r.__invert__ = r.retire
++         self.allocated[name] = r
++      return self.allocated[name]
++
++def pragma_allow_xor_0(x):
++   global allow_xor_0
++
++   if not isinstance(x, bool):
++      raise Exception('allow_xor_0 must be bool')
++   x, allow_xor_0 = allow_xor_0, x
++   return x
++
++def pragma_dont_warn_when_mul_rot_inp_r5(x):
++   global dont_warn_when_mul_rot_inp_r5
++
++   if not isinstance(x, bool):
++      raise Exception('dont_warn_when_mul_rot_inp_r5 must be bool')
++   x, dont_warn_when_mul_rot_inp_r5 = dont_warn_when_mul_rot_inp_r5, x
++   return x
++
++arg_defs = {
++   # special reg names (these alias the regular names, but also have appropriate read/write restrictions)
++   'w':             loc_t(MUX_A,   15, 0, 0, None, RW_EITHER),
++   'z':             loc_t(MUX_B,   15, 0, 0, None, RW_EITHER),
++   'unif':          loc_t(MUX_ANY, 32, 0, 0, None, RW_READ),
++   'vary':          loc_t(MUX_ANY, 35, 0, 0, None, RW_READ),
++   'tmurs':         loc_t(MUX_ANY, 36, 0, 0, None, RW_WRITE),
++   'r5quad':        loc_t(MUX_A,   37, 0, 0, None, RW_WRITE),
++   'r5rep':         loc_t(MUX_B,   37, 0, 0, None, RW_WRITE),
++   'elem_num':      loc_t(MUX_A,   38, 0, 0, None, RW_READ),
++   'qpu_num':       loc_t(MUX_B,   38, 0, 0, None, RW_READ),
++   'unif_addr':     loc_t(MUX_A,   40, 0, 0, None, RW_WRITE),
++   'unif_addr_rel': loc_t(MUX_B,   40, 0, 0, None, RW_WRITE),
++   'x_coord':       loc_t(MUX_A,   41, 0, 0, None, RW_EITHER),
++   'y_coord':       loc_t(MUX_B,   41, 0, 0, None, RW_EITHER),
++   'ms_mask':       loc_t(MUX_A,   42, 0, 0, None, RW_EITHER),
++   'rev_flag':      loc_t(MUX_B,   42, 0, 0, None, RW_EITHER),
++   'stencil':       loc_t(MUX_ANY, 43, 0, 0, None, RW_WRITE),
++   'tlbz':          loc_t(MUX_ANY, 44, 0, 0, None, RW_WRITE),
++   'tlbm':          loc_t(MUX_ANY, 45, 0, 0, None, RW_WRITE),
++   'tlbc':          loc_t(MUX_ANY, 46, 0, 0, None, RW_WRITE),
++   'vpm':           loc_t(MUX_ANY, 48, 0, 0, None, RW_EITHER),
++   'vr_busy':       loc_t(MUX_A,   49, 0, 0, None, RW_READ),
++   'vw_busy':       loc_t(MUX_B,   49, 0, 0, None, RW_READ),
++   'vr_setup':      loc_t(MUX_A,   49, 0, 0, None, RW_WRITE),
++   'vw_setup':      loc_t(MUX_B,   49, 0, 0, None, RW_WRITE),
++   'vr_wait':       loc_t(MUX_A,   50, 0, 0, None, RW_READ),
++   'vw_wait':       loc_t(MUX_B,   50, 0, 0, None, RW_READ),
++   'vr_addr':       loc_t(MUX_A,   50, 0, 0, None, RW_WRITE),
++   'vw_addr':       loc_t(MUX_B,   50, 0, 0, None, RW_WRITE),
++   'mutex':         loc_t(MUX_ANY, 51, 0, 0, None, RW_EITHER),
++   'recip':         loc_t(MUX_ANY, 52, 0, 0, None, RW_WRITE),
++   'recipsqrt':     loc_t(MUX_ANY, 53, 0, 0, None, RW_WRITE),
++   'rsqrt':         loc_t(MUX_ANY, 53, 0, 0, None, RW_WRITE),
++   'exp':           loc_t(MUX_ANY, 54, 0, 0, None, RW_WRITE),
++   'log':           loc_t(MUX_ANY, 55, 0, 0, None, RW_WRITE),
++   't0s':           loc_t(MUX_ANY, 56, 0, 0, None, RW_WRITE),
++   't0t':           loc_t(MUX_ANY, 57, 0, 0, None, RW_WRITE),
++   't0r':           loc_t(MUX_ANY, 58, 0, 0, None, RW_WRITE),
++   't0b':           loc_t(MUX_ANY, 59, 0, 0, None, RW_WRITE),
++   't1s':           loc_t(MUX_ANY, 60, 0, 0, None, RW_WRITE),
++   't1t':           loc_t(MUX_ANY, 61, 0, 0, None, RW_WRITE),
++   't1r':           loc_t(MUX_ANY, 62, 0, 0, None, RW_WRITE),
++   't1b':           loc_t(MUX_ANY, 63, 0, 0, None, RW_WRITE),
++
++   # semaphore acq/rel
++   'sacq': lambda i: sema_t(True, i),
++   'srel': lambda i: sema_t(False, i),
++
++   # label makers (before evaluating, the syntax x:label gets transformed to x_label_maker._label)
++   'r_label_maker': label_maker_t(True),
++   'a_label_maker': label_maker_t(False),
++
++   # handy functions
++   'f':     lambda x: struct.unpack('I', struct.pack('f', x))[0],
++   'sqrt':  math.sqrt,
++   'sin':   math.sin,
++   'cos':   math.cos,
++   'atan2': math.atan2,
++   'pi':    math.pi,
++   'rseed': random.seed,
++   'rand':  lambda: int(random.getrandbits(32)),
++   'bits':  bits,
++   'bitsw': bitsw,
++   'bitsws': bitsws,
++
++   # handy vpm/vdw/vdr stuff
++   'h32':  lambda y:       (1, 0, 0, y, 0, 0),
++   'h16l': lambda y, p:    (1, 1, 1, y, 0, p),
++   'h16p': lambda y, p:    (1, 0, 1, y, 0, p),
++   'h8l':  lambda y, p:    (1, 1, 2, y, 0, p),
++   'h8p':  lambda y, p:    (1, 0, 2, y, 0, p),
++   'v32':  lambda y, x:    (0, 0, 0, y, x, 0),
++   'v16l': lambda y, x, p: (0, 1, 1, y, x, p),
++   'v16p': lambda y, x, p: (0, 0, 1, y, x, p),
++   'v8l':  lambda y, x, p: (0, 1, 2, y, x, p),
++   'v8p':  lambda y, x, p: (0, 0, 2, y, x, p),
++   'dma_h32':  lambda y, x:    (1, 0, y, x, 0),
++   'dma_h16p': lambda y, x, p: (1, 1, y, x, p),
++   'dma_h8p':  lambda y, x, p: (1, 2, y, x, p),
++   'dma_v32':  lambda y, x:    (0, 0, y, x, 0),
++   'dma_v16p': lambda y, x, p: (0, 1, y, x, p),
++   'dma_v8p':  lambda y, x, p: (0, 2, y, x, p),
++   'vpm_setup': vpm_setup,
++   'vpm_setup_v2': lambda n, stride, addr: vpm_setup(n, stride, addr, True),
++   'vdw_setup_0': vdw_setup_0,
++   'vdw_setup_1': lambda stride: (3 << 30) | bits(stride, 13),
++   'vdr_setup_0': vdr_setup_0,
++   'vdr_setup_ext_stride': 8, # stride of 8 means use extended stride
++   'vdr_setup_1': lambda stride: (9 << 28) | bits(stride, 13),
++
++   # annotations
++   'mul_used': lambda *is_: ('mul_used', sum(1 << i for i in is_)),
++   'mul_unused': lambda *is_: ('mul_used', sum(1 << i for i in is_) ^ 0xffff),
++   'preserve_cond': ('preserve_cond', 1),
++
++   # somewhat experimental register allocator
++   'allocator_t': allocator_t,
++
++   # pragmas
++   'pragma_allow_xor_0': pragma_allow_xor_0,
++   'pragma_dont_warn_when_mul_rot_inp_r5': pragma_dont_warn_when_mul_rot_inp_r5}
++
++# accumulators and regs (regular names -- r0, ra0, etc)
++arg_defs.update(('r%d' % i, loc_t(MUX_AC, i, 0, 0, None, RW_EITHER)) for i in xrange(6))
++arg_defs.update(('rany%d' % i, loc_t(MUX_ANY, i, 0, 0, None, RW_EITHER)) for i in xrange(64))
++arg_defs.update(('ra%d' % i, loc_t(MUX_A, i, 0, 0, None, RW_EITHER)) for i in xrange(64))
++arg_defs.update(('rb%d' % i, loc_t(MUX_B, i, 0, 0, None, RW_EITHER)) for i in xrange(64))
++
++def arg_eval(arg, sets):
++   s = (arg.strip().split('.', 1) + [None])[:2]
++   if s[0] == '-':
++      return loc_t(MUX_ANY, WADDR_NOP, 0, 0, s[1], RW_WRITE)
++   arg = re_label_ref_left.sub('\\1_label_maker._', arg) # todo: we probably don't want to replace in strings...
++   arg = re_pack.sub('._\\1', arg)
++   try:
++      # todo: i would like to be able to pass both arg_defs and sets in here
++      # (with sets hiding arg_defs in the case of conflicts), but the obvious
++      # dict(arg_defs, **sets) won't permit things such as:
++      # .set f, lambda x: y
++      # .set y, 4
++      # (the y in the lambda will be looked up in the temporary dict we created
++      # when evaluating the f .set, which doesn't contain y)
++      #
++      # instead, sets is initially set to (a copy of) arg_defs. to simulate the
++      # hiding behaviour, on an unset, we restore any hidden arg_defs value.
++      # also, before dumping sets at the end, we strip out the arg_defs stuff
++      # (this isn't entirely correct as we want to dump sets that are hiding
++      # arg_defs)
++      return eval(arg, sets)
++   except Exception, e:
++      asm_error(e)
++   except:
++      asm_error('unknown error while evaluating argument')
++
++# doesn't check/fixup pack
++def check_and_fixup_loc(loc, read):
++   if (not read) and (loc.rw == RW_READ):
++      asm_error('writing to read-only hardware register')
++   if read and (loc.rw == RW_WRITE):
++      asm_error('reading from write-only hardware register')
++   if not read:
++      # conceptually, we are writing to a location rotated right by
++      # loc.rot/loc.r5_rot. but we are actually rotating the output right by
++      # -loc.rot/-loc.r5_rot then writing it to the unrotated location
++      loc.rot = -loc.rot % 16
++      loc.r5_rot = -loc.r5_rot % 16
++   if (loc.rot != 0) and (loc.r5_rot != 0):
++      asm_error('can\'t rotate by both r5 and immediate')
++   if (loc.r5_rot != 0) and (loc.r5_rot != 1):
++      asm_error('only supported rotation by r5 is once to the %s' % ('left', 'right')[read])
++   if (not mulw_rotate) and ((loc.rot != 0) or loc.r5_rot): # mulw_rotate source checking is done later
++      if not read:
++         asm_error('target doesn\'t support write rotation')
++      if loc.mux == MUX_ANY:
++         loc.mux = MUX_A # can't do rotated read from regfile b
++      if loc.mux != MUX_A:
++         asm_error('rotation on read only allowed from regfile a')
++      if loc.i >= 32:
++         asm_warning('rotation only works from physical regfile')
++   if loc.mux == MUX_AC:
++      if (loc.i < 0) or (loc.i >= 6):
++         asm_error('reg out of range')
++      if not read:
++         if loc.i == 4:
++            asm_error('not allowed to write to r4')
++         if loc.i == 5:
++
++            asm_error('not allowed to write to r5 -- please specify r5quad or r5rep')
++   elif (loc.mux == MUX_ANY) or (loc.mux == MUX_A) or (loc.mux == MUX_B):
++      if (loc.i < 0) or (loc.i >= 64):
++         asm_error('reg out of range')
++   else:
++      assert 0
++
++def get_dst(dst, sets):
++   if not dst:
++      return None, None, (0, PACK_TYPE_EITHER, PACK_MODE_EITHER), 0, 0
++   dst = arg_eval(dst, sets)
++   if not isinstance(dst, loc_t):
++      asm_error('invalid dst')
++   dst = dst.copy()
++   check_and_fixup_loc(dst, False)
++   pack = get_pack(dst.pack)
++   if dst.mux == MUX_AC:
++      if pack[2] == PACK_MODE_A:
++         asm_warning('ra packing only works when writing to physical regfile')
++         return WADDR_R0 + dst.i, WMUX_A, pack, dst.rot, dst.r5_rot
++      return WADDR_R0 + dst.i, WMUX_ANY, pack, dst.rot, dst.r5_rot
++   if (dst.mux == MUX_A) or ((dst.mux == MUX_ANY) and (pack[2] == PACK_MODE_A)): # can't pack to regfile b with this operation
++      if (pack[2] == PACK_MODE_A) and (dst.i >= 32):
++         asm_warning('ra packing only works when writing to physical regfile')
++      return dst.i, WMUX_A, pack, dst.rot, dst.r5_rot
++   if dst.mux == MUX_ANY:
++      return dst.i, WMUX_ANY, pack, dst.rot, dst.r5_rot
++   if dst.mux == MUX_B:
++      if pack[2] == PACK_MODE_A:
++         asm_error('this packing operation can only be used for regfile a')
++      return dst.i, WMUX_B, pack, dst.rot, dst.r5_rot
++   assert 0
++
++def get_src(src, sets):
++   if not src:
++      return None, None, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), None, None
++   src = arg_eval(src, sets)
++   if isinstance(src, sema_t):
++      if not have_sema:
++         asm_error('target does not support semaphores')
++      if (src.i < 0) or (src.i >= 16):
++         asm_error('semaphore number must be in [0, 16)')
++      return src.i | (src.acq << 4), RMUX_SEMA, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), 0, 0
++   if isinstance(src, label_t):
++      return (src.name, src.rel, src.offset), RMUX_LABEL, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), 0, 0
++   if isinstance(src, list):
++      if len(src) != 16:
++         asm_error('vector immediate must have length 16')
++      src = src[:]
++      for i in xrange(16):
++         if not is_int(src[i]):
++            asm_error('all elements of vector immediate must be integers')
++         src[i] &= (1 << 32) - 1
++      return src, RMUX_IMMV, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), 0, 0
++   if is_int(src):
++      return src & ((1 << 32) - 1), RMUX_IMM, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), 0, 0
++   if not isinstance(src, loc_t):
++      asm_error('invalid src')
++   src = src.copy()
++   check_and_fixup_loc(src, True)
++   if mulw_rotate:
++      srot, sr5rot = 0, 0
++      drot, dr5rot = src.rot, src.r5_rot
++   else:
++      srot, sr5rot = src.rot, src.r5_rot
++      drot, dr5rot = 0, 0
++   if src.mux == MUX_AC:
++      if src.i == 4:
++         return 4, RMUX_AC, get_r4_unpack(src.pack), drot, dr5rot
++      if src.pack:
++         asm_error('unpack only allowed for regfile a or r4')
++      return src.i, RMUX_AC, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), drot, dr5rot
++   if (src.mux == MUX_A) or ((src.mux == MUX_ANY) and src.pack): # can't unpack from regfile b
++      return (src.i, srot, sr5rot), RMUX_A, get_a_unpack(src.pack), drot, dr5rot
++   if src.mux == MUX_ANY:
++      return src.i, RMUX_ANY, (0, PACK_TYPE_EITHER, UNPACK_LOC_AB), drot, dr5rot
++   if src.mux == MUX_B:
++      if src.pack:
++         asm_error('unpack only allowed for regfile a or r4')
++      return src.i, RMUX_B, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), drot, dr5rot
++   assert 0
++
++# signals
++##########
++
++sigs = {
++   'bkpt': SIG_BKPT,
++   'thrsw': SIG_THRSW,
++   'thrend': SIG_THREND,
++   'sbwait': SIG_SBWAIT,
++   'sbdone': SIG_SBDONE,
++   'int': SIG_INT,
++   'loadcv': SIG_LOADCV,
++   'loadc': SIG_LOADC,
++   'ldcend': SIG_LDCEND,
++   'ldtmu0': SIG_LDTMU0,
++   'ldtmu1': SIG_LDTMU1}
++
++def get_sig(sig):
++   if sig not in sigs:
++      return SIG_NORMAL
++   return sigs[sig]
++
++# annotations
++##############
++
++def get_annots(annot, sets):
++   annots = arg_eval(annot, sets)
++   if isinstance(annots, list):
++      annots = annots[:]
++   else:
++      annots = [annots]
++   for i, annot in enumerate(annots):
++      if ((not isinstance(annot, tuple)) or (len(annot) != 2) or (not isinstance(annot[0], str)) or
++         (not is_int(annot[1]))):
++         asm_error('annotation must be (string, integer) pair, or a list of such pairs')
++      annots[i] = (annot[0], annot[1] & ((1 << 32) - 1))
++   return annots
++
++###############################################################################
++# core
++###############################################################################
++
++def calculate_pack_modes(rpacks, rfloats, couldrfloat, wpacks, wfloats):
++   needfloat = PACK_TYPE_EITHER
++   havefloata = False
++   havefloatr4 = False
++   unpacka = None
++   unpackr4 = None
++   forcebs = [False, False, False, False]
++   forcerafloat = False
++
++   pm = PACK_MODE_EITHER
++   for i in (0, 1, 2, 3):
++      if (rpacks[i][2] == UNPACK_LOC_OTHER) or (rpacks[i][2] == UNPACK_LOC_AB):
++         assert rpacks[i][0] == 0
++      else:
++         if rpacks[i][2] == UNPACK_LOC_A:
++            if unpacka is None:
++               unpacka = rpacks[i][0]
++            elif unpacka != rpacks[i][0]:
++               asm_error('conflicting unpack operations on regfile a')
++            havefloata = havefloata or rfloats[i]
++         elif rpacks[i][2] == UNPACK_LOC_R4:
++            if unpackr4 is None:
++               unpackr4 = rpacks[i][0]
++            elif unpackr4 != rpacks[i][0]:
++               asm_error('conflicting unpack operations on r4')
++            havefloatr4 = havefloatr4 or rfloats[i]
++         else:
++            assert 0
++
++         if rpacks[i][1] != PACK_TYPE_EITHER:
++            if (needfloat != PACK_TYPE_EITHER) and (needfloat != rpacks[i][1]):
++               asm_error('conflicting unpack float requirements')
++            needfloat = rpacks[i][1]
++   for i in (0, 1, 2, 3):
++      if rpacks[i][2] == UNPACK_LOC_AB:
++         if (unpacka is not None) and (unpacka != UNPACK_A_NOP):
++            forcebs[i] = True # non-nop unpack from regfile a. must use b
++
++   if unpacka:
++      if (needfloat == PACK_TYPE_FLOAT) and (not havefloata) and couldrfloat:
++         havefloata = True
++         forcerafloat = True
++      havefloat = havefloata
++   else:
++      havefloat = havefloatr4
++
++   if (needfloat == PACK_TYPE_FLOAT) and (not havefloat):
++      asm_error('float unpack operation used in integer alu operations')
++   if (needfloat == PACK_TYPE_INT) and havefloat:
++      asm_error('integer unpack operation used in float alu operation')
++
++   unpack = 0
++   if unpacka and unpackr4:
++      asm_error('cannot specify pack operation for both regfile a and r4')
++   if unpacka:
++      pm = PACK_MODE_A
++      unpack = unpacka
++   elif unpackr4:
++      pm = PACK_MODE_M
++      unpack = unpackr4
++
++   pack = 0
++   if wpacks[0][2] == PACK_MODE_M:
++      asm_error('mul-unit pack operation used on add result')
++   for i in (0, 1):
++      if wpacks[i][2] == PACK_MODE_A:
++         if (pm != PACK_MODE_EITHER) and (pm != PACK_MODE_A):
++            asm_error('conflicting pack modes')
++         pm = PACK_MODE_A
++         pack = wpacks[i][0]
++      elif wpacks[i][2] == PACK_MODE_M:
++         if (pm != PACK_MODE_EITHER) and (pm != PACK_MODE_M):
++            asm_error('conflicting pack modes')
++         pm = PACK_MODE_M
++         pack = wpacks[i][0]
++
++      if (wpacks[i][1] == PACK_TYPE_FLOAT) and (not wfloats[i]):
++         asm_error('float pack operation used with integer alu result')
++      if (wpacks[i][1] == PACK_TYPE_INT) and wfloats[i]:
++         asm_error('integer pack operation used with float alu result')
++
++   if pm == PACK_MODE_EITHER:
++      pm = PACK_MODE_A
++   return pm, pack, unpack, forcebs, forcerafloat
++
++# immediates that can be encoded with SIG_SMALLIMMED
++bimms = {}
++bimms.update((i, i) for i in xrange(16))
++bimms.update(((i - 32) + (1 << 32), i) for i in xrange(16, 32))
++bimms.update(((127 + (i - 32)) << 23, i) for i in xrange(32, 40))
++bimms.update(((127 + (i - 48)) << 23, i) for i in xrange(40, 48))
++
++def merge_rmux(raddr_a, raddr_b, immb, arot_r5, raddr, rmux):
++   if rmux == RMUX_SEMA:
++      asm_error('semaphore op can only be used with mov')
++   if rmux == RMUX_LABEL:
++      asm_error('label not allowed here')
++   if rmux == RMUX_IMMV:
++      asm_error('vector immediate can only be used with mov')
++   if rmux == RMUX_IMM:
++      if raddr not in bimms:
++         asm_error('can\'t encode immediate 0x%08x' % raddr)
++      raddr = bimms[raddr]
++      if not immb:
++         if raddr_b is not None:
++            asm_error('regfile b and immediates don\'t mix')
++         raddr_b = raddr
++         immb = True
++      elif raddr_b != raddr:
++         asm_error('can only encode one rotation/immediate')
++      return raddr_a, raddr_b, immb, arot_r5, RMUX_B
++   if rmux == RMUX_AC:
++      return raddr_a, raddr_b, immb, arot_r5, RMUX_A0 + raddr
++   if rmux == RMUX_ANY:
++      if (mulw_rotate or (((not immb) or (raddr_b < 48)) and (not arot_r5))) and (raddr_a == raddr):
++         return raddr_a, raddr_b, immb, arot_r5, RMUX_A
++      if (not immb) and (raddr_b == raddr):
++         return raddr_a, raddr_b, immb, arot_r5, RMUX_B
++      if raddr_a is None:
++         assert mulw_rotate or (((not immb) or (raddr_b < 48)) and (not arot_r5))
++         raddr_a = raddr
++         return raddr_a, raddr_b, immb, arot_r5, RMUX_A
++      if raddr_b is None:
++         assert not immb
++         raddr_b = raddr
++         return raddr_a, raddr_b, immb, arot_r5, RMUX_B
++      asm_error('no free read slots')
++   if rmux == RMUX_A:
++      if (not mulw_rotate) and (raddr_a is not None) and (
++         ((raddr[1] != 0) | ((raddr[2] != 0) << 1)) != ((immb and (raddr_b >= 48)) | (arot_r5 << 1))):
++         asm_error('conflicting rotations from regfile a')
++      if raddr_a is None:
++         raddr_a = raddr[0]
++      elif raddr_a != raddr[0]:
++         asm_error('can only read from one location in each regfile')
++      arot_r5 = raddr[2]
++      if raddr[1] == 0:
++         return raddr_a, raddr_b, immb, arot_r5, RMUX_A
++      raddr = 48 + raddr[1]
++      if not immb:
++         if raddr_b is not None:
++            asm_error('regfile b and rotation don\'t mix')
++         raddr_b = raddr
++         immb = True
++      elif raddr_b != raddr:
++         asm_error('can only encode one rotation/immediate')
++      return raddr_a, raddr_b, immb, arot_r5, RMUX_A
++   if rmux == RMUX_B:
++      if immb:
++         asm_error('regfile b and rotation/immediates don\'t mix')
++      if raddr_b is None:
++         raddr_b = raddr
++      elif raddr_b != raddr:
++         asm_error('can only read from one location in each regfile')
++      return raddr_a, raddr_b, immb, arot_r5, RMUX_B
++   assert 0
++
++# ok if:
++# - accumulator (r0-r3)
++# - uniform (ie all elements identical). this is true of unif, qpu_num, vr_busy,
++#   and vw_busy. it's also true of r5 if it was written by r5rep, but not if it
++#   was written by r5quad. so, by default, r5 isn't considered uniform. todo:
++#   what about vr_wait/vw_wait/mutex?
++def read_rot_ok(rmux, raddr_a, raddr_b):
++   return ((rmux < 4) or ((rmux == 5) and dont_warn_when_mul_rot_inp_r5) or
++      ((rmux == 6) and (raddr_a in (32, 49))) or # unif/vr_busy
++      ((rmux == 7) and (raddr_b in (32, 38, 49)))) # unif/qpu_num/vw_busy
++
++def asm_flush_prog_data():
++   global prog_data
++
++   while len(prog_data) & 7:
++      prog_data.append(0)
++   for i in xrange(0, len(prog_data), 8):
++      prog.append(((prog_data[i + 3] << 24) | (prog_data[i + 2] << 16) | (prog_data[i + 1] << 8) | (prog_data[i + 0] << 0),
++         (prog_data[i + 7] << 24) | (prog_data[i + 6] << 16) | (prog_data[i + 5] << 8) | (prog_data[i + 4] << 0), 'data', {}))
++   prog_data = []
++
++def asm_line(sets, location, line):
++   global current_location, construct, nwarn_level
++
++   prev_location = current_location
++   current_location = location
++
++   try:
++      if construct != None:
++         if re_macro.match(line):
++            construct_stack.append(CONSTRUCT_MACRO)
++         elif re_if.match(line):
++            construct_stack.append(CONSTRUCT_IF)
++         elif re_rep.match(line):
++            construct_stack.append(CONSTRUCT_REP)
++         else:
++            else_m = line == '.else'
++            elif_m = re_elif.match(line)
++            if elif_m:
++               end_construct = CONSTRUCT_IF
++            else:
++               end_construct = {
++                  '.endm':  CONSTRUCT_MACRO,
++                  '.else':  CONSTRUCT_IF,
++                  '.endif': CONSTRUCT_IF | CONSTRUCT_ELSE,
++                  '.endr':  CONSTRUCT_REP}.get(line)
++            if end_construct is not None:
++               end_construct &= construct_stack.pop()
++               if end_construct == 0:
++                  if elif_m:
++                     asm_error('unexpected .elif')
++                  asm_error('unexpected %s' % line)
++               if len(construct_stack) == 0:
++                  lines = construct
++                  construct = None
++                  if end_construct == CONSTRUCT_MACRO:
++                     return
++                  if (end_construct == CONSTRUCT_IF) or (end_construct == CONSTRUCT_ELSE):
++                     condition_if, condition_else = lines[0]
++                     lines = lines[1:]
++                     if condition_if:
++                        for location, line in lines:
++                           asm_line(sets, location, line)
++                     if else_m:
++                        construct = [(condition_else, False)]
++                        construct_stack.append(CONSTRUCT_ELSE)
++                     elif elif_m:
++                        if elif_m.group('set'):
++                           condition_if = condition_else and ((elif_m.group('set') == 'nset') ^ (elif_m.group('name') in sets))
++                        else:
++                           condition_if = condition_else and arg_eval(elif_m.group('condition'), sets)
++                        condition_else = condition_else and (not condition_if)
++                        construct = [(condition_if, condition_else)]
++                        construct_stack.append(CONSTRUCT_IF)
++                     return
++                  if end_construct == CONSTRUCT_REP:
++                     name, count = lines[0]
++                     lines = lines[1:]
++                     for i in xrange(count):
++                        sets[name] = i
++                        for location, line in lines:
++                           asm_line(sets, location, line)
++                     return
++                  assert 0
++               if else_m:
++                  construct_stack.append(CONSTRUCT_ELSE)
++               elif elif_m:
++                  construct_stack.append(CONSTRUCT_IF)
++         construct.append((current_location, line))
++         return
++
++      if line in ('.endm', '.else', '.endif', '.endr'):
++         asm_error('unexpected %s' % line)
++      if re_elif.match(line):
++         asm_error('unexpected .elif')
++
++      m = re_macro.match(line)
++      if m:
++         construct = []
++         construct_stack.append(CONSTRUCT_MACRO)
++         macros[m.group('name')] = ([param.strip() for param in m.group('params').split(',')[1:]], construct)
++         return
++
++      m = re_if.match(line)
++      if m:
++         if m.group('set'):
++            condition = (m.group('set') == 'nset') ^ (m.group('name') in sets)
++         else:
++            # not not forces condition to a bool (this matters if condition is
++            # something mutable like a list)
++            condition = not not arg_eval(m.group('condition'), sets)
++         construct = [(condition, not condition)]
++         construct_stack.append(CONSTRUCT_IF)
++         return
++
++      m = re_rep.match(line)
++      if m:
++         count = arg_eval(m.group('count'), sets)
++         if not is_int(count):
++            asm_error('.rep count must be integer')
++         construct = [(m.group('name'), count)]
++         construct_stack.append(CONSTRUCT_REP)
++         return
++
++      m = re_include.match(line)
++      if m:
++         filename = arg_eval(m.group('filename'), sets)
++         if not isinstance(filename, str):
++            asm_error('expected string')
++         asm_file(sets, '%s: %s' % (current_location, filename), filename)
++         return
++
++      m = re_set.match(line)
++      if m:
++         sets[m.group('name')] = arg_eval(m.group('val'), sets)
++         return
++
++      m = re_unset.match(line)
++      if m:
++         name = m.group('name')
++         if name not in sets:
++            asm_error('%s not set' % name)
++         if name in arg_defs: # todo: see arg_eval
++            sets[name] = arg_defs[name]
++         else:
++            del sets[name]
++         return
++
++      m = re_eval.match(line)
++      if m:
++         arg_eval(m.group('expr'), sets)
++         return
++
++      m = re_print_info_warn_error.match(line)
++      if m:
++         def print_fn(message):
++            print message
++         def info_fn(message):
++            sys.stderr.write('%s\n' % message)
++         {'print': print_fn, 'info': info_fn, 'warn': asm_warning, 'error': asm_error}[
++            m.group('print_info_warn_error')](arg_eval(m.group('message'), sets))
++         return
++
++      m = re_assert.match(line)
++      if m:
++         if not arg_eval(m.group('condition'), sets):
++            asm_error('assertion failure: \'%s\'' % m.group('condition'))
++         return
++
++      m = re_data.match(line)
++      if m:
++         size = int(m.group('size'))
++         for datum in smart_split(m.group('data')):
++            datum = arg_eval(datum, sets)
++            if not is_int(datum):
++               asm_error('datum must be integer')
++            prog_data.extend(((datum >> (i * 8)) & 0xff) for i in xrange(size))
++         return
++
++      m = re_macro_inst.match(line)
++      if m:
++         name = m.group('name')
++         if name in macros:
++            params, lines = macros[name]
++            args = smart_split(m.group('args'))
++            if len(args) > len(params):
++               asm_error('too many arguments to macro')
++            sets = sets.copy()
++            sets.update(zip(params, (arg_eval(arg, sets) for arg in args)))
++            for param in params[len(args):]:
++               if param in sets:
++                  if param in arg_defs: # todo: see arg_eval
++                     sets[param] = arg_defs[param]
++                  else:
++                     del sets[param]
++            for location, line in lines:
++               asm_line(sets, '%s: %s' % (current_location, location), line)
++            return
++
++      if line == '.pushnwarn':
++         nwarn_level += 1
++         return
++      if line == '.popnwarn':
++         if nwarn_level == 0:
++            asm_error('.popnwarn without .pushnwarn')
++         nwarn_level -= 1
++         return
++
++      # everything below assumes prog is up to date
++      asm_flush_prog_data()
++
++      m = re_label.match(line)
++      if m:
++         name = m.group('name')
++         if name[0].isdigit():
++            labels.setdefault(name, []).append(len(prog))
++         else:
++            if name[0] == ':':
++               undecorated_name = name[1:]
++            else:
++               undecorated_name = name
++            if (undecorated_name in labels) or ((':' + undecorated_name) in labels):
++               asm_error('named label defined twice')
++            labels[name] = len(prog)
++         return
++
++      annots = line.split('@')
++      ops = [op.strip() for op in annots[0].split(';')]
++      annots = sum((get_annots(annot, sets) for annot in annots[1:]), [])
++      sig = get_sig(ops[-1])
++      if sig != SIG_NORMAL:
++         ops = ops[:-1]
++      if len(ops) > 2:
++         asm_error('too many ops')
++      elif (len(ops) == 1) and (ops[0] == ''):
++         ops = []
++      ops = (ops + ['nop', 'nop'])[:2]
++      m = re_op.match(ops[0])
++      if not m:
++         asm_error('invalid syntax')
++      aop, aargs_n = get_aop(m.group('op'))
++      if (aop == AOP_BRA) or (aop == AOP_BRR):
++         acond = get_bcond(m.group('cond'))
++      else:
++         acond = get_cond(m.group('cond'))
++      asf = get_setf(m.group('sf'))
++      aargs = smart_split(m.group('args'))
++      if len(aargs) != aargs_n:
++         asm_error('wrong operand count')
++      ard, ara, arb = (aargs + [None, None, None])[:3]
++      m = re_op.match(ops[1])
++      if not m:
++         asm_error('invalid syntax')
++      mop, margs_n = get_mop(m.group('op'))
++      mcond = get_cond(m.group('cond'))
++      msf = get_setf(m.group('sf'))
++      margs = smart_split(m.group('args'))
++      if len(margs) != margs_n:
++         asm_error('wrong operand count')
++      mrd, mra, mrb = (margs + [None, None, None])[:3]
++      # eval srcs first so allocator can retire and reuse registers for dst
++      aaraddr, aarmux, aarpack, aadrot, aadrot_r5 = get_src(ara, sets)
++      abraddr, abrmux, abrpack, abdrot, abdrot_r5 = get_src(arb, sets)
++      maraddr, marmux, marpack, madrot, madrot_r5 = get_src(mra, sets)
++      mbraddr, mbrmux, mbrpack, mbdrot, mbdrot_r5 = get_src(mrb, sets)
++      awaddr, awmux, awpack, awrot, awrot_r5 = get_dst(ard, sets)
++      mwaddr, mwmux, mwpack, mwrot, mwrot_r5 = get_dst(mrd, sets)
++      if (((abrmux is not None) and ((aadrot != abdrot) or (aadrot_r5 != abdrot_r5))) or
++         ((mbrmux is not None) and ((madrot != mbdrot) or (madrot_r5 != mbdrot_r5)))):
++         asm_error('cannot have 2 arguments with different rotations')
++      if aarmux is not None:
++         awrot = (awrot + aadrot) % 16
++         awrot_r5 = (awrot_r5 + aadrot_r5) % 16
++      if (awrot != 0) or awrot_r5:
++         asm_error('rotate not allowed on add write')
++      if marmux is not None:
++         mwrot = (mwrot + madrot) % 16
++         mwrot_r5 = (mwrot_r5 + madrot_r5) % 16
++
++      afloatr = aop in (AOP_FADD, AOP_FSUB, AOP_FMIN, AOP_FMAX, AOP_FMINABS, AOP_FMAXABS, AOP_FTOI)
++      afloatw = aop in (AOP_FADD, AOP_FSUB, AOP_FMIN, AOP_FMAX, AOP_FMINABS, AOP_FMAXABS, AOP_ITOF)
++      pm, pack, unpack, forcebs, forcerafloat = calculate_pack_modes(
++         [aarpack, abrpack, marpack, mbrpack],
++         [afloatr, afloatr, mop == MOP_FMUL, mop == MOP_FMUL],
++         aop == AOP_FTOI,
++         [awpack, mwpack],
++         [afloatw, mop == MOP_FMUL])
++      if forcebs[0]:
++         aarmux = RMUX_B
++      if forcebs[1]:
++         abrmux = RMUX_B
++      if forcebs[2]:
++         marmux = RMUX_B
++      if forcebs[3]:
++         mbrmux = RMUX_B
++
++      # extend nops to 3 operands
++      if aop == AOP_NOP:
++         awaddr, awmux, aaraddr, aarmux, abraddr, abrmux = WADDR_NOP, WMUX_ANY, 0, RMUX_AC, 0, RMUX_AC
++      if mop == MOP_NOP:
++         mwaddr, mwmux, maraddr, marmux, mbraddr, mbrmux = WADDR_NOP, WMUX_ANY, 0, RMUX_AC, 0, RMUX_AC
++
++      # extend 2 operand alu ops to 3 operands (by duplicating the 2nd operand)
++      if (aop == AOP_FTOI) or (aop == AOP_ITOF) or (aop == AOP_NOT) or (aop == AOP_CLZ):
++         if forcerafloat:
++            assert aop == AOP_FTOI # can only forcerafloat if we have an unused float operand
++            # instead of duplicating the 2nd operand, take the ra operand from
++            # the mul op thus forcing the ra value to be considered a float for
++            # the purposes of unpacking
++            if marmux == RMUX_A:
++               abraddr, abrmux = maraddr, marmux
++            else:
++               assert mbrmux == RMUX_A
++               abraddr, abrmux = mbraddr, mbrmux
++         else:
++            abraddr, abrmux = aaraddr, aarmux
++      else:
++         assert not forcerafloat # can only forcerafloat if we have an unused operand
++
++      # handle write addrs
++      if (awmux == mwmux) and (awmux != WMUX_ANY):
++         asm_error('add/mul ops not allowed to write to same regfile')
++      ws = (awmux == WMUX_B) or (mwmux == WMUX_A)
++
++      # handle branch
++      if (aop == AOP_BRA) or (aop == AOP_BRR):
++         # check setf
++         if asf:
++            asm_error('setf not allowed on bra/brr')
++
++         # check pack/unpack
++         if (pack != 0) or (unpack != 0):
++            asm_error('pack/unpack not allowed with bra/brr')
++
++         # handle read address
++         if aarmux == RMUX_LABEL:
++            if (aop == AOP_BRA) and aaraddr[1]:
++               asm_warning('bra with rel label')
++            if (aop == AOP_BRR) and (not aaraddr[1]):
++               asm_warning('brr with abs label')
++            aaraddr, aarmux = (current_location,) + aaraddr, RMUX_IMM
++         if aarmux == RMUX_ANY:
++            aaraddr, aarmux = (aaraddr, 0, 0), RMUX_A
++         if (aarmux != RMUX_IMM) and (aarmux != RMUX_A):
++            asm_error('branch destination must be either label, immediate, or from regfile a')
++         if aarmux == RMUX_IMM:
++            imm = aaraddr
++            raddr = 0 # can't use RADDR_NOP
++         elif aarmux == RMUX_A:
++            if (aaraddr[1] != 0) or (aaraddr[2] != 0):
++               asm_error('rotation of read from regfile a not allowed with branch')
++            if aop == AOP_BRR:
++               asm_warning('brr with ra')
++            imm = 0
++            raddr = aaraddr[0]
++         else:
++            assert 0
++
++         # check mul op is nop
++         if mop != MOP_NOP:
++            asm_error('mul op not allowed with branch')
++
++         # check sig
++         if sig != SIG_NORMAL:
++            asm_error('no signal allowed with branch')
++
++         if raddr >= 32:
++            asm_error('can only branch to register locations in physical regfile')
++         if raddr & 1:
++            asm_warning('branch instruction will destroy flags (see hw-2780)')
++
++         # construct branch instruction
++         prog.append((imm,
++            (mwaddr << 0) | (awaddr << 6) | (ws << 12) | (raddr << 13) | ((aarmux == RMUX_A) << 18) | ((aop == AOP_BRR) << 19) | (acond << 20) | (SIG_BRANCH << 28),
++            line, annots))
++
++         return
++
++      # use COND_NEVER when possible (might save power / allow mul setf)
++      if not dict(annots).get('preserve_cond', 0):
++          if (awaddr == WADDR_NOP) and (not asf):
++             acond = COND_NEVER
++          if (mwaddr == WADDR_NOP) and (not msf):
++             mcond = COND_NEVER
++
++      # attempt to convert movs to ldi
++      if (# no mul setf
++         (not msf) and
++         # ops must either be nop or mov of sema/label/imm/immv
++         ((aop == AOP_NOP) or ((aop == AOP_MOV) and (aarmux in (RMUX_SEMA, RMUX_LABEL, RMUX_IMMV, RMUX_IMM)))) and
++         ((mop == MOP_NOP) or ((mop == MOP_MOV) and (marmux in (RMUX_SEMA, RMUX_LABEL, RMUX_IMMV, RMUX_IMM)))) and
++         # but we don't want 2 nops
++         ((aop != AOP_NOP) or (mop != MOP_NOP)) and
++         # if both ops are movs, srcs must be identical
++         ((aop != AOP_MOV) or (mop != MOP_MOV) or ((aarmux == marmux) and (aaraddr == maraddr))) and
++         # no signal
++         (sig == SIG_NORMAL)):
++         # make sure aarmux/aaraddr contains the value
++         if aop != AOP_MOV:
++            aarmux = marmux
++            aaraddr = maraddr
++
++         # convert immediate
++         if aarmux == RMUX_SEMA:
++            ldi_mode = LDI_SEMA
++         elif aarmux == RMUX_LABEL:
++            ldi_mode = LDI_32
++            aaraddr, aarmux = (current_location,) + aaraddr, RMUX_IMM
++         elif aarmux == RMUX_IMMV:
++            signed, unsigned = True, True
++            imm = 0
++            for i, elem in enumerate(aaraddr):
++               if elem not in (-2 + (1 << 32), -1 + (1 << 32), 0, 1):
++                  signed = False
++               if elem not in (0, 1, 2, 3):
++                  unsigned = False
++               imm |= ((elem & 0x1) << i) | ((elem & 0x2) << (15 + i))
++            if not (signed or unsigned):
++               asm_error('can\'t encode vector immediate')
++            if signed:
++               ldi_mode = LDI_EL_SIGNED
++            else:
++               ldi_mode = LDI_EL_UNSIGNED
++            aaraddr, aarmux = imm, RMUX_IMM
++         elif aarmux == RMUX_IMM:
++            ldi_mode = LDI_32
++         else:
++            assert 0
++
++         # construct ldi instruction
++         prog.append((aaraddr,
++            (mwaddr << 0) | (awaddr << 6) | (ws << 12) | (asf << 13) | (mcond << 14) | (acond << 17) | (pack << 20) | (pm << 24) | (ldi_mode << 25) | (SIG_IMMED << 28),
++            line, annots))
++
++         return
++
++      # convert movs to alu ops
++      if aop == AOP_MOV:
++         if allow_xor_0 and (aarmux == RMUX_IMM) and (aaraddr == 0):
++            aop = AOP_XOR
++            aaraddr, aarmux = 0, RMUX_AC
++            abraddr, abrmux = 0, RMUX_AC
++         else:
++            aop = AOP_OR
++            abraddr, abrmux = aaraddr, aarmux
++      if mop == MOP_MOV:
++         if allow_xor_0 and (marmux == RMUX_IMM) and (maraddr == 0):
++            mop = MOP_V8SUBS
++            maraddr, marmux = 0, RMUX_AC
++            mbraddr, mbrmux = 0, RMUX_AC
++         else:
++            mop = MOP_V8MIN
++            mbraddr, mbrmux = maraddr, marmux
++
++      # normal alu instruction...
++
++      # handle setf
++      if asf and (aop == AOP_NOP):
++         asm_error('nop.setf is not allowed in add pipe')
++      if msf and (mop == MOP_NOP):
++         asm_warning('nop.setf, really?')
++      if (aop == AOP_NOP) or (acond == COND_NEVER):
++         sf = msf
++      else:
++         if msf:
++            asm_error('setf only allowed on mul op if add op is nop or add condition is never')
++         sf = asf
++
++      # handle read addrs
++      raddr_a = None
++      raddr_b = None
++      immb = False
++      arot_r5 = False
++      muxes = [0, 0, 0, 0]
++      if mwrot != 0:
++         raddr_b = 48 + mwrot
++         immb = True
++      if mwrot_r5 and have_am:
++         raddr_b = 48
++         immb = True
++      for f in lambda rmux: rmux != RMUX_ANY, lambda rmux: rmux == RMUX_ANY: # do RMUX_ANY last
++         for i, raddr, rmux in (0, aaraddr, aarmux), (1, abraddr, abrmux), (2, maraddr, marmux), (3, mbraddr, mbrmux):
++            if f(rmux):
++               raddr_a, raddr_b, immb, arot_r5, muxes[i] = merge_rmux(raddr_a, raddr_b, immb, arot_r5, raddr, rmux)
++      add_a, add_b, mul_a, mul_b = muxes
++      if (not read_rot_ok(mul_a, raddr_a, raddr_b)) or (not read_rot_ok(mul_b, raddr_a, raddr_b)):
++         # some output elements might not be as expected
++         if mwrot_r5 or ((mwrot >= 4) and (mwrot <= 12)):
++            bad_elems = 0xffff
++         else:
++            bad_elems = ((1 << (mwrot & 0x3)) - 1) * 0x1111
++            if mwrot > 12:
++               bad_elems ^= 0xffff
++         bad_elems &= dict(annots).get('mul_used', 0xffff)
++         if not msf:
++            if mwaddr == WADDR_NOP:
++               # not writing anywhere and not setting flags. no elements used
++               bad_elems = 0
++            elif ((mwaddr in (36, 40, 43, 49, 50, 51)) or
++               ((not ws) and (mwaddr == 37))):
++               # writing to tmurs/r5rep/unif_addr/unif_addr_rel/stencil/
++               # vr_setup/vw_setup/vr_addr/vw_addr/mutex and not setting flags.
++               # only use element 0
++               bad_elems &= 0x0001
++            elif ((mwaddr == 41) or (ws and (mwaddr == 37)) or
++               ((not ws) and (mwaddr == 42))):
++               # writing to r5quad/x_coord/y_coord/rev_flag and not setting
++               # flags. only use elements 0, 4, 8, and 12
++               bad_elems &= 0x1111
++         if bad_elems:
++            asm_warning('mul inputs don\'t come from accumulators (r0-r3). output may not be as expected')
++      if raddr_a is None:
++         raddr_a = RADDR_NOP
++      if raddr_b is None:
++         raddr_b = RADDR_NOP
++      if immb:
++         if sig != SIG_NORMAL:
++            asm_error('rotation/immediates and signal don\'t mix')
++         sig = SIG_SMALLIMMED
++      if arot_r5 or (mwrot_r5 and (not have_am)):
++         if sig != SIG_NORMAL:
++            asm_error('rotation/immediates/signal don\'t mix')
++         sig = SIG_ROTATE
++
++      # construct instruction
++      prog.append(((mul_b << 0) | (mul_a << 3) | (add_b << 6) | (add_a << 9) | (raddr_b << 12) | (raddr_a << 18) | (aop << 24) | (mop << 29),
++         (mwaddr << 0) | (awaddr << 6) | (ws << 12) | (sf << 13) | (mcond << 14) | (acond << 17) | (pack << 20) | (pm << 24) | (unpack << 25) | (sig << 28),
++         line, annots))
++   finally:
++      current_location = prev_location
++
++def preprocess_passthrough(file):
++   line_number = 0
++   for line in file:
++      line_number += 1
++      yield line_number, line
++
++def asm_file(sets, location, filename, preprocess = None):
++   global current_dir, current_location
++
++   if filename is None:
++      location = '<stdin>'
++      file = sys.stdin
++
++      prev_dir = current_dir
++   else:
++      filename = os.path.normpath(os.path.join(current_dir, filename))
++
++      try:
++         file = open(filename)
++      except Exception, e:
++         asm_error(e)
++      except:
++         asm_error('unknown error while opening file %s' % filename)
++
++      prev_dir = current_dir
++      current_dir = os.path.dirname(filename)
++
++   prev_location = current_location
++   current_location = location
++
++   if preprocess is None:
++      preprocess = preprocess_passthrough
++
++   try:
++      for line_number, line in preprocess(file):
++         # strip off comments and whitespace
++         line = line.split('#')[0].strip()
++         if line == '':
++            continue
++
++         asm_line(sets, '%s: %d' % (current_location, line_number), line)
++   finally:
++      current_dir = prev_dir
++      current_location = prev_location
++
++def asm_end_prog():
++   # check we aren't in a multi-line construct (eg .macro or .rep)
++   if construct != None:
++      asm_error({
++         CONSTRUCT_MACRO: '.macro without .endm',
++         CONSTRUCT_IF:    '.if/.elif without .endif',
++         CONSTRUCT_ELSE:  '.else without .endif',
++         CONSTRUCT_REP:   '.rep without .endr'}[construct_stack[-1]])
++
++   # check no warnings level back to 0
++   if nwarn_level != 0:
++      asm_error('.pushnwarn without .popnwarn')
++
++   # flush queued up data
++   asm_flush_prog_data()
++
++   # fixup all the label references we can
++   for pc in xrange(len(prog)):
++      if isinstance(prog[pc][0], tuple):
++         location, label, rel, offset = prog[pc][0]
++         if label[0].isdigit():
++            label_pcs = labels.get(label[:-1], [])
++            if label[-1] == 'b':
++               label_pcs = filter(lambda label_pc: label_pc <= pc, label_pcs)[-1:]
++            else:
++               label_pcs = filter(lambda label_pc: label_pc > pc, label_pcs)[:1]
++            if label_pcs == []:
++               asm_error('search for label reached begin/end of file', location = location)
++            imm = label_pcs[0]
++         elif label in labels:
++            imm = labels[label]
++         elif (':' + label) in labels:
++            imm = labels[':' + label]
++         elif external_link:
++            continue # let the external linker deal with it
++         else:
++            asm_error('undefined label', location = location)
++         imm = (imm * 8) + offset
++         if rel:
++            imm -= (pc + 4) * 8 # relative to instruction after delay slots
++            imm &= (1 << 32) - 1
++         else:
++            if not external_link:
++               asm_error('can\'t get absolute address without using an external linker. this mode doesn\'t have an external linker', location = location)
++            imm = (location, label, rel, offset, imm)
++         prog[pc] = (imm,) + prog[pc][1:]
++
++def asm_init():
++   global current_dir, current_location, prog, prog_data, macros, labels, construct, construct_stack, nwarn_level
++
++   current_dir = os.getcwd()
++   current_location = ''
++   prog = []
++   prog_data = []
++   macros = {
++      'sacq': (['dst', 'i'], [('candyland', 'mov  dst, sacq(i)')]),
++      'srel': (['dst', 'i'], [('candyland', 'mov  dst, srel(i)')])}
++   labels = {}
++   construct = None
++   construct_stack = []
++   nwarn_level = 0
++
++def asm_reset_prog():
++   global prog, labels
++
++   prog = []
++   labels = {}
++
++###############################################################################
++# dumping
++###############################################################################
++
++def print_lines(lines):
++   for line in lines:
++      print line
++
++class dumper_t:
++   def external_link(self): return False
++   def begin(self): pass
++   def label(self, pc, name): pass
++   def line(self, pc, ls, ms, line, annots, first): pass
++   def end(self): pass
++   def sets(self, sets): pass
++   def direct(self, line): pass
++
++class clif_dumper_t(dumper_t):
++   def __init__(self):
++      self.annot_mode = 0
++
++   def external_link(self):
++      return True
++
++   def parse_annot_mode(self, line):
++      l = line.split(',')
++      self.annot_mode = int(l[0])
++      if self.annot_mode not in (0, 1, 2):
++         asm_error('bad annot mode')
++      if self.annot_mode == 2:
++         if len(l) != 2:
++            asm_error('expected buffer name')
++         self.annot_name = l[1].strip()
++         self.annot_offset = 0
++      elif len(l) != 1:
++         asm_error('unexpected comma')
++
++   def label(self, pc, name):
++      if (self.annot_mode != 1) and (name[0] == ':'):
++         if self.annot_mode == 2:
++            name = name + '_annotations'
++         print '@label %s' % name[1:]
++      else:
++         print '// :%s' % name
++
++   def line(self, pc, ls, ms, line, annots, first):
++      if self.annot_mode == 0:
++         if isinstance(ls, tuple):
++            if len(ls) == 5:
++               location, label, rel, offset, offset_from_prog = ls
++               assert not rel
++               ls = '[. - %d + %d]' % (pc * 8, offset_from_prog)
++            else:
++               location, label, rel, offset = ls
++               if rel:
++                  asm_error('relative external label references not allowed in this mode', location = location)
++               ls = '[%s + %d]' % (label, offset)
++         else:
++            ls = '0x%08x' % ls
++         print '%s 0x%08x // %s' % (ls, ms, line)
++      elif self.annot_mode == 1:
++         print '// %s' % line
++         for annot in annots:
++            print '0x%08x 0x%08x // %s' % ({
++               # todo: would rather not have these hard coded
++               'mul_used':              1,
++               'preserve_cond':         2,
++               'geomd_open':            3,
++               'geomd_i':               4,
++               'geomd_tris_clear':      5,
++               'geomd_verts':           6,
++               'geomd_tris_add':        7,
++               'geomd_tris_set_center': 8,
++               'geomd_region_clear':    9,
++               'geomd_region_set':      10,
++               'geomd_images_clear':    11,
++               'geomd_images_l':        12,
++               'geomd_images_b':        13,
++               'geomd_images_r':        14,
++               'geomd_images_t':        15,
++               'geomd_images_add_vpm':  16,
++               'trace_4c':              17,
++               'geomd_images_add_tex':  18,}[annot[0]], annot[1], annot[0])
++         if len(annots) != 0:
++            print '0x00000000 // end'
++      else:
++         assert self.annot_mode == 2
++         if len(annots) == 0:
++            print '0x00000000 // %s' % line
++         else:
++            print '[%s + %d] // %s' % (self.annot_name, self.annot_offset, line)
++            self.annot_offset += (len(annots) * 8) + 4
++
++   def direct(self, line):
++      print line
++
++class plain_dumper_t(dumper_t):
++   def line(self, pc, ls, ms, line, annots, first):
++      print '0x%08x, 0x%08x, // %s' % (ls, ms, line)
++
++class c_c_dumper_t(dumper_t):
++   def __init__(self, header_name, full_header_name, array_name):
++      self.header_name = header_name
++      self.array_name = array_name
++
++   def external_link(self):
++      return True
++
++   def begin(self):
++      self.external_labels = set()
++      self.lines = []
++
++      print '#include "%s.h"' % self.header_name
++      print ''
++      print '#ifdef _MSC_VER'
++      print '   #include <stdint.h>'
++      print '   /* cast through uintptr_t to avoid warnings */'
++      print '   #define POINTER_TO_UINT(X) ((unsigned int)(uintptr_t)(X))'
++      print '#else'
++      print '   #define POINTER_TO_UINT(X) ((unsigned int)(X))'
++      print '#endif'
++      print ''
++      print '#ifdef __cplusplus'
++      print 'extern "C" { /* the types are probably wrong... */'
++      print '#endif'
++
++   def label(self, pc, name):
++      self.lines.append('// :%s' % name)
++
++   def line(self, pc, ls, ms, line, annots, first):
++      if isinstance(ls, tuple):
++         if len(ls) == 5:
++            location, label, rel, offset, offset_from_prog = ls
++            assert not rel
++            ls = 'POINTER_TO_UINT(%s) + %d' % (self.array_name, offset_from_prog)
++         else:
++            location, label, rel, offset = ls
++            if rel:
++               asm_error('relative external label references not allowed in this mode', location = location)
++            if label not in self.external_labels:
++               self.external_labels.add(label)
++               print 'extern uint8_t %s[];' % label
++            ls = 'POINTER_TO_UINT(%s) + %d' % (label, offset)
++      else:
++         ls = '0x%08x' % ls
++      self.lines.append('/* [0x%08x] */ %s, 0x%08x, // %s' % (pc * 8, ls, ms, line))
++
++   def end(self):
++      print '#ifdef __cplusplus'
++      print '}'
++      print '#endif'
++      print ''
++      print '#ifdef _MSC_VER'
++      print '__declspec(align(8))'
++      print '#elif defined(__GNUC__)'
++      print '__attribute__((aligned(8)))'
++      print '#endif'
++      print 'unsigned int %s[] = {' % self.array_name
++      print_lines(self.lines)
++      print '};'
++      print '#ifdef __HIGHC__'
++      print '#pragma Align_to(8, %s)' % self.array_name
++      print '#endif'
++
++class c_h_dumper_t(dumper_t):
++   def __init__(self, header_name, full_header_name, array_name):
++      self.full_header_name = full_header_name
++      self.array_name = array_name
++
++   def external_link(self):
++      return True
++
++   def begin(self):
++      print '#ifndef %s_H' % self.full_header_name
++      print '#define %s_H' % self.full_header_name
++      print ''
++      print 'extern unsigned int %s[];' % self.array_name
++      print ''
++
++   def label(self, pc, name):
++      if name[0] == ':':
++         print '#define %s (%s + %d)' % (name[1:], self.array_name, pc * 2)
++
++   def end(self):
++      print ''
++      print '#endif'
++
++class ml_c_dumper_t(dumper_t):
++   def __init__(self, header_name, full_header_name, name, annots):
++      self.header_name = header_name
++      self.name = name
++      self.annots = annots
++
++   def external_link(self):
++      return True
++
++   def begin(self):
++      if self.annots:
++         self.annot_lines = []
++      self.lines = []
++      self.external_labels = set()
++      self.link_lines = []
++
++      print '#include "%s.h"' % self.header_name
++      print '#include <assert.h>'
++      if self.annots:
++         print '#ifdef SIMPENROSE'
++         print '#include <stddef.h>'
++         print '#include "v3d/verification/tools/2760sim/simpenrose.h"'
++      print ''
++
++   def label(self, pc, name):
++      self.lines.append('// :%s' % name)
++
++   def line(self, pc, ls, ms, line, annots, first):
++      if self.annots:
++         if len(annots) == 0:
++            self.annot_lines.append('NULL,')
++         else:
++            print 'static unsigned int const annotations_%d[] = {' % pc
++            for annot in annots:
++               print '   SIMPENROSE_SHADER_ANNOTATION_%s, 0x%08x,' % (annot[0].upper(), annot[1])
++            print '   SIMPENROSE_SHADER_ANNOTATION_END};'
++            print ''
++            self.annot_lines.append('annotations_%d,' % pc)
++      if isinstance(ls, tuple):
++         self.link_lines.append('   assert(p[%d] == 0xdeadbeef);' % (pc * 2))
++         if len(ls) == 5:
++            location, label, rel, offset, offset_from_prog = ls
++            assert not rel
++            self.link_lines.append('   p[%d] = base + %d;' % (pc * 2, offset_from_prog))
++         else:
++            location, label, rel, offset = ls
++            self.external_labels.add(label)
++            if rel:
++               self.link_lines.append('   p[%d] = (%s + %d) - (base + %d);' % (pc * 2, label, offset, (pc + 4) * 8))
++            else:
++               self.link_lines.append('   p[%d] = %s + %d;' % (pc * 2, label, offset))
++         ls = '0xdeadbeef'
++      else:
++         ls = '0x%08x' % ls
++      self.lines.append('/* [0x%08x] */ %s, 0x%08x, // %s' % (pc * 8, ls, ms, line))
++
++   def end(self):
++      if self.annots:
++         print 'unsigned int const *const %s_annotations_array[] = {' % self.name
++         print_lines(self.annot_lines)
++         print '};'
++         print '#endif'
++         print ''
++      print 'static unsigned int const array[] = {'
++      print_lines(self.lines)
++      print '};'
++      print ''
++      print 'void %s_link(void *p_in, unsigned int base' % self.name
++      for label in sorted(self.external_labels):
++         print '   , unsigned int %s' % label
++      print '   )'
++      print '{'
++      print '   unsigned int *p = (unsigned int *)p_in;'
++      print '   unsigned int i;'
++      print '   for (i = 0; i != (%s_SIZE / 4); ++i) {' % self.name.upper()
++      print '      p[i] = array[i];'
++      print '   }'
++      print_lines(self.link_lines)
++      print '}'
++
++class ml_h_dumper_t(dumper_t):
++   def __init__(self, header_name, full_header_name, name, annots):
++      self.full_header_name = full_header_name
++      self.name = name
++      self.annots = annots
++
++   def external_link(self):
++      return True
++
++   def begin(self):
++      self.external_labels = set()
++      self.lines_n = 0
++
++      print '#ifndef %s_H' % self.full_header_name
++      print '#define %s_H' % self.full_header_name
++      print ''
++      if self.annots:
++         print '#ifdef SIMPENROSE'
++         print '   extern unsigned int const *const %s_annotations_array[];' % self.name
++         print '#endif'
++         print ''
++
++   def label(self, pc, name):
++      if name[0] == ':':
++         print '#define %s_OFFSET %d' % (name[1:].upper(), pc * 8)
++         if self.annots:
++            print '#ifdef SIMPENROSE'
++            print '   #define %s_annotations (%s_annotations_array + %d)' % (name[1:], self.name, pc)
++            print '#endif'
++
++   def line(self, pc, ls, ms, line, annots, first):
++      if isinstance(ls, tuple) and (len(ls) != 5):
++         self.external_labels.add(ls[1])
++      self.lines_n += 1
++
++   def end(self):
++      print ''
++      print 'extern void %s_link(void *p, unsigned int base' % self.name
++      for label in sorted(self.external_labels):
++         print '   , unsigned int %s' % label
++      print '   );'
++      print ''
++      print '#define %s_SIZE %d' % (self.name.upper(), (self.lines_n * 8))
++      print ''
++      print '#endif'
++
++def print_lines_lc(lines):
++   for line in lines:
++      print '%s \\' % line
++
++def print_groups_lc(groups):
++   first = True
++   for group in groups:
++      if first:
++         print '{ \\'
++      else:
++         print ', { \\'
++      print_lines_lc(group)
++      print '} \\'
++      first = False
++
++class inline_c_dumper_t(dumper_t):
++   def __init__(self, annots):
++      self.annots = annots
++      self.iteration = False
++
++   def begin_iteration(self):
++      assert not self.iteration
++      self.iteration = True
++      self.iteration_lines = []
++      if self.annots:
++         self.iteration_annot_lines = []
++         self.annot_arrs = []
++
++   def end_iteration(self):
++      assert self.iteration
++      self.iteration = False
++      print '%d, \\' % self.iteration_n
++      if self.annots:
++         print '( \\'
++      print_groups_lc(self.iteration_lines)
++      if self.annots:
++         print '), ( \\'
++         print_groups_lc(self.iteration_annot_lines)
++         print '), ( \\'
++         for annot_arr in self.annot_arrs:
++            print_lines_lc(annot_arr)
++         print ') \\'
++
++   def begin(self):
++      self.n = 0
++      self.lines = []
++      if self.annots:
++         self.annot_lines = []
++         if not self.iteration:
++            self.annot_arrs = []
++
++   def label(self, pc, name):
++      self.lines.append('/* :%s */' % name)
++      if self.annots:
++         self.annot_lines.append('/* :%s */' % name)
++
++   def line(self, pc, ls, ms, line, annots, first):
++      self.n += 1
++      if first:
++         prefix = ''
++      else:
++         prefix = ', '
++      self.lines.append('%s0x%08x, 0x%08x /* %s */' % (prefix, ls, ms, line))
++      if self.annots:
++         if len(annots) == 0:
++            a = 'NULL'
++         else:
++            a = 'annotations_%d' % len(self.annot_arrs)
++            annot_arr = ['static unsigned int const annotations_%d[] = {' % len(self.annot_arrs)]
++            for annot in annots:
++               annot_arr.append('   SIMPENROSE_SHADER_ANNOTATION_%s, 0x%08x,' % (annot[0].upper(), annot[1]))
++            annot_arr.append('   SIMPENROSE_SHADER_ANNOTATION_END};')
++            self.annot_arrs.append(annot_arr)
++         self.annot_lines.append('%s%s /* %s */' % (prefix, a, line))
++
++   def end(self):
++      if self.iteration:
++         if len(self.iteration_lines) == 0:
++            self.iteration_n = self.n
++         elif self.iteration_n != self.n:
++            asm_error('number of instructions differs between iterations')
++         self.iteration_lines.append(self.lines)
++         if self.annots:
++            self.iteration_annot_lines.append(self.annot_lines)
++      else:
++         if self.annots:
++            print '( \\'
++         print_lines_lc(self.lines)
++         if self.annots:
++            print '), ( \\'
++            print_lines_lc(self.annot_lines)
++            print '), ( \\'
++            for annot_arr in self.annot_arrs:
++               print_lines_lc(annot_arr)
++            print ') \\'
++
++   def direct(self, line):
++      print line
++
++class asvc_dumper_t(dumper_t):
++   def external_link(self):
++      return True
++
++   def begin(self):
++      print '.align 8'
++
++   def label(self, pc, name):
++      if name[0] == ':':
++         print '%s::' % name[1:]
++      else:
++         print '%s:' % name
++
++   def line(self, pc, ls, ms, line, annots, first):
++      if isinstance(ls, tuple):
++         location, label, rel, offset = ls[:4]
++         if rel:
++            ls = '%s + %d - (. + 32)' % (label, offset)
++         else:
++            ls = '%s + %d' % (label, offset)
++      else:
++         ls = '0x%08x' % ls
++      print '.word %s, 0x%08x ; %s' % (ls, ms, line)
++
++def is_ra_or_rb(val):
++   return isinstance(val, loc_t) and ((val.mux == MUX_A) or (val.mux == MUX_B))
++
++class aliases_dumper_t(dumper_t):
++   def external_link(self):
++      return True
++
++   def begin(self):
++      print '#ifndef JUST_DQASM_ARGS'
++
++   def label(self, pc, name):
++      if not name[0].isdigit():
++         if name[0] == ':':
++            name = name[1:]
++         print '"bs%s", "bs%x",' % (name, pc * 8)
++         print '"bu%s", "bu%x",' % (name, pc * 8)
++
++   def end(self):
++      print '#endif'
++
++   # todo: handle things other than ra and rb? dqasm only allows ra and rb atm
++   def sets(self, sets):
++      dqasm_args = []
++      print '#ifndef JUST_DQASM_ARGS'
++      for name in sets:
++         if is_ra_or_rb(sets[name]):
++            dqasm_args.append('-r%s=%s' % (sets[name], name))
++            print '"%s", "%s",' % (name, sets[name])
++         elif isinstance(sets[name], list):
++            for i, val in enumerate(sets[name]):
++               if is_ra_or_rb(val):
++                  dqasm_args.append('-r%s=%s[%d]' % (val, name, i))
++                  print '"%s[%d]", "%s",' % (name, i, val)
++      print '#endif'
++      print '#define DQASM_ARGS "%s"' % ' '.join(dqasm_args)
++
++def dump(dumper):
++   if (len(prog) != 0) or (len(labels) != 0):
++      dumper.begin()
++
++      sorted_labels = []
++      for name in labels:
++         if name[0].isdigit():
++            for pc in labels[name]:
++               sorted_labels.append((pc, name))
++         else:
++            sorted_labels.append((labels[name], name))
++      sorted_labels.sort(reverse = True)
++
++      first = True
++      for pc in xrange(len(prog)):
++         ls, ms, line, annots = prog[pc]
++         while (len(sorted_labels) != 0) and (sorted_labels[-1][0] == pc):
++            dumper.label(*sorted_labels.pop())
++         dumper.line(pc, ls, ms, line, annots, first)
++         first = False
++      for sorted_label in sorted_labels:
++         assert sorted_label[0] == len(prog)
++         dumper.label(*sorted_label)
++
++      dumper.end()
++
++###############################################################################
++# preprocessing
++###############################################################################
++
++def preprocess_inline_c(dumper):
++   def preprocess(file):
++      ls = None
++      line_number = 0
++      for line in file:
++         line_number += 1
++         while True:
++            if ls is None:
++               l = line.split('%[', 1)
++               if len(l) == 1:
++                  dumper.direct(l[0].rstrip())
++                  break
++               dumper.direct('%s \\' % l[0].rstrip())
++               line = l[1]
++               ls = []
++            else:
++               l = line.split('%]', 1)
++               ls.append((line_number, l[0]))
++               if len(l) == 1:
++                  break
++               line = l[1]
++               l = ls[-1][1].split('%|', 1)
++               if len(l) == 1:
++                  for l_number, l in ls:
++                     yield l_number, l
++                  asm_end_prog()
++                  dump(dumper)
++                  asm_reset_prog()
++               else:
++                  ls[-1] = (ls[-1][0], l[0])
++                  if hasattr(dumper, 'begin_iteration'):
++                     dumper.begin_iteration()
++                  for repls in l[1].split('%,'):
++                     repls = [repl.strip() for repl in repls.split('%/')]
++                     for l_number, l in ls:
++                        for i, repl in enumerate(repls):
++                           l = l.replace('%' + str(i), repl)
++                        yield l_number, l
++                     asm_end_prog()
++                     dump(dumper)
++                     asm_reset_prog()
++                  if hasattr(dumper, 'end_iteration'):
++                     dumper.end_iteration()
++               ls = None
++   return preprocess
++
++def preprocess_clif(dumper):
++   def preprocess(file):
++      in_asm = False
++      line_number = 0
++      for line in file:
++         line_number += 1
++         if in_asm:
++            if line.strip() == '%]':
++               asm_end_prog()
++               dump(dumper)
++               asm_reset_prog()
++               in_asm = False
++            else:
++               yield line_number, line
++         else:
++            if line.strip() == '%[':
++               in_asm = True
++            elif (line[:1] == '%') and (line[:2] != '%@'):
++               yield line_number, line[1:]
++            else:
++               asm_end_prog()
++               dump(dumper)
++               asm_reset_prog()
++               if line[:2] == '%@':
++                  if hasattr(dumper, 'parse_annot_mode'):
++                     dumper.parse_annot_mode(line[2:])
++               else:
++                  dumper.direct(line.rstrip())
++   return preprocess
++
++###############################################################################
++# main
++###############################################################################
++
++def main():
++   global external_link, allow_xor_0, dont_warn_when_mul_rot_inp_r5
++   global warnings_are_errors, disable_warnings, have_sema, have_am, mulw_rotate
++
++   asm_init() # do this first so we can use asm_error without having to pass a location and so asm_warning will work
++
++   # parse command line
++   parser = optparse.OptionParser(usage = 'usage: %prog [options] <filename>')
++   parser.add_option('-m', '--mode', dest = 'mode',
++      help = '<mode> should be clif, plain, ' +
++      'c_c:<header_name>,<full_header_name>,<array_name>, ' +
++      'c_h:<header_name>,<full_header_name>,<array_name>, ' +
++      'ml_c:<header_name>,<full_header_name>,<name>[,annots], ' +
++      'ml_h:<header_name>,<full_header_name>,<name>[,annots], ' +
++      'inline_c[:annots], asvc, or aliases[:<preprocess_mode>]', metavar = '<mode>')
++   parser.add_option('-t', '--target', dest = 'target',
++      help = '<target> should be a0, b0, or hera', metavar = '<target>')
++   parser.add_option('-x', '--allow_xor_0', dest = 'allow_xor_0', action = 'store_true', default = False)
++   parser.add_option('-r', '--dont_warn_when_mul_rot_inp_r5', dest = 'dont_warn_when_mul_rot_inp_r5', action = 'store_true', default = False)
++   parser.add_option('-w', '--warnings_are_errors', dest = 'warnings_are_errors', action = 'store_true', default = False)
++   parser.add_option('-d', '--disable_warnings', dest = 'disable_warnings', action = 'store_true', default = False)
++   parser.add_option('-s', '--set', dest = 'sets', action = 'append', default = [], metavar = '<name>=<val>')
++   options, args = parser.parse_args()
++   if len(args) == 0:
++      filename = None
++   elif len(args) == 1:
++      filename = args[0]
++   else:
++      parser.print_help()
++      sys.exit(-1)
++
++   # handle mode
++   mode = options.mode or 'clif' # assume clif if no mode specified
++   if mode == 'clif':
++      dumper = clif_dumper_t()
++      preprocess = preprocess_clif(dumper)
++   elif mode == 'plain':
++      dumper = plain_dumper_t()
++      preprocess = None
++   elif (mode[:4] == 'c_c:') or (mode[:4] == 'c_h:'):
++      mode_options = mode[4:].split(',')
++      if len(mode_options) != 3:
++         asm_error('badly formatted mode on command line')
++      dumper = {'c_c': c_c_dumper_t, 'c_h': c_h_dumper_t}[mode[:3]](*mode_options)
++      preprocess = None
++   elif (mode[:5] == 'ml_c:') or (mode[:5] == 'ml_h:'):
++      mode_options = mode[5:].split(',')
++      if (len(mode_options) != 3) and ((len(mode_options) != 4) or (mode_options[3] != 'annots')):
++         asm_error('badly formatted mode on command line')
++      dumper = {'ml_c': ml_c_dumper_t, 'ml_h': ml_h_dumper_t
++         }[mode[:4]](*(mode_options[:3] + [len(mode_options) == 4]))
++      preprocess = None
++   elif mode == 'inline_c':
++      dumper = inline_c_dumper_t(False)
++      preprocess = preprocess_inline_c(dumper)
++   elif mode == 'inline_c:annots':
++      dumper = inline_c_dumper_t(True)
++      preprocess = preprocess_inline_c(dumper)
++   elif mode == 'asvc':
++      dumper = asvc_dumper_t()
++      preprocess = None
++   elif mode == 'aliases':
++      dumper = aliases_dumper_t()
++      preprocess = None
++   elif mode == 'aliases:inline_c':
++      dumper = aliases_dumper_t()
++      preprocess = preprocess_inline_c(dumper)
++   else:
++      asm_error('invalid mode')
++   external_link = dumper.external_link()
++
++   # handle target
++   target = options.target or 'b0' # assume b0 if no target specified
++   if target == 'a0':
++      have_sema = False
++      have_am = False
++      mulw_rotate = False
++      have_lthrsw = False
++   elif target == 'b0':
++      have_sema = True
++      have_am = True
++      mulw_rotate = True
++      have_lthrsw = True
++   elif target == 'hera':
++      have_sema = True
++      have_am = False
++      mulw_rotate = True
++      have_lthrsw = True
++   else:
++      asm_error('invalid target')
++   if have_am:
++      sigs['loadam'] = SIG_LOADAM
++      arg_defs['tlbam'] = loc_t(MUX_ANY, 47, 0, 0, None, RW_WRITE)
++   if have_lthrsw:
++      sigs['lthrsw'] = SIG_LTHRSW
++      del sigs['int']
++      arg_defs['interrupt'] = loc_t(MUX_ANY, 38, 0, 0, None, RW_WRITE)
++
++   # handle misc options
++   allow_xor_0 = options.allow_xor_0
++   dont_warn_when_mul_rot_inp_r5 = options.dont_warn_when_mul_rot_inp_r5
++   warnings_are_errors = options.warnings_are_errors
++   disable_warnings = options.disable_warnings
++
++   # make options visible to asm
++   arg_defs['mode'] = mode
++   arg_defs['target'] = target
++
++   # arg_defs all setup at this point
++   sets = arg_defs.copy() # todo: see arg_eval
++
++   # handle command line sets
++   re_options_set = re.compile('(?P<name>\\w+)=(?P<val>.+)$')
++   for options_set in options.sets:
++      m = re_options_set.match(options_set)
++      if not m:
++         asm_error('badly formatted set on command line')
++      sets[m.group('name')] = arg_eval(m.group('val'), sets)
++
++   # assemble input file and dump
++   asm_file(sets, filename, filename, preprocess)
++   asm_end_prog()
++   dump(dumper)
++   for name in arg_defs: # todo: see arg_eval
++      del sets[name]
++   dumper.sets(sets)
++
++if __name__ == '__main__':
++   main()
+diff --git a/pi-util/rebase_liblinks.py b/pi-util/rebase_liblinks.py
+new file mode 100755
+index 0000000..6a9a33f
+--- /dev/null
++++ b/pi-util/rebase_liblinks.py
+@@ -0,0 +1,37 @@
++#!/usr/bin/env python
++
++import os, sys
++from stat import *
++
++def walktree(top, callback, n, prefix):
++    '''recursively descend the directory tree rooted at top,
++       calling the callback function for each regular file'''
++
++    for f in os.listdir(top):
++        pathname = os.path.join(top, f)
++        mode = os.lstat(pathname).st_mode
++        if S_ISDIR(mode):
++            # It's a directory, recurse into it
++            walktree(pathname, callback, n+1, prefix)
++        elif S_ISLNK(mode):
++            # It's a file, call the callback function
++            callback(pathname, os.readlink(pathname), n, prefix)
++
++def visitfile(file, linkname, n, prefix):
++    if (linkname.startswith(prefix + 'lib/')):
++        newlink = "../" * n + linkname[len(prefix):]
++        print 'relinking', file, "->", newlink
++        os.remove(file)
++        os.symlink(newlink, file)
++
++if __name__ == '__main__':
++    argc = len(sys.argv)
++    if argc == 2:
++        walktree(sys.argv[1], visitfile, 0, "/")
++    elif argc == 3:
++        walktree(sys.argv[1], visitfile, 0, sys.argv[2])
++    else:
++        print "rebase_liblinks.py <local root> [<old sysroot>]"
++
++
++
+diff --git a/pi-util/syncroot.sh b/pi-util/syncroot.sh
+new file mode 100755
+index 0000000..d8bdd91
+--- /dev/null
++++ b/pi-util/syncroot.sh
+@@ -0,0 +1,43 @@
++set -e
++
++if [ "$1" == "" ]; then
++  echo Usage: $0 \<src_dir\> [\<rootname\>]
++  echo src_dir is a source for rsync so may contain m/c name.
++  echo rootname will be set to \"raspian_jessie_pi1\" if missing
++  echo e.g.: pi-util/syncroot.sh my-pi: raspian_jessie_pi1
++  exit 1
++fi
++
++SYSROOT_NAME=$2
++if [ "$SYSROOT_NAME" == "" ]; then
++  SYSROOT_NAME=raspian_jessie_pi1
++fi
++
++DST_ROOT=`pwd`
++DST=$DST_ROOT/build/linux/$SYSROOT_NAME-sysroot
++SRC=$1
++
++echo Sync src:  $SRC
++echo Sync dest: $DST
++
++mkdir -p $DST/lib
++mkdir -p $DST/opt/vc/include
++mkdir -p $DST/usr/lib/pkgconfig
++mkdir -p $DST/usr/bin
++mkdir -p $DST/usr/share
++
++#### MUST NOT include /opt/vc/include/*GL*
++# Creates conflicts with GL includes inside Chrome
++
++rsync -rl $SRC/lib/arm-linux-gnueabihf $DST/lib
++rsync -rl $SRC/opt/vc/lib $DST/opt/vc
++rsync -l  $SRC/opt/vc/include/bcm_host.h $DST/opt/vc/include
++rsync -rl $SRC/opt/vc/include/interface $DST/opt/vc/include
++rsync -rl $SRC/opt/vc/include/vcinclude $DST/opt/vc/include
++rsync -rl $SRC/usr/lib/arm-linux-gnueabihf $DST/usr/lib
++rsync -rl $SRC/usr/lib/gcc $DST/usr/lib
++rsync -rl $SRC/usr/include $DST/usr
++
++pi-util/rebase_liblinks.py $DST
++
++
 
diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1004-0001-Squashed-commit-of-the-following.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1004-0001-Squashed-commit-of-the-following.patch
deleted file mode 100644
index fee44ddbc6..0000000000
--- a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1004-0001-Squashed-commit-of-the-following.patch
+++ /dev/null
@@ -1,2180 +0,0 @@
-From d08594462136274636c1f2f476a6410ff92a9e16 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Wed, 13 Jan 2016 16:13:33 +0000
-Subject: [PATCH] H.265 residual decode rework (v2)
-
-Rework the cabac decode functions
-Simplify the code flow and variable usage where possible
-
-(Remove profiling and other spurious deltas that were in v1)
----
- libavcodec/arm/cabac.h                |  155 ++++-
- libavcodec/arm/hevc_cabac.h           |  491 +++++++++++++++
- libavcodec/arm/hevcdsp_deblock_neon.S |   13 +-
- libavcodec/arm/hevcdsp_epel_neon.S    |    9 +-
- libavcodec/cabac.h                    |    9 +-
- libavcodec/hevc_cabac.c               | 1096 +++++++++++++++++++++++++--------
- 6 files changed, 1509 insertions(+), 264 deletions(-)
- create mode 100644 libavcodec/arm/hevc_cabac.h
-
-diff --git a/libavcodec/arm/cabac.h b/libavcodec/arm/cabac.h
-index fdbf86b..0a3980a 100644
---- a/libavcodec/arm/cabac.h
-+++ b/libavcodec/arm/cabac.h
-@@ -26,13 +26,34 @@
- #include "libavutil/internal.h"
- #include "libavcodec/cabac.h"
- 
-+
-+#if UNCHECKED_BITSTREAM_READER
-+#define LOAD_16BITS_BEHI\
-+        "ldrh       %[tmp]        , [%[ptr]]    , #2            \n\t"\
-+        "rev        %[tmp]        , %[tmp]                      \n\t"
-+#elif CONFIG_THUMB
-+#define LOAD_16BITS_BEHI\
-+        "ldr        %[tmp]        , [%[c], %[end]]              \n\t"\
-+        "cmp        %[tmp]        , %[ptr]                      \n\t"\
-+        "it         cs                                          \n\t"\
-+        "ldrhcs     %[tmp]        , [%[ptr]]    , #2            \n\t"\
-+        "rev        %[tmp]        , %[tmp]                      \n\t"
-+#else
-+#define LOAD_16BITS_BEHI\
-+        "ldr        %[tmp]        , [%[c], %[end]]              \n\t"\
-+        "cmp        %[tmp]        , %[ptr]                      \n\t"\
-+        "ldrcsh     %[tmp]        , [%[ptr]]    , #2            \n\t"\
-+        "rev        %[tmp]        , %[tmp]                      \n\t"
-+#endif
-+
-+
- #define get_cabac_inline get_cabac_inline_arm
- static av_always_inline int get_cabac_inline_arm(CABACContext *c,
-                                                  uint8_t *const state)
- {
-     int bit;
-+#if 0
-     void *reg_b, *reg_c, *tmp;
--
-     __asm__ volatile(
-         "ldrb       %[bit]        , [%[state]]                  \n\t"
-         "add        %[r_b]        , %[tables]   , %[lps_off]    \n\t"
-@@ -100,9 +121,141 @@ static av_always_inline int get_cabac_inline_arm(CABACContext *c,
-           [mlps_off]"I"(H264_MLPS_STATE_OFFSET + 128)
-         : "memory", "cc"
-         );
-+#else
-+   // *** Not thumb compatible yet
-+   unsigned int reg_b, tmp;
-+    __asm__ (
-+        "ldrb       %[bit]        , [%[state]]                  \n\t"
-+        "sub        %[r_b]        , %[mlps_tables], %[lps_off]  \n\t"
-+        "and        %[tmp]        , %[range]    , #0xC0         \n\t"
-+        "add        %[r_b]        , %[r_b]      , %[bit]        \n\t"
-+        "ldrb       %[tmp]        , [%[r_b]     , %[tmp], lsl #1] \n\t"
-+// %bit = *state
-+// %range = range
-+// %tmp = RangeLPS
-+        "sub        %[range]      , %[range]    , %[tmp]        \n\t"
-+
-+        "cmp        %[low]        , %[range]    , lsl #17       \n\t"
-+        "ittt       ge                                          \n\t"
-+        "subge      %[low]        , %[low]      , %[range], lsl #17 \n\t"
-+        "mvnge      %[bit]        , %[bit]                      \n\t"
-+        "movge      %[range]      , %[tmp]                      \n\t"
-+
-+        "clz        %[tmp]        , %[range]                    \n\t"
-+        "sub        %[tmp]        , #23                         \n\t"
-+
-+        "ldrb       %[r_b]        , [%[mlps_tables], %[bit]]    \n\t"
-+        "lsl        %[low]        , %[low]      , %[tmp]        \n\t"
-+        "lsl        %[range]      , %[range]    , %[tmp]        \n\t"
-+
-+        "strb       %[r_b]        , [%[state]]                  \n\t"
-+        "lsls       %[tmp]        , %[low]      , #16           \n\t"
-+
-+        "bne        2f                                          \n\t"
-+        LOAD_16BITS_BEHI
-+        "lsr        %[tmp]        , %[tmp]      , #15           \n\t"
-+        "movw       %[r_b]        , #0xFFFF                     \n\t"
-+        "sub        %[tmp]        , %[tmp]      , %[r_b]        \n\t"
-+
-+        "rbit       %[r_b]        , %[low]                      \n\t"
-+        "clz        %[r_b]        , %[r_b]                      \n\t"
-+        "sub        %[r_b]        , %[r_b]      , #16           \n\t"
-+#if CONFIG_THUMB
-+        "lsl        %[tmp]        , %[tmp]      , %[r_b]        \n\t"
-+        "add        %[low]        , %[low]      , %[tmp]        \n\t"
-+#else
-+        "add        %[low]        , %[low]      , %[tmp], lsl %[r_b] \n\t"
-+#endif
-+        "2:                                                     \n\t"
-+        :    [bit]"=&r"(bit),
-+             [low]"+&r"(c->low),
-+           [range]"+&r"(c->range),
-+             [r_b]"=&r"(reg_b),
-+             [ptr]"+&r"(c->bytestream),
-+             [tmp]"=&r"(tmp)
-+          :  [state]"r"(state),
-+            [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128),
-+              [byte]"M"(offsetof(CABACContext, bytestream)),
-+#if !UNCHECKED_BITSTREAM_READER
-+                 [c]"r"(c),
-+               [end]"M"(offsetof(CABACContext, bytestream_end)),
-+#endif
-+           [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
-+        : "memory", "cc"
-+        );
-+#endif
- 
-     return bit & 1;
- }
-+
-+#define get_cabac_bypass get_cabac_bypass_arm
-+static inline int get_cabac_bypass_arm(CABACContext * const c)
-+{
-+    int rv = 0;
-+    unsigned int tmp;
-+    __asm (
-+        "lsl        %[low]        , #1                          \n\t"
-+        "cmp        %[low]        , %[range]    , lsl #17       \n\t"
-+        "adc        %[rv]         , %[rv]       , #0            \n\t"
-+        "it         cs                                          \n\t"
-+        "subcs      %[low]        , %[low]      , %[range], lsl #17 \n\t"
-+        "lsls       %[tmp]        , %[low]      , #16           \n\t"
-+        "bne        1f                                          \n\t"
-+        LOAD_16BITS_BEHI
-+        "add        %[low]        , %[low]      , %[tmp], lsr #15 \n\t"
-+        "movw       %[tmp]        , #0xFFFF                     \n\t"
-+        "sub        %[low]        , %[low]      , %[tmp]        \n\t"
-+        "1:                                                     \n\t"
-+        : // Outputs
-+              [rv]"+&r"(rv),
-+             [low]"+&r"(c->low),
-+             [tmp]"=&r"(tmp),
-+             [ptr]"+&r"(c->bytestream)
-+        : // Inputs
-+#if !UNCHECKED_BITSTREAM_READER
-+                 [c]"r"(c),
-+               [end]"M"(offsetof(CABACContext, bytestream_end)),
-+#endif
-+             [range]"r"(c->range)
-+        : "cc"
-+    );
-+    return rv;
-+}
-+
-+
-+#define get_cabac_bypass_sign get_cabac_bypass_sign_arm
-+static inline int get_cabac_bypass_sign_arm(CABACContext * const c, int rv)
-+{
-+    unsigned int tmp;
-+    __asm (
-+        "lsl        %[low]        , #1                          \n\t"
-+        "cmp        %[low]        , %[range]    , lsl #17       \n\t"
-+        "ite        cc                                          \n\t"
-+        "rsbcc      %[rv]         , %[rv]       , #0            \n\t"
-+        "subcs      %[low]        , %[low]      , %[range], lsl #17 \n\t"
-+        "lsls       %[tmp]        , %[low]      , #16           \n\t"
-+        "bne        1f                                          \n\t"
-+        LOAD_16BITS_BEHI
-+        "add        %[low]        , %[low]      , %[tmp], lsr #15 \n\t"
-+        "movw       %[tmp]        , #0xFFFF                     \n\t"
-+        "sub        %[low]        , %[low]      , %[tmp]        \n\t"
-+        "1:                                                     \n\t"
-+        : // Outputs
-+              [rv]"+&r"(rv),
-+             [low]"+&r"(c->low),
-+             [tmp]"=&r"(tmp),
-+             [ptr]"+&r"(c->bytestream)
-+        : // Inputs
-+#if !UNCHECKED_BITSTREAM_READER
-+                 [c]"r"(c),
-+               [end]"M"(offsetof(CABACContext, bytestream_end)),
-+#endif
-+             [range]"r"(c->range)
-+        : "cc"
-+    );
-+    return rv;
-+}
-+
- #endif /* HAVE_ARMV6T2_INLINE */
- 
- #endif /* AVCODEC_ARM_CABAC_H */
-diff --git a/libavcodec/arm/hevc_cabac.h b/libavcodec/arm/hevc_cabac.h
-new file mode 100644
-index 0000000..31d3c59
---- /dev/null
-+++ b/libavcodec/arm/hevc_cabac.h
-@@ -0,0 +1,491 @@
-+/*
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#ifndef AVCODEC_ARM_HEVC_CABAC_H
-+#define AVCODEC_ARM_HEVC_CABAC_H
-+
-+#include "config.h"
-+#if HAVE_ARMV6T2_INLINE
-+
-+#define hevc_mem_bits32 hevc_mem_bits32_arm
-+static inline uint32_t hevc_mem_bits32_arm(const void * p, const unsigned int bits)
-+{
-+    unsigned int n;
-+    __asm__ (
-+        "rev        %[n], %[x]                     \n\t"
-+        : [n]"=r"(n)
-+        : [x]"r"(*(const uint32_t *)((const uint8_t *)p + (bits >> 3)))
-+        :
-+        );
-+    return n << (bits & 7);
-+}
-+
-+
-+// ---------------------------------------------------------------------------
-+//
-+// Helper fns - little bits of code where ARM has an instraction that the
-+// compiler doesn't know about / use
-+
-+#define trans_scale_sat trans_scale_sat_arm
-+static inline int trans_scale_sat_arm(const int level, const unsigned int scale, const unsigned int scale_m, const unsigned int shift)
-+{
-+    int rv;
-+    int t = ((level * (int)(scale * scale_m)) >> shift) + 1;
-+
-+    __asm__ (
-+    "ssat %[rv], #16, %[t], ASR #1 \n\t"
-+    : [rv]"=r"(rv)
-+    : [t]"r"(t)
-+    :
-+    );
-+    return rv;
-+}
-+
-+#define update_rice update_rice_arm
-+static inline void update_rice_arm(uint8_t * const stat_coeff,
-+    const unsigned int last_coeff_abs_level_remaining,
-+    const unsigned int c_rice_param)
-+{
-+    int t;
-+    __asm__ (
-+    "lsl   %[t], %[coeff], #1               \n\t"
-+    "lsrs  %[t], %[t], %[shift]             \n\t"
-+    "it    eq                               \n\t"
-+    "subeq %[stat], %[stat], #1             \n\t"
-+    "cmp   %[t], #6                         \n\t"
-+    "adc   %[stat], %[stat], #0             \n\t"
-+    "usat  %[stat], #8, %[stat]             \n\t"
-+    : [stat]"+&r"(*stat_coeff),
-+         [t]"=&r"(t)
-+    :  [coeff]"r"(last_coeff_abs_level_remaining),
-+       [shift]"r"(c_rice_param)
-+    : "cc"
-+    );
-+}
-+
-+// ---------------------------------------------------------------------------
-+//
-+// CABAC get loops
-+//
-+// Where the loop is simple enough we can normally do 10-30% better than the
-+// compiler
-+
-+// Get the residual greater than 1 bits
-+
-+#define get_cabac_greater1_bits get_cabac_greater1_bits_arm
-+static inline unsigned int get_cabac_greater1_bits_arm(CABACContext * const c, const unsigned int n,
-+    uint8_t * const state0)
-+{
-+    unsigned int i, reg_b, st, tmp, bit, rv;
-+     __asm__ (
-+         "mov        %[i]          , #0                          \n\t"
-+         "mov        %[rv]         , #0                          \n\t"
-+         "1:                                                     \n\t"
-+         "add        %[i]          , %[i]        , #1            \n\t"
-+         "cmp        %[rv]         , #0                          \n\t"
-+         "ite        eq                                          \n\t"
-+         "usateq     %[st]         , #2          , %[i]          \n\t"
-+         "movne      %[st]         , #0                          \n\t"
-+
-+         "ldrb       %[bit]        , [%[state0], %[st]]          \n\t"
-+         "sub        %[r_b]        , %[mlps_tables], %[lps_off]  \n\t"
-+         "and        %[tmp]        , %[range]    , #0xC0         \n\t"
-+         "add        %[r_b]        , %[r_b]      , %[bit]        \n\t"
-+         "ldrb       %[tmp]        , [%[r_b], %[tmp], lsl #1]    \n\t"
-+         "sub        %[range]      , %[range]    , %[tmp]        \n\t"
-+
-+         "cmp        %[low]        , %[range], lsl #17           \n\t"
-+         "ittt       ge                                          \n\t"
-+         "subge      %[low]        , %[low]      , %[range], lsl #17 \n\t"
-+         "mvnge      %[bit]        , %[bit]                      \n\t"
-+         "movge      %[range]      , %[tmp]                      \n\t"
-+
-+         "ldrb       %[r_b]        , [%[mlps_tables], %[bit]]    \n\t"
-+         "and        %[bit]        , %[bit]      , #1            \n\t"
-+         "orr        %[rv]         , %[bit]      , %[rv], lsl #1 \n\t"
-+
-+         "clz        %[tmp]        , %[range]                    \n\t"
-+         "sub        %[tmp]        , #23                         \n\t"
-+
-+         "lsl        %[low]        , %[low]      , %[tmp]        \n\t"
-+         "lsl        %[range]      , %[range]    , %[tmp]        \n\t"
-+
-+         "strb       %[r_b]        , [%[state0], %[st]]          \n\t"
-+// There is a small speed gain from combining both conditions, using a single
-+// branch and then working out what that meant later
-+         "lsls       %[tmp]        , %[low]      , #16           \n\t"
-+         "it         ne                                          \n\t"
-+         "cmpne      %[n]          , %[i]                        \n\t"
-+         "bne        1b                                          \n\t"
-+
-+// If reload is not required then we must have run out of flags to decode
-+         "tst        %[tmp]        , %[tmp]                      \n\t"
-+         "bne        2f                                          \n\t"
-+
-+// Do reload
-+         "ldrh       %[tmp]        , [%[bptr]]   , #2            \n\t"
-+         "movw       %[r_b]        , #0xFFFF                     \n\t"
-+         "rev        %[tmp]        , %[tmp]                      \n\t"
-+         "rsb        %[tmp]        , %[r_b]      , %[tmp], lsr #15 \n\t"
-+
-+         "rbit       %[r_b]        , %[low]                      \n\t"
-+         "clz        %[r_b]        , %[r_b]                      \n\t"
-+         "sub        %[r_b]        , %[r_b]      , #16           \n\t"
-+
-+#if CONFIG_THUMB
-+         "lsl        %[tmp]        , %[tmp]      , %[r_b]        \n\t"
-+         "add        %[low]        , %[low]      , %[tmp]        \n\t"
-+#else
-+         "add        %[low]        , %[low]      , %[tmp], lsl %[r_b] \n\t"
-+#endif
-+
-+         "cmp        %[n]          , %[i]                        \n\t"
-+         "bne        1b                                          \n\t"
-+         "2:                                                     \n\t"
-+         :    [bit]"=&r"(bit),
-+              [low]"+&r"(c->low),
-+            [range]"+&r"(c->range),
-+              [r_b]"=&r"(reg_b),
-+             [bptr]"+&r"(c->bytestream),
-+                [i]"=&r"(i),
-+              [tmp]"=&r"(tmp),
-+               [st]"=&r"(st),
-+               [rv]"=&r"(rv)
-+          :  [state0]"r"(state0),
-+                  [n]"r"(n),
-+        [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128),
-+               [byte]"M"(offsetof(CABACContext, bytestream)),
-+            [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
-+         : "memory", "cc"
-+    );
-+    return rv;
-+}
-+
-+
-+// n must be > 0 on entry
-+#define get_cabac_sig_coeff_flag_idxs get_cabac_sig_coeff_flag_idxs_arm
-+static inline uint8_t * get_cabac_sig_coeff_flag_idxs_arm(CABACContext * const c, uint8_t * const state0,
-+    unsigned int n,
-+    const uint8_t const * ctx_map,
-+    uint8_t * p)
-+{
-+    unsigned int reg_b, tmp, st, bit;
-+     __asm__ (
-+         "1:                                                     \n\t"
-+// Get bin from map
-+         "ldrb       %[st]         , [%[ctx_map], %[n]]          \n\t"
-+
-+// Load state & ranges
-+         "sub        %[r_b]        , %[mlps_tables], %[lps_off]  \n\t"
-+         "ldrb       %[bit]        , [%[state0], %[st]]          \n\t"
-+         "and        %[tmp]        , %[range]    , #0xC0         \n\t"
-+         "add        %[r_b]        , %[r_b]      , %[tmp], lsl #1 \n\t"
-+         "ldrb       %[tmp]        , [%[r_b], %[bit]]            \n\t"
-+         "sub        %[range]      , %[range]    , %[tmp]        \n\t"
-+
-+         "cmp        %[low]        , %[range], lsl #17           \n\t"
-+         "ittt       ge                                          \n\t"
-+         "subge      %[low]        , %[low]      , %[range], lsl #17 \n\t"
-+         "mvnge      %[bit]        , %[bit]                      \n\t"
-+         "movge      %[range]      , %[tmp]                      \n\t"
-+
-+         "ldrb       %[r_b]        , [%[mlps_tables], %[bit]]    \n\t"
-+         "tst        %[bit]        , #1                          \n\t"
-+// GCC asm seems to need strbne written differently for thumb and arm
-+#if CONFIG_THUMB
-+         "it         ne                                          \n\t"
-+         "strbne     %[n]          , [%[idx]]    , #1            \n\t"
-+#else
-+         "strneb     %[n]          , [%[idx]]    , #1            \n\t"
-+#endif
-+
-+// Renorm
-+         "clz        %[tmp]        , %[range]                    \n\t"
-+         "sub        %[tmp]        , #23                         \n\t"
-+         "lsl        %[low]        , %[low]      , %[tmp]        \n\t"
-+         "lsl        %[range]      , %[range]    , %[tmp]        \n\t"
-+
-+         "strb       %[r_b]        , [%[state0], %[st]]          \n\t"
-+// There is a small speed gain from combining both conditions, using a single
-+// branch and then working out what that meant later
-+         "subs       %[n]          , %[n]        , #1            \n\t"
-+#if CONFIG_THUMB
-+         "itt        ne                                          \n\t"
-+         "lslsne     %[tmp]        , %[low]      , #16           \n\t"
-+         "bne        1b                                          \n\t"
-+#else
-+         "lslnes     %[tmp]        , %[low]      , #16           \n\t"
-+         "bne        1b                                          \n\t"
-+#endif
-+
-+// If we have bits left then n must be 0 so give up now
-+         "lsls       %[tmp]        , %[low]      , #16           \n\t"
-+         "bne        2f                                          \n\t"
-+
-+// Do reload
-+         "ldrh       %[tmp]        , [%[bptr]]   , #2            \n\t"
-+         "movw       %[r_b]        , #0xFFFF                     \n\t"
-+         "rev        %[tmp]        , %[tmp]                      \n\t"
-+         "rsb        %[tmp]        , %[r_b]      , %[tmp], lsr #15 \n\t"
-+
-+         "rbit       %[r_b]        , %[low]                      \n\t"
-+         "clz        %[r_b]        , %[r_b]                      \n\t"
-+         "sub        %[r_b]        , %[r_b]      , #16           \n\t"
-+
-+#if CONFIG_THUMB
-+         "lsl        %[tmp]        , %[tmp]      , %[r_b]        \n\t"
-+         "add        %[low]        , %[low]      , %[tmp]        \n\t"
-+#else
-+         "add        %[low]        , %[low]      , %[tmp], lsl %[r_b] \n\t"
-+#endif
-+
-+// Check to see if we still have more to do
-+         "cmp        %[n]          , #0                          \n\t"
-+         "bne        1b                                          \n\t"
-+         "2:                                                     \n\t"
-+         :    [bit]"=&r"(bit),
-+              [low]"+&r"(c->low),
-+            [range]"+&r"(c->range),
-+              [r_b]"=&r"(reg_b),
-+             [bptr]"+&r"(c->bytestream),
-+              [idx]"+&r"(p),
-+                [n]"+&r"(n),
-+              [tmp]"=&r"(tmp),
-+               [st]"=&r"(st)
-+          :  [state0]"r"(state0),
-+            [ctx_map]"r"(ctx_map),
-+        [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128),
-+               [byte]"M"(offsetof(CABACContext, bytestream)),
-+            [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
-+         : "memory", "cc"
-+    );
-+
-+    return p;
-+}
-+
-+// ---------------------------------------------------------------------------
-+//
-+// CABAC_BY22 functions
-+//
-+// By and large these are (at best) no faster than their C equivalents - the
-+// only one worth having is _peek where we do a slightly better job than the
-+// compiler
-+//
-+// The others have been stashed here for reference in case larger scale asm
-+// is attempted in which case they might be a useful base
-+
-+
-+#define get_cabac_by22_peek get_cabac_by22_peek_arm
-+static inline uint32_t get_cabac_by22_peek_arm(const CABACContext *const c)
-+{
-+    uint32_t rv, tmp;
-+    __asm__ (
-+        "bic      %[rv]  , %[low], #1            \n\t"
-+        "cmp      %[inv] , #0                    \n\t"
-+        "it       ne                             \n\t"
-+        "umullne  %[tmp] , %[rv] , %[inv], %[rv] \n\t"
-+        :  // Outputs
-+             [rv]"=&r"(rv),
-+             [tmp]"=r"(tmp)
-+        :  // Inputs
-+             [low]"r"(c->low),
-+             [inv]"r"(c->range)
-+        :  // Clobbers
-+                "cc"
-+    );
-+    return rv << 1;
-+}
-+
-+#if 0
-+
-+// ***** Slower than the C  :-(
-+#define get_cabac_by22_flush get_cabac_by22_flush_arm
-+static inline void get_cabac_by22_flush_arm(CABACContext *const c, const unsigned int n, const uint32_t val)
-+{
-+    uint32_t m, tmp;
-+    __asm__ (
-+    "add    %[bits], %[bits], %[n]   \n\t"
-+    "ldr    %[m], [%[ptr], %[bits], lsr #3]  \n\t"
-+
-+    "rsb    %[tmp], %[n], #32        \n\t"
-+    "lsr    %[tmp], %[val], %[tmp]   \n\t"
-+    "mul    %[tmp], %[range], %[tmp] \n\t"
-+
-+    "rev    %[m], %[m]               \n\t"
-+
-+    "lsl    %[tmp], %[tmp], #23      \n\t"
-+    "rsb    %[low], %[tmp], %[low], lsl %[n] \n\t"
-+
-+    "and    %[tmp], %[bits], #7         \n\t"
-+    "lsl    %[m], %[m], %[tmp]          \n\t"
-+
-+    "orr    %[low], %[low], %[m], lsr #9      \n\t"
-+        :  // Outputs
-+             [m]"=&r"(m),
-+           [tmp]"=&r"(tmp),
-+          [bits]"+&r"(c->by22.bits),
-+           [low]"+&r"(c->low)
-+        :  // Inputs
-+               [n]"r"(n),
-+             [val]"r"(val),
-+             [inv]"r"(c->range),
-+           [range]"r"(c->by22.range),
-+             [ptr]"r"(c->bytestream)
-+        :  // Clobbers
-+    );
-+}
-+
-+
-+// Works but slower than C
-+#define coeff_abs_level_remaining_decode_by22(c,r) coeff_abs_level_remaining_decode_by22_arm(c, r)
-+static int coeff_abs_level_remaining_decode_by22_arm(CABACContext * const c, const unsigned int c_rice_param)
-+{
-+    uint32_t n, val, tmp, level;
-+
-+//    PROFILE_START();
-+
-+    __asm__ (
-+            // Peek
-+            "bic    %[val],  %[low],   #1  \n\t"
-+            "cmp    %[inv], #0          \n\t"
-+            "umullne  %[tmp], %[val], %[inv], %[val] \n\t"
-+            "lsl    %[val], %[val], #1  \n\t"
-+
-+            // Count bits (n = prefix)
-+            "mvn    %[n], %[val] \n\t"
-+            "clz    %[n], %[n]   \n\t"
-+
-+            "lsl    %[level], %[val], %[n] \n\t"
-+            "subs   %[tmp], %[n], #3 \n\t"
-+            "blo    2f \n\t"
-+
-+            // prefix >= 3
-+            // < tmp = prefix - 3
-+            // > tmp = prefix + rice - 3
-+            "add    %[tmp], %[tmp], %[rice] \n\t"
-+            // > n = prefix * 2 + rice - 3
-+            "add    %[n], %[tmp], %[n] \n\t"
-+            "cmp    %[n], #21 \n\t"
-+            "bhi    3f \n\t"
-+
-+            "orr    %[level], %[level], #0x80000000 \n\t"
-+            "rsb    %[tmp], %[tmp], #31 \n\t"
-+            "lsr    %[level], %[level], %[tmp] \n\t"
-+
-+            "mov    %[tmp], #2 \n\t"
-+            "add    %[level], %[level], %[tmp], lsl %[rice] \n\t"
-+            "b      1f \n\t"
-+
-+            // > 22 bits used in total - need reload
-+            "3:  \n\t"
-+
-+            // Stash prefix + rice - 3 in level (only spare reg)
-+            "mov    %[level], %[tmp] \n\t"
-+            // Restore n to flush value (prefix)
-+            "sub    %[n], %[n], %[tmp] \n\t"
-+
-+            // Flush + reload
-+
-+//          "rsb    %[tmp], %[n], #32        \n\t"
-+//          "lsr    %[tmp], %[val], %[tmp]   \n\t"
-+//          "mul    %[tmp], %[range], %[tmp] \n\t"
-+
-+            // As it happens we know that all the bits we are flushing are 1
-+            // so we can cheat slightly
-+            "rsb    %[tmp], %[range], %[range], lsl %[n] \n\t"
-+            "lsl    %[tmp], %[tmp], #23      \n\t"
-+            "rsb    %[low], %[tmp], %[low], lsl %[n] \n\t"
-+
-+            "add    %[bits], %[bits], %[n]   \n\t"
-+            "ldr    %[n], [%[ptr], %[bits], lsr #3]  \n\t"
-+            "rev    %[n], %[n]               \n\t"
-+            "and    %[tmp], %[bits], #7         \n\t"
-+            "lsl    %[n], %[n], %[tmp]          \n\t"
-+
-+            "orr    %[low], %[low], %[n], lsr #9      \n\t"
-+
-+            // (reload)
-+
-+            "bic    %[val],  %[low],   #1  \n\t"
-+            "cmp    %[inv], #0          \n\t"
-+            "umullne  %[tmp], %[val], %[inv], %[val] \n\t"
-+            "lsl    %[val], %[val], #1  \n\t"
-+
-+            // Build value
-+
-+            "mov    %[n], %[level] \n\t"
-+
-+            "orr     %[tmp], %[val], #0x80000000 \n\t"
-+            "rsb     %[level], %[level], #31 \n\t"
-+            "lsr     %[level], %[tmp], %[level] \n\t"
-+
-+            "mov    %[tmp], #2 \n\t"
-+            "add    %[level], %[level], %[tmp], lsl %[rice] \n\t"
-+            "b      1f \n\t"
-+
-+            // prefix < 3
-+            "2:  \n\t"
-+            "rsb    %[tmp], %[rice], #31 \n\t"
-+            "lsr    %[level], %[level], %[tmp] \n\t"
-+            "orr    %[level], %[level], %[n], lsl %[rice] \n\t"
-+            "add    %[n], %[n], %[rice] \n\t"
-+
-+            "1:  \n\t"
-+            // Flush
-+            "add    %[n], %[n], #1 \n\t"
-+
-+            "rsb    %[tmp], %[n], #32        \n\t"
-+            "lsr    %[tmp], %[val], %[tmp]   \n\t"
-+
-+            "add    %[bits], %[bits], %[n]   \n\t"
-+            "ldr    %[val], [%[ptr], %[bits], lsr #3]  \n\t"
-+
-+            "mul    %[tmp], %[range], %[tmp] \n\t"
-+            "lsl    %[tmp], %[tmp], #23      \n\t"
-+            "rsb    %[low], %[tmp], %[low], lsl %[n] \n\t"
-+
-+            "rev    %[val], %[val]               \n\t"
-+            "and    %[tmp], %[bits], #7         \n\t"
-+            "lsl    %[val], %[val], %[tmp]          \n\t"
-+
-+            "orr    %[low], %[low], %[val], lsr #9      \n\t"
-+        :  // Outputs
-+         [level]"=&r"(level),
-+             [n]"=&r"(n),
-+           [val]"=&r"(val),
-+           [tmp]"=&r"(tmp),
-+          [bits]"+&r"(c->by22.bits),
-+           [low]"+&r"(c->low)
-+        :  // Inputs
-+            [rice]"r"(c_rice_param),
-+             [inv]"r"(c->range),
-+           [range]"r"(c->by22.range),
-+             [ptr]"r"(c->bytestream)
-+        :  // Clobbers
-+                "cc"
-+    );
-+
-+//    PROFILE_ACC(residual_abs);
-+
-+    return level;
-+}
-+#endif
-+
-+#endif /* HAVE_ARMV6T2_INLINE */
-+
-+#endif /* AVCODEC_ARM_HEVC_CABAC_H */
-diff --git a/libavcodec/arm/hevcdsp_deblock_neon.S b/libavcodec/arm/hevcdsp_deblock_neon.S
-index bad4589..a088cc3 100644
---- a/libavcodec/arm/hevcdsp_deblock_neon.S
-+++ b/libavcodec/arm/hevcdsp_deblock_neon.S
-@@ -409,10 +409,12 @@ function ff_hevc_deblocking_boundary_strengths_neon, export=1
-         beq         90f
- 
-         tst         a3, #1
-+        itee        ne
-         ldrne       a3, [v5, #0]    @ curr->mv[0]
-         ldreq       a3, [v5, #4]    @ curr->mv[1]
-         moveq       v1, v2
-         tst         v8, #1
-+        itee        ne
-         ldrne       v8, [v6, #0]    @ neigh->mv[0]
-         ldreq       v8, [v6, #4]    @ neigh->mv[1]
-         moveq       v3, v4
-@@ -424,9 +426,14 @@ function ff_hevc_deblocking_boundary_strengths_neon, export=1
-         sel         a3, a3, ip
-         ands        a3, a3, lr
-         @ drop through
--10:     movne       a3, #1
-+10:     it          ne
-+        movne       a3, #1
- 11:     subs        a2, a2, #1
--12:     strbhs      a3, [v7], a4
-+12:
-+A       strbhs      a3, [v7], a4
-+T       itt         hs
-+T       strbhs      a3, [v7]
-+T       addhs       v7, v7, a4
-         subs        a2, a2, #1
-         bhs         12b
- 
-@@ -442,6 +449,7 @@ function ff_hevc_deblocking_boundary_strengths_neon, export=1
-         bne         10b
- 
-         teq         v1, v3
-+        it          eq
-         teqeq       v2, v4
-         bne         40f
-         teq         v1, v2
-@@ -487,6 +495,7 @@ function ff_hevc_deblocking_boundary_strengths_neon, export=1
-         b           10b
- 
- 40:     teq         v1, v4
-+        ite         eq
-         teqeq       v2, v3
-         bne         10b
- 
-diff --git a/libavcodec/arm/hevcdsp_epel_neon.S b/libavcodec/arm/hevcdsp_epel_neon.S
-index 516ae5b..00eab9e 100644
---- a/libavcodec/arm/hevcdsp_epel_neon.S
-+++ b/libavcodec/arm/hevcdsp_epel_neon.S
-@@ -110,7 +110,9 @@ function ff_hevc_put_epel_h_neon_8, export=1
-         sub    r7, #1
-         lsl    r7, #2
-         vpush {d8-d15}
--        adrl   r12, epel_coeffs
-+@ adr reaches if we are in thumb mode but not in arm
-+T       adr    r12, epel_coeffs
-+A       adrl   r12, epel_coeffs
-         add    r7, r12
-         sub       r1, #1
-         lsl       r4, #1
-@@ -170,7 +172,8 @@ function ff_hevc_put_epel_v_neon_8, export=1
-         sub    r7, #1
-         lsl    r7, #2
-         vpush {d8-d15}
--        adrl   r12, epel_coeffs
-+T       adr    r12, epel_coeffs
-+A       adrl   r12, epel_coeffs
-         add    r7, r12
-         load_coeffs_16b r7
-         sub       r1, r2
-@@ -246,7 +249,7 @@ function ff_hevc_put_epel_hv_neon_8, export=1
-         sub    r7, #1
-         lsl    r7, #2
-         vpush {d8-d15}
--        adrl   r12, epel_coeffs
-+        adr    r12, epel_coeffs
-         sub    r6, #1
-         lsl    r6, #2
-         add    r6, r12 // mx epel coeff offset
-diff --git a/libavcodec/cabac.h b/libavcodec/cabac.h
-index 1bf1c62..ccfa991 100644
---- a/libavcodec/cabac.h
-+++ b/libavcodec/cabac.h
-@@ -43,7 +43,14 @@ extern const uint8_t ff_h264_cabac_tables[512 + 4*2*64 + 4*64 + 63];
- typedef struct CABACContext{
-     int low;
-     int range;
--    int outstanding_count;
-+    union
-+    {
-+        int outstanding_count;
-+        struct {
-+            uint16_t bits;
-+            uint16_t range;
-+        } by22;
-+    };
-     const uint8_t *bytestream_start;
-     const uint8_t *bytestream;
-     const uint8_t *bytestream_end;
-diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
-index 8656917..4caf720 100644
---- a/libavcodec/hevc_cabac.c
-+++ b/libavcodec/hevc_cabac.c
-@@ -21,14 +21,72 @@
-  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-  */
- 
-+#define UNCHECKED_BITSTREAM_READER 1
-+
- #include "libavutil/attributes.h"
- #include "libavutil/common.h"
- 
--#include "cabac_functions.h"
- #include "hevc.h"
-+#include "cabac_functions.h"
-+
-+// BY22 is probably faster than simple bypass if the processor has
-+// either a fast 32-bit divide or a fast 32x32->64[63:32] instruction
-+// x86 has fast int divide
-+// Arm doesn't have divide or general fast 64 bit, but does have the multiply
-+// * Beware: ARCH_xxx isn't set if configure --disable-asm is used
-+#define USE_BY22 (HAVE_FAST_64BIT || ARCH_ARM || ARCH_X86)
-+// Use native divide if we have a fast one - otherwise use mpy 1/x
-+// x86 has a fast integer divide - arm doesn't - unsure about other
-+// architectures
-+#define USE_BY22_DIV  ARCH_X86
-+
-+// Special case blocks with a single significant ceoff
-+// Decreases the complexity of the code for a common case but increases the
-+// code size.
-+#define USE_N_END_1 1
-+
-+#if ARCH_ARM
-+#include "arm/hevc_cabac.h"
-+#endif
- 
- #define CABAC_MAX_BIN 31
- 
-+
-+#if USE_BY22 && !USE_BY22_DIV
-+#define I(x) (uint32_t)((0x10000000000ULL / (uint64_t)(x)) + 1ULL)
-+
-+static const uint32_t cabac_by22_inv_range[256] = {
-+                                                    0,      I(257), I(258), I(259),
-+    I(260), I(261), I(262), I(263), I(264), I(265), I(266), I(267), I(268), I(269),
-+    I(270), I(271), I(272), I(273), I(274), I(275), I(276), I(277), I(278), I(279),
-+    I(280), I(281), I(282), I(283), I(284), I(285), I(286), I(287), I(288), I(289),
-+    I(290), I(291), I(292), I(293), I(294), I(295), I(296), I(297), I(298), I(299),
-+    I(300), I(301), I(302), I(303), I(304), I(305), I(306), I(307), I(308), I(309),
-+    I(310), I(311), I(312), I(313), I(314), I(315), I(316), I(317), I(318), I(319),
-+    I(320), I(321), I(322), I(323), I(324), I(325), I(326), I(327), I(328), I(329),
-+    I(330), I(331), I(332), I(333), I(334), I(335), I(336), I(337), I(338), I(339),
-+    I(340), I(341), I(342), I(343), I(344), I(345), I(346), I(347), I(348), I(349),
-+    I(350), I(351), I(352), I(353), I(354), I(355), I(356), I(357), I(358), I(359),
-+    I(360), I(361), I(362), I(363), I(364), I(365), I(366), I(367), I(368), I(369),
-+    I(370), I(371), I(372), I(373), I(374), I(375), I(376), I(377), I(378), I(379),
-+    I(380), I(381), I(382), I(383), I(384), I(385), I(386), I(387), I(388), I(389),
-+    I(390), I(391), I(392), I(393), I(394), I(395), I(396), I(397), I(398), I(399),
-+    I(400), I(401), I(402), I(403), I(404), I(405), I(406), I(407), I(408), I(409),
-+    I(410), I(411), I(412), I(413), I(414), I(415), I(416), I(417), I(418), I(419),
-+    I(420), I(421), I(422), I(423), I(424), I(425), I(426), I(427), I(428), I(429),
-+    I(430), I(431), I(432), I(433), I(434), I(435), I(436), I(437), I(438), I(439),
-+    I(440), I(441), I(442), I(443), I(444), I(445), I(446), I(447), I(448), I(449),
-+    I(450), I(451), I(452), I(453), I(454), I(455), I(456), I(457), I(458), I(459),
-+    I(460), I(461), I(462), I(463), I(464), I(465), I(466), I(467), I(468), I(469),
-+    I(470), I(471), I(472), I(473), I(474), I(475), I(476), I(477), I(478), I(479),
-+    I(480), I(481), I(482), I(483), I(484), I(485), I(486), I(487), I(488), I(489),
-+    I(490), I(491), I(492), I(493), I(494), I(495), I(496), I(497), I(498), I(499),
-+    I(500), I(501), I(502), I(503), I(504), I(505), I(506), I(507), I(508), I(509),
-+    I(510), I(511)
-+};
-+#undef I
-+#endif  // USE_BY22
-+
- /**
-  * number of bin by SyntaxElement.
-  */
-@@ -445,6 +503,211 @@ static const uint8_t diag_scan8x8_inv[8][8] = {
-     { 28, 36, 43, 49, 54, 58, 61, 63, },
- };
- 
-+
-+typedef struct
-+{
-+    uint16_t coeff;
-+    uint16_t scale;
-+} xy_off_t;
-+
-+#define XYT_C(x,y,t) ((x) + ((y) << (t)))
-+#define SCALE_TRAFO(t) ((t) > 3 ? 3 : (t))
-+#define SCALE_SHR(t) ((t) - SCALE_TRAFO(t))
-+#define XYT_S(x,y,t) (((x) >> SCALE_SHR(t)) + (((y) >> SCALE_SHR(t)) << SCALE_TRAFO(t)))
-+
-+#define XYT(x,y,t) {XYT_C(x,y,t), XYT_S(x,y,t)}
-+
-+#define OFF_DIAG(t) {\
-+    XYT(0,0,t), XYT(0,1,t), XYT(1,0,t), XYT(0,2,t),\
-+    XYT(1,1,t), XYT(2,0,t), XYT(0,3,t), XYT(1,2,t),\
-+    XYT(2,1,t), XYT(3,0,t), XYT(1,3,t), XYT(2,2,t),\
-+    XYT(3,1,t), XYT(2,3,t), XYT(3,2,t), XYT(3,3,t)\
-+}
-+
-+#define OFF_HORIZ(t) {\
-+    XYT(0,0,t), XYT(1,0,t), XYT(2,0,t), XYT(3,0,t),\
-+    XYT(0,1,t), XYT(1,1,t), XYT(2,1,t), XYT(3,1,t),\
-+    XYT(0,2,t), XYT(1,2,t), XYT(2,2,t), XYT(3,2,t),\
-+    XYT(0,3,t), XYT(1,3,t), XYT(2,3,t), XYT(3,3,t)\
-+}
-+
-+#define OFF_VERT(t) {\
-+    XYT(0,0,t), XYT(0,1,t), XYT(0,2,t), XYT(0,3,t),\
-+    XYT(1,0,t), XYT(1,1,t), XYT(1,2,t), XYT(1,3,t),\
-+    XYT(2,0,t), XYT(2,1,t), XYT(2,2,t), XYT(2,3,t),\
-+    XYT(3,0,t), XYT(3,1,t), XYT(3,2,t), XYT(3,3,t)\
-+}
-+
-+static const xy_off_t off_xys[3][4][16] =
-+{
-+    {OFF_DIAG(2), OFF_DIAG(3), OFF_DIAG(4), OFF_DIAG(5)},
-+    {OFF_HORIZ(2), OFF_HORIZ(3), OFF_HORIZ(4), OFF_HORIZ(5)},
-+    {OFF_VERT(2), OFF_VERT(3), OFF_VERT(4), OFF_VERT(5)}
-+};
-+
-+
-+// Helper fns
-+#ifndef hevc_mem_bits32
-+static av_always_inline uint32_t hevc_mem_bits32(const void * buf, const unsigned int offset)
-+{
-+    return AV_RB32((const uint8_t *)buf + (offset >> 3)) << (offset & 7);
-+}
-+#endif
-+
-+#if AV_GCC_VERSION_AT_LEAST(3,4) && !defined(hevc_clz32)
-+#define hevc_clz32 hevc_clz32_builtin
-+static av_always_inline unsigned int hevc_clz32_builtin(const uint32_t x)
-+{
-+    // __builtin_clz says it works on ints - so adjust if int is >32 bits long
-+    return __builtin_clz(x) - (sizeof(int) * 8 - 32);
-+}
-+#endif
-+
-+// It is unlikely that we will ever need this but include for completeness
-+#ifndef hevc_clz32
-+static inline unsigned int hevc_clz32(unsigned int x)
-+{
-+    unsigned int n = 1;
-+    if ((x & 0xffff0000) == 0) {
-+        n += 16;
-+        x <<= 16;
-+    }
-+    if ((x & 0xff000000) == 0) {
-+        n += 8;
-+        x <<= 8;
-+    }
-+    if ((x & 0xf0000000) == 0) {
-+        n += 4;
-+        x <<= 4;
-+    }
-+    if ((x & 0xc0000000) == 0) {
-+        n += 2;
-+        x <<= 2;
-+    }
-+    return n - ((x >> 31) & 1);
-+}
-+#endif
-+
-+
-+#if !USE_BY22
-+// If no by22 then _by22 functions will revert to normal and so _peek/_flush
-+// will no longer be called but the setup calls will still exist and we want
-+// to null them out
-+#define bypass_start(s)
-+#define bypass_finish(s)
-+#else
-+// Use BY22 for residual bypass block
-+
-+#define bypass_start(s) get_cabac_by22_start(&s->HEVClc->cc)
-+#define bypass_finish(s) get_cabac_by22_finish(&s->HEVClc->cc)
-+
-+// BY22 notes that bypass is simply a divide into the bitstream and so we
-+// can peek out large quantities of bits at one and treat the result as if
-+// it was VLC.  In many cases this will lead to O(1) processing rather than
-+// O(n) though the setup and teardown is sufficiently expensive that it is
-+// only worth using if we expect to be dealing with more than a few bits
-+// The definition of "a few bits" will vary from platform to platform but
-+// tests on ARM show that it probably isn't worth it for a single coded
-+// residual, but is for >1 - this is probaly reinforced that if there are
-+// more residuals then they are likely to be bigger and this will make the
-+// O(1) nature of the code more worthwhile.
-+
-+
-+#if !USE_BY22_DIV
-+// * 1/x @ 32 bits gets us 22 bits of accuracy
-+#define CABAC_BY22_PEEK_BITS  22
-+#else
-+// A real 32-bit divide gets us another bit
-+// If we have a 64 bit int & a unit time divider then we should get a lot
-+// of bits (55)  but that is untested and it is unclear if it would give
-+// us a large advantage
-+#define CABAC_BY22_PEEK_BITS  23
-+#endif
-+
-+// Bypass block start
-+// Must be called before _by22_peek is used as it sets the CABAC environment
-+// into the correct state.  _by22_finish must be called to return to 'normal'
-+// (i.e. non-bypass) cabac decoding
-+static inline void get_cabac_by22_start(CABACContext * const c)
-+{
-+    const unsigned int bits = __builtin_ctz(c->low);
-+    const uint32_t m = hevc_mem_bits32(c->bytestream, 0);
-+    uint32_t x = (c->low << (22 - CABAC_BITS)) ^ ((m ^ 0x80000000U) >> (9 + CABAC_BITS - bits));
-+#if !USE_BY22_DIV
-+    const uint32_t inv = cabac_by22_inv_range[c->range & 0xff];
-+#endif
-+
-+    c->bytestream -= (CABAC_BITS / 8);
-+    c->by22.bits = bits;
-+#if !USE_BY22_DIV
-+    c->by22.range = c->range;
-+    c->range = inv;
-+#endif
-+    c->low = x;
-+}
-+
-+// Bypass block finish
-+// Must be called at the end of the bypass block to return to normal operation
-+static inline void get_cabac_by22_finish(CABACContext * const c)
-+{
-+    unsigned int used = c->by22.bits;
-+    unsigned int bytes_used = (used / CABAC_BITS) * (CABAC_BITS / 8);
-+    unsigned int bits_used = used & (CABAC_BITS == 16 ? 15 : 7);
-+
-+    c->bytestream += bytes_used + (CABAC_BITS / 8);
-+    c->low = (((uint32_t)c->low >> (22 - CABAC_BITS + bits_used)) | 1) << bits_used;
-+#if !USE_BY22_DIV
-+    c->range = c->by22.range;
-+#endif
-+}
-+
-+// Peek bypass bits
-+// _by22_start must be called before _by22_peek is called and _by22_flush
-+// must be called afterwards to flush any used bits
-+// The actual number of valid bits returned is
-+// min(<coded bypass block length>, CABAC_BY22_PEEK_BITS). CABAC_BY22_PEEK_BITS
-+// will be at least 22 which should be long enough for any prefix or suffix
-+// though probably not long enough for the worst case combination
-+#ifndef get_cabac_by22_peek
-+static inline uint32_t get_cabac_by22_peek(const CABACContext * const c)
-+{
-+#if USE_BY22_DIV
-+    return ((unsigned int)c->low / (unsigned int)c->range) << 9;
-+#else
-+    uint32_t x = c->low & ~1U;
-+    const uint32_t inv = c->range;
-+
-+    if (inv != 0)
-+        x = (uint32_t)(((uint64_t)x * (uint64_t)inv) >> 32);
-+
-+    return x << 1;
-+#endif
-+}
-+#endif
-+
-+// Flush bypass bits peeked by _by22_peek
-+// Flush n bypass bits. n must be >= 1 to guarantee correct operation
-+// val is an unmodified copy of whatever _by22_peek returned
-+#ifndef get_cabac_by22_flush
-+static inline void get_cabac_by22_flush(CABACContext * c, const unsigned int n, const uint32_t val)
-+{
-+    // Subtract the bits used & reshift up to the top of the word
-+#if USE_BY22_DIV
-+    const uint32_t low = (((unsigned int)c->low << n) - (((val >> (32 - n)) * (unsigned int)c->range) << 23));
-+#else
-+    const uint32_t low = (((uint32_t)c->low << n) - (((val >> (32 - n)) * c->by22.range) << 23));
-+#endif
-+
-+    // and refill lower bits
-+    // We will probably OR over some existing bits but that doesn't matter
-+    c->by22.bits += n;
-+    c->low = low | (hevc_mem_bits32(c->bytestream, c->by22.bits) >> 9);
-+}
-+#endif
-+
-+#endif  // USE_BY22
-+
-+
- void ff_hevc_save_states(HEVCContext *s, int ctb_addr_ts)
- {
-     if (s->ps.pps->entropy_coding_sync_enabled_flag &&
-@@ -863,19 +1126,19 @@ int ff_hevc_cbf_luma_decode(HEVCContext *s, int trafo_depth)
-     return GET_CABAC(elem_offset[CBF_LUMA] + !trafo_depth);
- }
- 
--static int hevc_transform_skip_flag_decode(HEVCContext *s, int c_idx)
-+static int hevc_transform_skip_flag_decode(HEVCContext *s, int c_idx_nz)
- {
--    return GET_CABAC(elem_offset[TRANSFORM_SKIP_FLAG] + !!c_idx);
-+    return GET_CABAC(elem_offset[TRANSFORM_SKIP_FLAG] + c_idx_nz);
- }
- 
--static int explicit_rdpcm_flag_decode(HEVCContext *s, int c_idx)
-+static int explicit_rdpcm_flag_decode(HEVCContext *s, int c_idx_nz)
- {
--    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_FLAG] + !!c_idx);
-+    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_FLAG] + c_idx_nz);
- }
- 
--static int explicit_rdpcm_dir_flag_decode(HEVCContext *s, int c_idx)
-+static int explicit_rdpcm_dir_flag_decode(HEVCContext *s, int c_idx_nz)
- {
--    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_DIR_FLAG] + !!c_idx);
-+    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_DIR_FLAG] + c_idx_nz);
- }
- 
- int ff_hevc_log2_res_scale_abs(HEVCContext *s, int idx) {
-@@ -891,14 +1154,14 @@ int ff_hevc_res_scale_sign_flag(HEVCContext *s, int idx) {
-     return GET_CABAC(elem_offset[RES_SCALE_SIGN_FLAG] + idx);
- }
- 
--static av_always_inline void last_significant_coeff_xy_prefix_decode(HEVCContext *s, int c_idx,
-+static av_always_inline void last_significant_coeff_xy_prefix_decode(HEVCContext *s, int c_idx_nz,
-                                                    int log2_size, int *last_scx_prefix, int *last_scy_prefix)
- {
-     int i = 0;
-     int max = (log2_size << 1) - 1;
-     int ctx_offset, ctx_shift;
- 
--    if (!c_idx) {
-+    if (!c_idx_nz) {
-         ctx_offset = 3 * (log2_size - 2)  + ((log2_size - 1) >> 2);
-         ctx_shift = (log2_size + 1) >> 2;
-     } else {
-@@ -929,22 +1192,16 @@ static av_always_inline int last_significant_coeff_suffix_decode(HEVCContext *s,
-     return value;
- }
- 
--static av_always_inline int significant_coeff_group_flag_decode(HEVCContext *s, int c_idx, int ctx_cg)
-+static av_always_inline int significant_coeff_group_flag_decode(HEVCContext *s, int c_idx_nz, int ctx_cg)
- {
-     int inc;
- 
--    inc = FFMIN(ctx_cg, 1) + (c_idx>0 ? 2 : 0);
-+    inc = (ctx_cg != 0) + (c_idx_nz << 1);
- 
-     return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_GROUP_FLAG] + inc);
- }
--static av_always_inline int significant_coeff_flag_decode(HEVCContext *s, int x_c, int y_c,
--                                           int offset, const uint8_t *ctx_idx_map)
--{
--    int inc = ctx_idx_map[(y_c << 2) + x_c] + offset;
--    return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_FLAG] + inc);
--}
- 
--static av_always_inline int significant_coeff_flag_decode_0(HEVCContext *s, int c_idx, int offset)
-+static av_always_inline int significant_coeff_flag_decode_0(HEVCContext *s, int offset)
- {
-     return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_FLAG] + offset);
- }
-@@ -966,65 +1223,305 @@ static av_always_inline int coeff_abs_level_greater2_flag_decode(HEVCContext *s,
-     return GET_CABAC(elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] + inc);
- }
- 
--static av_always_inline int coeff_abs_level_remaining_decode(HEVCContext *s, int rc_rice_param)
-+
-+#if !USE_BY22
-+#define coeff_abs_level_remaining_decode_bypass(s,r) coeff_abs_level_remaining_decode(s, r)
-+#endif
-+
-+
-+#ifndef coeff_abs_level_remaining_decode_bypass
-+static int coeff_abs_level_remaining_decode_bypass(HEVCContext * const s, const unsigned int rice_param)
-+{
-+    CABACContext * const c = &s->HEVClc->cc;
-+    uint32_t y;
-+    unsigned int prefix;
-+    unsigned int last_coeff_abs_level_remaining;
-+    unsigned int n;
-+
-+    y = get_cabac_by22_peek(c);
-+    prefix = hevc_clz32(~y);
-+    // y << prefix will always have top bit 0
-+
-+    if (prefix < 3) {
-+        const unsigned int suffix = (y << prefix) >> (31 - rice_param);
-+        last_coeff_abs_level_remaining = (prefix << rice_param) + suffix;
-+        n = prefix + 1 + rice_param;
-+    }
-+    else if (prefix * 2 + rice_param <= CABAC_BY22_PEEK_BITS + 2)
-+    {
-+        const uint32_t suffix = ((y << prefix) | 0x80000000) >> (34 - (prefix + rice_param));
-+
-+        last_coeff_abs_level_remaining = (2 << rice_param) + suffix;
-+        n = prefix * 2 + rice_param - 2;
-+    }
-+    else {
-+        unsigned int suffix;
-+
-+        get_cabac_by22_flush(c, prefix, y);
-+        y = get_cabac_by22_peek(c);
-+
-+        suffix = (y | 0x80000000) >> (34 - (prefix + rice_param));
-+        last_coeff_abs_level_remaining = (2 << rice_param) + suffix;
-+        n = prefix + rice_param - 2;
-+    }
-+
-+    get_cabac_by22_flush(c, n, y);
-+
-+    return last_coeff_abs_level_remaining;
-+}
-+#endif
-+
-+static int coeff_abs_level_remaining_decode(HEVCContext * const s, int rc_rice_param)
- {
-+    CABACContext * const c = &s->HEVClc->cc;
-     int prefix = 0;
-     int suffix = 0;
-     int last_coeff_abs_level_remaining;
-     int i;
- 
--    while (prefix < CABAC_MAX_BIN && get_cabac_bypass(&s->HEVClc->cc))
-+    while (prefix < CABAC_MAX_BIN && get_cabac_bypass(c))
-         prefix++;
-     if (prefix == CABAC_MAX_BIN) {
-         av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", prefix);
-         return 0;
-     }
-+
-     if (prefix < 3) {
-         for (i = 0; i < rc_rice_param; i++)
--            suffix = (suffix << 1) | get_cabac_bypass(&s->HEVClc->cc);
-+            suffix = (suffix << 1) | get_cabac_bypass(c);
-         last_coeff_abs_level_remaining = (prefix << rc_rice_param) + suffix;
-     } else {
-         int prefix_minus3 = prefix - 3;
-         for (i = 0; i < prefix_minus3 + rc_rice_param; i++)
--            suffix = (suffix << 1) | get_cabac_bypass(&s->HEVClc->cc);
-+            suffix = (suffix << 1) | get_cabac_bypass(c);
-         last_coeff_abs_level_remaining = (((1 << prefix_minus3) + 3 - 1)
-                                               << rc_rice_param) + suffix;
-     }
-+
-     return last_coeff_abs_level_remaining;
- }
- 
--static av_always_inline int coeff_sign_flag_decode(HEVCContext *s, uint8_t nb)
-+#if !USE_BY22
-+#define coeff_sign_flag_decode_bypass coeff_sign_flag_decode
-+static inline uint32_t coeff_sign_flag_decode(HEVCContext * const s, const unsigned int nb)
- {
--    int i;
--    int ret = 0;
-+    CABACContext * const c = &s->HEVClc->cc;
-+    unsigned int i;
-+    uint32_t ret = 0;
- 
-     for (i = 0; i < nb; i++)
--        ret = (ret << 1) | get_cabac_bypass(&s->HEVClc->cc);
--    return ret;
-+        ret = (ret << 1) | get_cabac_bypass(c);
-+
-+    return ret << (32 - nb);
- }
-+#endif
-+
-+#ifndef coeff_sign_flag_decode_bypass
-+static inline uint32_t coeff_sign_flag_decode_bypass(HEVCContext * const s, const unsigned int nb)
-+{
-+    CABACContext * const c = &s->HEVClc->cc;
-+    uint32_t y;
-+    y = get_cabac_by22_peek(c);
-+    get_cabac_by22_flush(c, nb, y);
-+    return y & ~(0xffffffffU >> nb);
-+}
-+#endif
-+
-+
-+#ifndef get_cabac_greater1_bits
-+static inline unsigned int get_cabac_greater1_bits(CABACContext * const c, const unsigned int n,
-+    uint8_t * const state0)
-+{
-+    unsigned int i;
-+    unsigned int rv = 0;
-+    for (i = 0; i != n; ++i) {
-+        const unsigned int idx = rv != 0 ? 0 : i < 3 ? i + 1 : 3;
-+        const unsigned int b = get_cabac(c, state0 + idx);
-+        rv = (rv << 1) | b;
-+    }
-+    return rv;
-+}
-+#endif
-+
-+
-+// N.B. levels returned are the values assuming coeff_abs_level_remaining
-+// is uncoded, so 1 must be added if it is coded.  sum_abs also reflects
-+// this version of events.
-+static inline uint32_t get_greaterx_bits(HEVCContext * const s, const unsigned int n_end, int * const levels,
-+    int * const pprev_subset_coded, int * const psum,
-+    const unsigned int idx0_gt1, const unsigned int idx_gt2)
-+{
-+    CABACContext * const c = &s->HEVClc->cc;
-+    uint8_t * const state0 = s->HEVClc->cabac_state + idx0_gt1;
-+    uint8_t * const state_gt2 = s->HEVClc->cabac_state + idx_gt2;
-+    unsigned int rv;
-+    unsigned int i;
-+    const unsigned int n = FFMIN(n_end, 8);
-+
-+    // Really this is i != n but the simple unconditional loop is cheaper
-+    // and faster
-+    for (i = 0; i != 8; ++i)
-+        levels[i] = 1;
-+
-+    rv = get_cabac_greater1_bits(c, n, state0);
-+
-+    *pprev_subset_coded = 0;
-+    *psum = n;
-+
-+    rv <<= (32 - n);
-+    if (rv != 0)
-+    {
-+        *pprev_subset_coded = 1;
-+        *psum = n + 1;
-+        i = hevc_clz32(rv);
-+        levels[i] = 2;
-+        if (get_cabac(c, state_gt2) == 0)
-+        {
-+            // Unset first coded bit
-+            rv &= ~(0x80000000U >> i);
-+        }
-+    }
-+
-+    if (n_end > 8) {
-+        const unsigned int g8 = n_end - 8;
-+        rv |= ((1 << g8) - 1) << (24 - g8);
-+        for (i = 0; i != g8; ++i) {
-+            levels[i + 8] = 0;
-+        }
-+    }
-+
-+    return rv;
-+}
-+
-+// extended_precision_processing_flag must be false given we are
-+// putting the result into a 16-bit array
-+// So trans_coeff_level must fit in 16 bits too (7.4.9.1 definition of coeff_abs_level_remaining)
-+// scale_m is uint8_t
-+//
-+// scale is [40 - 72] << [0..12] based on qp- worst case is (45 << 12)
-+//   or it can be 2 (if we have transquant_bypass)
-+// shift is set to one less than we really want but would normally be
-+//   s->ps.sps->bit_depth (max 16, min 8) + log2_trafo_size (max 5, min 2?) - 5 = max 16 min 5?
-+// however the scale shift is substracted from shift to a min 0 so scale_m worst = 45 << 6
-+// This can still theoretically lead to overflow but the coding would have to be very odd (& inefficient)
-+// to achieve it
-+
-+#ifndef trans_scale_sat
-+static inline int trans_scale_sat(const int level, const unsigned int scale, const unsigned int scale_m, const unsigned int shift)
-+{
-+    return av_clip_int16((((level * (int)(scale * scale_m)) >> shift) + 1) >> 1);
-+}
-+#endif
-+
-+
-+#ifndef update_rice
-+static inline void update_rice(uint8_t * const stat_coeff,
-+    const unsigned int last_coeff_abs_level_remaining,
-+    const unsigned int c_rice_param)
-+{
-+    const unsigned int x = (last_coeff_abs_level_remaining << 1) >> c_rice_param;
-+    if (x >= 6)
-+        (*stat_coeff)++;
-+    else if (x == 0 && *stat_coeff > 0)
-+        (*stat_coeff)--;
-+}
-+#endif
-+
-+
-+// n must be > 0 on entry
-+#ifndef get_cabac_sig_coeff_flag_idxs
-+static inline uint8_t * get_cabac_sig_coeff_flag_idxs(CABACContext * const c, uint8_t * const state0,
-+    unsigned int n,
-+    const uint8_t const * ctx_map,
-+    uint8_t * p)
-+{
-+    do {
-+        if (get_cabac(c, state0 + ctx_map[n]))
-+            *p++ = n;
-+    } while (--n != 0);
-+    return p;
-+}
-+#endif
-+
-+
-+static int get_sig_coeff_flag_idxs(CABACContext * const c, uint8_t * const state0,
-+    unsigned int n,
-+    const uint8_t const * ctx_map,
-+    uint8_t * const flag_idx)
-+{
-+    int rv;
-+
-+    rv = get_cabac_sig_coeff_flag_idxs(c, state0, n, ctx_map, flag_idx) - flag_idx;
-+
-+    return rv;
-+}
-+
-+#define H4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
-+     x0,  x1,  x2,  x3,\
-+     x4,  x5,  x6,  x7,\
-+     x8,  x9, x10, x11,\
-+    x12, x13, x14, x15}
-+
-+#define V4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
-+     x0,  x4,  x8, x12,\
-+     x1,  x5,  x9, x13,\
-+     x2,  x6, x10, x14,\
-+     x3,  x7, x11, x15}
-+
-+#define D4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
-+     x0,  x4,  x1,  x8,\
-+     x5,  x2, x12,  x9,\
-+     x6,  x3, x13, x10,\
-+     x7, x14, x11, x15}
-+
-+
-+static inline int next_subset(HEVCContext * const s, int i, const int c_idx_nz,
-+    uint8_t * const significant_coeff_group_flag,
-+    const uint8_t * const scan_x_cg, const uint8_t * const scan_y_cg,
-+    int * const pPrev_sig)
-+{
-+    while (--i >= 0) {
-+        unsigned int x_cg = scan_x_cg[i];
-+        unsigned int y_cg = scan_y_cg[i];
-+
-+        // For the flag decode we only care about Z/NZ but
-+        // we use the full Right + Down * 2 when calculating
-+        // significant coeff flags so we obtain it here
-+        //.
-+        // The group flag array is one longer than it needs to
-+        // be so we don't need to check for y_cg limits
-+        unsigned int prev_sig = ((significant_coeff_group_flag[y_cg] >> (x_cg + 1)) & 1) |
-+            (((significant_coeff_group_flag[y_cg + 1] >> x_cg) & 1) << 1);
-+
-+        if (i == 0 ||
-+            significant_coeff_group_flag_decode(s, c_idx_nz, prev_sig))
-+        {
-+            significant_coeff_group_flag[y_cg] |= (1 << x_cg);
-+            *pPrev_sig = prev_sig;
-+            break;
-+        }
-+    }
-+
-+    return i;
-+}
-+
- 
- void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-                                 int log2_trafo_size, enum ScanType scan_idx,
-                                 int c_idx)
- {
--#define GET_COORD(offset, n)                                    \
--    do {                                                        \
--        x_c = (x_cg << 2) + scan_x_off[n];                      \
--        y_c = (y_cg << 2) + scan_y_off[n];                      \
--    } while (0)
--    HEVCLocalContext *lc = s->HEVClc;
--    int transform_skip_flag = 0;
-+    HEVCLocalContext * const lc = s->HEVClc;
-+    int trans_skip_or_bypass = lc->cu.cu_transquant_bypass_flag;
- 
-     int last_significant_coeff_x, last_significant_coeff_y;
--    int last_scan_pos;
--    int n_end;
-     int num_coeff = 0;
--    int greater1_ctx = 1;
-+    int prev_subset_coded = 0;
- 
-     int num_last_subset;
-     int x_cg_last_sig, y_cg_last_sig;
- 
--    const uint8_t *scan_x_cg, *scan_y_cg, *scan_x_off, *scan_y_off;
-+    const uint8_t *scan_x_cg, *scan_y_cg;
-+    const xy_off_t * scan_xy_off;
- 
-     ptrdiff_t stride = s->frame->linesize[c_idx];
-     int hshift = s->ps.sps->hshift[c_idx];
-@@ -1032,21 +1529,28 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-     uint8_t *dst = &s->frame->data[c_idx][(y0 >> vshift) * stride +
-                                           ((x0 >> hshift) << s->ps.sps->pixel_shift)];
- #ifdef RPI
--    int use_vpu = s->enable_rpi && !lc->cu.cu_transquant_bypass_flag && !transform_skip_flag && !lc->tu.cross_pf && log2_trafo_size>=4;
-+    //***** transform_skip_flag decoded later!
-+    int use_vpu = s->enable_rpi && !lc->cu.cu_transquant_bypass_flag /* && !transform_skip_flag*/ && !lc->tu.cross_pf && log2_trafo_size>=4;
- #endif
-     int16_t *coeffs = (int16_t*)(c_idx ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
--    uint8_t significant_coeff_group_flag[8][8] = {{0}};
-+    uint8_t significant_coeff_group_flag[9] = {0};  // Allow 1 final byte that is always zero
-     int explicit_rdpcm_flag = 0;
-     int explicit_rdpcm_dir_flag;
- 
-     int trafo_size = 1 << log2_trafo_size;
-     int i;
--    int qp,shift,add,scale,scale_m;
-+    int qp,shift,scale;
-     static const uint8_t level_scale[] = { 40, 45, 51, 57, 64, 72 };
-     const uint8_t *scale_matrix = NULL;
-     uint8_t dc_scale;
-     int pred_mode_intra = (c_idx == 0) ? lc->tu.intra_pred_mode :
-                                          lc->tu.intra_pred_mode_c;
-+
-+    int prev_sig = 0;
-+    const int c_idx_nz = (c_idx != 0);
-+
-+    int may_hide_sign;
-+
- #ifdef RPI
-     if (s->enable_rpi) {
-         int n = trafo_size * trafo_size;
-@@ -1078,7 +1582,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
- 
-     // Derive QP for dequant
-     if (!lc->cu.cu_transquant_bypass_flag) {
--        static const int qp_c[] = { 29, 30, 31, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37 };
-+        static const uint8_t qp_c[] = { 29, 30, 31, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37 };
-         static const uint8_t rem6[51 + 4 * 6 + 1] = {
-             0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2,
-             3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
-@@ -1094,9 +1598,19 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-         };
-         int qp_y = lc->qp_y;
- 
-+        may_hide_sign = s->ps.pps->sign_data_hiding_flag;
-+
-         if (s->ps.pps->transform_skip_enabled_flag &&
-             log2_trafo_size <= s->ps.pps->log2_max_transform_skip_block_size) {
--            transform_skip_flag = hevc_transform_skip_flag_decode(s, c_idx);
-+            int transform_skip_flag = hevc_transform_skip_flag_decode(s, c_idx_nz);
-+            if (transform_skip_flag) {
-+                trans_skip_or_bypass = 1;
-+                if (lc->cu.pred_mode ==  MODE_INTRA  &&
-+                    s->ps.sps->implicit_rdpcm_enabled_flag &&
-+                    (pred_mode_intra == 10 || pred_mode_intra == 26)) {
-+                    may_hide_sign = 0;
-+                }
-+            }
-         }
- 
-         if (c_idx == 0) {
-@@ -1129,39 +1643,73 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-             qp += s->ps.sps->qp_bd_offset;
-         }
- 
--        shift    = s->ps.sps->bit_depth + log2_trafo_size - 5;
--        add      = 1 << (shift-1);
--        scale    = level_scale[rem6[qp]] << (div6[qp]);
--        scale_m  = 16; // default when no custom scaling lists.
--        dc_scale = 16;
-+        // Shift is set to one less than will actually occur as the scale
-+        // and saturate step adds 1 and then shifts right again
-+        shift = s->ps.sps->bit_depth + log2_trafo_size - 6;
-+        scale = level_scale[rem6[qp]];
-+        if (div6[qp] >= shift) {
-+            scale <<= (div6[qp] - shift);
-+            shift = 0;
-+        } else {
-+            shift -= div6[qp];
-+        }
- 
--        if (s->ps.sps->scaling_list_enable_flag && !(transform_skip_flag && log2_trafo_size > 2)) {
-+        if (s->ps.sps->scaling_list_enable_flag && !(trans_skip_or_bypass && log2_trafo_size > 2)) {
-             const ScalingList *sl = s->ps.pps->scaling_list_data_present_flag ?
--            &s->ps.pps->scaling_list : &s->ps.sps->scaling_list;
-+                &s->ps.pps->scaling_list : &s->ps.sps->scaling_list;
-             int matrix_id = lc->cu.pred_mode != MODE_INTRA;
- 
-             matrix_id = 3 * matrix_id + c_idx;
- 
-             scale_matrix = sl->sl[log2_trafo_size - 2][matrix_id];
-+            dc_scale = scale_matrix[0];
-             if (log2_trafo_size >= 4)
-                 dc_scale = sl->sl_dc[log2_trafo_size - 4][matrix_id];
-         }
-+        else
-+        {
-+            static const uint8_t sixteen_scale[64] = {
-+                16, 16, 16, 16, 16, 16, 16, 16,
-+                16, 16, 16, 16, 16, 16, 16, 16,
-+                16, 16, 16, 16, 16, 16, 16, 16,
-+                16, 16, 16, 16, 16, 16, 16, 16,
-+                16, 16, 16, 16, 16, 16, 16, 16,
-+                16, 16, 16, 16, 16, 16, 16, 16,
-+                16, 16, 16, 16, 16, 16, 16, 16,
-+                16, 16, 16, 16, 16, 16, 16, 16
-+            };
-+            scale_matrix = sixteen_scale;
-+            dc_scale = 16;
-+        }
-     } else {
-+        static const uint8_t unit_scale[64] = {
-+            1, 1, 1, 1, 1, 1, 1, 1,
-+            1, 1, 1, 1, 1, 1, 1, 1,
-+            1, 1, 1, 1, 1, 1, 1, 1,
-+            1, 1, 1, 1, 1, 1, 1, 1,
-+            1, 1, 1, 1, 1, 1, 1, 1,
-+            1, 1, 1, 1, 1, 1, 1, 1,
-+            1, 1, 1, 1, 1, 1, 1, 1,
-+            1, 1, 1, 1, 1, 1, 1, 1,
-+        };
-+        scale_matrix = unit_scale;
-         shift        = 0;
--        add          = 0;
--        scale        = 0;
--        dc_scale     = 0;
-+        scale        = 2;  // We will shift right to kill this
-+        dc_scale     = 1;
-+
-+        may_hide_sign = 0;
-     }
- 
-     if (lc->cu.pred_mode == MODE_INTER && s->ps.sps->explicit_rdpcm_enabled_flag &&
--        (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
--        explicit_rdpcm_flag = explicit_rdpcm_flag_decode(s, c_idx);
-+        trans_skip_or_bypass) {
-+        explicit_rdpcm_flag = explicit_rdpcm_flag_decode(s, c_idx_nz);
-         if (explicit_rdpcm_flag) {
--            explicit_rdpcm_dir_flag = explicit_rdpcm_dir_flag_decode(s, c_idx);
-+            may_hide_sign = 0;
-+            explicit_rdpcm_dir_flag = explicit_rdpcm_dir_flag_decode(s, c_idx_nz);
-         }
-     }
- 
--    last_significant_coeff_xy_prefix_decode(s, c_idx, log2_trafo_size,
-+    last_significant_coeff_xy_prefix_decode(s, c_idx_nz, log2_trafo_size,
-                                            &last_significant_coeff_x, &last_significant_coeff_y);
- 
-     if (last_significant_coeff_x > 3) {
-@@ -1189,119 +1737,113 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-         int last_x_c = last_significant_coeff_x & 3;
-         int last_y_c = last_significant_coeff_y & 3;
- 
--        scan_x_off = ff_hevc_diag_scan4x4_x;
--        scan_y_off = ff_hevc_diag_scan4x4_y;
-         num_coeff = diag_scan4x4_inv[last_y_c][last_x_c];
--        if (trafo_size == 4) {
-+
-+        switch (log2_trafo_size) {
-+        case 2:
-             scan_x_cg = scan_1x1;
-             scan_y_cg = scan_1x1;
--        } else if (trafo_size == 8) {
-+            break;
-+        case 3:
-             num_coeff += diag_scan2x2_inv[y_cg_last_sig][x_cg_last_sig] << 4;
-             scan_x_cg = diag_scan2x2_x;
-             scan_y_cg = diag_scan2x2_y;
--        } else if (trafo_size == 16) {
-+            break;
-+        case 4:
-             num_coeff += diag_scan4x4_inv[y_cg_last_sig][x_cg_last_sig] << 4;
-             scan_x_cg = ff_hevc_diag_scan4x4_x;
-             scan_y_cg = ff_hevc_diag_scan4x4_y;
--        } else { // trafo_size == 32
-+            break;
-+        case 5:
-+        default:
-             num_coeff += diag_scan8x8_inv[y_cg_last_sig][x_cg_last_sig] << 4;
-             scan_x_cg = ff_hevc_diag_scan8x8_x;
-             scan_y_cg = ff_hevc_diag_scan8x8_y;
-+            break;
-         }
-         break;
-     }
-     case SCAN_HORIZ:
-         scan_x_cg = horiz_scan2x2_x;
-         scan_y_cg = horiz_scan2x2_y;
--        scan_x_off = horiz_scan4x4_x;
--        scan_y_off = horiz_scan4x4_y;
-         num_coeff = horiz_scan8x8_inv[last_significant_coeff_y][last_significant_coeff_x];
-         break;
-     default: //SCAN_VERT
-         scan_x_cg = horiz_scan2x2_y;
-         scan_y_cg = horiz_scan2x2_x;
--        scan_x_off = horiz_scan4x4_y;
--        scan_y_off = horiz_scan4x4_x;
-         num_coeff = horiz_scan8x8_inv[last_significant_coeff_x][last_significant_coeff_y];
-         break;
-     }
-     num_coeff++;
-     num_last_subset = (num_coeff - 1) >> 4;
- 
--    for (i = num_last_subset; i >= 0; i--) {
--        int n, m;
--        int x_cg, y_cg, x_c, y_c, pos;
--        int implicit_non_zero_coeff = 0;
--        int64_t trans_coeff_level;
--        int prev_sig = 0;
--        int offset = i << 4;
--        int rice_init = 0;
--
--        uint8_t significant_coeff_flag_idx[16];
--        uint8_t nb_significant_coeff_flag = 0;
-+    significant_coeff_group_flag[y_cg_last_sig] = 1 << x_cg_last_sig; // 1st subset always significant
- 
--        x_cg = scan_x_cg[i];
--        y_cg = scan_y_cg[i];
-+    scan_xy_off = off_xys[scan_idx][log2_trafo_size - 2];
- 
--        if ((i < num_last_subset) && (i > 0)) {
--            int ctx_cg = 0;
--            if (x_cg < (1 << (log2_trafo_size - 2)) - 1)
--                ctx_cg += significant_coeff_group_flag[x_cg + 1][y_cg];
--            if (y_cg < (1 << (log2_trafo_size - 2)) - 1)
--                ctx_cg += significant_coeff_group_flag[x_cg][y_cg + 1];
--
--            significant_coeff_group_flag[x_cg][y_cg] =
--                significant_coeff_group_flag_decode(s, c_idx, ctx_cg);
--            implicit_non_zero_coeff = 1;
--        } else {
--            significant_coeff_group_flag[x_cg][y_cg] =
--            ((x_cg == x_cg_last_sig && y_cg == y_cg_last_sig) ||
--             (x_cg == 0 && y_cg == 0));
--        }
-+    i = num_last_subset;
-+    do {
-+        int implicit_non_zero_coeff = 0;
-+        int n_end;
- 
--        last_scan_pos = num_coeff - offset - 1;
-+        uint8_t significant_coeff_flag_idx[16];
-+        unsigned int nb_significant_coeff_flag = 0;
- 
-         if (i == num_last_subset) {
-+            // First time through
-+            int last_scan_pos = num_coeff - (i << 4) - 1;
-             n_end = last_scan_pos - 1;
-             significant_coeff_flag_idx[0] = last_scan_pos;
-             nb_significant_coeff_flag = 1;
-         } else {
-             n_end = 15;
-+            implicit_non_zero_coeff = (i != 0);
-         }
- 
--        if (x_cg < ((1 << log2_trafo_size) - 1) >> 2)
--            prev_sig = !!significant_coeff_group_flag[x_cg + 1][y_cg];
--        if (y_cg < ((1 << log2_trafo_size) - 1) >> 2)
--            prev_sig += (!!significant_coeff_group_flag[x_cg][y_cg + 1] << 1);
--
--        if (significant_coeff_group_flag[x_cg][y_cg] && n_end >= 0) {
--            static const uint8_t ctx_idx_map[] = {
--                0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8, // log2_trafo_size == 2
--                1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, // prev_sig == 0
--                2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, // prev_sig == 1
--                2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, // prev_sig == 2
--                2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2  // default
-+        if (n_end >= 0) {
-+            static const uint8_t ctx_idx_maps_ts2[3][16] = {
-+                D4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2
-+                H4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2
-+                V4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8)  // log2_trafo_size == 2
-+            };
-+            static const uint8_t ctx_idx_maps[3][4][16] = {
-+                {
-+                    D4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
-+                    D4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 1
-+                    D4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 2
-+                    D4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
-+                },
-+                {
-+                    H4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
-+                    H4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 1
-+                    H4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 2
-+                    H4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
-+                },
-+                {
-+                    V4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
-+                    V4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 1
-+                    V4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 2
-+                    V4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
-+                }
-             };
-             const uint8_t *ctx_idx_map_p;
-             int scf_offset = 0;
--            if (s->ps.sps->transform_skip_context_enabled_flag &&
--                (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
--                ctx_idx_map_p = (uint8_t*) &ctx_idx_map[4 * 16];
--                if (c_idx == 0) {
--                    scf_offset = 40;
--                } else {
--                    scf_offset = 14 + 27;
--                }
-+
-+            if (s->ps.sps->transform_skip_context_enabled_flag && trans_skip_or_bypass) {
-+                ctx_idx_map_p = ctx_idx_maps[0][3];
-+                scf_offset = 40 + c_idx_nz;
-             } else {
--                if (c_idx != 0)
-+                if (c_idx_nz != 0)
-                     scf_offset = 27;
-+
-                 if (log2_trafo_size == 2) {
--                    ctx_idx_map_p = (uint8_t*) &ctx_idx_map[0];
-+                    ctx_idx_map_p = ctx_idx_maps_ts2[scan_idx];
-                 } else {
--                    ctx_idx_map_p = (uint8_t*) &ctx_idx_map[(prev_sig + 1) << 4];
--                    if (c_idx == 0) {
--                        if ((x_cg > 0 || y_cg > 0))
-+                    ctx_idx_map_p = ctx_idx_maps[scan_idx][prev_sig];
-+                    if (!c_idx_nz) {
-+                        if (i != 0)
-                             scf_offset += 3;
-+
-                         if (log2_trafo_size == 3) {
-                             scf_offset += (scan_idx == SCAN_DIAG) ? 9 : 15;
-                         } else {
-@@ -1315,34 +1857,30 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-                     }
-                 }
-             }
--            for (n = n_end; n > 0; n--) {
--                x_c = scan_x_off[n];
--                y_c = scan_y_off[n];
--                if (significant_coeff_flag_decode(s, x_c, y_c, scf_offset, ctx_idx_map_p)) {
--                    significant_coeff_flag_idx[nb_significant_coeff_flag] = n;
--                    nb_significant_coeff_flag++;
-+
-+            if (n_end > 0) {
-+                int cnt = get_sig_coeff_flag_idxs(&s->HEVClc->cc,
-+                    s->HEVClc->cabac_state + elem_offset[SIGNIFICANT_COEFF_FLAG] + scf_offset,
-+                    n_end, ctx_idx_map_p,
-+                    significant_coeff_flag_idx + nb_significant_coeff_flag);
-+
-+                nb_significant_coeff_flag += cnt;
-+                if (cnt != 0) {
-                     implicit_non_zero_coeff = 0;
-                 }
-             }
-+
-             if (implicit_non_zero_coeff == 0) {
--                if (s->ps.sps->transform_skip_context_enabled_flag &&
--                    (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
--                    if (c_idx == 0) {
--                        scf_offset = 42;
--                    } else {
--                        scf_offset = 16 + 27;
--                    }
-+                if (s->ps.sps->transform_skip_context_enabled_flag && trans_skip_or_bypass) {
-+                    scf_offset = 42 + c_idx_nz;
-                 } else {
-                     if (i == 0) {
--                        if (c_idx == 0)
--                            scf_offset = 0;
--                        else
--                            scf_offset = 27;
-+                        scf_offset = c_idx_nz ? 27 : 0;
-                     } else {
-                         scf_offset = 2 + scf_offset;
-                     }
-                 }
--                if (significant_coeff_flag_decode_0(s, c_idx, scf_offset) == 1) {
-+                if (significant_coeff_flag_decode_0(s, scf_offset) == 1) {
-                     significant_coeff_flag_idx[nb_significant_coeff_flag] = 0;
-                     nb_significant_coeff_flag++;
-                 }
-@@ -1352,141 +1890,185 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-             }
-         }
- 
--        n_end = nb_significant_coeff_flag;
--
-+        if (nb_significant_coeff_flag != 0) {
-+            const unsigned int gt1_idx_delta = (c_idx_nz << 2) |
-+                ((i != 0 && !c_idx_nz) ? 2 : 0) |
-+                prev_subset_coded;
-+            const unsigned int idx0_gt1 = elem_offset[COEFF_ABS_LEVEL_GREATER1_FLAG] +
-+                (gt1_idx_delta << 2);
-+            const unsigned int idx_gt2 = elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] +
-+                gt1_idx_delta;
-+
-+            const unsigned int x_cg = scan_x_cg[i];
-+            const unsigned int y_cg = scan_y_cg[i];
-+            int16_t * const blk_coeffs = coeffs +
-+                ((x_cg + (y_cg << log2_trafo_size)) << 2);
-+            // This calculation is 'wrong' for log2_traffo_size == 2
-+            // but that doesn't mattor as in this case x_cg & y_cg
-+            // are always 0 so result is correct (0) anyway
-+            const uint8_t * const blk_scale = scale_matrix +
-+                (((x_cg + (y_cg << 3)) << (5 - log2_trafo_size)));
-+
-+            // * THe following code block doesn't deal with these flags:
-+            //   (nor did the one it replaces)
-+            //
-+            // cabac_bypass_alignment_enabled_flag
-+            //    This should be easy but I can't find a test case
-+            // extended_precision_processing_flag
-+            //    This can extend the required precision past 16bits
-+            //    so is probably tricky - also no example found yet
-+
-+#if USE_N_END_1
-+            if (nb_significant_coeff_flag == 1) {
-+                // There is a small gain to be had from special casing the single
-+                // transform coefficient case.  The reduction in complexity
-+                // makes up for the code duplicatioon.
-+
-+                int trans_coeff_level = 1;
-+                int coeff_sign_flag;
-+                int coded_val = 0;
-+
-+                // initialize first elem of coeff_bas_level_greater1_flag
-+                prev_subset_coded = 0;
-+
-+                if (get_cabac(&s->HEVClc->cc, s->HEVClc->cabac_state + idx0_gt1 + 1)) {
-+                    trans_coeff_level = 2;
-+                    prev_subset_coded = 1;
-+                    coded_val = get_cabac(&s->HEVClc->cc, s->HEVClc->cabac_state + idx_gt2);
-+                }
- 
--        if (n_end) {
--            int first_nz_pos_in_cg;
--            int last_nz_pos_in_cg;
--            int c_rice_param = 0;
--            int first_greater1_coeff_idx = -1;
--            uint8_t coeff_abs_level_greater1_flag[8];
--            uint16_t coeff_sign_flag;
--            int sum_abs = 0;
--            int sign_hidden;
--            int sb_type;
-+                // Probably not worth the overhead of starting by22 for just one value
-+                coeff_sign_flag = get_cabac_bypass(&s->HEVClc->cc);
- 
-+                if (coded_val)
-+                {
-+                    if (!s->ps.sps->persistent_rice_adaptation_enabled_flag) {
-+                        trans_coeff_level = 3 + coeff_abs_level_remaining_decode(s, 0);
-+                    } else {
-+                        uint8_t * const stat_coeff =
-+                            lc->stat_coeff + trans_skip_or_bypass + 2 - ((c_idx_nz) << 1);
-+                        const unsigned int c_rice_param = *stat_coeff >> 2;
-+                        const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
- 
--            // initialize first elem of coeff_bas_level_greater1_flag
--            int ctx_set = (i > 0 && c_idx == 0) ? 2 : 0;
-+                        trans_coeff_level = 3 + last_coeff_abs_level_remaining;
-+                        update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
-+                    }
-+                }
- 
--            if (s->ps.sps->persistent_rice_adaptation_enabled_flag) {
--                if (!transform_skip_flag && !lc->cu.cu_transquant_bypass_flag)
--                    sb_type = 2 * (c_idx == 0 ? 1 : 0);
--                else
--                    sb_type = 2 * (c_idx == 0 ? 1 : 0) + 1;
--                c_rice_param = lc->stat_coeff[sb_type] / 4;
--            }
-+                {
-+                    const xy_off_t * const xy_off = scan_xy_off + significant_coeff_flag_idx[0];
-+                    const int k = (int32_t)(coeff_sign_flag << 31) >> 31;
-+                    const unsigned int scale_m = blk_scale[xy_off->scale];
- 
--            if (!(i == num_last_subset) && greater1_ctx == 0)
--                ctx_set++;
--            greater1_ctx = 1;
--            last_nz_pos_in_cg = significant_coeff_flag_idx[0];
--
--            for (m = 0; m < (n_end > 8 ? 8 : n_end); m++) {
--                int inc = (ctx_set << 2) + greater1_ctx;
--                coeff_abs_level_greater1_flag[m] =
--                    coeff_abs_level_greater1_flag_decode(s, c_idx, inc);
--                if (coeff_abs_level_greater1_flag[m]) {
--                    greater1_ctx = 0;
--                    if (first_greater1_coeff_idx == -1)
--                        first_greater1_coeff_idx = m;
--                } else if (greater1_ctx > 0 && greater1_ctx < 3) {
--                    greater1_ctx++;
-+                    blk_coeffs[xy_off->coeff] = trans_scale_sat(
-+                        (trans_coeff_level ^ k) - k,  // Apply sign
-+                        scale,
-+                        i == 0 && xy_off->coeff == 0 ? dc_scale : scale_m,
-+                        shift);
-                 }
-             }
--            first_nz_pos_in_cg = significant_coeff_flag_idx[n_end - 1];
--
--            if (lc->cu.cu_transquant_bypass_flag ||
--                (lc->cu.pred_mode ==  MODE_INTRA  &&
--                 s->ps.sps->implicit_rdpcm_enabled_flag  &&  transform_skip_flag  &&
--                 (pred_mode_intra == 10 || pred_mode_intra  ==  26 )) ||
--                 explicit_rdpcm_flag)
--                sign_hidden = 0;
-             else
--                sign_hidden = (last_nz_pos_in_cg - first_nz_pos_in_cg >= 4);
-+#endif
-+            {
-+                int sign_hidden = may_hide_sign;
-+                int levels[16]; // Should be able to get away with int16_t but that fails some tests
-+                uint32_t coeff_sign_flags;
-+                uint32_t coded_vals = 0;
-+                // Sum(abs(level[]))
-+                // In fact we only need the bottom bit and in some future
-+                // version that may be all we calculate
-+                unsigned int sum_abs;
-+
-+                coded_vals = get_greaterx_bits(s, nb_significant_coeff_flag, levels,
-+                    &prev_subset_coded, &sum_abs, idx0_gt1, idx_gt2);
-+
-+                if (significant_coeff_flag_idx[0] - significant_coeff_flag_idx[nb_significant_coeff_flag - 1] <= 3)
-+                    sign_hidden = 0;
-+
-+                // -- Start bypass block
-+
-+                bypass_start(s);
-+
-+                coeff_sign_flags = coeff_sign_flag_decode_bypass(s, nb_significant_coeff_flag - sign_hidden);
-+
-+                if (coded_vals != 0)
-+                {
-+                    const int rice_adaptation_enabled = s->ps.sps->persistent_rice_adaptation_enabled_flag;
-+                    uint8_t * stat_coeff = !rice_adaptation_enabled ? NULL :
-+                        lc->stat_coeff + trans_skip_or_bypass + 2 - ((c_idx_nz) << 1);
-+                    int c_rice_param = !rice_adaptation_enabled ? 0 : *stat_coeff >> 2;
-+                    int * level = levels - 1;
-+
-+                    do {
-+                        {
-+                            const unsigned int z = hevc_clz32(coded_vals) + 1;
-+                            level += z;
-+                            coded_vals <<= z;
-+                        }
- 
--            if (first_greater1_coeff_idx != -1) {
--                coeff_abs_level_greater1_flag[first_greater1_coeff_idx] += coeff_abs_level_greater2_flag_decode(s, c_idx, ctx_set);
--            }
--            if (!s->ps.pps->sign_data_hiding_flag || !sign_hidden ) {
--                coeff_sign_flag = coeff_sign_flag_decode(s, nb_significant_coeff_flag) << (16 - nb_significant_coeff_flag);
--            } else {
--                coeff_sign_flag = coeff_sign_flag_decode(s, nb_significant_coeff_flag - 1) << (16 - (nb_significant_coeff_flag - 1));
--            }
-+                        {
-+                            const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode_bypass(s, c_rice_param);
-+                            const int trans_coeff_level = *level + last_coeff_abs_level_remaining + 1;
-+
-+                            sum_abs += last_coeff_abs_level_remaining + 1;
-+                            *level = trans_coeff_level;
- 
--            for (m = 0; m < n_end; m++) {
--                n = significant_coeff_flag_idx[m];
--                GET_COORD(offset, n);
--                if (m < 8) {
--                    trans_coeff_level = 1 + coeff_abs_level_greater1_flag[m];
--                    if (trans_coeff_level == ((m == first_greater1_coeff_idx) ? 3 : 2)) {
--                        int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
--
--                        trans_coeff_level += last_coeff_abs_level_remaining;
--                        if (trans_coeff_level > (3 << c_rice_param))
--                            c_rice_param = s->ps.sps->persistent_rice_adaptation_enabled_flag ? c_rice_param + 1 : FFMIN(c_rice_param + 1, 4);
--                        if (s->ps.sps->persistent_rice_adaptation_enabled_flag && !rice_init) {
--                            int c_rice_p_init = lc->stat_coeff[sb_type] / 4;
--                            if (last_coeff_abs_level_remaining >= (3 << c_rice_p_init))
--                                lc->stat_coeff[sb_type]++;
--                            else if (2 * last_coeff_abs_level_remaining < (1 << c_rice_p_init))
--                                if (lc->stat_coeff[sb_type] > 0)
--                                    lc->stat_coeff[sb_type]--;
--                            rice_init = 1;
-+                            if (stat_coeff != NULL)
-+                                update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
-+                            stat_coeff = NULL;
-+
-+                            if (trans_coeff_level > (3 << c_rice_param) &&
-+                                (c_rice_param < 4 || rice_adaptation_enabled))
-+                                ++c_rice_param;
-                         }
--                    }
--                } else {
--                    int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
--
--                    trans_coeff_level = 1 + last_coeff_abs_level_remaining;
--                    if (trans_coeff_level > (3 << c_rice_param))
--                        c_rice_param = s->ps.sps->persistent_rice_adaptation_enabled_flag ? c_rice_param + 1 : FFMIN(c_rice_param + 1, 4);
--                    if (s->ps.sps->persistent_rice_adaptation_enabled_flag && !rice_init) {
--                        int c_rice_p_init = lc->stat_coeff[sb_type] / 4;
--                        if (last_coeff_abs_level_remaining >= (3 << c_rice_p_init))
--                            lc->stat_coeff[sb_type]++;
--                        else if (2 * last_coeff_abs_level_remaining < (1 << c_rice_p_init))
--                            if (lc->stat_coeff[sb_type] > 0)
--                                lc->stat_coeff[sb_type]--;
--                        rice_init = 1;
--                    }
-+                    } while (coded_vals != 0);
-                 }
--                if (s->ps.pps->sign_data_hiding_flag && sign_hidden) {
--                    sum_abs += trans_coeff_level;
--                    if (n == first_nz_pos_in_cg && (sum_abs&1))
--                        trans_coeff_level = -trans_coeff_level;
-+
-+                // sign_hidden = 0 or 1 so we can combine the tests
-+                if ((sign_hidden & sum_abs) != 0) {
-+                    levels[nb_significant_coeff_flag - 1] = -levels[nb_significant_coeff_flag - 1];
-                 }
--                if (coeff_sign_flag >> 15)
--                    trans_coeff_level = -trans_coeff_level;
--                coeff_sign_flag <<= 1;
--                if(!lc->cu.cu_transquant_bypass_flag) {
--                    if (s->ps.sps->scaling_list_enable_flag && !(transform_skip_flag && log2_trafo_size > 2)) {
--                        if(y_c || x_c || log2_trafo_size < 4) {
--                            switch(log2_trafo_size) {
--                                case 3: pos = (y_c << 3) + x_c; break;
--                                case 4: pos = ((y_c >> 1) << 3) + (x_c >> 1); break;
--                                case 5: pos = ((y_c >> 2) << 3) + (x_c >> 2); break;
--                                default: pos = (y_c << 2) + x_c; break;
--                            }
--                            scale_m = scale_matrix[pos];
--                        } else {
--                            scale_m = dc_scale;
--                        }
-+
-+                bypass_finish(s);
-+
-+                // -- Finish bypass block
-+
-+                // Scale loop
-+                {
-+                    int m = nb_significant_coeff_flag - 1;
-+
-+                    // Deal with DC component (if any) first
-+                    if (i == 0 && significant_coeff_flag_idx[m] == 0)
-+                    {
-+                        const int k = (int32_t)(coeff_sign_flags << m) >> 31;
-+                        blk_coeffs[0] = trans_scale_sat(
-+                            (levels[m] ^ k) - k, scale, dc_scale, shift);
-+                        --m;
-                     }
--                    trans_coeff_level = (trans_coeff_level * (int64_t)scale * (int64_t)scale_m + add) >> shift;
--                    if(trans_coeff_level < 0) {
--                        if((~trans_coeff_level) & 0xFffffffffff8000)
--                            trans_coeff_level = -32768;
--                    } else {
--                        if(trans_coeff_level & 0xffffffffffff8000)
--                            trans_coeff_level = 32767;
-+
-+#if !USE_N_END_1
-+                    // If N_END_! set then m was at least 1 initially
-+                    if (m >= 0)
-+#endif
-+                    {
-+                        do {
-+                            const xy_off_t * const xy_off = scan_xy_off +
-+                                significant_coeff_flag_idx[m];
-+                            const int k = (int32_t)(coeff_sign_flags << m) >> 31;
-+
-+                            blk_coeffs[xy_off->coeff] = trans_scale_sat(
-+                                (levels[m] ^ k) - k,
-+                                scale,
-+                                blk_scale[xy_off->scale],
-+                                shift);
-+                        } while (--m >= 0);
-                     }
-                 }
--                coeffs[y_c * trafo_size + x_c] = trans_coeff_level;
-+
-             }
-         }
--    }
-+    } while ((i = next_subset(s, i, c_idx_nz,
-+        significant_coeff_group_flag, scan_x_cg, scan_y_cg, &prev_sig)) >= 0);
- 
-     if (lc->cu.cu_transquant_bypass_flag) {
-         if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
-@@ -1496,7 +2078,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-             s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
-         }
-     } else {
--        if (transform_skip_flag) {
-+        if (trans_skip_or_bypass) { // Must be trans_skip as we've already dealt with bypass
-             int rot = s->ps.sps->transform_skip_rotation_enabled_flag &&
-                       log2_trafo_size == 2 &&
-                       lc->cu.pred_mode == MODE_INTRA;
--- 
-2.7.4
-
-
diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1005-0001-avcodec-add-h264_mvc-codec-id-and-profiles.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1005-0001-avcodec-add-h264_mvc-codec-id-and-profiles.patch
deleted file mode 100644
index ab7d3e981d..0000000000
--- a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1005-0001-avcodec-add-h264_mvc-codec-id-and-profiles.patch
+++ /dev/null
@@ -1,69 +0,0 @@
-From 4060f15e2d29e268110032d4366382e370e088d0 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Sun, 26 Jun 2016 20:09:18 +0100
-Subject: [PATCH] avcodec: add h264_mvc codec id and profiles
-
----
- libavcodec/avcodec.h    | 5 +++++
- libavcodec/codec_desc.c | 7 +++++++
- libavformat/mpegts.c    | 2 +-
- 3 files changed, 13 insertions(+), 1 deletion(-)
-
-diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
-index a1ba217..abd2e91 100644
---- a/libavcodec/avcodec.h
-+++ b/libavcodec/avcodec.h
-@@ -410,6 +410,8 @@ enum AVCodecID {
-     AV_CODEC_ID_SHEERVIDEO,
-     AV_CODEC_ID_YLC,
- 
-+    AV_CODEC_ID_H264_MVC,
-+
-     /* various PCM "codecs" */
-     AV_CODEC_ID_FIRST_AUDIO = 0x10000,     ///< A dummy id pointing at the start of audio codecs
-     AV_CODEC_ID_PCM_S16LE = 0x10000,
-@@ -3195,6 +3197,9 @@ typedef struct AVCodecContext {
- #define FF_PROFILE_H264_HIGH_444_PREDICTIVE  244
- #define FF_PROFILE_H264_HIGH_444_INTRA       (244|FF_PROFILE_H264_INTRA)
- #define FF_PROFILE_H264_CAVLC_444            44
-+#define FF_PROFILE_H264_MULTIVIEW_HIGH       118
-+#define FF_PROFILE_H264_STEREO_HIGH          128
-+#define FF_PROFILE_H264_MULTIVIEW_HIGH_DEPTH 138
- 
- #define FF_PROFILE_VC1_SIMPLE   0
- #define FF_PROFILE_VC1_MAIN     1
-diff --git a/libavcodec/codec_desc.c b/libavcodec/codec_desc.c
-index 9d94b72..535ebf0 100644
---- a/libavcodec/codec_desc.c
-+++ b/libavcodec/codec_desc.c
-@@ -1563,6 +1563,13 @@ static const AVCodecDescriptor codec_descriptors[] = {
-         .long_name = NULL_IF_CONFIG_SMALL("YUY2 Lossless Codec"),
-         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
-     },
-+    {
-+        .id        = AV_CODEC_ID_H264_MVC,
-+        .type      = AVMEDIA_TYPE_VIDEO,
-+        .name      = "h264_mvc",
-+        .long_name = NULL_IF_CONFIG_SMALL("H264 MVC"),
-+        .props     = AV_CODEC_PROP_LOSSY,
-+    },
- 
-     /* various PCM "codecs" */
-     {
-diff --git a/libavformat/mpegts.c b/libavformat/mpegts.c
-index b31d233..2767306 100644
---- a/libavformat/mpegts.c
-+++ b/libavformat/mpegts.c
-@@ -701,7 +701,7 @@ static const StreamType ISO_types[] = {
- #endif
-     { 0x1b, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_H264       },
-     { 0x1c, AVMEDIA_TYPE_AUDIO, AV_CODEC_ID_AAC        },
--    { 0x20, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_H264       },
-+    { 0x20, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_H264_MVC   },
-     { 0x21, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_JPEG2000   },
-     { 0x24, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_HEVC       },
-     { 0x42, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_CAVS       },
--- 
-2.7.4
-
-
diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1006-0001-h264_parser-add-support-for-parsing-h264-mvc-NALUs.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1006-0001-h264_parser-add-support-for-parsing-h264-mvc-NALUs.patch
deleted file mode 100644
index 4894bd781b..0000000000
--- a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1006-0001-h264_parser-add-support-for-parsing-h264-mvc-NALUs.patch
+++ /dev/null
@@ -1,117 +0,0 @@
-From 23dd20678a05e1764e5d8d30481cb354a51b6c8b Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Sun, 26 Jun 2016 20:16:03 +0100
-Subject: [PATCH] h264_parser: add support for parsing h264 mvc NALUs
-
----
- libavcodec/allcodecs.c   |  1 +
- libavcodec/h264.h        |  2 ++
- libavcodec/h264_parser.c | 34 ++++++++++++++++++++++++++++++----
- 3 files changed, 33 insertions(+), 4 deletions(-)
-
-diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c
-index 54efaad..02a89c3 100644
---- a/libavcodec/allcodecs.c
-+++ b/libavcodec/allcodecs.c
-@@ -667,6 +667,7 @@ void avcodec_register_all(void)
-     REGISTER_PARSER(H261,               h261);
-     REGISTER_PARSER(H263,               h263);
-     REGISTER_PARSER(H264,               h264);
-+    REGISTER_PARSER(H264_MVC,           h264_mvc);
-     REGISTER_PARSER(HEVC,               hevc);
-     REGISTER_PARSER(MJPEG,              mjpeg);
-     REGISTER_PARSER(MLP,                mlp);
-diff --git a/libavcodec/h264.h b/libavcodec/h264.h
-index efe3555..16358aa 100644
---- a/libavcodec/h264.h
-+++ b/libavcodec/h264.h
-@@ -126,7 +126,9 @@ enum {
-     NAL_END_STREAM      = 11,
-     NAL_FILLER_DATA     = 12,
-     NAL_SPS_EXT         = 13,
-+    NAL_SPS_SUBSET      = 15,
-     NAL_AUXILIARY_SLICE = 19,
-+    NAL_SLICE_EXT       = 20,
-     NAL_FF_IGNORE       = 0xff0f001,
- };
- 
-diff --git a/libavcodec/h264_parser.c b/libavcodec/h264_parser.c
-index ce4bab2..082ac17 100644
---- a/libavcodec/h264_parser.c
-+++ b/libavcodec/h264_parser.c
-@@ -58,6 +58,7 @@ typedef struct H264ParseContext {
-     uint8_t parse_history[6];
-     int parse_history_count;
-     int parse_last_mb;
-+    int is_mvc;
- } H264ParseContext;
- 
- 
-@@ -105,14 +106,18 @@ static int h264_find_frame_end(H264ParseContext *p, const uint8_t *buf,
-         } else if (state <= 5) {
-             int nalu_type = buf[i] & 0x1F;
-             if (nalu_type == NAL_SEI || nalu_type == NAL_SPS ||
--                nalu_type == NAL_PPS || nalu_type == NAL_AUD) {
-+                nalu_type == NAL_PPS || nalu_type == NAL_AUD ||
-+                nalu_type == NAL_SPS_SUBSET) {
-                 if (pc->frame_start_found) {
-                     i++;
-                     goto found;
-                 }
-             } else if (nalu_type == NAL_SLICE || nalu_type == NAL_DPA ||
--                       nalu_type == NAL_IDR_SLICE) {
-+                       nalu_type == NAL_IDR_SLICE || (p->is_mvc && nalu_type == NAL_SLICE_EXT)) {
-                 state += 8;
-+
-+                if (nalu_type == NAL_SLICE_EXT)
-+                    i += 3; // skip mvc extension
-                 continue;
-             }
-             state = 7;
-@@ -585,7 +590,8 @@ static int h264_parse(AVCodecParserContext *s,
-         }
-     }
- 
--    parse_nal_units(s, avctx, buf, buf_size);
-+    if (!p->is_mvc)
-+        parse_nal_units(s, avctx, buf, buf_size);
- 
-     if (avctx->framerate.num)
-         avctx->time_base = av_inv_q(av_mul_q(avctx->framerate, (AVRational){avctx->ticks_per_frame, 1}));
-@@ -622,7 +628,7 @@ static int h264_split(AVCodecContext *avctx,
-         if ((state & 0xFFFFFF00) != 0x100)
-             break;
-         nalu_type = state & 0x1F;
--        if (nalu_type == NAL_SPS) {
-+        if (nalu_type == NAL_SPS || nalu_type == NAL_SPS_SUBSET) {
-             has_sps = 1;
-         } else if (nalu_type == NAL_PPS)
-             has_pps = 1;
-@@ -672,3 +678,23 @@ AVCodecParser ff_h264_parser = {
-     .parser_close   = h264_close,
-     .split          = h264_split,
- };
-+
-+static av_cold int init_mvc(AVCodecParserContext *s)
-+{
-+    H264ParseContext *p = s->priv_data;
-+    int ret = init(s);
-+    if (ret < 0)
-+        return ret;
-+
-+    p->is_mvc = 1;
-+    return 0;
-+}
-+
-+AVCodecParser ff_h264_mvc_parser = {
-+    .codec_ids      = { AV_CODEC_ID_H264_MVC },
-+    .priv_data_size = sizeof(H264ParseContext),
-+    .parser_init    = init_mvc,
-+    .parser_parse   = h264_parse,
-+    .parser_close   = h264_close,
-+    .split          = h264_split,
-+};
--- 
-2.7.4
-
-
diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1007-h264_parser_fix_parsing_of_mvc_slices_in_some_corner_cases.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1007-h264_parser_fix_parsing_of_mvc_slices_in_some_corner_cases.patch
deleted file mode 100644
index 1272d4889a..0000000000
--- a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1007-h264_parser_fix_parsing_of_mvc_slices_in_some_corner_cases.patch
+++ /dev/null
@@ -1,56 +0,0 @@
-From 12d99a92469e5916de3bc787dce4c13abfdd5e09 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Sun, 26 Jun 2016 20:20:04 +0100
-Subject: [PATCH] h264_parser: fix parsing of mvc slices in some corner cases
-
----
- libavcodec/h264_parser.c | 10 +++++-----
- 1 file changed, 5 insertions(+), 5 deletions(-)
-
-diff --git a/libavcodec/h264_parser.c b/libavcodec/h264_parser.c
-index 082ac17..b9b0c78 100644
---- a/libavcodec/h264_parser.c
-+++ b/libavcodec/h264_parser.c
-@@ -59,6 +59,7 @@ typedef struct H264ParseContext {
-     int parse_history_count;
-     int parse_last_mb;
-     int is_mvc;
-+    int slice_ext;
- } H264ParseContext;
- 
- 
-@@ -116,18 +117,17 @@ static int h264_find_frame_end(H264ParseContext *p, const uint8_t *buf,
-                        nalu_type == NAL_IDR_SLICE || (p->is_mvc && nalu_type == NAL_SLICE_EXT)) {
-                 state += 8;
- 
--                if (nalu_type == NAL_SLICE_EXT)
--                    i += 3; // skip mvc extension
-+                p->slice_ext = (nalu_type == NAL_SLICE_EXT);
-                 continue;
-             }
-             state = 7;
-         } else {
-             p->parse_history[p->parse_history_count++] = buf[i];
--            if (p->parse_history_count > 5) {
-+            if (p->parse_history_count > 8) {
-                 unsigned int mb, last_mb = p->parse_last_mb;
-                 GetBitContext gb;
- 
--                init_get_bits(&gb, p->parse_history, 8*p->parse_history_count);
-+                init_get_bits8(&gb, p->parse_history + 3*p->slice_ext, p->parse_history_count - 3*p->slice_ext);
-                 p->parse_history_count = 0;
-                 mb= get_ue_golomb_long(&gb);
-                 p->parse_last_mb = mb;
-@@ -150,7 +150,7 @@ found:
-     pc->frame_start_found = 0;
-     if (p->is_avc)
-         return next_avc;
--    return i - (state & 5) - 5 * (state > 7);
-+    return i - (state & 5) - 8 * (state > 7);
- }
- 
- static int scan_mmco_reset(AVCodecParserContext *s, GetBitContext *gb,
--- 
-2.7.4
-
-
diff --git a/packages/tools/bcm2835-bootloader/package.mk b/packages/tools/bcm2835-bootloader/package.mk
index 2ffb0851fb..99fefb1689 100644
--- a/packages/tools/bcm2835-bootloader/package.mk
+++ b/packages/tools/bcm2835-bootloader/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="bcm2835-bootloader"
-PKG_VERSION="475a89a"
+PKG_VERSION="a5d4376"
 PKG_ARCH="arm"
 PKG_LICENSE="nonfree"
 PKG_SITE="http://www.broadcom.com"
diff --git a/projects/RPi/patches/kodi/kodi-001-backport.patch b/projects/RPi/patches/kodi/kodi-001-backport.patch
index 355c3494c9..42910f0a13 100644
--- a/projects/RPi/patches/kodi/kodi-001-backport.patch
+++ b/projects/RPi/patches/kodi/kodi-001-backport.patch
@@ -1,91 +1,7 @@
-From 3a032772cf28a21dcfcd12f8872e211b391fac64 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Tue, 28 Oct 2014 00:19:40 +0000
-Subject: [PATCH 01/64] [cec] Add settings for configuring button repeats
-
----
- addons/resource.language.en_gb/resources/strings.po | 15 +++++++++++++++
- system/peripherals.xml                              |  4 +++-
- xbmc/peripherals/devices/PeripheralCecAdapter.cpp   | 16 ++++++++++++++++
- 3 files changed, 34 insertions(+), 1 deletion(-)
-
-diff --git a/addons/resource.language.en_gb/resources/strings.po b/addons/resource.language.en_gb/resources/strings.po
-index da5580360222805f83da510d7eba0b67a4c67c84..6e0d5ed0fbba1aee3cca9bff3401b366cb77c2b7 100644
---- a/addons/resource.language.en_gb/resources/strings.po
-+++ b/addons/resource.language.en_gb/resources/strings.po
-@@ -19728,3 +19728,18 @@ msgstr ""
- msgctxt "#39010"
- msgid "Select sort method"
- msgstr ""
-+
-+#: system/peripherals.xml
-+msgctxt "#38050"
-+msgid "Remote button press delay before repeating (ms)"
-+msgstr ""
-+
-+#: system/peripherals.xml
-+msgctxt "#38051"
-+msgid "Remote button press repeat rate (ms)"
-+msgstr ""
-+
-+#: system/peripherals.xml
-+msgctxt "#38052"
-+msgid "Remote button press release time (ms)"
-+msgstr ""
-diff --git a/system/peripherals.xml b/system/peripherals.xml
-index d5704b249c3065b2980dc92c7c81dc7b384187bc..02b1a9ed6fce1986bd864bba09a9df0621f9e041 100644
---- a/system/peripherals.xml
-+++ b/system/peripherals.xml
-@@ -31,7 +31,9 @@
-     <setting key="device_type" type="int" value="1" configurable="0" />
-     <setting key="wake_devices_advanced" type="string" value="" configurable="0" />
-     <setting key="standby_devices_advanced" type="string" value="" configurable="0" />
--    <setting key="double_tap_timeout_ms" type="int" min="0" value="300" configurable="0" />
-+    <setting key="double_tap_timeout_ms" type="int" min="50" max="1000" step="50" value="300" label="38050" order="16" />
-+    <setting key="button_repeat_rate_ms" type="int" min="0" max="250" step="10" value="0" label="38051" order="17" />
-+    <setting key="button_release_delay_ms" type="int" min="0" max="500" step="50" value="0" label="38052" order="18" />
-   </peripheral>
- 
-   <peripheral vendor_product="2548:1001,2548:1002" bus="usb" name="Pulse-Eight CEC Adapter" mapTo="cec">
-diff --git a/xbmc/peripherals/devices/PeripheralCecAdapter.cpp b/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
-index d032ffd707fee5eec035e90bdf618530f7215c37..30367a3fde956090afdca9930fa52e829f35046f 100644
---- a/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
-+++ b/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
-@@ -1296,6 +1296,20 @@ void CPeripheralCecAdapter::SetConfigurationFromLibCEC(const CEC::libcec_configu
-   m_configuration.bActivateSource = config.bActivateSource;
-   bChanged |= SetSetting("activate_source", m_configuration.bActivateSource == 1);
- 
-+#if defined(CEC_DOUBLE_TAP_TIMEOUT_MS_OLD)
-+  m_configuration.iDoubleTapTimeout50Ms = config.iDoubleTapTimeout50Ms;
-+  bChanged |= SetSetting("double_tap_timeout_ms", (int)m_configuration.iDoubleTapTimeout50Ms * 50);
-+#else
-+  m_configuration.iDoubleTapTimeoutMs = config.iDoubleTapTimeoutMs;
-+  bChanged |= SetSetting("double_tap_timeout_ms", (int)m_configuration.iDoubleTapTimeoutMs);
-+#endif
-+
-+  m_configuration.iButtonRepeatRateMs = config.iButtonRepeatRateMs;
-+  bChanged |= SetSetting("button_repeat_rate_ms", (int)m_configuration.iButtonRepeatRateMs);
-+
-+  m_configuration.iButtonReleaseDelayMs = config.iButtonReleaseDelayMs;
-+  bChanged |= SetSetting("button_release_delay_ms", (int)m_configuration.iButtonReleaseDelayMs);
-+
-   m_configuration.bPowerOffOnStandby = config.bPowerOffOnStandby;
- 
-   m_configuration.iFirmwareVersion = config.iFirmwareVersion;
-@@ -1398,6 +1412,8 @@ void CPeripheralCecAdapter::SetConfigurationFromSettings(void)
-   // backwards compatibility. will be removed once the next major release of libCEC is out
-   m_configuration.iDoubleTapTimeoutMs = GetSettingInt("double_tap_timeout_ms");
- #endif
-+  m_configuration.iButtonRepeatRateMs = GetSettingInt("button_repeat_rate_ms");
-+  m_configuration.iButtonReleaseDelayMs = GetSettingInt("button_release_delay_ms");
- 
-   if (GetSettingBool("pause_playback_on_deactivate"))
-   {
-
-From 84fde1194d89b02d321ff4049a572bce88947ec9 Mon Sep 17 00:00:00 2001
+From fb711d36229c80705b4a0a36ce2e120c3e1466fd Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sat, 26 Apr 2014 17:27:52 +0100
-Subject: [PATCH 02/64] [cec] Don't suspend pi on tv switch off - it can't wake
+Subject: [PATCH 02/67] [cec] Don't suspend pi on tv switch off - it can't wake
  up
 
 ---
@@ -106,10 +22,10 @@ index 02b1a9ed6fce1986bd864bba09a9df0621f9e041..54f9b70cfd5c8c82ceb99932e1b3e325
      <setting key="use_tv_menu_language" type="bool" value="1" label="36018" order="10" />
      <setting key="pause_playback_on_deactivate" type="bool" value="1" label="36033" configurable="0" />
 
-From 46438fbd67528d9ac3ca8bba386a9f7e1e18c777 Mon Sep 17 00:00:00 2001
+From 6b34039dc27c952fc5217ffc1f0e1fac49992bed Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Mon, 7 Apr 2014 18:19:32 +0100
-Subject: [PATCH 03/64] [rbp/omxplayer] When opening a stream don't try to
+Subject: [PATCH 03/67] [rbp/omxplayer] When opening a stream don't try to
  update gui so often
 
 ---
@@ -133,10 +49,10 @@ index c8fe0706d128b3c67a4000894129ae0fa08bb223..8a5916299575661743131b921a27a76f
          dialog->ProcessRenderLoop(false);
          if (allowCancel && dialog->IsCanceled())
 
-From 1b2fcbc6357fa1399576a819398c01833053b35e Mon Sep 17 00:00:00 2001
+From 4757e370bc4f5fc42f7191b893c2d806d6c76bbc Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sat, 8 Mar 2014 15:36:06 +0000
-Subject: [PATCH 04/64] [hifiberry] Hack: force it to be recognised as IEC958
+Subject: [PATCH 04/67] [hifiberry] Hack: force it to be recognised as IEC958
  capable to enable passthrough options
 
 ---
@@ -159,10 +75,10 @@ index d66993a09583d8f9f54f5f97c18fbba45dddee9b..3c0b691860ace57e0a25f01013df01a5
          info.m_displayName.substr(info.m_displayName.size()-5) == " HDMI")
      {
 
-From ea630fb4c3a67d3fb21b927dd18eaa5ba8937fbb Mon Sep 17 00:00:00 2001
+From eec779f1dba335e11b9b30955f047fa432896b2f Mon Sep 17 00:00:00 2001
 From: Ben Avison <bavison@riscosopen.org>
 Date: Thu, 1 May 2014 16:28:39 +0100
-Subject: [PATCH 05/64] Improved file buffering in CArchive
+Subject: [PATCH 05/67] Improved file buffering in CArchive
 
 Even though memcpy is typically inlined by the compiler into byte/word loads
 and stores (at least for release builds), the frequency with which 1, 2 and 4
@@ -222,10 +138,10 @@ index 23cac2759fb10d532da56fa75c5528c5589e9010..89d31d4db1afa7340ed8cd51a7a9fa7a
      }
  
 
-From b56b00d9100980eaee66810fac1ab0124292ba75 Mon Sep 17 00:00:00 2001
+From 2ed4fa5cf7935b1e04d2e2aebd0f214047ed358f Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sun, 10 Aug 2014 16:45:16 +0100
-Subject: [PATCH 06/64] filesystem: Make support of browsing into archives
+Subject: [PATCH 06/67] filesystem: Make support of browsing into archives
  optional
 
 The ability to browse, scan and play content in archives can cause problems on low powered/low memory devices.
@@ -244,10 +160,10 @@ We'll let people who don't use archives disable it manually
  4 files changed, 26 insertions(+), 2 deletions(-)
 
 diff --git a/addons/resource.language.en_gb/resources/strings.po b/addons/resource.language.en_gb/resources/strings.po
-index 6e0d5ed0fbba1aee3cca9bff3401b366cb77c2b7..6cc42fe19769b23fa71d6bc9ae6776cad01d9e19 100644
+index 6443f3dd885bf0aa8e031039e36e273972a310ae..7dfc5355cc0d85d94360ba21bc738733e4878f3d 100644
 --- a/addons/resource.language.en_gb/resources/strings.po
 +++ b/addons/resource.language.en_gb/resources/strings.po
-@@ -19371,6 +19371,15 @@ msgstr ""
+@@ -19388,6 +19388,15 @@ msgstr ""
  #: system/settings/rbp.xml
  msgctxt "#38010"
  msgid "GPU accelerated"
@@ -335,10 +251,10 @@ index a0fd0a9011e71f4af1535110c696b6ea5c4b37db..688b71a297c7c617c6764bfe6be157d7
    {
      CURL xbtUrl = URIUtils::CreateArchivePath("xbt", url);
 
-From a5cb6b253a6c9e6a1b7f4cf8aaf16e804f679856 Mon Sep 17 00:00:00 2001
+From dcebf738cde43680261a85a3385c728189b84cdb Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Mon, 27 Oct 2014 13:06:57 +0000
-Subject: [PATCH 07/64] [rbp] Make cachemembuffersize default depend on memory
+Subject: [PATCH 07/67] [rbp] Make cachemembuffersize default depend on memory
  size
 
 ---
@@ -402,7 +318,7 @@ index a35a509a91483f13e2cf0e688fc7e9528f254290..fffa5182126159f6dfcf750b21fa0464
    void Deinitialize();
    int GetArmMem() { return m_arm_mem; }
 diff --git a/xbmc/settings/AdvancedSettings.cpp b/xbmc/settings/AdvancedSettings.cpp
-index cc37998f0c9edfb38cf609666374cfa96530bf8f..3891a7ed34acb3489a860678d56a8ec049890f6e 100644
+index 1c00edab33101b82a5817ac03c7f1d98007e1856..12ba1aca0ba838bd8d33e9ca1043845c10f90954 100644
 --- a/xbmc/settings/AdvancedSettings.cpp
 +++ b/xbmc/settings/AdvancedSettings.cpp
 @@ -50,6 +50,9 @@
@@ -440,10 +356,10 @@ index cc37998f0c9edfb38cf609666374cfa96530bf8f..3891a7ed34acb3489a860678d56a8ec0
  }
  
 
-From 2fb7a0e59386ce93c8f4e7685880bd292d179b29 Mon Sep 17 00:00:00 2001
+From b3cfcae349f63dc41713cb9cd24707f02b0184d6 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Fri, 30 May 2014 14:58:43 +0100
-Subject: [PATCH 08/64] [settings] Experiment: Report DESKTOP resolution in
+Subject: [PATCH 08/67] [settings] Experiment: Report DESKTOP resolution in
  video settings
 
 ---
@@ -465,10 +381,10 @@ index ef95bc286fa982790248bad26da3c3e00c1da002..da69c6960867621d4ebe9267929664d9
          StringUtils::Format("%dx%d%s", resolution->width, resolution->height,
                              ModeFlagsToString(resolution->flags, false).c_str()),
 
-From d38a1b9896fe67dd2504144f49138393a335eaad Mon Sep 17 00:00:00 2001
+From 4a486ce217ef15870a9c23d6e0cbd2c69137100a Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Wed, 24 Sep 2014 23:13:52 +0100
-Subject: [PATCH 09/64] [audio] Add settings option to boost centre channel
+Subject: [PATCH 09/67] [audio] Add settings option to boost centre channel
  when downmixing
 
 This allows a dB volume increase to be added to centre channel.
@@ -486,10 +402,10 @@ Should work with Pi Sink (dvdplayer/paplayer) and omxplayer
  5 files changed, 46 insertions(+)
 
 diff --git a/addons/resource.language.en_gb/resources/strings.po b/addons/resource.language.en_gb/resources/strings.po
-index 6cc42fe19769b23fa71d6bc9ae6776cad01d9e19..7b171b3186d47726d1f60cc0225358dc434e9d9f 100644
+index 7dfc5355cc0d85d94360ba21bc738733e4878f3d..c67fc9a16f303a822dadfb4f558a390ada04bca8 100644
 --- a/addons/resource.language.en_gb/resources/strings.po
 +++ b/addons/resource.language.en_gb/resources/strings.po
-@@ -19591,6 +19591,21 @@ msgstr ""
+@@ -19608,6 +19608,21 @@ msgstr ""
  
  #empty strings from id 38062 to 38099
  
@@ -512,7 +428,7 @@ index 6cc42fe19769b23fa71d6bc9ae6776cad01d9e19..7b171b3186d47726d1f60cc0225358dc
  #: system/settings/settings.xml
  msgctxt "#38100"
 diff --git a/system/settings/settings.xml b/system/settings/settings.xml
-index f28f5daa8145613670b93fdb221dc53eadf5ce63..316b641d01ceaa8e0a347d8331b56b41c6a44b49 100644
+index 301e7276e5b79e00457db1f33b1cd576bdef4c85..5f1f3ca48342ef1a4eeed7432221d7b2dda354e8 100644
 --- a/system/settings/settings.xml
 +++ b/system/settings/settings.xml
 @@ -2358,6 +2358,18 @@
@@ -594,10 +510,10 @@ index f16b822ed7b4aebe18b5d339b3f71ee66e97c23f..993d4b33a294e88c2c004b7943895ba5
      // stereo upmix
      if (upmix && m_src_channels == 2 && m_dst_channels > 2)
 
-From 3d536f5d2226193b5066d97f727795efce625d48 Mon Sep 17 00:00:00 2001
+From d3125a94e433da8ead850dfa45ed1d6ded3f3148 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Mon, 27 Oct 2014 15:23:51 +0000
-Subject: [PATCH 10/64] [rbp] Default extract thumbnails to false
+Subject: [PATCH 10/67] [rbp] Default extract thumbnails to false
 
 It can take 80 seconds for a single file on a Pi. It can cause crashes with out-of-memory errors.
 It genereates a lot of support issues. Best to default to disabled and let users enable it if they must
@@ -623,10 +539,10 @@ index e8b0d3d472b02fd161a4b51e957b9129e3cb9792..289dc55ec41aa44848519a05f8ee1ccc
      </category>
    </section>
 
-From 6adf157187e26042b135d082bc1d91637e4108c4 Mon Sep 17 00:00:00 2001
+From 2e4c872b8c0b795156f96918cb8a2d4e099b1d1e Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Thu, 27 Nov 2014 16:31:56 +0000
-Subject: [PATCH 11/64] [languageinvoker] Reduce priority of python threads
+Subject: [PATCH 11/67] [languageinvoker] Reduce priority of python threads
 
 ---
  xbmc/interfaces/generic/LanguageInvokerThread.cpp | 5 +++++
@@ -649,10 +565,10 @@ index fcdd0633f30cd9595ae6cc4ed293677cdcb1f422..16f0c8916b5e0a9e90973d194cf2ebd1
  }
  
 
-From 7edf27fd315fa0aa2683790e8d16c7674253f86f Mon Sep 17 00:00:00 2001
+From 97ce5209853a7f18a79e5d98893353a3bb52f3dd Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sat, 29 Nov 2014 15:25:16 +0000
-Subject: [PATCH 12/64] [rbp] hack: wait for splash to complete before changing
+Subject: [PATCH 12/67] [rbp] hack: wait for splash to complete before changing
  hdmi mode
 
 ---
@@ -736,10 +652,10 @@ index ee297700f8583dbb15cbe53baf8c887b36bd2ea0..bbe501d40c5e101f1d0d64b8b59b1928
  
    RENDER_STEREO_MODE stereo_mode = g_graphicsContext.GetStereoMode();
 
-From cdc528d79b99a1cc90d1828f56a46234fd685c9a Mon Sep 17 00:00:00 2001
+From 70f0ff25bc73321491cc1ad85e3fbb5514dfdc16 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Thu, 11 Dec 2014 17:00:57 +0000
-Subject: [PATCH 13/64] Fix for UI not showing both extractflags and
+Subject: [PATCH 13/67] Fix for UI not showing both extractflags and
  extractthumb
 
 ---
@@ -748,7 +664,7 @@ Subject: [PATCH 13/64] Fix for UI not showing both extractflags and
  2 files changed, 9 insertions(+), 5 deletions(-)
 
 diff --git a/addons/resource.language.en_gb/resources/strings.po b/addons/resource.language.en_gb/resources/strings.po
-index 7b171b3186d47726d1f60cc0225358dc434e9d9f..7c619f5619ea974eda22315179a20569e832641a 100644
+index c67fc9a16f303a822dadfb4f558a390ada04bca8..b2f17db119a179e3e2bf4c8c186a19ea4e6d49a7 100644
 --- a/addons/resource.language.en_gb/resources/strings.po
 +++ b/addons/resource.language.en_gb/resources/strings.po
 @@ -12451,7 +12451,7 @@ msgstr ""
@@ -760,7 +676,7 @@ index 7b171b3186d47726d1f60cc0225358dc434e9d9f..7c619f5619ea974eda22315179a20569
  msgstr ""
  
  #: xbmc/dialogs/GUIDialogSmartPlaylistRule.cpp
-@@ -17011,7 +17011,7 @@ msgstr ""
+@@ -17028,7 +17028,7 @@ msgstr ""
  #. Description of setting with label #20433 "Extract thumbnails and video information"
  #: system/settings/settings.xml
  msgctxt "#36178"
@@ -769,7 +685,7 @@ index 7b171b3186d47726d1f60cc0225358dc434e9d9f..7c619f5619ea974eda22315179a20569
  msgstr ""
  
  #. Description of setting with label #20419 "Replace file names with library titles"
-@@ -17023,7 +17023,7 @@ msgstr ""
+@@ -17040,7 +17040,7 @@ msgstr ""
  #. Description of setting with label #20433 "Extract thumbnails and video information"
  #: system/settings/settings.xml
  msgctxt "#36180"
@@ -778,7 +694,7 @@ index 7b171b3186d47726d1f60cc0225358dc434e9d9f..7c619f5619ea974eda22315179a20569
  msgstr ""
  
  #: system/settings/settings.xml
-@@ -19767,3 +19767,7 @@ msgstr ""
+@@ -19784,3 +19784,7 @@ msgstr ""
  msgctxt "#38052"
  msgid "Remote button press release time (ms)"
  msgstr ""
@@ -787,7 +703,7 @@ index 7b171b3186d47726d1f60cc0225358dc434e9d9f..7c619f5619ea974eda22315179a20569
 +msgid "Extract thumbnails from video files"
 +msgstr ""
 diff --git a/system/settings/settings.xml b/system/settings/settings.xml
-index 316b641d01ceaa8e0a347d8331b56b41c6a44b49..85d3b93466236c33940d01a10d0b8761d1eaa2f6 100644
+index 5f1f3ca48342ef1a4eeed7432221d7b2dda354e8..2ed5fb217c6b9f63f28d760e2a2c00b29942315a 100644
 --- a/system/settings/settings.xml
 +++ b/system/settings/settings.xml
 @@ -974,8 +974,8 @@
@@ -802,10 +718,10 @@ index 316b641d01ceaa8e0a347d8331b56b41c6a44b49..85d3b93466236c33940d01a10d0b8761
            <control type="toggle" />
          </setting>
 
-From 3201c31912acbedfd5f035d4bd65df0fbb73d0e3 Mon Sep 17 00:00:00 2001
+From a1f119e0986ee89641e533cbafae576147e5848d Mon Sep 17 00:00:00 2001
 From: anaconda <anaconda@menakite.eu>
 Date: Thu, 11 Sep 2014 21:30:43 +0200
-Subject: [PATCH 14/64] Disable autoscrolling while on screensaver and while
+Subject: [PATCH 14/67] Disable autoscrolling while on screensaver and while
  opening streams.
 
 ---
@@ -818,10 +734,10 @@ Subject: [PATCH 14/64] Disable autoscrolling while on screensaver and while
  6 files changed, 24 insertions(+), 3 deletions(-)
 
 diff --git a/xbmc/Application.cpp b/xbmc/Application.cpp
-index c6ef0c0e08493090b02accd5cbcbbcb7d8530d87..426835c1b34477ef4871c8720879ed5f89e40386 100644
+index b8ff91b427c4fd430675aab3d1d93098c976031f..fdf7b1dc04e31ffe8e1d1b83825343b24c645b02 100644
 --- a/xbmc/Application.cpp
 +++ b/xbmc/Application.cpp
-@@ -5226,3 +5226,13 @@ bool CApplication::NotifyActionListeners(const CAction &action) const
+@@ -5229,3 +5229,13 @@ bool CApplication::NotifyActionListeners(const CAction &action) const
    
    return false;
  }
@@ -836,7 +752,7 @@ index c6ef0c0e08493090b02accd5cbcbbcb7d8530d87..426835c1b34477ef4871c8720879ed5f
 +  return onBlackDimScreenSaver || openingStreams;
 +}
 diff --git a/xbmc/Application.h b/xbmc/Application.h
-index 8d5876e03d7180ca71ed7c06108c1fa7c81ebe64..2d7f2616159406efdd0d8df4384f41ac9a144f5d 100644
+index 2c21d92c942dfcb3e29f26f00cb545f4b16dca0d..d5446b200439833fed02f998d180ce001eb98067 100644
 --- a/xbmc/Application.h
 +++ b/xbmc/Application.h
 @@ -393,6 +393,8 @@ public:
@@ -936,10 +852,10 @@ index d7bc1c5ba6067af9a460589920367288c640a915..ac766293f1c47c7f145cb46f6b152144
        if (m_lastRenderTime)
          m_autoScrollDelayTime += currentTime - m_lastRenderTime;
 
-From 66f8b01e3d210df4f9cfadbdf188a82651804cc3 Mon Sep 17 00:00:00 2001
+From 65ee3a30d6489b53126e6d34b01ed8c29a4920e5 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sat, 13 Dec 2014 18:35:20 +0000
-Subject: [PATCH 15/64] [demuxer] Avoid memcpy on every demuxer packet
+Subject: [PATCH 15/67] [demuxer] Avoid memcpy on every demuxer packet
 
 Avoids an unnecessary memcpy on every demuxer packet which for
 high bitrate videos can be significant.
@@ -1039,10 +955,10 @@ index df0f35bd49c65b302de4ccd110d859e8b881ea5f..b4b591ae4c4dd4fb0b36d4d00fedca96
      }
      catch(...) {
 
-From 9d108d1ba19f61e3b60260eaf0b65c7e607e9f55 Mon Sep 17 00:00:00 2001
+From 66365771b22ae63d65bbb6df6f8d77d5a5dab33e Mon Sep 17 00:00:00 2001
 From: anaconda <anaconda@menakite.eu>
 Date: Wed, 25 Feb 2015 18:22:21 +0100
-Subject: [PATCH 16/64] Load OSD dialogs on startup.
+Subject: [PATCH 16/67] Load OSD dialogs on startup.
 
 Fixes skipped frames the first time they're loaded in memory on less powered
 devices, like a Raspberry Pi, when using DVDPlayer.
@@ -1137,10 +1053,10 @@ index 0534828dd85520134f7a6890e43a873e223062c1..5a86dfc1e2a54c8fe8d82cb75b612d8e
  CGUIDialogVideoSettings::~CGUIDialogVideoSettings()
  { }
 
-From 973728dd114827a31c754ab1f128a88707f80a49 Mon Sep 17 00:00:00 2001
+From 2c71f9b477cfd5ecb5cdedb6688502dc8cef8fa8 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Tue, 14 Apr 2015 20:51:14 +0100
-Subject: [PATCH 17/64] [gui] Also limit GUI updates when in non full-screen
+Subject: [PATCH 17/67] [gui] Also limit GUI updates when in non full-screen
  video mode
 
 ---
@@ -1148,10 +1064,10 @@ Subject: [PATCH 17/64] [gui] Also limit GUI updates when in non full-screen
  1 file changed, 3 insertions(+), 1 deletion(-)
 
 diff --git a/xbmc/Application.cpp b/xbmc/Application.cpp
-index 426835c1b34477ef4871c8720879ed5f89e40386..0d2a8b4b4c70d9b194de35a9369b0c2d46ea490d 100644
+index fdf7b1dc04e31ffe8e1d1b83825343b24c645b02..513deb7f27846891fb875b9263ad4d61752519ef 100644
 --- a/xbmc/Application.cpp
 +++ b/xbmc/Application.cpp
-@@ -2768,7 +2768,7 @@ void CApplication::FrameMove(bool processEvents, bool processGUI)
+@@ -2771,7 +2771,7 @@ void CApplication::FrameMove(bool processEvents, bool processGUI)
  #if defined(TARGET_RASPBERRY_PI) || defined(HAS_IMXVPU)
      // This code reduces rendering fps of the GUI layer when playing videos in fullscreen mode
      // it makes only sense on architectures with multiple layers
@@ -1160,7 +1076,7 @@ index 426835c1b34477ef4871c8720879ed5f89e40386..0d2a8b4b4c70d9b194de35a9369b0c2d
        fps = CSettings::GetInstance().GetInt(CSettings::SETTING_VIDEOPLAYER_LIMITGUIUPDATE);
  #endif
  
-@@ -2781,6 +2781,8 @@ void CApplication::FrameMove(bool processEvents, bool processGUI)
+@@ -2784,6 +2784,8 @@ void CApplication::FrameMove(bool processEvents, bool processGUI)
      {
        if (!m_skipGuiRender)
          g_windowManager.Process(CTimeUtils::GetFrameTime());
@@ -1170,10 +1086,10 @@ index 426835c1b34477ef4871c8720879ed5f89e40386..0d2a8b4b4c70d9b194de35a9369b0c2d
      g_windowManager.FrameMove();
    }
 
-From aeab20d2a11f70c12577db4cda2d307fb65ddeb8 Mon Sep 17 00:00:00 2001
+From e0ee1d794615a8f4378801b55fe4c36e12d7aab5 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Tue, 5 May 2015 23:58:06 +0100
-Subject: [PATCH 18/64] [screensaver] Leave GUI contents available for
+Subject: [PATCH 18/67] [screensaver] Leave GUI contents available for
  screensaver
 
 ---
@@ -1203,10 +1119,10 @@ index 5808f7ed1e94d68ead7305ba6d284edd4df12bdd..2a3b7f16531c9822e79c77efabdd30ac
  
    // Add window to the history list (we must do this before we activate it,
 
-From fb83e3c356d0c1f70c8fd09170d8ea868c5c4bdd Mon Sep 17 00:00:00 2001
+From fb4c838814069e7178ecfde96cecf43ba76cf722 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sat, 6 Jun 2015 18:43:57 +0100
-Subject: [PATCH 19/64] ffmpeg: Automatic switch to software decode for GMC
+Subject: [PATCH 19/67] ffmpeg: Automatic switch to software decode for GMC
  with more than one warp point
 
 ---
@@ -1434,10 +1350,10 @@ index f135d423c0ca76fd70e79ae5b7d035f0cb79fc75..d9b576bc46055fdab1c134e5f2c63cd4
        else if ((hint.codec == AV_CODEC_ID_VC1 || hint.codec == AV_CODEC_ID_WMV3) && g_RBP.GetCodecWvc1())
          supported = true;
 
-From 75b4698f0a751244d729c2b8d09a489c2e37d365 Mon Sep 17 00:00:00 2001
+From acde728909548c939ae05ff179e461fdffda3e1b Mon Sep 17 00:00:00 2001
 From: Claudio-Sjo <Claudio.Porfiri@gmail.com>
 Date: Mon, 16 Feb 2015 14:51:26 +0100
-Subject: [PATCH 20/64] - allow reads < CDIO_CD_FRAMESIZE_RAW by using a buffer
+Subject: [PATCH 20/67] - allow reads < CDIO_CD_FRAMESIZE_RAW by using a buffer
  - fixes #15794
 
 ---
@@ -1629,10 +1545,10 @@ index 0427af4534bfe59a343f0518c7f4242d93299836..e99236294fa8b9b613e465a8ecaf3ad3
    lsn_t m_lsnCurrent; // Position inside the track in logical sector number
    lsn_t m_lsnEnd;   // End of m_iTrack in logical sector number
 
-From 4282fb935f1a3b2deb8d16e3e9cbc73ed327e451 Mon Sep 17 00:00:00 2001
+From 79220cf49a616e2d1f18a6872323dc02521d4440 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Fri, 24 Jun 2016 19:38:13 +0100
-Subject: [PATCH 21/64] codecoverlay: Include codec name in overlay
+Subject: [PATCH 21/67] codecoverlay: Include codec name in overlay
 
 ---
  xbmc/cores/VideoPlayer/VideoPlayerAudio.cpp | 4 ++++
@@ -1643,10 +1559,10 @@ Subject: [PATCH 21/64] codecoverlay: Include codec name in overlay
  5 files changed, 17 insertions(+), 5 deletions(-)
 
 diff --git a/xbmc/cores/VideoPlayer/VideoPlayerAudio.cpp b/xbmc/cores/VideoPlayer/VideoPlayerAudio.cpp
-index f822935ab7fc919128db53f70a6c4eb84d9759bc..9db3a9cc91fd5f9b194d6c1aa66aa02121164c29 100644
+index ec5f91443f99f57a5e250ddc89a0d04278c00c63..1823f2b02a076e0ab33ca2776fefddb2e126c3d1 100644
 --- a/xbmc/cores/VideoPlayer/VideoPlayerAudio.cpp
 +++ b/xbmc/cores/VideoPlayer/VideoPlayerAudio.cpp
-@@ -210,6 +210,10 @@ void CVideoPlayerAudio::UpdatePlayerInfo()
+@@ -208,6 +208,10 @@ void CVideoPlayerAudio::UpdatePlayerInfo()
    std::ostringstream s;
    s << "aq:"     << std::setw(2) << std::min(99,m_messageQueue.GetLevel()) << "%";
    s << ", Kb/s:" << std::fixed << std::setprecision(2) << (double)GetAudioBitrate() / 1024.0;
@@ -1677,10 +1593,10 @@ index 89db27cce079e3e273050f2fa71f941f21b8280b..903f0d83527d9088ff1bf0ba056f357f
    s << ", skip:" << m_renderManager.GetSkippedFrames();
  
 diff --git a/xbmc/cores/omxplayer/OMXPlayerAudio.cpp b/xbmc/cores/omxplayer/OMXPlayerAudio.cpp
-index 1e5d2b98bbef15b47994c3e4735873a9946b58c7..d43350fa0eefb5960475a02c1327efc24d138e0f 100644
+index 3fa9e11bf58cc1d59773beb1fbeb6fe614535a6c..3006d5445eb1de27e6a5b9a82f564bcde24f3557 100644
 --- a/xbmc/cores/omxplayer/OMXPlayerAudio.cpp
 +++ b/xbmc/cores/omxplayer/OMXPlayerAudio.cpp
-@@ -659,6 +659,10 @@ std::string OMXPlayerAudio::GetPlayerInfo()
+@@ -641,6 +641,10 @@ std::string OMXPlayerAudio::GetPlayerInfo()
    std::ostringstream s;
    s << "aq:"     << std::setw(2) << std::min(99,m_messageQueue.GetLevel() + MathUtils::round_int(100.0/8.0*GetCacheTime())) << "%";
    s << ", Kb/s:" << std::fixed << std::setprecision(2) << (double)GetAudioBitrate() / 1024.0;
@@ -1726,10 +1642,10 @@ index 0df7e72cc9d1947173c2bac5e72eb09976b51aa5..b5050081c360d29b1b478c27e6b88291
    double                    m_iSubtitleDelay;
    bool                      m_bRenderSubs;
 
-From 9d1a5913a50d500595acbe929eede41eb43d1a01 Mon Sep 17 00:00:00 2001
+From 5813a683f670077d064bba5fe2592c105b4f73b4 Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <afedchin@ruswizards.com>
 Date: Tue, 8 Mar 2016 21:20:58 +0300
-Subject: [PATCH 22/64] [DebugInfo] Add cpu usage info.
+Subject: [PATCH 22/67] [DebugInfo] Add cpu usage info.
 
 ---
  .../VideoPlayer/VideoRenderers/DebugRenderer.cpp   | 56 ++++++++--------------
@@ -1899,10 +1815,10 @@ index 420b5b5d8e6089e1049ef9af25e23d915df50dc1..fd8a0a2447c40357a9e13003f2ef45ef
  
        m_debugTimer.Set(1000);
 
-From ffe63455a082ff98b5b6916e47116027eda65ed2 Mon Sep 17 00:00:00 2001
+From 37063ea86134466d1a61a6b6e1cf51638cb7088b Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Fri, 22 May 2015 13:56:29 +0100
-Subject: [PATCH 23/64] ffmpeg: Allow neon to be enabled in unified builds
+Subject: [PATCH 23/67] ffmpeg: Allow neon to be enabled in unified builds
 
 ---
  tools/depends/target/ffmpeg/Makefile | 4 ++++
@@ -1925,10 +1841,10 @@ index 8dd14cdfd053f142f386b6dee1fc0b21bb1f8d93..b5f38a458dfb341c43089e07afded153
  ifeq ($(OS), linux)
    ffmpg_config += --target-os=$(OS) --cpu=$(CPU)
 
-From 322fa05ebba56f61ee7a7ed46da04301ca2814d0 Mon Sep 17 00:00:00 2001
+From ba52537598de76e187946e0869fcadda7c7d48be Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Fri, 27 Feb 2015 14:37:27 +0000
-Subject: [PATCH 24/64] ffmpeg: Add some upstream HEVC optimisations
+Subject: [PATCH 24/67] ffmpeg: Add some upstream HEVC optimisations
 
 ---
  tools/depends/target/ffmpeg/Makefile               |    6 +-
@@ -5726,10 +5642,10 @@ index 0000000000000000000000000000000000000000..5e8e07d407f045fc99554f0f061d1e81
 +2.5.0
 +
 
-From 2182021a32ca71f9c71366a1df71af134f6ecbbc Mon Sep 17 00:00:00 2001
+From b7d0926bb1263518cc9c6fda4945b1b24ba0bf63 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Thu, 7 May 2015 14:04:18 +0100
-Subject: [PATCH 25/64] [ffmpeg] Add GPU acceleration to hevc
+Subject: [PATCH 25/67] [ffmpeg] Add GPU acceleration to hevc
 
 ---
  tools/depends/target/ffmpeg/Makefile               |     4 +-
@@ -43915,10 +43831,10 @@ index 0000000000000000000000000000000000000000..e172ebf157aebffe1ae50b4a2b25fd71
 +2.7.4
 +
 
-From e81f7f20e79e8dcedd41aa6cf1ec32ae4a9862c0 Mon Sep 17 00:00:00 2001
+From 1c5e663d7d0ad95e5fc3de874b006531a50d9b47 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Tue, 12 Jan 2016 16:29:57 +0000
-Subject: [PATCH 26/64] ffmpeg: Add cabac opimisations for hevc
+Subject: [PATCH 26/67] ffmpeg: Add cabac opimisations for hevc
 
 ---
  .../0001-Squashed-commit-of-the-following.patch    | 2179 ++++++++++++++++++++
@@ -46163,10 +46079,10 @@ index d6856dbd4fb4957ace700cbc08332223c01938f6..a61357f14cb2139e8125ae04684bed1b
  
  make -j ${BUILDTHREADS} 
 
-From 9ada0f3d707391c0f9c684ef5bf3deab52926fbc Mon Sep 17 00:00:00 2001
+From cea464637de727807464b87e3efa161268b891ad Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Wed, 16 Sep 2015 19:05:12 +0100
-Subject: [PATCH 27/64] [3d] Make MVC a valid 3D filename tag
+Subject: [PATCH 27/67] [3d] Make MVC a valid 3D filename tag
 
 ---
  xbmc/guilib/StereoscopicsManager.cpp | 9 +++++++++
@@ -46195,7 +46111,7 @@ index b34873cba6534086ae243326550385867a03256a..1443acaf0f25df458ae49766e13dd032
  }
  
 diff --git a/xbmc/settings/AdvancedSettings.cpp b/xbmc/settings/AdvancedSettings.cpp
-index 3891a7ed34acb3489a860678d56a8ec049890f6e..974305ff329eb6999c908d5e05d723f93137ae33 100644
+index 12ba1aca0ba838bd8d33e9ca1043845c10f90954..3478719e18e9430224542c3ed825cd036e975434 100644
 --- a/xbmc/settings/AdvancedSettings.cpp
 +++ b/xbmc/settings/AdvancedSettings.cpp
 @@ -402,6 +402,7 @@ void CAdvancedSettings::Initialize()
@@ -46227,10 +46143,10 @@ index fc526d11c3a78bc74125429120e29bf295bd3b16..6b0e3b8cf9e3ff40e6af758c54fe7eef
      bool m_useDisplayControlHWStereo;
  
 
-From 714732f56e37ffa93ed95405799cadf96b550282 Mon Sep 17 00:00:00 2001
+From bb14b6e47f39bf9e9c659175e397d4b54057b904 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Mon, 5 Oct 2015 14:58:05 +0100
-Subject: [PATCH 28/64] [3d] Swap top/bottom sides of GUI
+Subject: [PATCH 28/67] [3d] Swap top/bottom sides of GUI
 
 ---
  xbmc/guilib/GraphicContext.cpp | 2 +-
@@ -46250,10 +46166,10 @@ index 3706e4d80b3b31da4c5be0a1b21f36e59d2910f2..e170b3fb05279ffa316794dbce1d4f9d
    }
    if(m_stereoMode == RENDER_STEREO_MODE_SPLIT_VERTICAL)
 
-From a9695b01e43e3c9f2148f7bc138f45b07dca89f0 Mon Sep 17 00:00:00 2001
+From 6ea3e583f9bf50d3e4d9fba65442d965db2e7c60 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sun, 11 Oct 2015 20:51:37 +0100
-Subject: [PATCH 29/64] Revert "Revert "Disable extra logging by default""
+Subject: [PATCH 29/67] Revert "Revert "Disable extra logging by default""
 
 This reverts commit a880554325be187b877cd8f0e2b338e7267da636.
 ---
@@ -46261,10 +46177,10 @@ This reverts commit a880554325be187b877cd8f0e2b338e7267da636.
  1 file changed, 2 insertions(+), 2 deletions(-)
 
 diff --git a/system/settings/settings.xml b/system/settings/settings.xml
-index 85d3b93466236c33940d01a10d0b8761d1eaa2f6..8b7e9698510c611909d56caa5902391627a084b8 100644
+index 2ed5fb217c6b9f63f28d760e2a2c00b29942315a..850abcd174cc8773319639c7e337f2e2fdbe11b2 100644
 --- a/system/settings/settings.xml
 +++ b/system/settings/settings.xml
-@@ -2822,12 +2822,12 @@
+@@ -2834,12 +2834,12 @@
          </setting>
          <setting id="debug.extralogging" type="boolean" label="666" help="36394">
            <level>1</level>
@@ -46280,10 +46196,10 @@ index 85d3b93466236c33940d01a10d0b8761d1eaa2f6..8b7e9698510c611909d56caa59023916
              <options>loggingcomponents</options>
              <delimiter>,</delimiter>
 
-From a1b74504cf236599e0a4cd1a7e70472d98b8e5fe Mon Sep 17 00:00:00 2001
+From e4a199961a0f9eef3de3b6d9b2f43746aae44e2a Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Mon, 21 Dec 2015 22:17:25 +0000
-Subject: [PATCH 30/64] [omximage] Fall back to arm jpeg encode/decode when gpu
+Subject: [PATCH 30/67] [omximage] Fall back to arm jpeg encode/decode when gpu
  is busy
 
 ---
@@ -46526,10 +46442,10 @@ index a93aa82663903fb1bf712058c2e259290ee742e6..6f38dbc7e5cc721c59a3633935f08218
  
  extern COMXImage g_OMXImage;
 
-From 2d5f106241a3360cc6dafc9bfa8815bfb1811073 Mon Sep 17 00:00:00 2001
+From 4d10ca2fe22af671bc3ee041242aa19fcc6d986d Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Wed, 9 Dec 2015 13:31:14 +0000
-Subject: [PATCH 31/64] [mmalcodec] Fail to open when width is invalid. Can
+Subject: [PATCH 31/67] [mmalcodec] Fail to open when width is invalid. Can
  happen with mpegts files
 
 ---
@@ -46551,10 +46467,10 @@ index 822b7bf75f2e732b5eed8687403d0eda503fa641..c43952d4d29b42f3a5c7605573294568
    if (!CSettings::GetInstance().GetBool(CSettings::SETTING_VIDEOPLAYER_USEMMAL) || hints.software)
      return false;
 
-From 779210ec56e1c9910f7ed75f3d5ca793468f1dfe Mon Sep 17 00:00:00 2001
+From 1c412e8a9559575ce37830782c994eeaf608aace Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Fri, 19 Sep 2014 11:54:49 +0100
-Subject: [PATCH 32/64] [videoplayer/rbp] Add pi specific option to maintain
+Subject: [PATCH 32/67] [videoplayer/rbp] Add pi specific option to maintain
  vsync with pll adjustment
 
 New A/V sync option in settings/video/playback to do "Adjust PLL".
@@ -46576,10 +46492,10 @@ or drop/dupe audio packets which is normally required.
  12 files changed, 143 insertions(+), 21 deletions(-)
 
 diff --git a/addons/resource.language.en_gb/resources/strings.po b/addons/resource.language.en_gb/resources/strings.po
-index 7c619f5619ea974eda22315179a20569e832641a..6a637a80c9e5d900e23cfd87ee6ce5375d2065d6 100644
+index b2f17db119a179e3e2bf4c8c186a19ea4e6d49a7..55ec0a9985a8e77873d787e879d73c076e13b2c6 100644
 --- a/addons/resource.language.en_gb/resources/strings.po
 +++ b/addons/resource.language.en_gb/resources/strings.po
-@@ -19771,3 +19771,35 @@ msgstr ""
+@@ -19788,3 +19788,35 @@ msgstr ""
  msgctxt "#38190"
  msgid "Extract thumbnails from video files"
  msgstr ""
@@ -46881,10 +46797,10 @@ index 81882a1a3828e3f95df26c1bd88c061d3b994b44..ed6974b1155a7272f3ef5bfed3f74967
    void Drain();
    void AbortAddPackets();
 diff --git a/xbmc/cores/VideoPlayer/VideoPlayerAudio.cpp b/xbmc/cores/VideoPlayer/VideoPlayerAudio.cpp
-index 9db3a9cc91fd5f9b194d6c1aa66aa02121164c29..56170f48cda417554c57b2adf934c2df58a23abf 100644
+index 1823f2b02a076e0ab33ca2776fefddb2e126c3d1..af38453a0e9b212634ee8a4b99c336fff0a71efc 100644
 --- a/xbmc/cores/VideoPlayer/VideoPlayerAudio.cpp
 +++ b/xbmc/cores/VideoPlayer/VideoPlayerAudio.cpp
-@@ -96,6 +96,7 @@ bool CVideoPlayerAudio::OpenStream(CDVDStreamInfo &hints)
+@@ -95,6 +95,7 @@ bool CVideoPlayerAudio::OpenStream(CDVDStreamInfo &hints)
    bool allowpassthrough = !CSettings::GetInstance().GetBool(CSettings::SETTING_VIDEOPLAYER_USEDISPLAYASCLOCK);
    if (hints.realtime)
      allowpassthrough = false;
@@ -46892,7 +46808,7 @@ index 9db3a9cc91fd5f9b194d6c1aa66aa02121164c29..56170f48cda417554c57b2adf934c2df
    CDVDAudioCodec* codec = CDVDFactoryCodec::CreateAudioCodec(hints, m_processInfo, allowpassthrough, m_processInfo.AllowDTSHDDecode());
    if(!codec)
    {
-@@ -217,8 +218,12 @@ void CVideoPlayerAudio::UpdatePlayerInfo()
+@@ -215,8 +216,12 @@ void CVideoPlayerAudio::UpdatePlayerInfo()
  
    //print the inverse of the resample ratio, since that makes more sense
    //if the resample ratio is 0.5, then we're playing twice as fast
@@ -46905,7 +46821,7 @@ index 9db3a9cc91fd5f9b194d6c1aa66aa02121164c29..56170f48cda417554c57b2adf934c2df
  
    s << ", att:" << std::fixed << std::setprecision(1) << log(GetCurrentAttenuation()) * 20.0f << " dB";
  
-@@ -541,10 +546,12 @@ void CVideoPlayerAudio::SetSyncType(bool passthrough)
+@@ -525,10 +530,12 @@ void CVideoPlayerAudio::SetSyncType(bool passthrough)
      int synctype = (m_synctype >= 0 && m_synctype <= 1) ? m_synctype : 2;
      CLog::Log(LOGDEBUG, "CVideoPlayerAudio:: synctype set to %i: %s", m_synctype, synctypes[synctype]);
      m_prevsynctype = m_synctype;
@@ -46920,7 +46836,7 @@ index 9db3a9cc91fd5f9b194d6c1aa66aa02121164c29..56170f48cda417554c57b2adf934c2df
    }
  }
  
-@@ -602,6 +609,7 @@ bool CVideoPlayerAudio::SwitchCodecIfNeeded()
+@@ -586,6 +593,7 @@ bool CVideoPlayerAudio::SwitchCodecIfNeeded()
    bool allowpassthrough = !CSettings::GetInstance().GetBool(CSettings::SETTING_VIDEOPLAYER_USEDISPLAYASCLOCK);
    if (m_streaminfo.realtime)
      allowpassthrough = false;
@@ -46995,10 +46911,10 @@ index fffa5182126159f6dfcf750b21fa0464e229e545..815d758e7086d73b4d4eb16849fdbb50
  
  extern CRBP g_RBP;
 
-From f7ed055669b0f9a04bf7ddcaeb579506006dd45e Mon Sep 17 00:00:00 2001
+From 8fcc6e14f70281abfe8f29d4eecb09d3a1981750 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Thu, 7 May 2015 15:35:43 +0100
-Subject: [PATCH 33/64] rbp: Support zero copy interface with hevc acceleration
+Subject: [PATCH 33/67] rbp: Support zero copy interface with hevc acceleration
 
 ---
  xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecFFmpeg.cpp | 9 +++++++++
@@ -47042,10 +46958,10 @@ index 77ae3273bc8e224fe6c193300ccef32fb7fbafe1..c0b3f19f2ef9cdef9adf00cf81154803
    if (g_advancedSettings.CanLogComponent(LOGVIDEO))
      CLog::Log(LOGDEBUG, "%s::%s - mmal:%p dts:%.3f pts:%.3f buf:%p gpu:%p", CLASSNAME, __FUNCTION__, picture->MMALBuffer->mmal_buffer, 1e-6*picture->dts, 1e-6*picture->pts, picture->MMALBuffer, gmem);
 
-From 92cc9e7d80c9bf2eb519eb683b0627554aae9bcf Mon Sep 17 00:00:00 2001
+From d13334643b6bdf8df9c7711fc2942498092f6a88 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sat, 16 May 2015 18:26:04 +0100
-Subject: [PATCH 34/64] ffmpeg: use upstream mvc patches
+Subject: [PATCH 34/67] ffmpeg: use upstream mvc patches
 
 ---
  ...vcodec-add-h264_mvc-codec-id-and-profiles.patch |  68 ++++++++++++
@@ -47355,10 +47271,10 @@ index 0000000000000000000000000000000000000000..b39480ad098b9cd0882fcf75b96afb1b
 +2.7.4
 +
 
-From 6a3b03ea91643c4be1edd7c5bb4308fe193e6f8e Mon Sep 17 00:00:00 2001
+From a31ff8c43a33afc551e852ac68dcce470c418c67 Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <afedchin@ruswizards.com>
 Date: Fri, 29 Jan 2016 17:18:50 +0300
-Subject: [PATCH 35/64] [win32] Settings: Added setting to enable/disable MVC
+Subject: [PATCH 35/67] [win32] Settings: Added setting to enable/disable MVC
  decoder.
 
 ---
@@ -47388,10 +47304,10 @@ index a017d30c24232fb01220b87b29398403b8ed9662..2fcee72a64e8b701c8e895143410bbe9
      <category id="display">
        <group id="1">
 
-From d0c53b95139c9170448cbfaebc6cb8da3adfe62c Mon Sep 17 00:00:00 2001
+From 8accdc748ba93a0ff8406a12da77b3456887eea9 Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <afedchin@ruswizards.com>
 Date: Wed, 20 Jan 2016 17:02:16 +0300
-Subject: [PATCH 36/64] [VideoPlayer] DemuxFFmpeg: Properly demuxing h264_mvc
+Subject: [PATCH 36/67] [VideoPlayer] DemuxFFmpeg: Properly demuxing h264_mvc
  streams.
 
 ---
@@ -47454,10 +47370,10 @@ index 54a18c669a058b705e0276cb7e14522ae6cd04ae..55431978dcfabee8da95e2e76292ff81
        }
      case AVMEDIA_TYPE_DATA:
 
-From 546338b3e9e48142fa89cb54ae5864f4e0d2f7d0 Mon Sep 17 00:00:00 2001
+From f24fa3672ebcb63848be6e7b3669f7034616287c Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <anightik@gmail.com>
 Date: Thu, 25 Feb 2016 11:21:25 +0300
-Subject: [PATCH 37/64] [Stereo3D] Added block_lr and block_rl to supported
+Subject: [PATCH 37/67] [Stereo3D] Added block_lr and block_rl to supported
  modes.
 
 ---
@@ -47507,10 +47423,10 @@ index 1443acaf0f25df458ae49766e13dd0323454f2eb..6aaa82f4d883b8cae0ccdedf6c5a6814
      i++;
    }
 
-From 0ac0cad635d5ba36264d7674242e95342969f4f6 Mon Sep 17 00:00:00 2001
+From b61e26e406192689178f2059d91ce73d10cf0a5c Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <afedchin@ruswizards.com>
 Date: Sat, 23 Jan 2016 10:21:32 +0300
-Subject: [PATCH 38/64] [VideoPlayer] Fix possible wrong aspect.
+Subject: [PATCH 38/67] [VideoPlayer] Fix possible wrong aspect.
 
 ---
  xbmc/cores/VideoPlayer/VideoPlayerVideo.cpp | 2 +-
@@ -47530,10 +47446,10 @@ index 903f0d83527d9088ff1bf0ba056f357f6abfda81..a5a33d34c70892cde77ad4d8f3cb65fd
    else
      m_fForcedAspectRatio = 0.0;
 
-From 49681796854013d3028d92d6cee16bb361c31b0d Mon Sep 17 00:00:00 2001
+From 88e75af3d50f9e4c2912317f3f3713372a472fec Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <afedchin@ruswizards.com>
 Date: Fri, 22 Jan 2016 18:18:33 +0300
-Subject: [PATCH 39/64] [VideoPlayer] DemuxFFmpeg: ssif remux
+Subject: [PATCH 39/67] [VideoPlayer] DemuxFFmpeg: ssif remux
 
 ---
  xbmc/cores/VideoPlayer/DVDDemuxers/CMakeLists.txt  |   2 +
@@ -47954,7 +47870,7 @@ index e4f8aed0af96fe0dceec4d8517087742f2c7df81..30076937bd084936571abf0e6eeecf5a
  LIB = DVDDemuxers.a
  
 diff --git a/xbmc/settings/AdvancedSettings.cpp b/xbmc/settings/AdvancedSettings.cpp
-index 974305ff329eb6999c908d5e05d723f93137ae33..985ecf9722141d78471c00e90da15bfad931462a 100644
+index 3478719e18e9430224542c3ed825cd036e975434..748354c94045ca279801464930e98bd57963de96 100644
 --- a/xbmc/settings/AdvancedSettings.cpp
 +++ b/xbmc/settings/AdvancedSettings.cpp
 @@ -391,7 +391,7 @@ void CAdvancedSettings::Initialize()
@@ -47967,10 +47883,10 @@ index 974305ff329eb6999c908d5e05d723f93137ae33..985ecf9722141d78471c00e90da15bfa
    m_discStubExtensions = ".disc";
    // internal music extensions
 
-From cfc857d82be72d89051610c0f72432487153c99c Mon Sep 17 00:00:00 2001
+From 91fa36649e96a95270e2296449aaa7aa92a25713 Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <afedchin@ruswizards.com>
 Date: Tue, 23 Feb 2016 16:02:46 +0300
-Subject: [PATCH 40/64] [3DBD] Added support of 3D-BluRay playback.
+Subject: [PATCH 40/67] [3DBD] Added support of 3D-BluRay playback.
 
 ---
  lib/DllLibbluray.h                                 |   8 +
@@ -48960,10 +48876,10 @@ index b967a85e6557e42a7f1235cdd804d5a0263b866f..561fb5cd4f971bc9ee4f41218a60bb3d
    typedef std::shared_ptr<CDVDOverlayImage> SOverlay;
    typedef std::list<SOverlay>                 SOverlays;
 
-From 901ebdbc1d85ea4522d75ca17d57b5ef4ac3cdd4 Mon Sep 17 00:00:00 2001
+From 7c59c8fe5dfd19c92cd01adba7203588fc9a05cf Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <anightik@gmail.com>
 Date: Fri, 11 Mar 2016 16:58:53 +0300
-Subject: [PATCH 41/64] [VideoPlayer] HasVideo returns true if video stream
+Subject: [PATCH 41/67] [VideoPlayer] HasVideo returns true if video stream
  exists. This don't allow start visualization if audio is opened before video.
 
 ---
@@ -48971,10 +48887,10 @@ Subject: [PATCH 41/64] [VideoPlayer] HasVideo returns true if video stream
  1 file changed, 1 insertion(+), 1 deletion(-)
 
 diff --git a/xbmc/cores/VideoPlayer/VideoPlayer.cpp b/xbmc/cores/VideoPlayer/VideoPlayer.cpp
-index bd11cd8a76fce261e5b7e0129d3b9181f0ef84c6..b8db03c873df41c2a3daa52867d30f8b14965821 100644
+index defe4e44a4cca76527186abb989dcb847e1431cd..b45b7573636de0cecd86606d942d5e3baf214c91 100644
 --- a/xbmc/cores/VideoPlayer/VideoPlayer.cpp
 +++ b/xbmc/cores/VideoPlayer/VideoPlayer.cpp
-@@ -3117,7 +3117,7 @@ void CVideoPlayer::Pause()
+@@ -3074,7 +3074,7 @@ void CVideoPlayer::Pause()
  
  bool CVideoPlayer::HasVideo() const
  {
@@ -48984,10 +48900,10 @@ index bd11cd8a76fce261e5b7e0129d3b9181f0ef84c6..b8db03c873df41c2a3daa52867d30f8b
  
  bool CVideoPlayer::HasAudio() const
 
-From 6b11efe3833212a6bdbe714ae3ac7a2e114a6791 Mon Sep 17 00:00:00 2001
+From 21b67480972e81a6892a02477468fc4d33d786c4 Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <anightik@gmail.com>
 Date: Thu, 10 Mar 2016 18:11:33 +0300
-Subject: [PATCH 42/64] fixup! Revert supporting crappy tab/sbs subtitles. this
+Subject: [PATCH 42/67] fixup! Revert supporting crappy tab/sbs subtitles. this
  fixes regular subtitles.
 
 ---
@@ -49024,10 +48940,10 @@ index 3a080d06c90b0762482816928642e6de7810b539..a8323f419e404037c4e5fb4d78fa1b45
      CDVDOverlayImage* overlay = new CDVDOverlayImage();
  
 
-From 8e6cc386c73cc66084f0bdb7a45e2ee6756db6af Mon Sep 17 00:00:00 2001
+From 7fe6612e145e2868b058726f22dfcd8292092aa5 Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <anightik@gmail.com>
 Date: Thu, 7 Apr 2016 17:28:50 +0300
-Subject: [PATCH 43/64] [VideoPlayer] Disable reading extension stream from
+Subject: [PATCH 43/67] [VideoPlayer] Disable reading extension stream from
  input stream if decoder doesn't support it.
 
 ---
@@ -49257,10 +49173,10 @@ index 0b676c9b611fe956f1aa721013412e41ff5b62f6..6762e733848d1298a75a862b0aaf81aa
  
  class CDVDAudioCodec;
 diff --git a/xbmc/cores/VideoPlayer/VideoPlayer.cpp b/xbmc/cores/VideoPlayer/VideoPlayer.cpp
-index b8db03c873df41c2a3daa52867d30f8b14965821..2ee5cbf243a91763ca89747c6c23f3b71875437c 100644
+index b45b7573636de0cecd86606d942d5e3baf214c91..ccc371d723b386eb76f022f794c7563d70e1dadd 100644
 --- a/xbmc/cores/VideoPlayer/VideoPlayer.cpp
 +++ b/xbmc/cores/VideoPlayer/VideoPlayer.cpp
-@@ -3894,6 +3894,10 @@ bool CVideoPlayer::OpenVideoStream(CDVDStreamInfo& hint, bool reset)
+@@ -3802,6 +3802,10 @@ bool CVideoPlayer::OpenVideoStream(CDVDStreamInfo& hint, bool reset)
      if (!player->OpenStream(hint))
        return false;
  
@@ -49284,10 +49200,10 @@ index 0d4100e58e9db7e5035bcf9ae23b0147f80cec8f..69570153f0810a5840f3780c7a6681a1
    // classes
    CDVDOverlayContainer* m_pOverlayContainer;
 
-From a0c8e7ce6f37a8cd9fb0e2ce474c0239f8b2280f Mon Sep 17 00:00:00 2001
+From 29f7bc99e15245a3303f2b70c46b6c3f4d15fd84 Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <anightik@gmail.com>
 Date: Fri, 16 Sep 2016 11:37:48 +0300
-Subject: [PATCH 44/64] [Settings] move SETTING_VIDEOPLAYER_SUPPORTMVC from
+Subject: [PATCH 44/67] [Settings] move SETTING_VIDEOPLAYER_SUPPORTMVC from
  platform settings to common settings.
 
 ---
@@ -49317,7 +49233,7 @@ index 2572e25753712186f69390965ee1448bff3fadd5..7098edf32dff8c00e192229c3ffb060b
    </section>
    <section id="media">
 diff --git a/system/settings/settings.xml b/system/settings/settings.xml
-index 8b7e9698510c611909d56caa5902391627a084b8..e73d85c18ea63453275f2a8f2a0cdd96c4b11e39 100644
+index 850abcd174cc8773319639c7e337f2e2fdbe11b2..0fb9464a598cad05893bff627cbd7ddee7341ca8 100644
 --- a/system/settings/settings.xml
 +++ b/system/settings/settings.xml
 @@ -343,6 +343,12 @@
@@ -49369,10 +49285,10 @@ index 74e8e1fc2da66d3c98a5bab04faa2f6bf16539ff..7dd85f0173bd636f4f5ae6e7fc43b306
      MPLS_PL * mpls = m_dll->bd_get_title_mpls(m_bd);
      if (mpls)
 diff --git a/xbmc/settings/SettingConditions.cpp b/xbmc/settings/SettingConditions.cpp
-index 6b1f2b6d757354d6065c2862b44dfb47184a1dcc..9163ec85bd0feb48a698a025d9870bf40042c675 100644
+index 473ca093f45f6a5779cade1268269bb7ba483e9d..11a422b1a5cbfde9914d3bfd23b5b540cc3b8f88 100644
 --- a/xbmc/settings/SettingConditions.cpp
 +++ b/xbmc/settings/SettingConditions.cpp
-@@ -327,6 +327,9 @@ void CSettingConditions::Initialize()
+@@ -339,6 +339,9 @@ void CSettingConditions::Initialize()
    m_simpleConditions.insert("has_dx");
    m_simpleConditions.insert("hasdxva2");
  #endif
@@ -49383,10 +49299,10 @@ index 6b1f2b6d757354d6065c2862b44dfb47184a1dcc..9163ec85bd0feb48a698a025d9870bf4
    m_simpleConditions.insert("have_lcms2");
  #endif
 
-From 618138892406de177ea6ffeb1c8170013f38dc57 Mon Sep 17 00:00:00 2001
+From 6cafd78f07821d41e50d95015e7531b47ff4ba22 Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <afedchin@ruswizards.com>
 Date: Fri, 4 Nov 2016 22:56:56 +0300
-Subject: [PATCH 45/64] [VideoPlayer] SSIF: fix for corner case when mvc stream
+Subject: [PATCH 45/67] [VideoPlayer] SSIF: fix for corner case when mvc stream
  is switched before the last packet is read from previous stream.
 
 ---
@@ -49575,33 +49491,33 @@ index f70657c9e31fb2460d12910c635dba5163282e74..a11ec77903d2a9b2c68106a8e2301af9
    typedef std::shared_ptr<CDVDOverlayImage> SOverlay;
    typedef std::list<SOverlay>                 SOverlays;
 
-From 9414392ca337af025f99cb5ff2388cd18fab05e0 Mon Sep 17 00:00:00 2001
+From a94a694d85b3f7bb7ebd528cbff1c22500dcd033 Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <afedchin@ruswizards.com>
 Date: Tue, 23 Feb 2016 16:01:08 +0300
-Subject: [PATCH 46/64] [libbluray] bump libbluray to 0.9.2-mvc.
+Subject: [PATCH 46/67] [libbluray] bump libbluray to 0.9.2-mvc.
 
 ---
  project/BuildDependencies/scripts/0_package.list | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)
 
 diff --git a/project/BuildDependencies/scripts/0_package.list b/project/BuildDependencies/scripts/0_package.list
-index 67151c1a1bf47df2b81d38f80ddc3f5e1a3b4eab..9f7ff84b06acca2a5c24f6a74b82d46c544a8b07 100644
+index 3ca0ecb8b91f4db2a0dae1f0fac217bd18c7bc43..48f6088640142b2d40d9a4bce525baa87d3278a3 100644
 --- a/project/BuildDependencies/scripts/0_package.list
 +++ b/project/BuildDependencies/scripts/0_package.list
-@@ -17,7 +17,7 @@ freetype-2.6.3-win32-vc140.7z
+@@ -17,7 +17,7 @@ freetype-dc2b38-win32-vc140-v2.7z
  giflib-5.1.4-win32-vc140.7z
  jsonschemabuilder-1.0.0-win32-3.7z
- libass-542975a-win32-vc140.7z
+ libass-6aaaf5-win32-vc140.7z
 -libbluray-0.9.3-win32-vc140.7z
 +libbluray-0.9.2-mvc-win32-vc120.7z
  libcdio-0.9.3-win32-vc140.7z
  libcec-4.0.1-win32-vc140-2.7z
  libfribidi-0.19.2-win32.7z
 
-From e707f353eb3a974559e4a139610ae9a23ff48f4d Mon Sep 17 00:00:00 2001
+From 4b4f63b3b9083dc559680a33095ffd21ec6c8d81 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Mon, 29 Feb 2016 17:00:50 +0000
-Subject: [PATCH 47/64] libbluray: Bump to Nevcairie's v0.9.2
+Subject: [PATCH 47/67] libbluray: Bump to Nevcairie's v0.9.2
 
 This includes 3D support
 ---
@@ -51258,10 +51174,10 @@ index 0000000000000000000000000000000000000000..5ef0124e35c9d81143921a328e272220
 + 
 +     return fp;
 
-From c1e2e8a5832154824fb4ed0b43d214bbef8fcbde Mon Sep 17 00:00:00 2001
+From 05f619cf0bde4209267f1de6270e7fc93718b0d7 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sun, 6 Mar 2016 12:54:59 +0000
-Subject: [PATCH 48/64] mvc: Automatically enable stereo mode
+Subject: [PATCH 48/67] mvc: Automatically enable stereo mode
 
 ---
  xbmc/cores/VideoPlayer/DVDCodecs/Video/MMALCodec.cpp | 6 +++++-
@@ -51319,10 +51235,10 @@ index 311dd6689236d660919c4c4483c51dca2752514a..536332c43e22ccb229e72b88518e54dd
      break;
      case AV_CODEC_ID_MPEG4:
 
-From 965d2478fb479e9b1624c8cba5a2d4a53a6187e4 Mon Sep 17 00:00:00 2001
+From 0792d059c421811de7a52b55638373c26c21e008 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Thu, 24 Mar 2016 13:02:58 +0000
-Subject: [PATCH 49/64] ffmpeg: mvc: fix for pixelation from packets with no
+Subject: [PATCH 49/67] ffmpeg: mvc: fix for pixelation from packets with no
  pts/dts
 
 ---
@@ -51384,10 +51300,10 @@ index 7e97e4d91a443d46d933df528763422ff5e8f4fa..d4f279fd4f2ceb260698cd6fedb124ba
  	cd $(PLATFORM);\
  	CFLAGS="$(CFLAGS)" CXXFLAGS="$(CXXFLAGS)" CPPFLAGS="$(CPPFLAGS)" LDFLAGS="$(LDFLAGS)" \
 
-From 8092e05383265a3d5f9b86d63de81fba443dd719 Mon Sep 17 00:00:00 2001
+From 7cca3aef831e26c2e43a92646a0243b58486981d Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Fri, 11 Nov 2016 15:53:53 +0000
-Subject: [PATCH 50/64] stereoscopicmanager: fixups for rbp
+Subject: [PATCH 50/67] stereoscopicmanager: fixups for rbp
 
 ---
  xbmc/cores/VideoPlayer/DVDCodecs/DVDCodecUtils.cpp | 61 ++++++++++++++++++++++
@@ -51625,10 +51541,10 @@ index 6aaa82f4d883b8cae0ccdedf6c5a6814e7aaa720..cc929b599125a44ac128713fd4331782
  };
  
 
-From 2df4c943373762d16be809dd00f188af2b6dc631 Mon Sep 17 00:00:00 2001
+From 780c207fcd4e086d32f74d6a097642ba86b8b8a3 Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <anightik@gmail.com>
 Date: Thu, 10 Mar 2016 18:11:33 +0300
-Subject: [PATCH 51/64] fixup! Revert supporting crappy tab/sbs subtitles. this
+Subject: [PATCH 51/67] fixup! Revert supporting crappy tab/sbs subtitles. this
  fixes regular subtitles.
 
 ---
@@ -51648,10 +51564,10 @@ index a8323f419e404037c4e5fb4d78fa1b45409337a7..7c0b70777556ac7694e7fc511cd4bb18
    }
  
 
-From 6502330a06981420a0e94795c9470ea0eced81e3 Mon Sep 17 00:00:00 2001
+From be1121b5490ba594b4338223d0587c4ce12b6030 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sat, 26 Nov 2016 18:24:18 +0000
-Subject: [PATCH 52/64] DemuxMVC: fixup after SeekTime API change
+Subject: [PATCH 52/67] DemuxMVC: fixup after SeekTime API change
 
 ---
  xbmc/cores/VideoPlayer/DVDDemuxers/DemuxMVC.cpp | 2 +-
@@ -51685,36 +51601,10 @@ index bbb836a61344689a83af68c821c05c212a86b097..54f91a02391368fbfbb4d669c003f425
    virtual int GetStreamLength() { return 0; };
    virtual CDemuxStream* GetStream(int iStreamId) const override { return nullptr; };
 
-From 71bd79c370a44ab2f0c01faca4fbe0d2557bc553 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Mon, 3 Nov 2014 23:17:46 +0000
-Subject: [PATCH 53/64] [cec] Don't discard buttons when repeat mode is enabled
-
----
- xbmc/peripherals/devices/PeripheralCecAdapter.cpp | 5 ++++-
- 1 file changed, 4 insertions(+), 1 deletion(-)
-
-diff --git a/xbmc/peripherals/devices/PeripheralCecAdapter.cpp b/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
-index 30367a3fde956090afdca9930fa52e829f35046f..febacb3b7964eab3b8615a6a807e0f27d911b4da 100644
---- a/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
-+++ b/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
-@@ -803,7 +803,10 @@ void CPeripheralCecAdapter::PushCecKeypress(const CecButtonPress &key)
-   CLog::Log(LOGDEBUG, "%s - received key %2x duration %d", __FUNCTION__, key.iButton, key.iDuration);
- 
-   CSingleLock lock(m_critSection);
--  if (key.iDuration > 0)
-+  // avoid the queue getting too long
-+  if (m_configuration.iButtonRepeatRateMs && m_buttonQueue.size() > 5)
-+    return;
-+  if (m_configuration.iButtonRepeatRateMs == 0 && key.iDuration > 0)
-   {
-     if (m_currentButton.iButton == key.iButton && m_currentButton.iDuration == 0)
-     {
-
-From bcd2d7a264e2822f9dfd0283254a4a1cf233b7c6 Mon Sep 17 00:00:00 2001
+From 76a51144a06e9b8147407ccb4d6fe926e8a7816b Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Tue, 4 Nov 2014 18:50:00 +0000
-Subject: [PATCH 54/64] [cec] Temp - more logging
+Subject: [PATCH 54/67] [cec] Temp - more logging
 
 ---
  xbmc/peripherals/devices/PeripheralCecAdapter.cpp | 8 +++++++-
@@ -51766,10 +51656,10 @@ index febacb3b7964eab3b8615a6a807e0f27d911b4da..52d6e6a7ab68ce91faf5a3881b23ea7a
  }
  
 
-From 938124103d0041c0d275629d691fc6d39739b840 Mon Sep 17 00:00:00 2001
+From 205a27d0a81ab333c53a786a00dffae7e1d6205f Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Wed, 25 May 2016 18:31:17 +0100
-Subject: [PATCH 55/64] rbp: Hard code the number of buffers to improve audio
+Subject: [PATCH 55/67] rbp: Hard code the number of buffers to improve audio
  sync
 
 ---
@@ -51811,10 +51701,10 @@ index fd8a0a2447c40357a9e13003f2ef45ef20ccb205..be0de0d962fd374bc17bfa48a27ca17d
  
  }
 
-From 9688862943ab68f8aca7c8d58bf75172bdc7128e Mon Sep 17 00:00:00 2001
+From ac68b8f32842e24009e2de53e42fa6775af263a8 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Mon, 4 Jul 2016 18:30:03 +0100
-Subject: [PATCH 56/64] rbp: Update the GL libs to new naming scheme
+Subject: [PATCH 56/67] rbp: Update the GL libs to new naming scheme
 
 As the opensource mesa GL library is getting more usable, the name collision wih the firmware GL driver is causing issues.
 As such we are renaming the firmware GL driver to avoid this.
@@ -51828,7 +51718,7 @@ will be dropped at some point
  3 files changed, 5 insertions(+), 5 deletions(-)
 
 diff --git a/configure.ac b/configure.ac
-index 060939073c841360dd69bfd9c3a50bd15b6a9411..065af598a8e06b80a779ece30d1d09440b1293bf 100644
+index a37c8c5d2d92194731203b19a5cf8a369e96d3fa..772facc7c837e730317b8708800741efc608a9c9 100644
 --- a/configure.ac
 +++ b/configure.ac
 @@ -949,7 +949,7 @@ if test "$use_gles" = "yes"; then
@@ -51879,10 +51769,10 @@ index 3626ea5204eb561dc1ae0b64c6bb7253d2ec59ec..100ff3178bafe7434bd5456100b5bb71
  fi
  
 
-From 34a539610c33155e45768df7e5d2d9c9ee6258d7 Mon Sep 17 00:00:00 2001
+From 9838684c2520a11221010731a75d1ee556216205 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Tue, 28 Jun 2016 14:46:01 +0100
-Subject: [PATCH 57/64] ffmpeg: hacky fix for files with GMC
+Subject: [PATCH 57/67] ffmpeg: hacky fix for files with GMC
 
 ---
  xbmc/cores/VideoPlayer/DVDDemuxers/DVDDemuxFFmpeg.cpp | 4 ++--
@@ -51904,10 +51794,10 @@ index 9149698884c8ae6a23649abbaa0e659587dfe982..84d515e9e2df6a4c1c448a52a42f4675
          {
            if (pStream->codec->codec_id == AV_CODEC_ID_PROBE)
 
-From 0f4661f6302b5c8025ba24d55522faa6513f2dab Mon Sep 17 00:00:00 2001
+From d31579fe437b86fb71198d5f077645cfaae0f017 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Tue, 19 Jul 2016 20:39:18 +0100
-Subject: [PATCH 58/64] mmalrender: Add sharpness control
+Subject: [PATCH 58/67] mmalrender: Add sharpness control
 
 ---
  addons/resource.language.en_gb/resources/strings.po         |  2 +-
@@ -51916,7 +51806,7 @@ Subject: [PATCH 58/64] mmalrender: Add sharpness control
  3 files changed, 14 insertions(+), 2 deletions(-)
 
 diff --git a/addons/resource.language.en_gb/resources/strings.po b/addons/resource.language.en_gb/resources/strings.po
-index 6a637a80c9e5d900e23cfd87ee6ce5375d2065d6..4ae21a644a4739448e7752c95970fc61b1d3ebd3 100644
+index 55ec0a9985a8e77873d787e879d73c076e13b2c6..eea89feb0f698619623ec67ed0078d30d18c22fc 100644
 --- a/addons/resource.language.en_gb/resources/strings.po
 +++ b/addons/resource.language.en_gb/resources/strings.po
 @@ -8694,7 +8694,7 @@ msgstr ""
@@ -51979,10 +51869,10 @@ index e0e6f7c0e0546013ca74265aef54704fd332f8e4..69eae6cbef0131d20dc979dcb35915cd
    CCriticalSection m_sharedSection;
    MMAL_COMPONENT_T *m_vout;
 
-From 41a329e62bc59dcb395c060c02551e5bba7c5691 Mon Sep 17 00:00:00 2001
+From bf759c6dad91e60580981c29146bf495a18167cf Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Fri, 14 Oct 2016 15:37:53 +0100
-Subject: [PATCH 59/64] MMALFFMpeg: Report as SW decode in codec overlay info
+Subject: [PATCH 59/67] MMALFFMpeg: Report as SW decode in codec overlay info
 
 ---
  xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecFFmpeg.cpp | 2 +-
@@ -52002,10 +51892,10 @@ index 8bace5b3eb98b3b1ddad7f56af83a41ae067bc75..c820a04c903866862b5ff04b38124ff0
    CLog::Log(LOGDEBUG, "CDVDVideoCodecFFmpeg - Updated codec: %s", m_name.c_str());
  }
 
-From a92c013a38a1b086d74c6267c587484297128626 Mon Sep 17 00:00:00 2001
+From 8c3b6d8a7d39d535cc73cd5d0479fb2a01fd1171 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Mon, 7 Nov 2016 18:28:01 +0000
-Subject: [PATCH 60/64] advancedsettings: Add option to set cache size on
+Subject: [PATCH 60/67] advancedsettings: Add option to set cache size on
  libass
 
 E.g to set total cache size in libass to 32M
@@ -52071,7 +51961,7 @@ index f9de4f15e7c612d69ef46e7cad870ecb61afaec3..b5303fd100f1a930eb5c010a95193206
    END_METHOD_RESOLVE()
  };
 diff --git a/xbmc/settings/AdvancedSettings.cpp b/xbmc/settings/AdvancedSettings.cpp
-index 985ecf9722141d78471c00e90da15bfad931462a..a33581ba02a26110105a2d0ae810d96c410efbf1 100644
+index 748354c94045ca279801464930e98bd57963de96..5bdd6244e28c8320e18fed5148d332da19801221 100644
 --- a/xbmc/settings/AdvancedSettings.cpp
 +++ b/xbmc/settings/AdvancedSettings.cpp
 @@ -364,6 +364,8 @@ void CAdvancedSettings::Initialize()
@@ -52107,10 +51997,10 @@ index 6b0e3b8cf9e3ff40e6af758c54fe7eefb89a131c..35bf38719f0eaaa5ac29e9495480ae97
      unsigned int m_jsonTcpPort;
  
 
-From 07d4465716adca7dcad36fa18c74fa26f7afabe5 Mon Sep 17 00:00:00 2001
+From 4ce0c7911dba73b04ae32cb813b9a5fca1d44998 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sun, 13 Nov 2016 20:30:15 +0000
-Subject: [PATCH 61/64] [rbp] Experimental limit libass cache size depending on
+Subject: [PATCH 61/67] [rbp] Experimental limit libass cache size depending on
  arm memory size
 
 ---
@@ -52141,7 +52031,7 @@ index 6e8529001b1a464b4547a846f553d98f5bc0b6c0..238eba372af2cbab11d7543c857ee476
    response[sizeof(response) - 1] = '\0';
    CLog::Log(LOGNOTICE, "Config:\n%s", response);
 diff --git a/xbmc/settings/AdvancedSettings.cpp b/xbmc/settings/AdvancedSettings.cpp
-index a33581ba02a26110105a2d0ae810d96c410efbf1..d70e2cf3113bbe0dad60dfc7accc8d77f7f30c30 100644
+index 5bdd6244e28c8320e18fed5148d332da19801221..95a9d347049dbfa04d74248dce9167a6896566dc 100644
 --- a/xbmc/settings/AdvancedSettings.cpp
 +++ b/xbmc/settings/AdvancedSettings.cpp
 @@ -361,8 +361,10 @@ void CAdvancedSettings::Initialize()
@@ -52156,33 +52046,10 @@ index a33581ba02a26110105a2d0ae810d96c410efbf1..d70e2cf3113bbe0dad60dfc7accc8d77
    m_libAssCache = 0;
  
 
-From d82023dcbc5281e42f9d3795d94c8a6d2da8ddae Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Tue, 17 Jan 2017 21:05:26 +0000
-Subject: [PATCH 62/64] ADSP: Hack - disable
-
----
- xbmc/ServiceManager.cpp | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/xbmc/ServiceManager.cpp b/xbmc/ServiceManager.cpp
-index ecaa4037e53fbefcbbd6f7e6b75d1cb781a82cc0..27e50d337c702371817582c61b22892c43c3683a 100644
---- a/xbmc/ServiceManager.cpp
-+++ b/xbmc/ServiceManager.cpp
-@@ -70,7 +70,7 @@ bool CServiceManager::Init2()
- 
- bool CServiceManager::Init3()
- {
--  m_ADSPManager->Init();
-+  //m_ADSPManager->Init();
-   m_PVRManager->Init();
-   m_contextMenuManager->Init();
- 
-
-From b7ba577164b7f01786e2424e34fee7747e373f70 Mon Sep 17 00:00:00 2001
+From 8f293b32905920446d7d84c031a01ef451e8fa4c Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Mon, 22 Jun 2015 21:46:57 +0100
-Subject: [PATCH 63/64] [rbp] Use default resampling setting on Pi2
+Subject: [PATCH 62/67] [rbp] Use default resampling setting on Pi2
 
 ---
  system/settings/rbp2.xml | 5 +++++
@@ -52205,10 +52072,10 @@ index 50bd55e9c90864c1ff4c36c4650e9ec247737a44..f218216e615d9723e5a163aab9c42ca5
    </section>
  </settings>
 
-From 5e72bc627bb8417a4b5fe19628370ddbd9dcad89 Mon Sep 17 00:00:00 2001
+From d96d809cdb68c8299b0b6771a11eb390060df5b8 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Thu, 1 Dec 2016 17:06:01 +0000
-Subject: [PATCH 64/64] MMALRender: Allow advanced deinterlace with software
+Subject: [PATCH 63/67] MMALRender: Allow advanced deinterlace with software
  decode
 
 Uses YUV420 directly which improves performance.
@@ -52230,3 +52097,53500 @@ index f5f0f0d01227b3b4dcebb4a22a54dbcaac2d5ee9..05cbd8eeaef1a21fc32ea1fa23ea686e
  
      status = mmal_port_format_commit(m_deint_output);
      if (status != MMAL_SUCCESS)
+
+From 38b097a6d6653267c52de2a2d5f6a09c39642684 Mon Sep 17 00:00:00 2001
+From: Nuno Senica <nsenica@gmail.com>
+Date: Tue, 27 Dec 2016 20:59:56 +0000
+Subject: [PATCH 64/67] Apply ffmpeg patches automatically after downloading
+ and extracting the ffmpeg tar ball
+
+---
+ project/cmake/modules/FindFFMPEG.cmake | 12 +++++++++++-
+ 1 file changed, 11 insertions(+), 1 deletion(-)
+
+diff --git a/project/cmake/modules/FindFFMPEG.cmake b/project/cmake/modules/FindFFMPEG.cmake
+index 3d7fcc8ca30224fc589c720e37102588f3739448..486842e9bd6c6e7c65ca0dd9c2a6fc6f26169a5d 100644
+--- a/project/cmake/modules/FindFFMPEG.cmake
++++ b/project/cmake/modules/FindFFMPEG.cmake
+@@ -260,7 +260,17 @@ if(NOT FFMPEG_FOUND)
+                                     <SOURCE_DIR> &&
+                                     ${CMAKE_COMMAND} -E copy
+                                     ${CORE_SOURCE_DIR}/tools/depends/target/ffmpeg/FindGnuTls.cmake
+-                                    <SOURCE_DIR>)
++                                    <SOURCE_DIR> &&
++                                    patch -p1 < ${CORE_SOURCE_DIR}/tools/depends/target/ffmpeg/0001-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch &&
++                                    patch -p1 < ${CORE_SOURCE_DIR}/tools/depends/target/ffmpeg/hevcdsp_ARM_NEON_optimized_epel_functions.patch &&
++                                    patch -p1 < ${CORE_SOURCE_DIR}/tools/depends/target/ffmpeg/added_ARM_NEON_optimized_SAO_patches.patch &&
++                                    patch -p1 < ${CORE_SOURCE_DIR}/tools/depends/target/ffmpeg/pfcd_hevc_optimisations.patch &&
++                                    patch -p1 < ${CORE_SOURCE_DIR}/tools/depends/target/ffmpeg/0001-Squashed-commit-of-the-following.patch &&
++                                    patch -p1 < ${CORE_SOURCE_DIR}/tools/depends/target/ffmpeg/0001-avcodec-add-h264_mvc-codec-id-and-profiles.patch &&
++                                    patch -p1 < ${CORE_SOURCE_DIR}/tools/depends/target/ffmpeg/0001-h264_parser-add-support-for-parsing-h264-mvc-NALUs.patch &&
++                                    patch -p1 < ${CORE_SOURCE_DIR}/tools/depends/target/ffmpeg/h264_parser_fix_parsing_of_mvc_slices_in_some_corner_cases.patch &&
++                                    patch -p1 < ${CORE_SOURCE_DIR}/tools/depends/target/ffmpeg/73fde6f9f3d01f7fc0f3ae4b66f6c725f9fb1105.patch
++                     )
+ 
+   file(WRITE ${CMAKE_BINARY_DIR}/${CORE_BUILD_DIR}/ffmpeg/ffmpeg-link-wrapper
+ "#!/bin/bash
+
+From 41d5ae2774e4c5ed6386180c1553f129a1be549c Mon Sep 17 00:00:00 2001
+From: popcornmix <popcornmix@gmail.com>
+Date: Sun, 1 May 2016 19:56:43 +0100
+Subject: [PATCH 65/67] omxplayer: Avoid CAEFactory::Suspend which should only
+ be called by application
+
+---
+ xbmc/cores/AudioEngine/Sinks/AESinkPi.cpp | 12 +++++++++---
+ xbmc/cores/omxplayer/OMXAudio.cpp         | 15 +++++++++++----
+ xbmc/cores/omxplayer/OMXAudio.h           |  2 ++
+ 3 files changed, 22 insertions(+), 7 deletions(-)
+
+diff --git a/xbmc/cores/AudioEngine/Sinks/AESinkPi.cpp b/xbmc/cores/AudioEngine/Sinks/AESinkPi.cpp
+index 750ea754924d00dbaae9f479485d03f4b3011028..adb74a8bffedc118d4734f59162d0fb9598cc139 100644
+--- a/xbmc/cores/AudioEngine/Sinks/AESinkPi.cpp
++++ b/xbmc/cores/AudioEngine/Sinks/AESinkPi.cpp
+@@ -214,8 +214,6 @@ bool CAESinkPi::Initialize(AEAudioFormat &format, std::string &device)
+   format.m_sampleRate    = std::max(8000U, std::min(192000U, format.m_sampleRate));
+   format.m_frames        = format.m_sampleRate * AUDIO_PLAYBUFFER / NUM_OMX_BUFFERS;
+ 
+-  SetAudioProps(m_passthrough, GetChannelMap(format.m_channelLayout, m_passthrough));
+-
+   m_format = format;
+   m_sinkbuffer_sec_per_byte = 1.0 / (double)(m_format.m_frameSize * m_format.m_sampleRate);
+ 
+@@ -223,6 +221,12 @@ bool CAESinkPi::Initialize(AEAudioFormat &format, std::string &device)
+                 m_format.m_dataFormat, channels, m_format.m_sampleRate, m_format.m_frameSize, m_format.m_frameSize * m_format.m_frames, 1.0/m_sinkbuffer_sec_per_byte,
+                 CSettings::GetInstance().GetString(CSettings::SETTING_AUDIOOUTPUT_AUDIODEVICE).c_str());
+ 
++  // magic value used when omxplayer is playing - want sink to be disabled
++  if (m_passthrough && m_format.m_streamInfo.m_sampleRate == 16000)
++    return true;
++
++  SetAudioProps(m_passthrough, GetChannelMap(m_format.m_channelLayout, m_passthrough));
++
+   OMX_ERRORTYPE omx_err   = OMX_ErrorNone;
+ 
+   if (!m_omx_render.Initialize("OMX.broadcom.audio_render", OMX_IndexParamAudioInit))
+@@ -432,8 +436,10 @@ double CAESinkPi::GetCacheTotal()
+ unsigned int CAESinkPi::AddPackets(uint8_t **data, unsigned int frames, unsigned int offset)
+ {
+   if (!m_Initialized || !m_omx_output || !frames)
++  {
++    Sleep(10);
+     return frames;
+-
++  }
+   OMX_ERRORTYPE omx_err   = OMX_ErrorNone;
+   OMX_BUFFERHEADERTYPE *omx_buffer = NULL;
+ 
+diff --git a/xbmc/cores/omxplayer/OMXAudio.cpp b/xbmc/cores/omxplayer/OMXAudio.cpp
+index 993d4b33a294e88c2c004b7943895ba55558c2d0..21764045fbde39bffe58b61f32ad422231d617d2 100644
+--- a/xbmc/cores/omxplayer/OMXAudio.cpp
++++ b/xbmc/cores/omxplayer/OMXAudio.cpp
+@@ -95,16 +95,23 @@ COMXAudio::COMXAudio() :
+   m_failed_eos      (false  ),
+   m_output          (AESINKPI_UNKNOWN)
+ {
+-  CAEFactory::Suspend();
+-  while (!CAEFactory::IsSuspended())
+-    Sleep(10);
++  // magic value used when omxplayer is playing - want sink to be disabled
++  AEAudioFormat m_format;
++  m_format.m_dataFormat = AE_FMT_RAW;
++  m_format.m_streamInfo.m_type = CAEStreamInfo::STREAM_TYPE_AC3;
++  m_format.m_streamInfo.m_sampleRate = 16000;
++  m_format.m_streamInfo.m_channels = 2;
++  m_format.m_sampleRate = 16000;
++  m_format.m_frameSize = 1;
++  m_pAudioStream = CAEFactory::MakeStream(m_format, 0, nullptr);
+ }
+ 
+ COMXAudio::~COMXAudio()
+ {
+   Deinitialize();
+ 
+-  CAEFactory::Resume();
++  if (m_pAudioStream)
++    CAEFactory::FreeStream(m_pAudioStream);
+ }
+ 
+ bool COMXAudio::PortSettingsChanged()
+diff --git a/xbmc/cores/omxplayer/OMXAudio.h b/xbmc/cores/omxplayer/OMXAudio.h
+index db7f98ddbc2db2f20bdc42379df3f08eba165bfc..02acfc8cfe57446be4e00b991ef6fde9d8fe8eab 100644
+--- a/xbmc/cores/omxplayer/OMXAudio.h
++++ b/xbmc/cores/omxplayer/OMXAudio.h
+@@ -24,6 +24,7 @@
+ 
+ #include "cores/AudioEngine/Utils/AEAudioFormat.h"
+ #include "cores/AudioEngine/Utils/AEUtil.h"
++#include "cores/AudioEngine/Interfaces/AEStream.h"
+ #include "linux/PlatformDefs.h"
+ #include "DVDStreamInfo.h"
+ 
+@@ -145,6 +146,7 @@ private:
+   OMX_AUDIO_PARAM_PCMMODETYPE m_pcm_input;
+   OMX_AUDIO_PARAM_DTSTYPE     m_dtsParam;
+   WAVEFORMATEXTENSIBLE        m_wave_header;
++  IAEStream *m_pAudioStream;
+ protected:
+   COMXCoreComponent m_omx_render_analog;
+   COMXCoreComponent m_omx_render_hdmi;
+
+From a6419a20e48ee45774ef61d41ae9124228e632ca Mon Sep 17 00:00:00 2001
+From: popcornmix <popcornmix@gmail.com>
+Date: Wed, 1 Mar 2017 21:40:22 +0000
+Subject: [PATCH 66/67] MMALRender: default to bob (x2) deinterlace for HD
+
+There are still issues with some dvb dongles run on the same Pi as playback.
+Default to bob. Users who aren't using these devices will have to manually enable advanced.
+---
+ xbmc/cores/VideoPlayer/VideoRenderers/HwDecRender/MMALRenderer.cpp | 4 ++--
+ xbmc/cores/omxplayer/OMXVideo.cpp                                  | 5 +++++
+ 2 files changed, 7 insertions(+), 2 deletions(-)
+
+diff --git a/xbmc/cores/VideoPlayer/VideoRenderers/HwDecRender/MMALRenderer.cpp b/xbmc/cores/VideoPlayer/VideoRenderers/HwDecRender/MMALRenderer.cpp
+index 05cbd8eeaef1a21fc32ea1fa23ea686e3cd7e33b..9279966fa634f6f5a3e00f12dd528337392cf038 100644
+--- a/xbmc/cores/VideoPlayer/VideoRenderers/HwDecRender/MMALRenderer.cpp
++++ b/xbmc/cores/VideoPlayer/VideoRenderers/HwDecRender/MMALRenderer.cpp
+@@ -555,8 +555,8 @@ void CMMALRenderer::Run()
+         if (interlace_method == VS_INTERLACEMETHOD_AUTO)
+         {
+           interlace_method = VS_INTERLACEMETHOD_MMAL_ADVANCED;
+-          // avoid advanced deinterlace when using software decode and HD resolution
+-          if (omvb->m_state == MMALStateFFDec && omvb->m_width * omvb->m_height > 720*576)
++          // avoid advanced deinterlace when using HD resolution
++          if (omvb->m_width * omvb->m_height > 720*576)
+             interlace_method = VS_INTERLACEMETHOD_MMAL_BOB;
+         }
+         bool interlace = (omvb->mmal_buffer->flags & MMAL_BUFFER_HEADER_VIDEO_FLAG_INTERLACED) ? true:false;
+diff --git a/xbmc/cores/omxplayer/OMXVideo.cpp b/xbmc/cores/omxplayer/OMXVideo.cpp
+index 39bc0530cecd54ae8c3a5481c92f1a6a18a4d9c5..cb0a06888a919879155fea2a689c1bae9ff2f139 100644
+--- a/xbmc/cores/omxplayer/OMXVideo.cpp
++++ b/xbmc/cores/omxplayer/OMXVideo.cpp
+@@ -236,7 +236,12 @@ bool COMXVideo::PortSettingsChanged(ResolutionUpdateInfo &resinfo)
+ 
+   EINTERLACEMETHOD interlace_method = CMediaSettings::GetInstance().GetCurrentVideoSettings().m_InterlaceMethod;
+   if (interlace_method == VS_INTERLACEMETHOD_AUTO)
++  {
+     interlace_method = VS_INTERLACEMETHOD_MMAL_ADVANCED;
++    // avoid advanced deinterlace when using HD resolution
++    if (port_image.format.video.nFrameWidth * port_image.format.video.nFrameHeight > 720*576)
++      interlace_method = VS_INTERLACEMETHOD_MMAL_BOB;
++  }
+ 
+   if (m_deinterlace && interlace_method != VS_INTERLACEMETHOD_NONE)
+   {
+
+From 58b2734d76c3e32caed3ed96da93b3b02391c8d8 Mon Sep 17 00:00:00 2001
+From: popcornmix <popcornmix@gmail.com>
+Date: Fri, 17 Feb 2017 17:58:13 +0000
+Subject: [PATCH 67/67] ffmpeg: Update hevc optimisation to use the gpu service
+
+---
+ project/cmake/modules/FindFFMPEG.cmake             |    14 +-
+ tools/depends/target/ffmpeg/Makefile               |    14 +-
+ tools/depends/target/ffmpeg/autobuild.sh           |    14 +-
+ .../target/ffmpeg/pfcd_hevc_optimisations.patch    | 52224 ++++++-------------
+ 4 files changed, 16342 insertions(+), 35924 deletions(-)
+
+diff --git a/project/cmake/modules/FindFFMPEG.cmake b/project/cmake/modules/FindFFMPEG.cmake
+index 486842e9bd6c6e7c65ca0dd9c2a6fc6f26169a5d..92e79eb225640bb8a965ee63b3fd5c743e09758f 100644
+--- a/project/cmake/modules/FindFFMPEG.cmake
++++ b/project/cmake/modules/FindFFMPEG.cmake
+@@ -261,14 +261,14 @@ if(NOT FFMPEG_FOUND)
+                                     ${CMAKE_COMMAND} -E copy
+                                     ${CORE_SOURCE_DIR}/tools/depends/target/ffmpeg/FindGnuTls.cmake
+                                     <SOURCE_DIR> &&
+-                                    patch -p1 < ${CORE_SOURCE_DIR}/tools/depends/target/ffmpeg/0001-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch &&
+-                                    patch -p1 < ${CORE_SOURCE_DIR}/tools/depends/target/ffmpeg/hevcdsp_ARM_NEON_optimized_epel_functions.patch &&
+-                                    patch -p1 < ${CORE_SOURCE_DIR}/tools/depends/target/ffmpeg/added_ARM_NEON_optimized_SAO_patches.patch &&
++                                    #patch -p1 < ${CORE_SOURCE_DIR}/tools/depends/target/ffmpeg/0001-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch &&
++                                    #patch -p1 < ${CORE_SOURCE_DIR}/tools/depends/target/ffmpeg/hevcdsp_ARM_NEON_optimized_epel_functions.patch &&
++                                    #patch -p1 < ${CORE_SOURCE_DIR}/tools/depends/target/ffmpeg/added_ARM_NEON_optimized_SAO_patches.patch &&
+                                     patch -p1 < ${CORE_SOURCE_DIR}/tools/depends/target/ffmpeg/pfcd_hevc_optimisations.patch &&
+-                                    patch -p1 < ${CORE_SOURCE_DIR}/tools/depends/target/ffmpeg/0001-Squashed-commit-of-the-following.patch &&
+-                                    patch -p1 < ${CORE_SOURCE_DIR}/tools/depends/target/ffmpeg/0001-avcodec-add-h264_mvc-codec-id-and-profiles.patch &&
+-                                    patch -p1 < ${CORE_SOURCE_DIR}/tools/depends/target/ffmpeg/0001-h264_parser-add-support-for-parsing-h264-mvc-NALUs.patch &&
+-                                    patch -p1 < ${CORE_SOURCE_DIR}/tools/depends/target/ffmpeg/h264_parser_fix_parsing_of_mvc_slices_in_some_corner_cases.patch &&
++                                    #patch -p1 < ${CORE_SOURCE_DIR}/tools/depends/target/ffmpeg/0001-Squashed-commit-of-the-following.patch &&
++                                    #patch -p1 < ${CORE_SOURCE_DIR}/tools/depends/target/ffmpeg/0001-avcodec-add-h264_mvc-codec-id-and-profiles.patch &&
++                                    #patch -p1 < ${CORE_SOURCE_DIR}/tools/depends/target/ffmpeg/0001-h264_parser-add-support-for-parsing-h264-mvc-NALUs.patch &&
++                                    #patch -p1 < ${CORE_SOURCE_DIR}/tools/depends/target/ffmpeg/h264_parser_fix_parsing_of_mvc_slices_in_some_corner_cases.patch &&
+                                     patch -p1 < ${CORE_SOURCE_DIR}/tools/depends/target/ffmpeg/73fde6f9f3d01f7fc0f3ae4b66f6c725f9fb1105.patch
+                      )
+ 
+diff --git a/tools/depends/target/ffmpeg/Makefile b/tools/depends/target/ffmpeg/Makefile
+index d4f279fd4f2ceb260698cd6fedb124bae61018d0..11e92a9ad618b748cad4831fa6af7565e29081ab 100644
+--- a/tools/depends/target/ffmpeg/Makefile
++++ b/tools/depends/target/ffmpeg/Makefile
+@@ -88,14 +88,14 @@ $(PLATFORM): $(TARBALLS_LOCATION)/$(ARCHIVE) $(DEPS)
+ 	rm -rf $(PLATFORM); mkdir -p $(PLATFORM)
+ 	cd $(PLATFORM); $(ARCHIVE_TOOL) $(ARCHIVE_TOOL_FLAGS) $(TARBALLS_LOCATION)/$(ARCHIVE)
+ 	cd $(PLATFORM); sed -i".bak" -e "s%pkg_config_default=pkg-config%export PKG_CONFIG_LIBDIR=$(PREFIX)/lib/pkgconfig \&\& pkg_config_default=$(NATIVEPREFIX)/bin/pkg-config%" configure
+-	cd $(PLATFORM); patch -p1 < ../0001-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch
+-	cd $(PLATFORM); patch -p1 < ../hevcdsp_ARM_NEON_optimized_epel_functions.patch
+-	cd $(PLATFORM); patch -p1 < ../added_ARM_NEON_optimized_SAO_patches.patch
++	#cd $(PLATFORM); patch -p1 < ../0001-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch
++	#cd $(PLATFORM); patch -p1 < ../hevcdsp_ARM_NEON_optimized_epel_functions.patch
++	#cd $(PLATFORM); patch -p1 < ../added_ARM_NEON_optimized_SAO_patches.patch
+ 	cd $(PLATFORM); patch -p1 < ../pfcd_hevc_optimisations.patch
+-	cd $(PLATFORM); patch -p1 < ../0001-Squashed-commit-of-the-following.patch
+-	cd $(PLATFORM); patch -p1 < ../0001-avcodec-add-h264_mvc-codec-id-and-profiles.patch
+-	cd $(PLATFORM); patch -p1 < ../0001-h264_parser-add-support-for-parsing-h264-mvc-NALUs.patch
+-	cd $(PLATFORM); patch -p1 < ../h264_parser_fix_parsing_of_mvc_slices_in_some_corner_cases.patch
++	#cd $(PLATFORM); patch -p1 < ../0001-Squashed-commit-of-the-following.patch
++	#cd $(PLATFORM); patch -p1 < ../0001-avcodec-add-h264_mvc-codec-id-and-profiles.patch
++	#cd $(PLATFORM); patch -p1 < ../0001-h264_parser-add-support-for-parsing-h264-mvc-NALUs.patch
++	#cd $(PLATFORM); patch -p1 < ../h264_parser_fix_parsing_of_mvc_slices_in_some_corner_cases.patch
+ 	cd $(PLATFORM); patch -p1 < ../73fde6f9f3d01f7fc0f3ae4b66f6c725f9fb1105.patch
+ 
+ 	cd $(PLATFORM);\
+diff --git a/tools/depends/target/ffmpeg/autobuild.sh b/tools/depends/target/ffmpeg/autobuild.sh
+index 9f6c26c8acd08ed603aadeb4d9d81b07026e7506..3d970429012c1f3aede4df0545ced5006c165d50 100755
+--- a/tools/depends/target/ffmpeg/autobuild.sh
++++ b/tools/depends/target/ffmpeg/autobuild.sh
+@@ -132,14 +132,14 @@ mkdir -p "ffmpeg-${VERSION}"
+ cd "ffmpeg-${VERSION}" || exit 2
+ tar --strip-components=1 -xf $MYDIR/${ARCHIVE}
+ 
+-patch -p1 < ../0001-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch
+-patch -p1 < ../hevcdsp_ARM_NEON_optimized_epel_functions.patch
+-patch -p1 < ../added_ARM_NEON_optimized_SAO_patches.patch
++#patch -p1 < ../0001-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch
++#patch -p1 < ../hevcdsp_ARM_NEON_optimized_epel_functions.patch
++#patch -p1 < ../added_ARM_NEON_optimized_SAO_patches.patch
+ patch -p1 < ../pfcd_hevc_optimisations.patch
+-patch -p1 < ../0001-Squashed-commit-of-the-following.patch
+-patch -p1 < ../0001-avcodec-add-h264_mvc-codec-id-and-profiles.patch
+-patch -p1 < ../0001-h264_parser-add-support-for-parsing-h264-mvc-NALUs.patch
+-patch -p1 < ../h264_parser_fix_parsing_of_mvc_slices_in_some_corner_cases.patch
++#patch -p1 < ../0001-Squashed-commit-of-the-following.patch
++#patch -p1 < ../0001-avcodec-add-h264_mvc-codec-id-and-profiles.patch
++#patch -p1 < ../0001-h264_parser-add-support-for-parsing-h264-mvc-NALUs.patch
++#patch -p1 < ../h264_parser_fix_parsing_of_mvc_slices_in_some_corner_cases.patch
+ 
+ CFLAGS="$CFLAGS" CXXFLAGS="$CXXFLAGS" LDFLAGS="$LDFLAGS" \
+ ./configure --prefix=$FFMPEG_PREFIX \
+diff --git a/tools/depends/target/ffmpeg/pfcd_hevc_optimisations.patch b/tools/depends/target/ffmpeg/pfcd_hevc_optimisations.patch
+index e172ebf157aebffe1ae50b4a2b25fd71bc708c93..852815d5f4ae80771c5304f6f3520b5e49b18a67 100644
+--- a/tools/depends/target/ffmpeg/pfcd_hevc_optimisations.patch
++++ b/tools/depends/target/ffmpeg/pfcd_hevc_optimisations.patch
+@@ -1,14 +1,17 @@
+-From b9b5434c61afd492a54dad5158b4d56ecbf7f01d Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Tue, 28 Apr 2015 16:18:40 +0100
+-Subject: [PATCH 01/68] Added display output
+-
+----
+- ffmpeg.c | 159 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+- 1 file changed, 159 insertions(+)
+-
++diff --git a/.gitignore b/.gitignore
++index 524fb73..305632b 100644
++--- a/.gitignore
+++++ b/.gitignore
++@@ -23,6 +23,7 @@
++ .\#*
++ /.config
++ /.version
+++/build/
++ /ffmpeg
++ /ffplay
++ /ffprobe
+ diff --git a/ffmpeg.c b/ffmpeg.c
+-index 9ffd833..50c6e86 100644
++index 9ffd833..7a86d7e 100644
+ --- a/ffmpeg.c
+ +++ b/ffmpeg.c
+ @@ -23,6 +23,11 @@
+@@ -17,17 +20,20 @@ index 9ffd833..50c6e86 100644
+  
+ +#ifdef RPI
+ +#define RPI_DISPLAY
+-+//#define RPI_ZERO_COPY
+++#define RPI_ZERO_COPY
+ +#endif
+ +
+  #include "config.h"
+  #include <ctype.h>
+  #include <string.h>
+-@@ -66,6 +71,20 @@
++@@ -66,6 +71,25 @@
+  # include "libavfilter/buffersrc.h"
+  # include "libavfilter/buffersink.h"
+  
+ +#ifdef RPI_DISPLAY
+++#pragma GCC diagnostic push
+++// Many many redundant decls in the header files
+++#pragma GCC diagnostic ignored "-Wredundant-decls"
+ +#include <bcm_host.h>
+ +#include <interface/mmal/mmal.h>
+ +#include <interface/mmal/mmal_parameters_camera.h>
+@@ -36,15 +42,17 @@ index 9ffd833..50c6e86 100644
+ +#include <interface/mmal/util/mmal_default_components.h>
+ +#include <interface/mmal/util/mmal_connection.h>
+ +#include <interface/mmal/util/mmal_util_params.h>
+++#pragma GCC diagnostic pop
+ +#ifdef RPI_ZERO_COPY
+ +#include "libavcodec/rpi_qpu.h"
+ +#endif
+++#include "libavcodec/rpi_zc.h"
+ +#endif
+ +
+  #if HAVE_SYS_RESOURCE_H
+  #include <sys/time.h>
+  #include <sys/types.h>
+-@@ -158,6 +177,134 @@ static int restore_tty;
++@@ -158,6 +182,169 @@ static int restore_tty;
+  static void free_input_threads(void);
+  #endif
+  
+@@ -54,13 +62,7 @@ index 9ffd833..50c6e86 100644
+ +
+ +static MMAL_COMPONENT_T* rpi_display = NULL;
+ +static MMAL_POOL_T *rpi_pool = NULL;
+-+
+-+#ifdef RPI_ZERO_COPY
+-+static uint8_t *get_vc_handle(AVBufferRef *bref) {
+-+  GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
+-+  return (uint8_t *)p->vc_handle;
+-+}
+-+#endif
+++static volatile int rpi_display_count = 0;
+ +
+ +static MMAL_POOL_T* display_alloc_pool(MMAL_PORT_T* port, size_t w, size_t h)
+ +{
+@@ -77,7 +79,7 @@ index 9ffd833..50c6e86 100644
+ +    for (i = 0; i < NUM_BUFFERS; ++i)
+ +    {
+ +       MMAL_BUFFER_HEADER_T* buffer = pool->header[i];
+-+       void* bufPtr = buffer->data;
+++       char * bufPtr = buffer->data;
+ +       memset(bufPtr, i*30, w*h);
+ +       memset(bufPtr+w*h, 128, (w*h)/2);
+ +    }
+@@ -86,81 +88,122 @@ index 9ffd833..50c6e86 100644
+ +    return pool;
+ +}
+ +
+-+static void display_cb_input(MMAL_PORT_T *port,MMAL_BUFFER_HEADER_T *buffer) {
+++static void display_cb_input(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer) {
+++#ifdef RPI_ZERO_COPY
+++    av_rpi_zc_unref(buffer->user_data);
+++    --rpi_display_count;
+++#endif
+++    mmal_buffer_header_release(buffer);
+++}
+++
+++static void display_cb_control(MMAL_PORT_T *port,MMAL_BUFFER_HEADER_T *buffer) {
+ +  mmal_buffer_header_release(buffer);
+ +}
+ +
+ +static MMAL_COMPONENT_T* display_init(size_t x, size_t y, size_t w, size_t h)
+ +{
+ +    MMAL_COMPONENT_T* display;
+-+    int w2 = (w+31)&~31;
+-+    int h2 = (h+15)&~15;
+ +    MMAL_DISPLAYREGION_T region =
+ +    {
+-+        {MMAL_PARAMETER_DISPLAYREGION, sizeof(region)},
+++        .hdr = {MMAL_PARAMETER_DISPLAYREGION, sizeof(region)},
+ +        .set = MMAL_DISPLAY_SET_LAYER | MMAL_DISPLAY_SET_FULLSCREEN | MMAL_DISPLAY_SET_DEST_RECT,
+ +        .layer = 2,
+ +        .fullscreen = 0,
+ +        .dest_rect = {x, y, w, h}
+ +    };
+++    const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(w, h);
+++
+ +    bcm_host_init();  // TODO is this needed?
+ +    mmal_component_create(MMAL_COMPONENT_DEFAULT_VIDEO_RENDERER, &display);
+ +    assert(display);
+ +
+ +    mmal_port_parameter_set(display->input[0], &region.hdr);
+ +
+-+    MMAL_ES_FORMAT_T* format = display->input[0]->format;
+-+    format->encoding = MMAL_ENCODING_I420;
+-+    format->es->video.width = w2;
+-+    format->es->video.height = h2;
+-+    format->es->video.crop.x = 0;
+-+    format->es->video.crop.y = 0;
+-+    format->es->video.crop.width = w;
+-+    format->es->video.crop.height = h;
+-+    mmal_port_format_commit(display->input[0]);
+++    {
+++        MMAL_ES_FORMAT_T* format = display->input[0]->format;
+++        format->encoding = MMAL_ENCODING_I420;
+++        format->es->video.width = geo.stride_y;
+++        format->es->video.height = geo.height_y;
+++        format->es->video.crop.x = 0;
+++        format->es->video.crop.y = 0;
+++        format->es->video.crop.width = w;
+++        format->es->video.crop.height = h;
+++        mmal_port_format_commit(display->input[0]);
+++    }
+ +
+ +    mmal_component_enable(display);
+ +
+-+    rpi_pool = display_alloc_pool(display->input[0], w2, h2);
+++    rpi_pool = display_alloc_pool(display->input[0], geo.stride_y, geo.height_y);
+ +
+ +    mmal_port_enable(display->input[0],display_cb_input);
+-+    mmal_port_enable(display->control,display_cb_input);
+++    mmal_port_enable(display->control,display_cb_control);
+ +
+-+    printf("Allocated display %d %d\n",w,h);
+++    printf("Allocated display %dx%d in %dx%d\n", w, h, geo.stride_y, geo.height_y);
+ +
+ +    return display;
+ +}
+ +
+-+static void display_frame(MMAL_COMPONENT_T* display,AVFrame* fr)
+++static void display_frame(struct AVCodecContext * const s, MMAL_COMPONENT_T* const display, const AVFrame* const fr)
+ +{
+-+    int w = fr->width;
+-+    int h = fr->height;
+-+    int w2 = (w+31)&~31;
+-+    int h2 = (h+15)&~15;
+++    MMAL_BUFFER_HEADER_T* buf;
+++
+ +    if (!display || !rpi_pool)
+ +        return;
+-+    MMAL_BUFFER_HEADER_T* buf = mmal_queue_get(rpi_pool->queue);
+++
+++    if (rpi_display_count >= 3) {
+++        av_log(s, AV_LOG_VERBOSE, "Frame dropped\n");
+++        return;
+++    }
+++
+++    buf = mmal_queue_get(rpi_pool->queue);
+ +    if (!buf) {
+-+      // Running too fast so drop the frame
+-+      return;
+++        // Running too fast so drop the frame
+++        printf("Q alloc failure\n");
+++        return;
+ +    }
+ +    assert(buf);
+ +    buf->cmd = 0;
+-+    buf->length = (w2 * h2 * 3)/2;
+ +    buf->offset = 0; // Offset to valid data
+ +    buf->flags = 0;
+ +#ifdef RPI_ZERO_COPY
+-+    buf->data = get_vc_handle(fr->buf[0]);
+-+    buf->alloc_size = (w2*h2*3)/2;
+++{
+++    const AVRpiZcRefPtr fr_buf = av_rpi_zc_ref(s, fr, 1);
+++
+++    buf->user_data = fr_buf;
+++    buf->data = av_rpi_zc_vc_handle(fr_buf);
+++    buf->alloc_size =
+++        buf->length = av_rpi_zc_numbytes(fr_buf);
+++
+++    ++rpi_display_count;
+++}
+ +#else
+++{
+++#error YYY
+++    int w = fr->width;
+++    int h = fr->height;
+++    int w2 = (w+31)&~31;
+++    int h2 = (h+15)&~15;
+++
+++    buf->length = (w2 * h2 * 3)/2;
+++    buf->user_data = NULL;
+++
+ +    //mmal_buffer_header_mem_lock(buf);
+ +    memcpy(buf->data, fr->data[0], w2 * h);
+ +    memcpy(buf->data+w2*h2, fr->data[1], w2 * h / 4);
+ +    memcpy(buf->data+w2*h2*5/4, fr->data[2], w2 * h / 4);
+ +    //mmal_buffer_header_mem_unlock(buf);
+++}
+ +#endif
+ +
+-+    mmal_port_send_buffer(display->input[0], buf);  // I assume this will automatically get released
+++    while (rpi_display_count >= 3) {
+++        usleep(5000);
+++    }
+++
+++    if (mmal_port_send_buffer(display->input[0], buf) != MMAL_SUCCESS)
+++    {
+++        printf("** send failed: depth=%d\n", rpi_display_count);
+++        display_cb_input(NULL, buf);
+++    }
+ +}
+ +
+ +static void display_exit(MMAL_COMPONENT_T* display)
+@@ -179,4965 +222,6886 @@ index 9ffd833..50c6e86 100644
+  /* sub2video hack:
+     Convert subtitles to video with alpha to insert them in filter graphs.
+     This is a temporary solution until libavfilter gets real subtitles support.
+-@@ -581,6 +728,10 @@ static void ffmpeg_cleanup(int ret)
++@@ -540,6 +727,11 @@ static void ffmpeg_cleanup(int ret)
++         avformat_close_input(&input_files[i]->ctx);
++         av_freep(&input_files[i]);
+      }
+-     term_exit();
+-     ffmpeg_exited = 1;
+ +
+ +#ifdef RPI_DISPLAY
+ +    display_exit(rpi_display);
+ +#endif
+++
++     for (i = 0; i < nb_input_streams; i++) {
++         InputStream *ist = input_streams[i];
++ 
++@@ -551,6 +743,9 @@ static void ffmpeg_cleanup(int ret)
++         av_freep(&ist->filters);
++         av_freep(&ist->hwaccel_device);
++ 
+++#ifdef RPI_ZERO_COPY
+++        av_rpi_zc_uninit(ist->dec_ctx);
+++#endif
++         avcodec_free_context(&ist->dec_ctx);
++ 
++         av_freep(&input_streams[i]);
++@@ -581,6 +776,7 @@ static void ffmpeg_cleanup(int ret)
++     }
++     term_exit();
++     ffmpeg_exited = 1;
+++
+  }
+  
+  void remove_avoptions(AVDictionary **a, AVDictionary *b)
+-@@ -940,6 +1091,14 @@ static void do_video_out(AVFormatContext *s,
+-     int frame_size = 0;
+-     InputStream *ist = NULL;
+-     AVFilterContext *filter = ost->filter->filter;
++@@ -944,6 +1140,15 @@ static void do_video_out(AVFormatContext *s,
++     if (ost->source_index >= 0)
++         ist = input_streams[ost->source_index];
++ 
+ +#ifdef RPI_DISPLAY
+-+    if (next_picture)
+++    if (next_picture && ist != NULL)
+ +    {
+-+	if (!rpi_display)
+++        if (!rpi_display)
+ +           rpi_display = display_init(0,0,next_picture->width,next_picture->height);
+-+        display_frame(rpi_display,next_picture);
+++        display_frame(ist->dec_ctx, rpi_display, next_picture);
+ +    }
+ +#endif
+++
++     if (filter->inputs[0]->frame_rate.num > 0 &&
++         filter->inputs[0]->frame_rate.den > 0)
++         duration = 1/(av_q2d(filter->inputs[0]->frame_rate) * av_q2d(enc->time_base));
++@@ -2549,6 +2754,12 @@ static int init_input_stream(int ist_index, char *error, int error_len)
++         ist->dec_ctx->opaque                = ist;
++         ist->dec_ctx->get_format            = get_format;
++         ist->dec_ctx->get_buffer2           = get_buffer;
+++
+++#ifdef RPI_ZERO_COPY
+++        // Overrides the above get_buffer2
+++        av_rpi_zc_init(ist->dec_ctx);
+++#endif
+++
++         ist->dec_ctx->thread_safe_callbacks = 1;
+  
+-     if (ost->source_index >= 0)
+-         ist = input_streams[ost->source_index];
+--- 
+-2.7.4
+-
+-
+-From b90a5aff7bf9112ebd2a07949c8d79a49fcafe48 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Wed, 29 Apr 2015 16:49:43 +0100
+-Subject: [PATCH 02/68] Split transform and intra prediction into commands
+-
+----
+- libavcodec/hevc.c       | 119 +++++++++++++++++++++++++++++++++++++++++++++++-
+- libavcodec/hevc.h       |  58 +++++++++++++++++++++++
+- libavcodec/hevc_cabac.c |  15 ++++++
+- 3 files changed, 191 insertions(+), 1 deletion(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index b478065..aa45dd6 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -931,6 +931,25 @@ static int hls_cross_component_pred(HEVCContext *s, int idx) {
+-     return 0;
+- }
++         av_opt_set_int(ist->dec_ctx, "refcounted_frames", 1, 0);
++diff --git a/libavcodec/Makefile b/libavcodec/Makefile
++index fd0d1f0..40d22d2 100644
++--- a/libavcodec/Makefile
+++++ b/libavcodec/Makefile
++@@ -5,6 +5,11 @@ NAME = avcodec
++ HEADERS = avcodec.h                                                     \
++           avdct.h                                                       \
++           avfft.h                                                       \
+++          rpi_qpu.h                                                     \
+++          rpi_shader.h                                                  \
+++          rpi_mailbox.h                                                 \
+++          rpi_hevc_transform.h                                          \
+++          rpi_zc.h                                                      \
++           d3d11va.h                                                     \
++           dirac.h                                                       \
++           dv_profile.h                                                  \
++@@ -43,6 +48,10 @@ OBJS = allcodecs.o                                                      \
++        resample.o                                                       \
++        resample2.o                                                      \
++        utils.o                                                          \
+++       rpi_qpu.o                                                        \
+++       rpi_shader.o                                                     \
+++       rpi_mailbox.o                                                    \
+++       rpi_zc.o                                                         \
++        vorbis_parser.o                                                  \
++        xiph.o                                                           \
+  
+-+#ifdef RPI
+-+static void rpi_intra_pred(HEVCContext *s, int log2_trafo_size, int x0, int y0, int c_idx)
+-+{
+-+    if (s->enable_rpi) {
+-+        HEVCLocalContext *lc = s->HEVClc;
+-+        HEVCPredCmd *cmd = s->univ_pred_cmds + s->num_pred_cmds++;
+-+        cmd->type = RPI_PRED_INTRA;
+-+        cmd->size = log2_trafo_size;
+-+        cmd->c_idx = c_idx;
+-+        cmd->x = x0;
+-+        cmd->y = y0;
+-+        cmd->na = (lc->na.cand_bottom_left<<4) + (lc->na.cand_left<<3) + (lc->na.cand_up_left<<2) + (lc->na.cand_up<<1) + lc->na.cand_up_right;
+-+        cmd->mode = c_idx ? lc->tu.intra_pred_mode_c :  lc->tu.intra_pred_mode;
+-+    } else {
+-+        s->hpc.intra_pred[log2_trafo_size - 2](s, x0, y0, c_idx);
+-+    }
+-+}
++@@ -1078,3 +1087,11 @@ $(SUBDIR)qdm2.o: $(SUBDIR)qdm2_tables.h
++ $(SUBDIR)sinewin.o: $(SUBDIR)sinewin_tables.h
++ $(SUBDIR)sinewin_fixed.o: $(SUBDIR)sinewin_fixed_tables.h
++ endif
+++
+++$(SUBDIR)rpi_shader.c: $(SUBDIR)rpi_shader.qasm
+++	python $(SUBDIR)../pi-util/qasm.py -mc_c:rpi_shader,rpi_shader,rpi_shader $< > $@
+++
+++$(SUBDIR)rpi_shader.h: $(SUBDIR)rpi_shader.qasm
+++	python $(SUBDIR)../pi-util/qasm.py -mc_h:rpi_shader,rpi_shader,rpi_shader $< > $@
+++
+++$(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_shader.h
++diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c
++index 54efaad..02a89c3 100644
++--- a/libavcodec/allcodecs.c
+++++ b/libavcodec/allcodecs.c
++@@ -667,6 +667,7 @@ void avcodec_register_all(void)
++     REGISTER_PARSER(H261,               h261);
++     REGISTER_PARSER(H263,               h263);
++     REGISTER_PARSER(H264,               h264);
+++    REGISTER_PARSER(H264_MVC,           h264_mvc);
++     REGISTER_PARSER(HEVC,               hevc);
++     REGISTER_PARSER(MJPEG,              mjpeg);
++     REGISTER_PARSER(MLP,                mlp);
++diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
++index a4ceca7..1354c14 100644
++--- a/libavcodec/arm/Makefile
+++++ b/libavcodec/arm/Makefile
++@@ -132,8 +132,10 @@ NEON-OBJS-$(CONFIG_LLAUDDSP)           += arm/lossless_audiodsp_neon.o
++ NEON-OBJS-$(CONFIG_DCA_DECODER)        += arm/synth_filter_neon.o
++ NEON-OBJS-$(CONFIG_HEVC_DECODER)       += arm/hevcdsp_init_neon.o       \
++                                           arm/hevcdsp_deblock_neon.o    \
+++                                          arm/hevcdsp_epel_neon.o       \
++                                           arm/hevcdsp_idct_neon.o       \
++-                                          arm/hevcdsp_qpel_neon.o
+++                                          arm/hevcdsp_qpel_neon.o       \
+++                                          arm/hevcdsp_sao_neon.o
++ NEON-OBJS-$(CONFIG_RV30_DECODER)       += arm/rv34dsp_neon.o
++ NEON-OBJS-$(CONFIG_RV40_DECODER)       += arm/rv34dsp_neon.o            \
++                                           arm/rv40dsp_neon.o
++diff --git a/libavcodec/arm/cabac.h b/libavcodec/arm/cabac.h
++index fdbf86b..0a3980a 100644
++--- a/libavcodec/arm/cabac.h
+++++ b/libavcodec/arm/cabac.h
++@@ -26,13 +26,34 @@
++ #include "libavutil/internal.h"
++ #include "libavcodec/cabac.h"
++ 
+++
+++#if UNCHECKED_BITSTREAM_READER
+++#define LOAD_16BITS_BEHI\
+++        "ldrh       %[tmp]        , [%[ptr]]    , #2            \n\t"\
+++        "rev        %[tmp]        , %[tmp]                      \n\t"
+++#elif CONFIG_THUMB
+++#define LOAD_16BITS_BEHI\
+++        "ldr        %[tmp]        , [%[c], %[end]]              \n\t"\
+++        "cmp        %[tmp]        , %[ptr]                      \n\t"\
+++        "it         cs                                          \n\t"\
+++        "ldrhcs     %[tmp]        , [%[ptr]]    , #2            \n\t"\
+++        "rev        %[tmp]        , %[tmp]                      \n\t"
+++#else
+++#define LOAD_16BITS_BEHI\
+++        "ldr        %[tmp]        , [%[c], %[end]]              \n\t"\
+++        "cmp        %[tmp]        , %[ptr]                      \n\t"\
+++        "ldrcsh     %[tmp]        , [%[ptr]]    , #2            \n\t"\
+++        "rev        %[tmp]        , %[tmp]                      \n\t"
+ +#endif
+ +
+- static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+-                               int xBase, int yBase, int cb_xBase, int cb_yBase,
+-                               int log2_cb_size, int log2_trafo_size,
+-@@ -943,8 +962,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+-     if (lc->cu.pred_mode == MODE_INTRA) {
+-         int trafo_size = 1 << log2_trafo_size;
+-         ff_hevc_set_neighbour_available(s, x0, y0, trafo_size, trafo_size);
+++
++ #define get_cabac_inline get_cabac_inline_arm
++ static av_always_inline int get_cabac_inline_arm(CABACContext *c,
++                                                  uint8_t *const state)
++ {
++     int bit;
+++#if 0
++     void *reg_b, *reg_c, *tmp;
+ -
+-+#ifdef RPI
+-+        rpi_intra_pred(s, log2_trafo_size, x0, y0, 0);
++     __asm__ volatile(
++         "ldrb       %[bit]        , [%[state]]                  \n\t"
++         "add        %[r_b]        , %[tables]   , %[lps_off]    \n\t"
++@@ -100,9 +121,141 @@ static av_always_inline int get_cabac_inline_arm(CABACContext *c,
++           [mlps_off]"I"(H264_MLPS_STATE_OFFSET + 128)
++         : "memory", "cc"
++         );
+ +#else
+-         s->hpc.intra_pred[log2_trafo_size - 2](s, x0, y0, 0);
+++   // *** Not thumb compatible yet
+++   unsigned int reg_b, tmp;
+++    __asm__ (
+++        "ldrb       %[bit]        , [%[state]]                  \n\t"
+++        "sub        %[r_b]        , %[mlps_tables], %[lps_off]  \n\t"
+++        "and        %[tmp]        , %[range]    , #0xC0         \n\t"
+++        "add        %[r_b]        , %[r_b]      , %[bit]        \n\t"
+++        "ldrb       %[tmp]        , [%[r_b]     , %[tmp], lsl #1] \n\t"
+++// %bit = *state
+++// %range = range
+++// %tmp = RangeLPS
+++        "sub        %[range]      , %[range]    , %[tmp]        \n\t"
+++
+++        "cmp        %[low]        , %[range]    , lsl #17       \n\t"
+++        "ittt       ge                                          \n\t"
+++        "subge      %[low]        , %[low]      , %[range], lsl #17 \n\t"
+++        "mvnge      %[bit]        , %[bit]                      \n\t"
+++        "movge      %[range]      , %[tmp]                      \n\t"
+++
+++        "clz        %[tmp]        , %[range]                    \n\t"
+++        "sub        %[tmp]        , #23                         \n\t"
+++
+++        "ldrb       %[r_b]        , [%[mlps_tables], %[bit]]    \n\t"
+++        "lsl        %[low]        , %[low]      , %[tmp]        \n\t"
+++        "lsl        %[range]      , %[range]    , %[tmp]        \n\t"
+++
+++        "strb       %[r_b]        , [%[state]]                  \n\t"
+++        "lsls       %[tmp]        , %[low]      , #16           \n\t"
+++
+++        "bne        2f                                          \n\t"
+++        LOAD_16BITS_BEHI
+++        "lsr        %[tmp]        , %[tmp]      , #15           \n\t"
+++        "movw       %[r_b]        , #0xFFFF                     \n\t"
+++        "sub        %[tmp]        , %[tmp]      , %[r_b]        \n\t"
+++
+++        "rbit       %[r_b]        , %[low]                      \n\t"
+++        "clz        %[r_b]        , %[r_b]                      \n\t"
+++        "sub        %[r_b]        , %[r_b]      , #16           \n\t"
+++#if CONFIG_THUMB
+++        "lsl        %[tmp]        , %[tmp]      , %[r_b]        \n\t"
+++        "add        %[low]        , %[low]      , %[tmp]        \n\t"
+++#else
+++        "add        %[low]        , %[low]      , %[tmp], lsl %[r_b] \n\t"
+++#endif
+++        "2:                                                     \n\t"
+++        :    [bit]"=&r"(bit),
+++             [low]"+&r"(c->low),
+++           [range]"+&r"(c->range),
+++             [r_b]"=&r"(reg_b),
+++             [ptr]"+&r"(c->bytestream),
+++             [tmp]"=&r"(tmp)
+++          :  [state]"r"(state),
+++            [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128),
+++              [byte]"M"(offsetof(CABACContext, bytestream)),
+++#if !UNCHECKED_BITSTREAM_READER
+++                 [c]"r"(c),
+++               [end]"M"(offsetof(CABACContext, bytestream_end)),
+++#endif
+++           [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
+++        : "memory", "cc"
+++        );
+ +#endif
+-     }
+  
+-     if (cbf_luma || cbf_cb[0] || cbf_cr[0] ||
+-@@ -1030,7 +1052,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+-             for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
+-                 if (lc->cu.pred_mode == MODE_INTRA) {
+-                     ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
+-+#ifdef RPI
+-+                    rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (i << log2_trafo_size_c), 1);
++     return bit & 1;
++ }
+++
+++#define get_cabac_bypass get_cabac_bypass_arm
+++static inline int get_cabac_bypass_arm(CABACContext * const c)
+++{
+++    int rv = 0;
+++    unsigned int tmp;
+++    __asm (
+++        "lsl        %[low]        , #1                          \n\t"
+++        "cmp        %[low]        , %[range]    , lsl #17       \n\t"
+++        "adc        %[rv]         , %[rv]       , #0            \n\t"
+++        "it         cs                                          \n\t"
+++        "subcs      %[low]        , %[low]      , %[range], lsl #17 \n\t"
+++        "lsls       %[tmp]        , %[low]      , #16           \n\t"
+++        "bne        1f                                          \n\t"
+++        LOAD_16BITS_BEHI
+++        "add        %[low]        , %[low]      , %[tmp], lsr #15 \n\t"
+++        "movw       %[tmp]        , #0xFFFF                     \n\t"
+++        "sub        %[low]        , %[low]      , %[tmp]        \n\t"
+++        "1:                                                     \n\t"
+++        : // Outputs
+++              [rv]"+&r"(rv),
+++             [low]"+&r"(c->low),
+++             [tmp]"=&r"(tmp),
+++             [ptr]"+&r"(c->bytestream)
+++        : // Inputs
+++#if !UNCHECKED_BITSTREAM_READER
+++                 [c]"r"(c),
+++               [end]"M"(offsetof(CABACContext, bytestream_end)),
+++#endif
+++             [range]"r"(c->range)
+++        : "cc"
+++    );
+++    return rv;
+++}
+++
+++
+++#define get_cabac_bypass_sign get_cabac_bypass_sign_arm
+++static inline int get_cabac_bypass_sign_arm(CABACContext * const c, int rv)
+++{
+++    unsigned int tmp;
+++    __asm (
+++        "lsl        %[low]        , #1                          \n\t"
+++        "cmp        %[low]        , %[range]    , lsl #17       \n\t"
+++        "ite        cc                                          \n\t"
+++        "rsbcc      %[rv]         , %[rv]       , #0            \n\t"
+++        "subcs      %[low]        , %[low]      , %[range], lsl #17 \n\t"
+++        "lsls       %[tmp]        , %[low]      , #16           \n\t"
+++        "bne        1f                                          \n\t"
+++        LOAD_16BITS_BEHI
+++        "add        %[low]        , %[low]      , %[tmp], lsr #15 \n\t"
+++        "movw       %[tmp]        , #0xFFFF                     \n\t"
+++        "sub        %[low]        , %[low]      , %[tmp]        \n\t"
+++        "1:                                                     \n\t"
+++        : // Outputs
+++              [rv]"+&r"(rv),
+++             [low]"+&r"(c->low),
+++             [tmp]"=&r"(tmp),
+++             [ptr]"+&r"(c->bytestream)
+++        : // Inputs
+++#if !UNCHECKED_BITSTREAM_READER
+++                 [c]"r"(c),
+++               [end]"M"(offsetof(CABACContext, bytestream_end)),
+++#endif
+++             [range]"r"(c->range)
+++        : "cc"
+++    );
+++    return rv;
+++}
+++
++ #endif /* HAVE_ARMV6T2_INLINE */
++ 
++ #endif /* AVCODEC_ARM_CABAC_H */
++diff --git a/libavcodec/arm/hevc_cabac.h b/libavcodec/arm/hevc_cabac.h
++new file mode 100644
++index 0000000..31d3c59
++--- /dev/null
+++++ b/libavcodec/arm/hevc_cabac.h
++@@ -0,0 +1,491 @@
+++/*
+++ * This file is part of FFmpeg.
+++ *
+++ * FFmpeg is free software; you can redistribute it and/or
+++ * modify it under the terms of the GNU Lesser General Public
+++ * License as published by the Free Software Foundation; either
+++ * version 2.1 of the License, or (at your option) any later version.
+++ *
+++ * FFmpeg is distributed in the hope that it will be useful,
+++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+++ * Lesser General Public License for more details.
+++ *
+++ * You should have received a copy of the GNU Lesser General Public
+++ * License along with FFmpeg; if not, write to the Free Software
+++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+++ */
+++
+++#ifndef AVCODEC_ARM_HEVC_CABAC_H
+++#define AVCODEC_ARM_HEVC_CABAC_H
+++
+++#include "config.h"
+++#if HAVE_ARMV6T2_INLINE
+++
+++#define hevc_mem_bits32 hevc_mem_bits32_arm
+++static inline uint32_t hevc_mem_bits32_arm(const void * p, const unsigned int bits)
+++{
+++    unsigned int n;
+++    __asm__ (
+++        "rev        %[n], %[x]                     \n\t"
+++        : [n]"=r"(n)
+++        : [x]"r"(*(const uint32_t *)((const uint8_t *)p + (bits >> 3)))
+++        :
+++        );
+++    return n << (bits & 7);
+++}
+++
+++
+++// ---------------------------------------------------------------------------
+++//
+++// Helper fns - little bits of code where ARM has an instraction that the
+++// compiler doesn't know about / use
+++
+++#define trans_scale_sat trans_scale_sat_arm
+++static inline int trans_scale_sat_arm(const int level, const unsigned int scale, const unsigned int scale_m, const unsigned int shift)
+++{
+++    int rv;
+++    int t = ((level * (int)(scale * scale_m)) >> shift) + 1;
+++
+++    __asm__ (
+++    "ssat %[rv], #16, %[t], ASR #1 \n\t"
+++    : [rv]"=r"(rv)
+++    : [t]"r"(t)
+++    :
+++    );
+++    return rv;
+++}
+++
+++#define update_rice update_rice_arm
+++static inline void update_rice_arm(uint8_t * const stat_coeff,
+++    const unsigned int last_coeff_abs_level_remaining,
+++    const unsigned int c_rice_param)
+++{
+++    int t;
+++    __asm__ (
+++    "lsl   %[t], %[coeff], #1               \n\t"
+++    "lsrs  %[t], %[t], %[shift]             \n\t"
+++    "it    eq                               \n\t"
+++    "subeq %[stat], %[stat], #1             \n\t"
+++    "cmp   %[t], #6                         \n\t"
+++    "adc   %[stat], %[stat], #0             \n\t"
+++    "usat  %[stat], #8, %[stat]             \n\t"
+++    : [stat]"+&r"(*stat_coeff),
+++         [t]"=&r"(t)
+++    :  [coeff]"r"(last_coeff_abs_level_remaining),
+++       [shift]"r"(c_rice_param)
+++    : "cc"
+++    );
+++}
+++
+++// ---------------------------------------------------------------------------
+++//
+++// CABAC get loops
+++//
+++// Where the loop is simple enough we can normally do 10-30% better than the
+++// compiler
+++
+++// Get the residual greater than 1 bits
+++
+++#define get_cabac_greater1_bits get_cabac_greater1_bits_arm
+++static inline unsigned int get_cabac_greater1_bits_arm(CABACContext * const c, const unsigned int n,
+++    uint8_t * const state0)
+++{
+++    unsigned int i, reg_b, st, tmp, bit, rv;
+++     __asm__ (
+++         "mov        %[i]          , #0                          \n\t"
+++         "mov        %[rv]         , #0                          \n\t"
+++         "1:                                                     \n\t"
+++         "add        %[i]          , %[i]        , #1            \n\t"
+++         "cmp        %[rv]         , #0                          \n\t"
+++         "ite        eq                                          \n\t"
+++         "usateq     %[st]         , #2          , %[i]          \n\t"
+++         "movne      %[st]         , #0                          \n\t"
+++
+++         "ldrb       %[bit]        , [%[state0], %[st]]          \n\t"
+++         "sub        %[r_b]        , %[mlps_tables], %[lps_off]  \n\t"
+++         "and        %[tmp]        , %[range]    , #0xC0         \n\t"
+++         "add        %[r_b]        , %[r_b]      , %[bit]        \n\t"
+++         "ldrb       %[tmp]        , [%[r_b], %[tmp], lsl #1]    \n\t"
+++         "sub        %[range]      , %[range]    , %[tmp]        \n\t"
+++
+++         "cmp        %[low]        , %[range], lsl #17           \n\t"
+++         "ittt       ge                                          \n\t"
+++         "subge      %[low]        , %[low]      , %[range], lsl #17 \n\t"
+++         "mvnge      %[bit]        , %[bit]                      \n\t"
+++         "movge      %[range]      , %[tmp]                      \n\t"
+++
+++         "ldrb       %[r_b]        , [%[mlps_tables], %[bit]]    \n\t"
+++         "and        %[bit]        , %[bit]      , #1            \n\t"
+++         "orr        %[rv]         , %[bit]      , %[rv], lsl #1 \n\t"
+++
+++         "clz        %[tmp]        , %[range]                    \n\t"
+++         "sub        %[tmp]        , #23                         \n\t"
+++
+++         "lsl        %[low]        , %[low]      , %[tmp]        \n\t"
+++         "lsl        %[range]      , %[range]    , %[tmp]        \n\t"
+++
+++         "strb       %[r_b]        , [%[state0], %[st]]          \n\t"
+++// There is a small speed gain from combining both conditions, using a single
+++// branch and then working out what that meant later
+++         "lsls       %[tmp]        , %[low]      , #16           \n\t"
+++         "it         ne                                          \n\t"
+++         "cmpne      %[n]          , %[i]                        \n\t"
+++         "bne        1b                                          \n\t"
+++
+++// If reload is not required then we must have run out of flags to decode
+++         "tst        %[tmp]        , %[tmp]                      \n\t"
+++         "bne        2f                                          \n\t"
+++
+++// Do reload
+++         "ldrh       %[tmp]        , [%[bptr]]   , #2            \n\t"
+++         "movw       %[r_b]        , #0xFFFF                     \n\t"
+++         "rev        %[tmp]        , %[tmp]                      \n\t"
+++         "rsb        %[tmp]        , %[r_b]      , %[tmp], lsr #15 \n\t"
+++
+++         "rbit       %[r_b]        , %[low]                      \n\t"
+++         "clz        %[r_b]        , %[r_b]                      \n\t"
+++         "sub        %[r_b]        , %[r_b]      , #16           \n\t"
+++
+++#if CONFIG_THUMB
+++         "lsl        %[tmp]        , %[tmp]      , %[r_b]        \n\t"
+++         "add        %[low]        , %[low]      , %[tmp]        \n\t"
+ +#else
+-                     s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (i << log2_trafo_size_c), 1);
+++         "add        %[low]        , %[low]      , %[tmp], lsl %[r_b] \n\t"
+ +#endif
+-                 }
+-                 if (cbf_cb[i])
+-                     ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c),
+-@@ -1059,7 +1085,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+-             for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
+-                 if (lc->cu.pred_mode == MODE_INTRA) {
+-                     ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
+-+#ifdef RPI
+-+                    rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (i << log2_trafo_size_c), 2);
+++
+++         "cmp        %[n]          , %[i]                        \n\t"
+++         "bne        1b                                          \n\t"
+++         "2:                                                     \n\t"
+++         :    [bit]"=&r"(bit),
+++              [low]"+&r"(c->low),
+++            [range]"+&r"(c->range),
+++              [r_b]"=&r"(reg_b),
+++             [bptr]"+&r"(c->bytestream),
+++                [i]"=&r"(i),
+++              [tmp]"=&r"(tmp),
+++               [st]"=&r"(st),
+++               [rv]"=&r"(rv)
+++          :  [state0]"r"(state0),
+++                  [n]"r"(n),
+++        [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128),
+++               [byte]"M"(offsetof(CABACContext, bytestream)),
+++            [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
+++         : "memory", "cc"
+++    );
+++    return rv;
+++}
+++
+++
+++// n must be > 0 on entry
+++#define get_cabac_sig_coeff_flag_idxs get_cabac_sig_coeff_flag_idxs_arm
+++static inline uint8_t * get_cabac_sig_coeff_flag_idxs_arm(CABACContext * const c, uint8_t * const state0,
+++    unsigned int n,
+++    const uint8_t const * ctx_map,
+++    uint8_t * p)
+++{
+++    unsigned int reg_b, tmp, st, bit;
+++     __asm__ (
+++         "1:                                                     \n\t"
+++// Get bin from map
+++         "ldrb       %[st]         , [%[ctx_map], %[n]]          \n\t"
+++
+++// Load state & ranges
+++         "sub        %[r_b]        , %[mlps_tables], %[lps_off]  \n\t"
+++         "ldrb       %[bit]        , [%[state0], %[st]]          \n\t"
+++         "and        %[tmp]        , %[range]    , #0xC0         \n\t"
+++         "add        %[r_b]        , %[r_b]      , %[tmp], lsl #1 \n\t"
+++         "ldrb       %[tmp]        , [%[r_b], %[bit]]            \n\t"
+++         "sub        %[range]      , %[range]    , %[tmp]        \n\t"
+++
+++         "cmp        %[low]        , %[range], lsl #17           \n\t"
+++         "ittt       ge                                          \n\t"
+++         "subge      %[low]        , %[low]      , %[range], lsl #17 \n\t"
+++         "mvnge      %[bit]        , %[bit]                      \n\t"
+++         "movge      %[range]      , %[tmp]                      \n\t"
+++
+++         "ldrb       %[r_b]        , [%[mlps_tables], %[bit]]    \n\t"
+++         "tst        %[bit]        , #1                          \n\t"
+++// GCC asm seems to need strbne written differently for thumb and arm
+++#if CONFIG_THUMB
+++         "it         ne                                          \n\t"
+++         "strbne     %[n]          , [%[idx]]    , #1            \n\t"
+ +#else
+-                     s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (i << log2_trafo_size_c), 2);
+++         "strneb     %[n]          , [%[idx]]    , #1            \n\t"
+ +#endif
+-                 }
+-                 if (cbf_cr[i])
+-                     ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c),
+-@@ -1088,7 +1118,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+-                 if (lc->cu.pred_mode == MODE_INTRA) {
+-                     ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size),
+-                                                     trafo_size_h, trafo_size_v);
+-+#ifdef RPI
+-+                    rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (i << log2_trafo_size), 1);
+++
+++// Renorm
+++         "clz        %[tmp]        , %[range]                    \n\t"
+++         "sub        %[tmp]        , #23                         \n\t"
+++         "lsl        %[low]        , %[low]      , %[tmp]        \n\t"
+++         "lsl        %[range]      , %[range]    , %[tmp]        \n\t"
+++
+++         "strb       %[r_b]        , [%[state0], %[st]]          \n\t"
+++// There is a small speed gain from combining both conditions, using a single
+++// branch and then working out what that meant later
+++         "subs       %[n]          , %[n]        , #1            \n\t"
+++#if CONFIG_THUMB
+++         "itt        ne                                          \n\t"
+++         "lslsne     %[tmp]        , %[low]      , #16           \n\t"
+++         "bne        1b                                          \n\t"
+ +#else
+-                     s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (i << log2_trafo_size), 1);
+++         "lslnes     %[tmp]        , %[low]      , #16           \n\t"
+++         "bne        1b                                          \n\t"
+ +#endif
+-                 }
+-                 if (cbf_cb[i])
+-                     ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size),
+-@@ -1098,7 +1132,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+-                 if (lc->cu.pred_mode == MODE_INTRA) {
+-                     ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size),
+-                                                 trafo_size_h, trafo_size_v);
+-+#ifdef RPI
+-+                    rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (i << log2_trafo_size), 2);
+-+#else
+-                     s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (i << log2_trafo_size), 2);
+-+#endif
+-                 }
+-                 if (cbf_cr[i])
+-                     ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size),
+-@@ -1110,26 +1148,46 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+-             int trafo_size_h = 1 << (log2_trafo_size_c + s->ps.sps->hshift[1]);
+-             int trafo_size_v = 1 << (log2_trafo_size_c + s->ps.sps->vshift[1]);
+-             ff_hevc_set_neighbour_available(s, x0, y0, trafo_size_h, trafo_size_v);
+-+#ifdef RPI
+-+            rpi_intra_pred(s, log2_trafo_size_c, x0, y0, 1);
+-+            rpi_intra_pred(s, log2_trafo_size_c, x0, y0, 2);
+-+#else
+-             s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0, 1);
+-             s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0, 2);
+-+#endif
+-             if (s->ps.sps->chroma_format_idc == 2) {
+-                 ff_hevc_set_neighbour_available(s, x0, y0 + (1 << log2_trafo_size_c),
+-                                                 trafo_size_h, trafo_size_v);
+-+#ifdef RPI
+-+                rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (1 << log2_trafo_size_c), 1);
+-+                rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (1 << log2_trafo_size_c), 2);
+-+#else
+-                 s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (1 << log2_trafo_size_c), 1);
+-                 s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (1 << log2_trafo_size_c), 2);
+-+#endif
+-             }
+-         } else if (blk_idx == 3) {
+-             int trafo_size_h = 1 << (log2_trafo_size + 1);
+-             int trafo_size_v = 1 << (log2_trafo_size + s->ps.sps->vshift[1]);
+-             ff_hevc_set_neighbour_available(s, xBase, yBase,
+-                                             trafo_size_h, trafo_size_v);
+-+#ifdef RPI
+-+            rpi_intra_pred(s, log2_trafo_size, xBase, yBase, 1);
+-+            rpi_intra_pred(s, log2_trafo_size, xBase, yBase, 2);
+-+#else
+-             s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase, 1);
+-             s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase, 2);
+-+#endif
+-             if (s->ps.sps->chroma_format_idc == 2) {
+-                 ff_hevc_set_neighbour_available(s, xBase, yBase + (1 << (log2_trafo_size)),
+-                                                 trafo_size_h, trafo_size_v);
+-+#ifdef RPI
+-+                rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (1 << (log2_trafo_size)), 1);
+-+                rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (1 << (log2_trafo_size)), 2);
+-+#else
+-                 s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (1 << (log2_trafo_size)), 1);
+-                 s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (1 << (log2_trafo_size)), 2);
+-+#endif
+-             }
+-         }
+-     }
+-@@ -2304,6 +2362,31 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
+-     lc->ctb_up_left_flag = ((x_ctb > 0) && (y_ctb > 0)  && (ctb_addr_in_slice-1 >= s->ps.sps->ctb_width) && (s->ps.pps->tile_id[ctb_addr_ts] == s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1 - s->ps.sps->ctb_width]]));
+- }
+- 
+-+#ifdef RPI
+-+static void rpi_execute_pred_cmds(HEVCContext *s)
+-+{
+-+  int i;
+-+  HEVCPredCmd *cmd = s->univ_pred_cmds;
+-+  HEVCLocalContext *lc = s->HEVClc;
+ +
+-+  for(i = s->num_pred_cmds; i > 0; i--, cmd++) {
+-+      if (cmd->type == RPI_PRED_INTRA) {
+-+          lc->tu.intra_pred_mode_c = lc->tu.intra_pred_mode = cmd->mode;
+-+          lc->na.cand_bottom_left  = (cmd->na >> 4) & 1;
+-+          lc->na.cand_left         = (cmd->na >> 3) & 1;
+-+          lc->na.cand_up_left      = (cmd->na >> 2) & 1;
+-+          lc->na.cand_up           = (cmd->na >> 1) & 1;
+-+          lc->na.cand_up_right     = (cmd->na >> 0) & 1;
+-+          s->hpc.intra_pred[cmd->size - 2](s, cmd->x, cmd->y, cmd->c_idx);
+-+      } else {
+-+          s->hevcdsp.transform_add[cmd->size-2](cmd->dst, cmd->buf, cmd->stride);
+-+      }
+-+  }
+-+  s->num_pred_cmds = 0;
+-+  s->num_coeffs = 0;
+-+}
+-+#endif
+++// If we have bits left then n must be 0 so give up now
+++         "lsls       %[tmp]        , %[low]      , #16           \n\t"
+++         "bne        2f                                          \n\t"
+ +
+- static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+- {
+-     HEVCContext *s  = avctxt->priv_data;
+-@@ -2313,6 +2396,10 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-     int y_ctb       = 0;
+-     int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs];
+- 
+-+#ifdef RPI
+-+    s->enable_rpi = 1; // TODO this should depend on cross component and frame width etc.
+-+#endif
+++// Do reload
+++         "ldrh       %[tmp]        , [%[bptr]]   , #2            \n\t"
+++         "movw       %[r_b]        , #0xFFFF                     \n\t"
+++         "rev        %[tmp]        , %[tmp]                      \n\t"
+++         "rsb        %[tmp]        , %[r_b]      , %[tmp], lsr #15 \n\t"
+ +
+-     if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) {
+-         av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n");
+-         return AVERROR_INVALIDDATA;
+-@@ -2342,6 +2429,9 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-         s->filter_slice_edges[ctb_addr_rs]  = s->sh.slice_loop_filter_across_slices_enabled_flag;
+- 
+-         more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
+-+#ifdef RPI
+-+        rpi_execute_pred_cmds(s);
+-+#endif
+-         if (more_data < 0) {
+-             s->tab_slice_address[ctb_addr_rs] = -1;
+-             return more_data;
+-@@ -2387,6 +2477,10 @@ static int hls_decode_entry_wpp(AVCodecContext *avctxt, void *input_ctb_row, int
+-     s = s1->sList[self_id];
+-     lc = s->HEVClc;
+- 
+-+#ifdef RPI
+-+    s->enable_rpi = 0;
+-+#endif
+++         "rbit       %[r_b]        , %[low]                      \n\t"
+++         "clz        %[r_b]        , %[r_b]                      \n\t"
+++         "sub        %[r_b]        , %[r_b]      , #16           \n\t"
+ +
+-     if(ctb_row) {
+-         ret = init_get_bits8(&lc->gb, s->data + s->sh.offset[ctb_row - 1], s->sh.size[ctb_row - 1]);
+- 
+-@@ -3075,6 +3169,13 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
+- 
+-     av_freep(&s->cabac_state);
+- 
+-+#ifdef RPI
+-+    av_freep(&s->unif_mv_cmds);
+-+    av_freep(&s->unif_xfm_cmds);
+-+    av_freep(&s->univ_pred_cmds);
+-+    av_freep(&s->coeffs_buf);
+++#if CONFIG_THUMB
+++         "lsl        %[tmp]        , %[tmp]      , %[r_b]        \n\t"
+++         "add        %[low]        , %[low]      , %[tmp]        \n\t"
+++#else
+++         "add        %[low]        , %[low]      , %[tmp], lsl %[r_b] \n\t"
+ +#endif
+ +
+-     for (i = 0; i < 3; i++) {
+-         av_freep(&s->sao_pixel_buffer_h[i]);
+-         av_freep(&s->sao_pixel_buffer_v[i]);
+-@@ -3129,6 +3230,22 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
+-     s->HEVClcList[0] = s->HEVClc;
+-     s->sList[0] = s;
+- 
+-+#ifdef RPI
+-+    s->unif_mv_cmds = av_mallocz(sizeof(HEVCMvCmd)*RPI_MAX_MV_CMDS);
+-+    if (!s->unif_mv_cmds)
+-+        goto fail;
+-+    s->unif_xfm_cmds = av_mallocz(sizeof(HEVCXfmCmd)*RPI_MAX_XFM_CMDS);
+-+    if (!s->unif_xfm_cmds)
+-+        goto fail;
+-+    s->univ_pred_cmds = av_mallocz(sizeof(HEVCPredCmd)*RPI_MAX_PRED_CMDS);
+-+    if (!s->univ_pred_cmds)
+-+        goto fail;
+-+    s->coeffs_buf = av_mallocz(sizeof(int16_t)*RPI_MAX_XFM_CMDS*16);
+-+    if (!s->coeffs_buf)
+-+        goto fail;
+-+    s->enable_rpi = 0;
+-+#endif
+++// Check to see if we still have more to do
+++         "cmp        %[n]          , #0                          \n\t"
+++         "bne        1b                                          \n\t"
+++         "2:                                                     \n\t"
+++         :    [bit]"=&r"(bit),
+++              [low]"+&r"(c->low),
+++            [range]"+&r"(c->range),
+++              [r_b]"=&r"(reg_b),
+++             [bptr]"+&r"(c->bytestream),
+++              [idx]"+&r"(p),
+++                [n]"+&r"(n),
+++              [tmp]"=&r"(tmp),
+++               [st]"=&r"(st)
+++          :  [state0]"r"(state0),
+++            [ctx_map]"r"(ctx_map),
+++        [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128),
+++               [byte]"M"(offsetof(CABACContext, bytestream)),
+++            [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
+++         : "memory", "cc"
+++    );
+++
+++    return p;
+++}
+ +
+-     s->cabac_state = av_malloc(HEVC_CONTEXTS);
+-     if (!s->cabac_state)
+-         goto fail;
+-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
+-index be91010..7a1c35f 100644
+---- a/libavcodec/hevc.h
+-+++ b/libavcodec/hevc.h
+-@@ -23,6 +23,9 @@
+- #ifndef AVCODEC_HEVC_H
+- #define AVCODEC_HEVC_H
+- 
+-+// define RPI to split the CABAC/prediction/transform into separate stages
+-+#include "config.h"
+++// ---------------------------------------------------------------------------
+++//
+++// CABAC_BY22 functions
+++//
+++// By and large these are (at best) no faster than their C equivalents - the
+++// only one worth having is _peek where we do a slightly better job than the
+++// compiler
+++//
+++// The others have been stashed here for reference in case larger scale asm
+++// is attempted in which case they might be a useful base
+ +
+- #include "libavutil/buffer.h"
+- #include "libavutil/md5.h"
+- 
+-@@ -790,6 +793,49 @@ typedef struct HEVCLocalContext {
+-     int boundary_flags;
+- } HEVCLocalContext;
+- 
+-+#ifdef RPI
+ +
+-+// RPI_MAX_WIDTH is maximum width in pixels supported by the accelerated code
+-+#define RPI_MAX_WIDTH 2048
+++#define get_cabac_by22_peek get_cabac_by22_peek_arm
+++static inline uint32_t get_cabac_by22_peek_arm(const CABACContext *const c)
+++{
+++    uint32_t rv, tmp;
+++    __asm__ (
+++        "bic      %[rv]  , %[low], #1            \n\t"
+++        "cmp      %[inv] , #0                    \n\t"
+++        "it       ne                             \n\t"
+++        "umullne  %[tmp] , %[rv] , %[inv], %[rv] \n\t"
+++        :  // Outputs
+++             [rv]"=&r"(rv),
+++             [tmp]"=r"(tmp)
+++        :  // Inputs
+++             [low]"r"(c->low),
+++             [inv]"r"(c->range)
+++        :  // Clobbers
+++                "cc"
+++    );
+++    return rv << 1;
+++}
+ +
+-+// Worst case is for 4:4:4 4x4 blocks with 64 high coding tree blocks, so 16 MV cmds per 4 pixels across for each colour plane
+-+#define RPI_MAX_MV_CMDS   (16*3*(RPI_MAX_WIDTH/4))
+-+#define RPI_MAX_XFM_CMDS  (16*3*(RPI_MAX_WIDTH/4))
+-+// Each block can have an intra prediction and a transform_add command
+-+#define RPI_MAX_PRED_CMDS (2*16*3*(RPI_MAX_WIDTH/4))
+++#if 0
+ +
+-+// Command for inter prediction
+-+typedef struct HEVCMvCmd {
+-+} HEVCMvCmd;
+++// ***** Slower than the C  :-(
+++#define get_cabac_by22_flush get_cabac_by22_flush_arm
+++static inline void get_cabac_by22_flush_arm(CABACContext *const c, const unsigned int n, const uint32_t val)
+++{
+++    uint32_t m, tmp;
+++    __asm__ (
+++    "add    %[bits], %[bits], %[n]   \n\t"
+++    "ldr    %[m], [%[ptr], %[bits], lsr #3]  \n\t"
+++
+++    "rsb    %[tmp], %[n], #32        \n\t"
+++    "lsr    %[tmp], %[val], %[tmp]   \n\t"
+++    "mul    %[tmp], %[range], %[tmp] \n\t"
+++
+++    "rev    %[m], %[m]               \n\t"
+++
+++    "lsl    %[tmp], %[tmp], #23      \n\t"
+++    "rsb    %[low], %[tmp], %[low], lsl %[n] \n\t"
+++
+++    "and    %[tmp], %[bits], #7         \n\t"
+++    "lsl    %[m], %[m], %[tmp]          \n\t"
+++
+++    "orr    %[low], %[low], %[m], lsr #9      \n\t"
+++        :  // Outputs
+++             [m]"=&r"(m),
+++           [tmp]"=&r"(tmp),
+++          [bits]"+&r"(c->by22.bits),
+++           [low]"+&r"(c->low)
+++        :  // Inputs
+++               [n]"r"(n),
+++             [val]"r"(val),
+++             [inv]"r"(c->range),
+++           [range]"r"(c->by22.range),
+++             [ptr]"r"(c->bytestream)
+++        :  // Clobbers
+++    );
+++}
+ +
+-+// Command for transform to process a block of coefficients
+-+typedef struct HEVCXfmCmd {
+-+} HEVCXfmCmd;
+ +
+-+// Command for intra prediction and transform_add of predictions to coefficients
+-+#define RPI_PRED_TRANSFORM_ADD 0
+-+#define RPI_PRED_INTRA 1
+-+typedef struct HEVCPredCmd {
+-+    uint8_t size;
+-+    uint8_t type;
+-+    uint8_t na;
+-+    uint8_t c_idx;
+-+    union {
+-+        uint8_t *dst; // RPI_PRED_TRANSFORM_ADD
+-+        uint32_t x;   // RPI_PRED_INTRA
+-+    };
+-+    union {
+-+        int16_t *buf; // RPI_PRED_TRANSFORM_ADD
+-+        uint32_t y;   // RPI_PRED_INTRA
+-+    };
+-+    union {
+-+        enum IntraPredMode mode; // RPI_PRED_TRANSFORM_ADD
+-+        uint32_t stride;         // RPI_PRED_INTRA
+-+    };
+-+} HEVCPredCmd;
+++// Works but slower than C
+++#define coeff_abs_level_remaining_decode_by22(c,r) coeff_abs_level_remaining_decode_by22_arm(c, r)
+++static int coeff_abs_level_remaining_decode_by22_arm(CABACContext * const c, const unsigned int c_rice_param)
+++{
+++    uint32_t n, val, tmp, level;
+ +
+-+#endif
+++//    PROFILE_START();
+ +
+- typedef struct HEVCContext {
+-     const AVClass *c;  // needed by private avoptions
+-     AVCodecContext *avctx;
+-@@ -805,6 +851,18 @@ typedef struct HEVCContext {
+-     int                 width;
+-     int                 height;
+- 
+-+#ifdef RPI
+-+    int enable_rpi;
+-+    HEVCMvCmd *unif_mv_cmds;
+-+    HEVCXfmCmd *unif_xfm_cmds;
+-+    HEVCPredCmd *univ_pred_cmds;
+-+    int16_t *coeffs_buf;
+-+    int num_mv_cmds;
+-+    int num_xfm_cmds;
+-+    int num_pred_cmds;
+-+    int num_coeffs;
+-+#endif
+++    __asm__ (
+++            // Peek
+++            "bic    %[val],  %[low],   #1  \n\t"
+++            "cmp    %[inv], #0          \n\t"
+++            "umullne  %[tmp], %[val], %[inv], %[val] \n\t"
+++            "lsl    %[val], %[val], #1  \n\t"
+ +
+-     uint8_t *cabac_state;
+- 
+-     /** 1 if the independent slice segment header was successfully parsed */
+-diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
+-index 05b2821..4e97f06 100644
+---- a/libavcodec/hevc_cabac.c
+-+++ b/libavcodec/hevc_cabac.c
+-@@ -1510,6 +1510,21 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+-             coeffs[i] = coeffs[i] + ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
+-         }
+-     }
+-+#ifdef RPI
+-+    if (s->enable_rpi) {
+-+        int16_t *c = s->coeffs_buf + s->num_coeffs;
+-+        int n = trafo_size * trafo_size;
+-+        HEVCPredCmd *cmd = s->univ_pred_cmds + s->num_pred_cmds++;
+-+        memcpy(c, coeffs, n * sizeof(int16_t));  // TODO change pointer earlier and we can avoid this copy
+-+        s->num_coeffs += n;
+-+        cmd->type = RPI_PRED_TRANSFORM_ADD;
+-+        cmd->size = log2_trafo_size;
+-+        cmd->buf = c;
+-+        cmd->dst = dst;
+-+        cmd->stride = stride;
+-+        return;
+-+    }
+-+#endif
+-     s->hevcdsp.transform_add[log2_trafo_size-2](dst, coeffs, stride);
+- }
+- 
+--- 
+-2.7.4
+-
+-
+-From f8293de11dc040d9fa2a558762a357c0c353d2c9 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Thu, 30 Apr 2015 15:23:22 +0100
+-Subject: [PATCH 03/68] Added simple VPU test code
+-
+----
+- libavcodec/Makefile             |    7 +
+- libavcodec/hevc.c               |   33 +-
+- libavcodec/rpi_hevc_transform.h |  212 ++++++
+- libavcodec/rpi_hevc_transform.s |  147 ++++
+- libavcodec/rpi_mailbox.c        |  293 ++++++++
+- libavcodec/rpi_mailbox.h        |   20 +
+- libavcodec/rpi_qpu.c            |  652 ++++++++++++++++++
+- libavcodec/rpi_qpu.h            |   45 ++
+- libavcodec/rpi_shader.c         |  818 ++++++++++++++++++++++
+- libavcodec/rpi_shader.h         |   20 +
+- libavcodec/rpi_shader.qasm      | 1413 +++++++++++++++++++++++++++++++++++++++
+- libavcodec/rpi_user_vcsm.h      |  425 ++++++++++++
+- 12 files changed, 4084 insertions(+), 1 deletion(-)
+- create mode 100644 libavcodec/rpi_hevc_transform.h
+- create mode 100644 libavcodec/rpi_hevc_transform.s
+- create mode 100644 libavcodec/rpi_mailbox.c
+- create mode 100644 libavcodec/rpi_mailbox.h
+- create mode 100644 libavcodec/rpi_qpu.c
+- create mode 100644 libavcodec/rpi_qpu.h
+- create mode 100644 libavcodec/rpi_shader.c
+- create mode 100644 libavcodec/rpi_shader.h
+- create mode 100644 libavcodec/rpi_shader.qasm
+- create mode 100644 libavcodec/rpi_user_vcsm.h
+-
+-diff --git a/libavcodec/Makefile b/libavcodec/Makefile
+-index fd0d1f0..03065cd 100644
+---- a/libavcodec/Makefile
+-+++ b/libavcodec/Makefile
+-@@ -5,6 +5,10 @@ NAME = avcodec
+- HEADERS = avcodec.h                                                     \
+-           avdct.h                                                       \
+-           avfft.h                                                       \
+-+          rpi_qpu.h                                                     \
+-+          rpi_shader.h                                                  \
+-+          rpi_mailbox.h                                                 \
+-+          rpi_hevc_transform.h                                          \
+-           d3d11va.h                                                     \
+-           dirac.h                                                       \
+-           dv_profile.h                                                  \
+-@@ -43,6 +47,9 @@ OBJS = allcodecs.o                                                      \
+-        resample.o                                                       \
+-        resample2.o                                                      \
+-        utils.o                                                          \
+-+       rpi_qpu.o                                                        \
+-+       rpi_shader.o                                                     \
+-+       rpi_mailbox.o                                                    \
+-        vorbis_parser.o                                                  \
+-        xiph.o                                                           \
+- 
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index aa45dd6..ab55df1 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -41,6 +41,10 @@
+- #include "hevc.h"
+- #include "profiles.h"
+- 
+-+#ifdef RPI
+-+#include "rpi_qpu.h"
+++            // Count bits (n = prefix)
+++            "mvn    %[n], %[val] \n\t"
+++            "clz    %[n], %[n]   \n\t"
+++
+++            "lsl    %[level], %[val], %[n] \n\t"
+++            "subs   %[tmp], %[n], #3 \n\t"
+++            "blo    2f \n\t"
+++
+++            // prefix >= 3
+++            // < tmp = prefix - 3
+++            // > tmp = prefix + rice - 3
+++            "add    %[tmp], %[tmp], %[rice] \n\t"
+++            // > n = prefix * 2 + rice - 3
+++            "add    %[n], %[tmp], %[n] \n\t"
+++            "cmp    %[n], #21 \n\t"
+++            "bhi    3f \n\t"
+++
+++            "orr    %[level], %[level], #0x80000000 \n\t"
+++            "rsb    %[tmp], %[tmp], #31 \n\t"
+++            "lsr    %[level], %[level], %[tmp] \n\t"
+++
+++            "mov    %[tmp], #2 \n\t"
+++            "add    %[level], %[level], %[tmp], lsl %[rice] \n\t"
+++            "b      1f \n\t"
+++
+++            // > 22 bits used in total - need reload
+++            "3:  \n\t"
+++
+++            // Stash prefix + rice - 3 in level (only spare reg)
+++            "mov    %[level], %[tmp] \n\t"
+++            // Restore n to flush value (prefix)
+++            "sub    %[n], %[n], %[tmp] \n\t"
+++
+++            // Flush + reload
+++
+++//          "rsb    %[tmp], %[n], #32        \n\t"
+++//          "lsr    %[tmp], %[val], %[tmp]   \n\t"
+++//          "mul    %[tmp], %[range], %[tmp] \n\t"
+++
+++            // As it happens we know that all the bits we are flushing are 1
+++            // so we can cheat slightly
+++            "rsb    %[tmp], %[range], %[range], lsl %[n] \n\t"
+++            "lsl    %[tmp], %[tmp], #23      \n\t"
+++            "rsb    %[low], %[tmp], %[low], lsl %[n] \n\t"
+++
+++            "add    %[bits], %[bits], %[n]   \n\t"
+++            "ldr    %[n], [%[ptr], %[bits], lsr #3]  \n\t"
+++            "rev    %[n], %[n]               \n\t"
+++            "and    %[tmp], %[bits], #7         \n\t"
+++            "lsl    %[n], %[n], %[tmp]          \n\t"
+++
+++            "orr    %[low], %[low], %[n], lsr #9      \n\t"
+++
+++            // (reload)
+++
+++            "bic    %[val],  %[low],   #1  \n\t"
+++            "cmp    %[inv], #0          \n\t"
+++            "umullne  %[tmp], %[val], %[inv], %[val] \n\t"
+++            "lsl    %[val], %[val], #1  \n\t"
+++
+++            // Build value
+++
+++            "mov    %[n], %[level] \n\t"
+++
+++            "orr     %[tmp], %[val], #0x80000000 \n\t"
+++            "rsb     %[level], %[level], #31 \n\t"
+++            "lsr     %[level], %[tmp], %[level] \n\t"
+++
+++            "mov    %[tmp], #2 \n\t"
+++            "add    %[level], %[level], %[tmp], lsl %[rice] \n\t"
+++            "b      1f \n\t"
+++
+++            // prefix < 3
+++            "2:  \n\t"
+++            "rsb    %[tmp], %[rice], #31 \n\t"
+++            "lsr    %[level], %[level], %[tmp] \n\t"
+++            "orr    %[level], %[level], %[n], lsl %[rice] \n\t"
+++            "add    %[n], %[n], %[rice] \n\t"
+++
+++            "1:  \n\t"
+++            // Flush
+++            "add    %[n], %[n], #1 \n\t"
+++
+++            "rsb    %[tmp], %[n], #32        \n\t"
+++            "lsr    %[tmp], %[val], %[tmp]   \n\t"
+++
+++            "add    %[bits], %[bits], %[n]   \n\t"
+++            "ldr    %[val], [%[ptr], %[bits], lsr #3]  \n\t"
+++
+++            "mul    %[tmp], %[range], %[tmp] \n\t"
+++            "lsl    %[tmp], %[tmp], #23      \n\t"
+++            "rsb    %[low], %[tmp], %[low], lsl %[n] \n\t"
+++
+++            "rev    %[val], %[val]               \n\t"
+++            "and    %[tmp], %[bits], #7         \n\t"
+++            "lsl    %[val], %[val], %[tmp]          \n\t"
+++
+++            "orr    %[low], %[low], %[val], lsr #9      \n\t"
+++        :  // Outputs
+++         [level]"=&r"(level),
+++             [n]"=&r"(n),
+++           [val]"=&r"(val),
+++           [tmp]"=&r"(tmp),
+++          [bits]"+&r"(c->by22.bits),
+++           [low]"+&r"(c->low)
+++        :  // Inputs
+++            [rice]"r"(c_rice_param),
+++             [inv]"r"(c->range),
+++           [range]"r"(c->by22.range),
+++             [ptr]"r"(c->bytestream)
+++        :  // Clobbers
+++                "cc"
+++    );
+++
+++//    PROFILE_ACC(residual_abs);
+++
+++    return level;
+++}
+ +#endif
+ +
+- const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
+- 
+- /**
+-@@ -2430,7 +2434,9 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+- 
+-         more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
+- #ifdef RPI
+--        rpi_execute_pred_cmds(s);
+-+        if (x_ctb + ctb_size >= s->ps.sps->width) {
+-+            rpi_execute_pred_cmds(s);
+-+        }
+- #endif
+-         if (more_data < 0) {
+-             s->tab_slice_address[ctb_addr_rs] = -1;
+-@@ -3244,6 +3250,31 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
+-     if (!s->coeffs_buf)
+-         goto fail;
+-     s->enable_rpi = 0;
+-+
+-+    // A little test program
+-+    {
+-+      GPU_MEM_PTR_T p;
+-+      int err = gpu_malloc_cached(16, &p);
+-+      short *q = (short *)p.arm;
+-+      int i;
+-+      int r;
+-+      printf("Allocated memory %d ARM 0x%x, VC 0x%x, Code 0x%x\n",err,(int)p.arm,p.vc,(int)vpu_get_fn());
+-+      printf("Allocated memory %d ARM 0x%x, VC 0x%x\n",err,(int)p.arm,p.vc);
+-+      printf("Preparing data %p\n",q);
+-+      for(i=0;i<16;i++)
+-+        q[i] = i;
+-+      printf("Flush cache\n");
+-+      gpu_cache_flush(&p);
+-+      printf("Executing code\n");
+-+      r = vpu_execute_code( vpu_get_fn(), p.vc, 0, 0, 0, 0, 0);
+-+      printf("Return value %d (",r);
+-+      for(i=0;i<16;i++)
+-+        printf("%d ",q[i]);
+-+      printf(")\n");
+-+      gpu_free(&p);
+-+      goto fail; // Early out
+-+    }
+++#endif /* HAVE_ARMV6T2_INLINE */
+ +
+- #endif
+- 
+-     s->cabac_state = av_malloc(HEVC_CONTEXTS);
+-diff --git a/libavcodec/rpi_hevc_transform.h b/libavcodec/rpi_hevc_transform.h
+++#endif /* AVCODEC_ARM_HEVC_CABAC_H */
++diff --git a/libavcodec/arm/hevcdsp_deblock_neon.S b/libavcodec/arm/hevcdsp_deblock_neon.S
++index 166bddb..a088cc3 100644
++--- a/libavcodec/arm/hevcdsp_deblock_neon.S
+++++ b/libavcodec/arm/hevcdsp_deblock_neon.S
++@@ -383,3 +383,127 @@ function ff_hevc_h_loop_filter_chroma_neon, export=1
++         vst1.8   {d4}, [r0]
++         bx       lr
++ endfunc
+++
+++/* ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_inc, int out_inc,
+++ *                                            int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
+++ *                                            MvField *curr, MvField *neigh, uint8_t *bs)
+++ */
+++function ff_hevc_deblocking_boundary_strengths_neon, export=1
+++        add         ip, sp, #4*4
+++        push        {a2-a4,v1-v8,lr}
+++        ldmia       ip, {v5-v7}
+++1:      ldmdb       ip, {v1-v4}
+++        ldrsb       a3, [v5, #8]    @ curr->ref_idx
+++        ldrsb       v8, [v5, #9]
+++        ldrsb       ip, [v6, #8]    @ neigh->ref_idx
+++        ldrsb       lr, [v6, #9]
+++        ldr         v1, [v1, a3, lsl #2]
+++        ldrb        a3, [v5, #10]   @ curr->pred_flag
+++        ldr         v2, [v2, v8, lsl #2]
+++        ldrb        v8, [v6, #10]   @ neigh->pred_flag
+++        ldr         v3, [v3, ip, lsl #2]
+++        ldr         v4, [v4, lr, lsl #2]
+++        teq         a3, #3
+++        beq         20f
+++        teq         v8, #3
+++        beq         90f
+++
+++        tst         a3, #1
+++        itee        ne
+++        ldrne       a3, [v5, #0]    @ curr->mv[0]
+++        ldreq       a3, [v5, #4]    @ curr->mv[1]
+++        moveq       v1, v2
+++        tst         v8, #1
+++        itee        ne
+++        ldrne       v8, [v6, #0]    @ neigh->mv[0]
+++        ldreq       v8, [v6, #4]    @ neigh->mv[1]
+++        moveq       v3, v4
+++        teq         v1, v3
+++        bne         10f
+++        ldr         lr, =0xFFFCFFFC
+++        ssub16      ip, v8, a3
+++        ssub16      a3, a3, v8
+++        sel         a3, a3, ip
+++        ands        a3, a3, lr
+++        @ drop through
+++10:     it          ne
+++        movne       a3, #1
+++11:     subs        a2, a2, #1
+++12:
+++A       strbhs      a3, [v7], a4
+++T       itt         hs
+++T       strbhs      a3, [v7]
+++T       addhs       v7, v7, a4
+++        subs        a2, a2, #1
+++        bhs         12b
+++
+++        ldm         sp, {a2, a3}
+++        add         ip, sp, #16*4
+++        subs        a1, a1, #1
+++        add         v5, v5, a3
+++        add         v6, v6, a3
+++        bhi         1b
+++        pop         {a2-a4,v1-v8,pc}
+++
+++20:     teq         v8, #3
+++        bne         10b
+++
+++        teq         v1, v3
+++        it          eq
+++        teqeq       v2, v4
+++        bne         40f
+++        teq         v1, v2
+++        bne         30f
+++
+++        ldrd        v1, v2, [v5]    @ curr->mv
+++        ldrd        v3, v4, [v6]    @ neigh->mv
+++        ldr         lr, =0xFFFCFFFC
+++        ssub16      ip, v3, v1
+++        ssub16      a3, v1, v3
+++        sel         a3, a3, ip
+++        ands        a3, a3, lr
+++        bne         25f
+++        ssub16      ip, v4, v2
+++        ssub16      a3, v2, v4
+++        sel         a3, a3, ip
+++        ands        a3, a3, lr
+++        beq         11b
+++        @ drop through
+++25:     ssub16      ip, v4, v1
+++        ssub16      a3, v1, v4
+++        sel         a3, a3, ip
+++        ands        a3, a3, lr
+++        bne         10b
+++        ssub16      ip, v3, v2
+++        ssub16      a3, v2, v3
+++        sel         a3, a3, ip
+++        ands        a3, a3, lr
+++        b           10b
+++
+++30:     ldrd        v1, v2, [v5]    @ curr->mv
+++        ldrd        v3, v4, [v6]    @ neigh->mv
+++        ldr         lr, =0xFFFCFFFC
+++        ssub16      ip, v3, v1
+++        ssub16      a3, v1, v3
+++        sel         a3, a3, ip
+++        ands        a3, a3, lr
+++        bne         10b
+++        ssub16      ip, v4, v2
+++        ssub16      a3, v2, v4
+++        sel         a3, a3, ip
+++        ands        a3, a3, lr
+++        b           10b
+++
+++40:     teq         v1, v4
+++        ite         eq
+++        teqeq       v2, v3
+++        bne         10b
+++
+++        ldrd        v1, v2, [v5]    @ curr->mv
+++        ldrd        v3, v4, [v6]    @ neigh->mv
+++        ldr         lr, =0xFFFCFFFC
+++        b           25b
+++
+++90:     mov         a3, #1
+++        b           11b
+++endfunc
++diff --git a/libavcodec/arm/hevcdsp_epel_neon.S b/libavcodec/arm/hevcdsp_epel_neon.S
+ new file mode 100644
+-index 0000000..85a9102
++index 0000000..00eab9e
+ --- /dev/null
+-+++ b/libavcodec/rpi_hevc_transform.h
+-@@ -0,0 +1,212 @@
+-+unsigned char rpi_hevc_transform [] = {
+-+169,
+-+3,
+-+3,
+-+232,
+-+128,
+-+0,
+-+0,
+-+0,
+-+20,
+-+248,
+-+0,
+-+136,
+-+0,
+-+0,
+-+192,
+-+248,
+-+0,
+-+0,
+-+0,
+-+96,
+-+3,
+-+232,
+-+32,
+-+0,
+-+0,
+-+0,
+-+7,
+-+232,
+-+0,
+-+2,
+-+0,
+-+0,
+-+8,
+-+232,
+-+0,
+-+4,
+-+0,
+-+0,
+-+12,
+-+248,
+-+0,
+-+128,
+-+0,
+-+0,
+-+192,
+-+8,
+-+4,
+-+0,
+-+4,
+-+232,
+-+64,
+-+0,
+-+0,
+-+0,
+-+5,
+-+232,
+-+0,
+-+0,
+-+8,
+-+0,
+-+128,
+-+69,
+-+113,
+-+66,
+-+12,
+-+248,
+-+0,
+-+128,
+-+0,
+-+0,
+-+192,
+-+8,
+-+4,
+-+0,
+-+128,
+-+69,
+-+113,
+-+70,
+-+128,
+-+144,
+-+39,
+-+0,
+-+4,
+-+255,
+-+48,
+-+192,
+-+128,
+-+3,
+-+32,
+-+8,
+-+16,
+-+0,
+-+76,
+-+254,
+-+48,
+-+192,
+-+9,
+-+4,
+-+32,
+-+8,
+-+0,
+-+0,
+-+4,
+-+254,
+-+0,
+-+144,
+-+128,
+-+2,
+-+0,
+-+248,
+-+62,
+-+0,
+-+128,
+-+144,
+-+22,
+-+0,
+-+4,
+-+255,
+-+48,
+-+192,
+-+128,
+-+3,
+-+32,
+-+8,
+-+16,
+-+0,
+-+76,
+-+254,
+-+48,
+-+192,
+-+9,
+-+4,
+-+32,
+-+8,
+-+0,
+-+0,
+-+140,
+-+248,
+-+44,
+-+0,
+-+0,
+-+0,
+-+32,
+-+48,
+-+4,
+-+0,
+-+128,
+-+69,
+-+113,
+-+66,
+-+242,
+-+140,
+-+211,
+-+192,
+-+41,
+-+3,
+-+68,
+-+192,
+-+80,
+-+7,
+-+164,
+-+255,
+-+36,
+-+220,
+-+96,
+-+2,
+-+0,
+-+248,
+-+62,
+-+0,
+-+3,
+-+255,
+-+55,
+-+208,
+-+120,
+-+3,
+-+224,
+-+3,
+-+190,
+-+11,
+-+16,
+-+139,
+-+246,
+-+83,
+-+0,
+-+103,
+-+90,
+-+0,
+-+8,
+-+240,
+-+0,
+-+128,
+-+128,
+-+3,
+-+0,
+-+247,
+-+32,
+-+128,
+-+10,
+-+4,
+-+136,
+-+240,
+-+32,
+-+0,
+-+128,
+-+3,
+-+112,
+-+96,
+-+90,
+-+0,
+-+};
+-diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s
+-new file mode 100644
+-index 0000000..5e2728d
+---- /dev/null
+-+++ b/libavcodec/rpi_hevc_transform.s
+-@@ -0,0 +1,147 @@
+-+# ******************************************************************************
+-+# Argon Design Ltd.
+-+# (c) Copyright 2015 Argon Design Ltd. All rights reserved.
+-+#
+-+# Module : HEVC
+-+# Author : Peter de Rivaz
+-+# ******************************************************************************
+++++ b/libavcodec/arm/hevcdsp_epel_neon.S
++@@ -0,0 +1,337 @@
+++/*
+++ * Copyright (c) 2014 - 2015 Seppo Tomperi <seppo.tomperi@vtt.fi>
+++ *
+++ * This file is part of FFmpeg.
+++ *
+++ * FFmpeg is free software; you can redistribute it and/or
+++ * modify it under the terms of the GNU Lesser General Public
+++ * License as published by the Free Software Foundation; either
+++ * version 2.1 of the License, or (at your option) any later version.
+++ *
+++ * FFmpeg is distributed in the hope that it will be useful,
+++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+++ * Lesser General Public License for more details.
+++ *
+++ * You should have received a copy of the GNU Lesser General Public
+++ * License along with FFmpeg; if not, write to the Free Software
+++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+++ */
+ +
+-+# HEVC VPU Transform
+-+#
+-+# Transform matrix can be thought of as
+-+#   output row vector = input row vector * transMatrix2
+-+#
+-+# The even rows of the matrix are symmetric
+-+# The odd rows of the matrix are antisymmetric
+-+#
+-+# So only need to compute the first half of the results, then can compute the remainder with a butterfly
+-+#
+-+# EXAMPLE
+-+#   (a b c d) (1 2  2  1)
+-+#             (3 4 -4 -3)
+-+#             (5 6  6  5)
+-+#             (7 8 -8 -7)
+-+#
+-+#  x=(a c)(1 2) = 1a+5c 2a+6c
+-+#         (5 6)
+-+#
+-+#  y=(b d)(3 4) = 3b+7d 4b+8d
+-+#         (7 8)
+-+#
+-+#  u=x+y = 1a+5c+3b+7d 2a+4b+6c+8d
+-+#  v=x-y = 1a+5c-3b-7d 2a+6c-4b-8d
+-+#
+-+#  Final results are (u , v[::-1])
+-+#
+-+#
+-+#  For 32x1 input, load even rows into HX(0++,0), odd rows into HX(16++,0)
+-+#  Apply the even matrix first and stop before rounding
+-+#  Then apply the odd matrix in a full manner:
+-+#
+-+#   First step is to compute partial products with the first input (16 cycles)
+-+#   1a 3b 5c 7d   16x1 input coefficients produce 16x16 output
+-+#   2a 4b 6c 8d
+-+#   2a -4b 6c -8d
+-+#   1a -3b 5c -7d
+-+#
+-+#   Second step is to sum partial products into final position (8 cycles)
+-+#   1a+3b+5c+7d
+-+#   2a+4b+6c+8d
+-+#   2a-4b+6c-8d
+-+#   1a-3b+5c-7d
+-+#
+-+#   Then can apply butterfly to combine even results and odd results + rounding to produce 16 rows of output at a time (need to save in transposed format)
+-+#
+-+#   For 16x16 no butterfly is required and can store final results in original location  (Could do 2 16x16s in parallel to make use of the trick - saves on the adds)
+-+#
+-+#   For 8x8 we could compute two in parallel.
+-+#
+-+#
+++#include "libavutil/arm/asm.S"
+++#include "neon.S"
+++
+++#define MAX_PB_SIZE #64
+++
+++.macro vextin_d4
+++    vld1.8    {q10}, [r1], r2
+++    vmov      d16, d20
+++    vext.8    d17, d20, d21, #1
+++    vext.8    d18, d20, d21, #2
+++    vext.8    d19, d20, d21, #3
+++.endm
+++
+++.macro vextin_d4_8
+++    vld1.8    d16, [r1], r2
+++    vext.8    d17, d16, d16, #1
+++    vext.8    d18, d16, d16, #2
+++    vext.8    d19, d16, d16, #3
+++.endm
+++
+++.macro load_coeffs_16b coeffs
+++    ldr      \coeffs, [\coeffs]
+++    vdup.i8  d0, \coeffs
+++    lsr      \coeffs, #8
+++    vdup.i8  d1, \coeffs
+++    lsr      \coeffs, #8
+++    vdup.i8  d2, \coeffs
+++    lsr      \coeffs, #8
+++    vdup.i8  d3, \coeffs
+++.endm
+++
+++.macro epel_filter_16b out=q12
+++    vmull.u8 q3, d16, d0
+++    vmull.u8 q11, d19, d3
+++    vmull.u8 \out, d17, d1
+++    vmull.u8 q10, d18, d2
+++    vadd.s16 q3, q11
+++    vadd.s16 \out, q10
+++    vsub.s16 \out, q3
+++.endm
+++
+++.macro load_coeffs_32b coeffs
+++    ldr      \coeffs, [\coeffs]
+++    vmov.i64 d4, #0
+++    vmov.8   d4[0], \coeffs
+++    lsr      \coeffs, #8
+++    vmov.8   d4[2], \coeffs
+++    lsr      \coeffs, #8
+++    vmov.8   d4[4], \coeffs
+++    lsr      \coeffs, #8
+++    vmov.8   d4[6], \coeffs
+++.endm
+++
+++.macro epel_filter_32b
+++    vmull.s16 q3, d24, d4[0] //q12
+++    vmull.s16 q4, d25, d4[0]
+++    vmull.s16 q5, d30, d4[3] //q15
+++    vmull.s16 q6, d31, d4[3]
+++
+++    vmull.s16 q7, d26, d4[1] // q13
+++    vmull.s16 q8, d27, d4[1]
+++    vmull.s16 q9, d28, d4[2] // q14
+++    vmull.s16 q10, d29, d4[2]
+++    vadd.s32 q3, q5
+++    vadd.s32 q4, q6
+++    vadd.s32 q7, q9
+++    vadd.s32 q8, q10
+++    vsub.s32 q7, q3
+++    vsub.s32 q8, q4
+++    vqshrn.s32  d6, q7, #6
+++    vqshrn.s32  d7, q8, #6
+++.endm
+++
+++.macro epel_filter_32b_4
+++    vmull.s16 q3, d24, d4[0] //q12
+++    vmull.s16 q5, d30, d4[3] //q15
+++    vmull.s16 q7, d26, d4[1] // q13
+++    vmull.s16 q9, d28, d4[2] // q14
+++    vadd.s32 q3, q5
+++    vadd.s32 q7, q9
+++    vsub.s32 q7, q3
+++    vqshrn.s32  d6, q7, #6
+++.endm
+++
+++function ff_hevc_put_epel_h_neon_8, export=1
+++        push   {r4-r7}
+++        mov    r4, MAX_PB_SIZE
+++        ldr    r7, [sp, #16] // mx
+++        ldr    r5, [sp, #24] // width
+++        sub    r7, #1
+++        lsl    r7, #2
+++        vpush {d8-d15}
+++@ adr reaches if we are in thumb mode but not in arm
+++T       adr    r12, epel_coeffs
+++A       adrl   r12, epel_coeffs
+++        add    r7, r12
+++        sub       r1, #1
+++        lsl       r4, #1
+++        load_coeffs_16b r7
+++        mov   r12, r3
+++        mov   r6, r0
+++        mov   r7, r1
+++        cmp       r5, #6
+++        bgt       8f
+++        cmp       r5, #4
+++        blt       2f
+++        b         4f
+++8:      subs r3, #1
+++        pld [r1]
+++        vextin_d4
+++        epel_filter_16b
+++        vst1.16    {q12}, [r0], r4
+++        bne 8b
+++        subs    r5, #8
+++        beq  99f
+++        mov       r3, r12
+++        add       r6, #16
+++        mov       r0, r6
+++        add       r7, #8
+++        mov       r1, r7
+++        cmp       r5, #4
+++        bgt       8b
+++4:      subs r3, #1
+++        pld [r1]
+++        vextin_d4_8
+++        epel_filter_16b
+++        vst1.16    d24, [r0], r4
+++        bne 4b
+++        subs      r5, #4
+++        beq       99f
+++        mov       r3, r12
+++        add       r6, #8
+++        mov       r0, r6
+++        add       r7, #4
+++        mov       r1, r7
+++2:      subs r3, #1
+++        pld [r1]
+++        vextin_d4_8
+++        epel_filter_16b
+++        vst1.32    d24[0], [r0], r4
+++        bne 2b
+++99:     vpop {d8-d15}
+++        pop {r4-r7}
+++        bx lr
+++endfunc
+ +
+-+test_add:
+-+  vldh HX(0,0),(r0)
+-+  vadd HX(0,0),HX(0,0),10
+-+  vsth HX(0,0),(r0)
+-+  mov r0,7 # return value
+-+  b lr
+++function ff_hevc_put_epel_v_neon_8, export=1
+++        push   {r4-r7}
+++        mov    r4, MAX_PB_SIZE
+++        ldr    r7, [sp, #20] // my
+++        ldr    r5, [sp, #24] // width
+++        sub    r7, #1
+++        lsl    r7, #2
+++        vpush {d8-d15}
+++T       adr    r12, epel_coeffs
+++A       adrl   r12, epel_coeffs
+++        add    r7, r12
+++        load_coeffs_16b r7
+++        sub       r1, r2
+++        lsl       r4, #1
+++        mov   r12, r3
+++        mov   r6, r0
+++        mov   r7, r1
+++0:      pld [r1]
+++        vld1.8    {d16}, [r1], r2
+++        pld [r1]
+++        vld1.8    {d17}, [r1], r2
+++        pld [r1]
+++        vld1.8    {d18}, [r1], r2
+++        cmp       r5, #6
+++        bgt       8f
+++        cmp       r5, #4
+++        blt       2f
+++        b         4f
+++8:      pld [r1]
+++        vld1.8    {d19}, [r1], r2
+++        subs r3, #1
+++        epel_filter_16b
+++        vst1.16    {q12}, [r0], r4
+++        vmov d16, d17
+++        vmov d17, d18
+++        vmov d18, d19
+++        bne 8b
+++        subs    r5, #8
+++        beq  99f
+++        mov       r3, r12
+++        add       r6, #16
+++        mov       r0, r6
+++        add       r7, #8
+++        mov       r1, r7
+++        b         0b
+++4:      pld       [r1]
+++        vld1.8    {d19}, [r1], r2
+++        subs r3, #1
+++        epel_filter_16b
+++        vst1.16    d24, [r0], r4
+++        vmov d16, d17
+++        vmov d17, d18
+++        vmov d18, d19
+++        bne 4b
+++        subs      r5, #4
+++        beq       99f
+++        mov       r3, r12
+++        add       r6, #8
+++        mov       r0, r6
+++        add       r7, #4
+++        mov       r1, r7
+++        b         0b
+++2:      pld [r1]
+++        vld1.8    {d19}, [r1], r2
+++        subs r3, #1
+++        epel_filter_16b
+++        vst1.32    d24[0], [r0], r4
+++        vmov d16, d17
+++        vmov d17, d18
+++        vmov d18, d19
+++        bne 2b
+++99:     vpop {d8-d15}
+++        pop {r4-r7}
+++        bx lr
+++endfunc
+ +
+-+# Columns are transformed first
+-+#
+-+# Store top left half of transMatrix2 in
+-+# Store bottom left half of transMatrix2 in HX(32,32)
+-+#
+-+# For 16x16
+-+# HX(0:15,0) contains input data before transform
+-+# HY(0:15,0) contains 32bit output data after transform
+-+# HX(32,0) contains even rows of left half of transMatrix2
+-+# HX(32,32) contains odd rows of left half of transMatrix2
+-+# HY(48,0) contains partial products ready for summing
+-+#
+++function ff_hevc_put_epel_hv_neon_8, export=1
+++        push   {r4-r7}
+++        mov    r4, MAX_PB_SIZE
+++        ldr    r6, [sp, #16] // mx
+++        ldr    r7, [sp, #20] // my
+++        ldr    r5, [sp, #24] // width
+++        sub    r7, #1
+++        lsl    r7, #2
+++        vpush {d8-d15}
+++        adr    r12, epel_coeffs
+++        sub    r6, #1
+++        lsl    r6, #2
+++        add    r6, r12 // mx epel coeff offset
+++        add    r7, r12
+++        sub       r1, #1
+++        sub       r1, r2
+++        lsl       r4, #1
+++        load_coeffs_16b r6
+++        load_coeffs_32b r7
+++        mov   r12, r3
+++        mov   r6, r0
+++        mov   r7, r1
+++0:      pld   [r1]
+++        vextin_d4
+++        epel_filter_16b q12
+++        pld   [r1]
+++        vextin_d4
+++        epel_filter_16b q13
+++        pld   [r1]
+++        vextin_d4
+++        epel_filter_16b q14
+++        cmp       r5, #6
+++        bgt       8f
+++        cmp       r5, #4
+++        blt       2f
+++        b         4f
+++8:      pld     [r1]
+++        vextin_d4
+++        epel_filter_16b q15
+++        subs r3, #1
+++        epel_filter_32b
+++        vst1.16    {q3}, [r0], r4
+++        vmov q12, q13
+++        vmov q13, q14
+++        vmov q14, q15
+++        bne 8b
+++        subs    r5, #8
+++        beq  99f
+++        mov       r3, r12
+++        add       r6, #16
+++        mov       r0, r6
+++        add       r7, #8
+++        mov       r1, r7
+++        b         0b
+++4:      pld      [r1]
+++        vextin_d4_8
+++        epel_filter_16b q15
+++        subs r3, #1
+++        epel_filter_32b_4
+++        vst1.16    d6, [r0], r4
+++        vmov q12, q13
+++        vmov q13, q14
+++        vmov q14, q15
+++        bne 4b
+++        subs      r5, #4
+++        beq       99f
+++        mov       r3, r12
+++        add       r6, #8
+++        mov       r0, r6
+++        add       r7, #4
+++        mov       r1, r7
+++        b         0b
+++2:      pld      [r1]
+++        vextin_d4_8
+++        epel_filter_16b q15
+++        subs r3, #1
+++        epel_filter_32b_4
+++        vst1.32    d6[0], [r0], r4
+++        vmov q12, q13
+++        vmov q13, q14
+++        vmov q14, q15
+++        bne 2b
+++99:     vpop {d8-d15}
+++        pop {r4-r7}
+++        bx lr
+++endfunc
+ +
+++epel_coeffs:
+++       .byte 2, 58, 10, 2
+++       .byte 4, 54, 16, 2
+++       .byte 6, 46, 28, 4
+++       .byte 4, 36, 36, 4
+++       .byte 4, 28, 46, 6
+++       .byte 2, 16, 54, 4
+++       .byte 2, 10, 58, 2
++diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c
++index 5591807..49c70dd 100644
++--- a/libavcodec/arm/hevcdsp_init_neon.c
+++++ b/libavcodec/arm/hevcdsp_init_neon.c
++@@ -22,6 +22,8 @@
++ #include "libavutil/arm/cpu.h"
++ #include "libavcodec/hevcdsp.h"
++ #include "hevcdsp_arm.h"
+++#include "libavcodec/avcodec.h"
+++#include "libavcodec/bit_depth_template.c"
++ 
++ void ff_hevc_v_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
++ void ff_hevc_h_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
++@@ -43,6 +45,21 @@ void ff_hevc_transform_add_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
++ void ff_hevc_transform_add_32x32_neon_8(uint8_t *_dst, int16_t *coeffs,
++                                       ptrdiff_t stride);
++ 
+++void ff_hevc_sao_band_w8_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height);
+++void ff_hevc_sao_band_w16_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height);
+++void ff_hevc_sao_band_w32_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height);
+++void ff_hevc_sao_band_w64_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height);
+++
+++void ff_hevc_sao_edge_eo0_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
+++void ff_hevc_sao_edge_eo1_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
+++void ff_hevc_sao_edge_eo2_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
+++void ff_hevc_sao_edge_eo3_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
+++
+++void ff_hevc_sao_edge_eo0_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
+++void ff_hevc_sao_edge_eo1_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
+++void ff_hevc_sao_edge_eo2_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
+++void ff_hevc_sao_edge_eo3_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
+++
++ #define PUT_PIXELS(name) \
++     void name(int16_t *dst, uint8_t *src, \
++                                 ptrdiff_t srcstride, int height, \
++@@ -58,6 +75,15 @@ PUT_PIXELS(ff_hevc_put_pixels_w32_neon_8);
++ PUT_PIXELS(ff_hevc_put_pixels_w48_neon_8);
++ PUT_PIXELS(ff_hevc_put_pixels_w64_neon_8);
++ #undef PUT_PIXELS
+++void ff_hevc_put_epel_h_neon_8(int16_t *dst, uint8_t *src,
+++                                ptrdiff_t srcstride, int height,
+++                                intptr_t mx, intptr_t my, int width);
+++void ff_hevc_put_epel_v_neon_8(int16_t *dst, uint8_t *src,
+++                                ptrdiff_t srcstride, int height,
+++                                intptr_t mx, intptr_t my, int width);
+++void ff_hevc_put_epel_hv_neon_8(int16_t *dst, uint8_t *src,
+++                                ptrdiff_t srcstride, int height,
+++                                intptr_t mx, intptr_t my, int width);
++ 
++ static void (*put_hevc_qpel_neon[4][4])(int16_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
++                                    int height, int width);
++@@ -142,6 +168,132 @@ void ff_hevc_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t
++     put_hevc_qpel_uw_neon[my][mx](dst, dststride, src, srcstride, width, height, src2, MAX_PB_SIZE);
++ }
++ 
+++static void ff_hevc_sao_band_neon_wrapper(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+++                                          int16_t *sao_offset_val, int sao_left_class, int width, int height)
+++{
+++    pixel *dst = (pixel *)_dst;
+++    pixel *src = (pixel *)_src;
+++    int8_t offset_table[32] = { 0 };
+++    int k, y, x;
+++    int shift  = 3; // BIT_DEPTH - 5
+++    int cwidth = 0;
+ +
+-+# hevc_trans_16x16(short *transMatrix2, short *coeffs, int num)
+-+# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory)
+-+# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
+-+# num: number of 16x16 transforms to be done
+-+#
+-+hevc_trans_16x16:
+-+  push r6-r15, lr # TODO cut down number of used registers
+++    stride_src /= sizeof(pixel);
+++    stride_dst /= sizeof(pixel);
+ +
+-+  mov r3, 2*32*2 # Twice Stride of transMatrix2 in bytes
+-+  vld HX(32++,0),(r0 += r3) REP 16 # This is the 16x16 matrix, a transform is equivalent to multiplying input row vector * matrix
+-+  # Now use r0 to describe which matrix we are working on.
+-+  # Allows us to prefetch the next block of coefficients for efficiency.
+-+  mov r0,0 # This describes the location where we read our coefficients from
+-+  mov r3,16*2 # Stride of coefficients in bytes
+-+  mov r7,16*16*2 # Total block size
+-+  mov r8,64*16 # Value used to swap from current to next VRF location
+-+  vldh HX(0++,0)+r0,(r1 += r3) REP 16
+-+  mov r4,64 # Constant used for rounding first pass
+-+  mov r5,1<<19 # Constant used for rounding second pass
+++    for (k = 0; k < 4; k++)
+++        offset_table[(k + sao_left_class) & 31] = sao_offset_val[k + 1];
+ +
+-+  # At start of block r0,r1 point to the current block (that has already been loaded)
+-+block_loop:
+-+  eor r0,r8
+-+  add r1,r7
+-+  # Prefetch the next block
+-+  vldh HX(0++,0)+r0,(r1 += r3) REP 16
+-+  eor r0,r8
+-+  sub r1,r7
+++    if (height % 8 == 0)
+++        cwidth = width;
+ +
+-+  # Transform the current block
+-+  bl col_trans_16
+-+  vadd HY(0++,0)+r0,HY(0++,0)+r0,r4 REP 16   # Now add on rounding, shift down by 7, and saturate
+-+  #vsasls HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # 9+7=16 so this ends up with the output saturated and in the top half of the word.
+-+  vasl HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16    # This should be saturating, but the instruction above does not assemble?
+-+  vmov VX(0,0++), HX(0++,32) REP 16          # For simplicity transpose this back to the original position
+++    switch(cwidth){
+++    case 8:
+++        ff_hevc_sao_band_w8_neon_8(_dst, _src, offset_table, stride_src, stride_dst, height);
+++        break;
+++    case 16:
+++        ff_hevc_sao_band_w16_neon_8(_dst, _src, offset_table, stride_src, stride_dst, height);
+++        break;
+++    case 32:
+++        ff_hevc_sao_band_w32_neon_8(_dst, _src, offset_table, stride_src, stride_dst, height);
+++        break;
+++    case 64:
+++        ff_hevc_sao_band_w64_neon_8(_dst, _src, offset_table, stride_src, stride_dst, height);
+++        break;
+++    default:
+++        for (y = 0; y < height; y++) {
+++            for (x = 0; x < width; x++)
+++                dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
+++            dst += stride_dst;
+++            src += stride_src;
+++        }
+++    }
+++}
+ +
+-+  bl col_trans_16
+-+  vadd HY(0++,0)+r0,HY(0++,0)+r0,r4 REP 16   # Now add on rounding, shift down by 7, and saturate
+-+  #vsasls HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # 9+7=16 so this ends up with the output saturated and in the top half of the word.
+-+  vasl HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16    # This should be saturating, but the instruction above does not assemble?
+++#define CMP(a, b) ((a) > (b) ? 1 : ((a) == (b) ? 0 : -1))
+++static void ff_hevc_sao_edge_neon_wrapper(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
+++                                          int16_t *_sao_offset_val, int eo, int width, int height)
+++{
+++    static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
+++    static const int8_t pos[4][2][2] = {
+++        { { -1,  0 }, {  1, 0 } }, // horizontal
+++        { {  0, -1 }, {  0, 1 } }, // vertical
+++        { { -1, -1 }, {  1, 1 } }, // 45 degree
+++        { {  1, -1 }, { -1, 1 } }, // 135 degree
+++    };
+++    int8_t sao_offset_val[8];  // padding of 3 for vld
+++    ptrdiff_t stride_src = (2*MAX_PB_SIZE + FF_INPUT_BUFFER_PADDING_SIZE);
+++    pixel *dst = (pixel *)_dst;
+++    pixel *src = (pixel *)_src;
+++    int a_stride, b_stride;
+++    int x, y;
+++    int cwidth = 0;
+++
+++    for (x = 0; x < 5; x++) {
+++        sao_offset_val[x] = _sao_offset_val[edge_idx[x]];
+++    }
+ +
+-+  # Save results - note there has been a transposition during the processing so we save columns
+-+  vsth VX(0,32++)+r0, (r1 += r3) REP 16
+++    if (height % 8 == 0)
+++        cwidth = width;
+ +
+-+  # Move onto next block
+-+  eor r0,r8
+-+  add r1,r7
+++    stride_src /= sizeof(pixel);
+++    stride_dst /= sizeof(pixel);
+ +
+-+  addcmpbgt r2,-1,0,block_loop
+-+  pop r6-r15, pc
+++    switch (cwidth) {
+++    case 32:
+++        switch(eo) {
+++        case 0:
+++            ff_hevc_sao_edge_eo0_w32_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
+++            break;
+++        case 1:
+++            ff_hevc_sao_edge_eo1_w32_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
+++            break;
+++        case 2:
+++            ff_hevc_sao_edge_eo2_w32_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
+++            break;
+++        case 3:
+++            ff_hevc_sao_edge_eo3_w32_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
+++            break;
+++        }
+++        break;
+++    case 64:
+++        switch(eo) {
+++        case 0:
+++            ff_hevc_sao_edge_eo0_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
+++            break;
+++        case 1:
+++            ff_hevc_sao_edge_eo1_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
+++            break;
+++        case 2:
+++            ff_hevc_sao_edge_eo2_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
+++            break;
+++        case 3:
+++            ff_hevc_sao_edge_eo3_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
+++            break;
+++        }
+++        break;
+++    default:
+++        a_stride = pos[eo][0][0] + pos[eo][0][1] * stride_src;
+++        b_stride = pos[eo][1][0] + pos[eo][1][1] * stride_src;
+++        for (y = 0; y < height; y++) {
+++            for (x = 0; x < width; x++) {
+++                int diff0         = CMP(src[x], src[x + a_stride]);
+++                int diff1         = CMP(src[x], src[x + b_stride]);
+++                int idx           = diff0 + diff1;
+++                if (idx)
+++                    dst[x] = av_clip_pixel(src[x] + sao_offset_val[idx+2]);
+++            }
+++            src += stride_src;
+++            dst += stride_dst;
+++        }
+++    }
+++}
+++#undef CMP
+ +
+-+# r1,r2,r3 r7,r8 should be preserved
+-+# HX(0++,0)+r0 is the block to be transformed
+-+# HX(32++,0) is the 16x16 matrix of transform coefficients
+-+# Use HY(48,0) for intermediate results
+-+# r0 can be used, but should be returned to its original value at the end
+-+col_trans_16:
+-+  add r4,r0,16 # Final value for this loop
+-+col_trans_16_loop:
+-+  # First compute partial products for a single column
+-+  vmul32s VY(48,0++), VX(0,0)+r0, VX(32,0++) REP 16
+-+  # Then sum up the results and place back
+-+  vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
+-+  addcmpblt r0,1,r4,col_trans_16_loop
+-+  sub r0,16  # but r0 back to its original value
+-+  b lr
+-diff --git a/libavcodec/rpi_mailbox.c b/libavcodec/rpi_mailbox.c
+-new file mode 100644
+-index 0000000..536896f
+---- /dev/null
+-+++ b/libavcodec/rpi_mailbox.c
+-@@ -0,0 +1,293 @@
+-+/*
+-+Copyright (c) 2012, Broadcom Europe Ltd.
+-+All rights reserved.
+++void ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_inc, int out_inc,
+++                                                int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
+++                                                MvField *curr, MvField *neigh, uint8_t *bs);
+ +
+-+Redistribution and use in source and binary forms, with or without
+-+modification, are permitted provided that the following conditions are met:
+-+    * Redistributions of source code must retain the above copyright
+-+      notice, this list of conditions and the following disclaimer.
+-+    * Redistributions in binary form must reproduce the above copyright
+-+      notice, this list of conditions and the following disclaimer in the
+-+      documentation and/or other materials provided with the distribution.
+-+    * Neither the name of the copyright holder nor the
+-+      names of its contributors may be used to endorse or promote products
+-+      derived from this software without specific prior written permission.
++ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
++ {
++     if (bit_depth == 8) {
++@@ -161,6 +313,10 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
++         c->transform_add[2]            = ff_hevc_transform_add_16x16_neon_8;
++         c->transform_add[3]            = ff_hevc_transform_add_32x32_neon_8;
++         c->idct_4x4_luma               = ff_hevc_transform_luma_4x4_neon_8;
+++        for (x = 0; x < sizeof c->sao_band_filter / sizeof *c->sao_band_filter; x++) {
+++          c->sao_band_filter[x]        = ff_hevc_sao_band_neon_wrapper;
+++          c->sao_edge_filter[x]        = ff_hevc_sao_edge_neon_wrapper;
+++        }
++         put_hevc_qpel_neon[1][0]       = ff_hevc_put_qpel_v1_neon_8;
++         put_hevc_qpel_neon[2][0]       = ff_hevc_put_qpel_v2_neon_8;
++         put_hevc_qpel_neon[3][0]       = ff_hevc_put_qpel_v3_neon_8;
++@@ -201,7 +357,21 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
++             c->put_hevc_qpel_bi[x][1][0]      = ff_hevc_put_qpel_bi_neon_wrapper;
++             c->put_hevc_qpel_bi[x][0][1]      = ff_hevc_put_qpel_bi_neon_wrapper;
++             c->put_hevc_qpel_bi[x][1][1]      = ff_hevc_put_qpel_bi_neon_wrapper;
+++            c->put_hevc_epel[x][1][0]         = ff_hevc_put_epel_v_neon_8;
+++            c->put_hevc_epel[x][0][1]         = ff_hevc_put_epel_h_neon_8;
+++            c->put_hevc_epel[x][1][1]         = ff_hevc_put_epel_hv_neon_8;
++         }
+++        c->put_hevc_epel[0][0][0]  = ff_hevc_put_pixels_w2_neon_8;
+++        c->put_hevc_epel[1][0][0]  = ff_hevc_put_pixels_w4_neon_8;
+++        c->put_hevc_epel[2][0][0]  = ff_hevc_put_pixels_w6_neon_8;
+++        c->put_hevc_epel[3][0][0]  = ff_hevc_put_pixels_w8_neon_8;
+++        c->put_hevc_epel[4][0][0]  = ff_hevc_put_pixels_w12_neon_8;
+++        c->put_hevc_epel[5][0][0]  = ff_hevc_put_pixels_w16_neon_8;
+++        c->put_hevc_epel[6][0][0]  = ff_hevc_put_pixels_w24_neon_8;
+++        c->put_hevc_epel[7][0][0]  = ff_hevc_put_pixels_w32_neon_8;
+++        c->put_hevc_epel[8][0][0]  = ff_hevc_put_pixels_w48_neon_8;
+++        c->put_hevc_epel[9][0][0]  = ff_hevc_put_pixels_w64_neon_8;
+++
++         c->put_hevc_qpel[0][0][0]  = ff_hevc_put_pixels_w2_neon_8;
++         c->put_hevc_qpel[1][0][0]  = ff_hevc_put_pixels_w4_neon_8;
++         c->put_hevc_qpel[2][0][0]  = ff_hevc_put_pixels_w6_neon_8;
++@@ -221,4 +391,9 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
++         c->put_hevc_qpel_uni[8][0][0]  = ff_hevc_put_qpel_uw_pixels_w48_neon_8;
++         c->put_hevc_qpel_uni[9][0][0]  = ff_hevc_put_qpel_uw_pixels_w64_neon_8;
++     }
+ +
+-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+-+*/
+++    assert(offsetof(MvField, mv) == 0);
+++    assert(offsetof(MvField, ref_idx) == 8);
+++    assert(offsetof(MvField, pred_flag) == 10);
+++    c->hevc_deblocking_boundary_strengths = ff_hevc_deblocking_boundary_strengths_neon;
++ }
++diff --git a/libavcodec/arm/hevcdsp_sao_neon.S b/libavcodec/arm/hevcdsp_sao_neon.S
++new file mode 100644
++index 0000000..9c7808d
++--- /dev/null
+++++ b/libavcodec/arm/hevcdsp_sao_neon.S
++@@ -0,0 +1,510 @@
+++/*
+++ * Copyright (c) 2014 - 2015 Seppo Tomperi <seppo.tomperi@vtt.fi>
+++ *
+++ * This file is part of FFmpeg.
+++ *
+++ * FFmpeg is free software; you can redistribute it and/or
+++ * modify it under the terms of the GNU Lesser General Public
+++ * License as published by the Free Software Foundation; either
+++ * version 2.1 of the License, or (at your option) any later version.
+++ *
+++ * FFmpeg is distributed in the hope that it will be useful,
+++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+++ * Lesser General Public License for more details.
+++ *
+++ * You should have received a copy of the GNU Lesser General Public
+++ * License along with FFmpeg; if not, write to the Free Software
+++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+++ */
+ +
+-+#include <stdio.h>
+-+#include <string.h>
+-+#include <stdlib.h>
+-+#include <fcntl.h>
+-+#include <unistd.h>
+-+#include <assert.h>
+-+#include <stdint.h>
+-+#include <sys/mman.h>
+-+#include <sys/ioctl.h>
+++#include "libavutil/arm/asm.S"
+++#include "neon.S"
+++
+++.macro init_sao_band
+++        pld      [r1]
+++        vld1.8   {q0, q1}, [r2]  // offset table
+++        ldr       r2, [sp, #0]   // stride_dst
+++        ldr      r12, [sp, #4]   // height
+++        vmov.u8  q3, #128
+++.endm
+++
+++// 128 in q3
+++// input q8 - q11
+++.macro sao_band_64
+++        vtbl.8   d24, {d0, d1, d2, d3}, d24
+++        vadd.s8  q8, q3
+++        vtbl.8   d25, {d0, d1, d2, d3}, d25
+++        vadd.s8  q9, q3
+++        vtbl.8   d26, {d0, d1, d2, d3}, d26
+++        vadd.s8  q10, q3
+++        vtbl.8   d27, {d0, d1, d2, d3}, d27
+++        vadd.s8  q11, q3
+++        vtbl.8   d28, {d0, d1, d2, d3}, d28
+++        vqadd.s8 q8, q12
+++        vtbl.8   d29, {d0, d1, d2, d3}, d29
+++        vqadd.s8 q9, q13
+++        vtbl.8   d30, {d0, d1, d2, d3}, d30
+++        vqadd.s8 q10, q14
+++        vtbl.8   d31, {d0, d1, d2, d3}, d31
+++        vsub.s8  q8, q3
+++        vqadd.s8 q11, q15
+++        vsub.s8  q9, q3
+++        vsub.s8  q10, q3
+++        vsub.s8  q11, q3
+++.endm
+++
+++function ff_hevc_sao_band_w8_neon_8, export=1
+++        init_sao_band
+++1:      subs     r12, #8
+++        vld1.8   {d16}, [r1, :64], r3
+++        vld1.8   {d17}, [r1, :64], r3
+++        vshr.u8  q12, q8, #3
+++        vld1.8   {d18}, [r1, :64], r3
+++        vld1.8   {d19}, [r1, :64], r3
+++        vshr.u8  q13, q9, #3
+++        vld1.8   {d20}, [r1, :64], r3
+++        vld1.8   {d21}, [r1, :64], r3
+++        vshr.u8  q14, q10, #3
+++        vld1.8   {d22}, [r1, :64], r3
+++        vld1.8   {d23}, [r1, :64], r3
+++        vshr.u8  q15, q11, #3
+++        sao_band_64
+++        vst1.8  {d16}, [r0, :64], r2
+++        vst1.8  {d17}, [r0, :64], r2
+++        vst1.8  {d18}, [r0, :64], r2
+++        vst1.8  {d19}, [r0, :64], r2
+++        vst1.8  {d20}, [r0, :64], r2
+++        vst1.8  {d21}, [r0, :64], r2
+++        vst1.8  {d22}, [r0, :64], r2
+++        vst1.8  {d23}, [r0, :64], r2
+++        bne    1b
+++
+++        bx lr
+++endfunc
+ +
+-+#include <linux/ioctl.h>
+++function ff_hevc_sao_band_w16_neon_8, export=1
+++        init_sao_band
+++1:      subs     r12, #4
+++        vld1.8  {q8}, [r1, :128], r3
+++        vshr.u8  q12, q8, #3
+++        vld1.8  {q9}, [r1, :128], r3
+++        vshr.u8  q13, q9, #3
+++        vld1.8  {q10}, [r1, :128], r3
+++        vshr.u8  q14, q10, #3
+++        vld1.8  {q11}, [r1, :128], r3
+++        vshr.u8  q15, q11, #3
+++        sao_band_64
+++        vst1.8   {q8}, [r0, :128], r2
+++        vst1.8   {q9}, [r0, :128], r2
+++        vst1.8   {q10}, [r0, :128], r2
+++        vst1.8   {q11}, [r0, :128], r2
+++        bne    1b
+++
+++        bx lr
+++endfunc
+ +
+-+#define MAJOR_NUM 100
+-+#define IOCTL_MBOX_PROPERTY _IOWR(MAJOR_NUM, 0, char *)
+-+#define DEVICE_FILE_NAME "/dev/char_dev"
+++function ff_hevc_sao_band_w32_neon_8, export=1
+++        init_sao_band
+++1:      subs     r12, #2
+++        vld1.8   {q8-q9}, [r1, :128], r3
+++        vshr.u8  q12, q8, #3
+++        vshr.u8  q13, q9, #3
+++        vld1.8   {q10-q11}, [r1, :128], r3
+++        vshr.u8  q14, q10, #3
+++        vshr.u8  q15, q11, #3
+++        sao_band_64
+++        vst1.8   {q8-q9}, [r0, :128], r2
+++        vst1.8   {q10-q11}, [r0, :128], r2
+++        bne      1b
+++
+++        bx       lr
+++endfunc
+ +
+-+#include "rpi_mailbox.h"
+++function ff_hevc_sao_band_w64_neon_8, export=1
+++        init_sao_band
+++1:      subs      r12, #1
+++        pld       [r1, r3]
+++        vld1.8    {q8-q9}, [r1, :128]!
+++        vshr.u8  q12, q8, #3
+++        vshr.u8  q13, q9, #3
+++        vld1.8    {q10-q11}, [r1, :128], r3
+++        vshr.u8  q14, q10, #3
+++        vshr.u8  q15, q11, #3
+++        sub       r1, #32
+++        sao_band_64
+++        vst1.8    {q8-q9}, [r0, :128]!
+++        vst1.8    {q10-q11}, [r0, :128], r2
+++        sub       r0, #32
+++        bne       1b
+++
+++        bx lr
+++endfunc
+ +
+-+#define PAGE_SIZE (4*1024)
+++.macro diff32 out0, out1, tmp0, tmp1, in0, in1, in2, in3
+++        vcgt.u8 \out0, \in2, \in0  // c > a -> -1 , otherwise 0
+++        vcgt.u8 \tmp0,  \in0, \in2  // a > c -> -1 , otherwise 0
+++        vcgt.u8 \out1, \in3, \in1  // c > a -> -1 , otherwise 0 part 2
+++        vcgt.u8 \tmp1,  \in1, \in3  // a > c -> -1 , otherwise 0 part 2
+++        vsub.s8 \out0, \tmp0, \out0 // diff0
+++        vsub.s8 \out1, \tmp1, \out1 // diff0 part 2
+++.endm
+++
+++.macro table64
+++        vmov.s8 q13, #2 // 2 to all elements
+++        vmov.32  d24[0], r4  // load offset table from general registers
+++        vmov.32  d24[1], r5  // load rest of offset table
+++
+++        vadd.s8 q0, q13
+++        vadd.s8 q1, q13
+++        vadd.s8 q2, q13
+++        vadd.s8 q3, q13
+++
+++        vmov.u8  q15, #128 // s8 #-128
+++        vtbl.8   d0, {d24}, d0
+++        vadd.s8  q13,  q4, q15
+++        vtbl.8   d1, {d24}, d1
+++        vadd.s8  q14,  q5, q15
+++        vtbl.8   d2, {d24}, d2
+++        vqadd.s8 q0, q13
+++        vtbl.8   d3, {d24}, d3
+++        vqadd.s8 q1, q14
+++        vtbl.8   d4, {d24}, d4
+++        vadd.s8  q13,  q6, q15
+++        vtbl.8   d5, {d24}, d5
+++        vadd.s8  q14,  q7, q15
+++        vtbl.8   d6, {d24}, d6
+++        vqadd.s8 q2, q13
+++        vtbl.8   d7, {d24}, d7
+++        vqadd.s8 q3, q14
+++        vsub.s8   q0, q15
+++        vsub.s8   q1, q15
+++        vsub.s8   q2, q15
+++        vsub.s8   q3, q15
+++        vst1.8  {q0-q1}, [r0, :128]!
+++        vst1.8  {q2-q3}, [r0, :128], r2
+++        sub     r0, #32
+++.endm
+++
+++// input
+++// a in q0 - q3
+++// c in q4 - q7
+++// b in q8 - q11
+++// offset table in r7 and r5
+++// output in q0 - q3
+++// clobbers q12 - q15
+++.macro edge_w64_body
+++        diff32 q12, q13, q0, q1, q0, q1, q4, q5
+++        diff32 q0, q1, q14, q15, q8, q9, q4, q5
+++
+++        vadd.s8  q0, q12 //diff0 + diff1
+++        vadd.s8  q1, q13
+++
+++        diff32  q14, q15, q2, q3, q2, q3, q6, q7
+++        diff32  q2, q3, q12, q13, q10, q11, q6, q7
+++
+++        vadd.s8  q2, q14
+++        vadd.s8  q3, q15
+++        table64
+++.endm
+++
+++.macro init_edge_64
+++        push   {r4-r5}
+++        ldr    r12, [sp, #8] // height
+++        ldr    r5, [sp, #12] // sao_offset_val_table
+++        ldr    r4, [r5]
+++        add    r5, #4
+++        ldr    r5, [r5]
+++.endm
+++
+++function ff_hevc_sao_edge_eo0_w64_neon_8, export=1
+++        init_edge_64
+++        vpush {d8-d15}
+++        sub    r1, #8
+++1:      subs    r12, #1
+++        vld1.64  {d7}, [r1, :64]!
+++        vld1.64  {q4-q5}, [r1, :128]! // load c
+++        vld1.64  {q6-q7}, [r1, :128]!
+++        vld1.64  {d24}, [r1, :64], r3
+++        sub      r1, #72
+++        // load a
+++        vext.8 q0, q3, q4, #15
+++        vext.8 q1, q4, q5, #15
+++        vext.8 q2, q5, q6, #15
+++        vext.8 q3, q6, q7, #15
+++        // load b
+++        vext.8 q8, q4, q5, #1
+++        vext.8 q9, q5, q6, #1
+++        vext.8 q10, q6, q7, #1
+++        vext.8 q11, q7, q12, #1
+++        edge_w64_body
+++        bne   1b
+++        vpop  {d8-d15}
+++        pop   {r4-r5}
+++        bx lr
+++endfunc
+ +
+-+// Shared memory will not be cached in ARM cache
+-+void *mapmem_shared(unsigned base, unsigned size)
+-+{
+-+   int mem_fd;
+-+   unsigned offset = base % PAGE_SIZE;
+-+   base = base - offset;
+-+   /* open /dev/mem */
+-+   if ((mem_fd = open("/dev/mem", O_RDWR|O_SYNC) ) < 0) {
+-+      printf("can't open /dev/mem\nThis program should be run as root. Try prefixing command with: sudo\n");
+-+      return NULL;
+-+   }
+-+   void *mem = mmap(
+-+      0,
+-+      size,
+-+      PROT_READ|PROT_WRITE,
+-+      MAP_SHARED/*|MAP_FIXED*/,
+-+      mem_fd,
+-+      base);
+-+#ifdef DEBUG
+-+   printf("base=0x%x, mem=%p\n", base, mem);
+-+#endif
+-+   if (mem == MAP_FAILED) {
+-+      printf("mmap error %d\n", (int)mem);
+-+      return NULL;
+-+   }
+-+   close(mem_fd);
+-+   return (char *)mem + offset;
+-+}
+++function ff_hevc_sao_edge_eo1_w64_neon_8, export=1
+++        init_edge_64
+++        vpush {d8-d15}
+++        sub     r1, r3
+++        // load a
+++        vld1.8  {q0-q1}, [r1, :128]!
+++        vld1.8  {q2-q3}, [r1, :128], r3
+++        sub     r1, #32
+++        // load c
+++        vld1.8  {q4-q5}, [r1, :128]!
+++        vld1.8  {q6-q7}, [r1, :128], r3
+++        sub     r1, #32
+++1:      subs    r12, #1
+++        // load b
+++        vld1.8  {q8-q9}, [r1, :128]!
+++        vld1.8  {q10-q11}, [r1, :128], r3
+++        sub     r1, #32
+++        edge_w64_body
+++        // copy c to a
+++        vmov.64 q0, q4
+++        vmov.64 q1, q5
+++        vmov.64 q2, q6
+++        vmov.64 q3, q7
+++        // copy b to c
+++        vmov.64 q4, q8
+++        vmov.64 q5, q9
+++        vmov.64 q6, q10
+++        vmov.64 q7, q11
+++        bne   1b
+++        vpop  {d8-d15}
+++        pop   {r4-r5}
+++        bx lr
+++endfunc
+ +
+-+// Unshared memory will be faster as lives in ARM cache, but requires cache flushing
+-+void *mapmem_private(unsigned base, unsigned size)
+-+{
+-+   int mem_fd;
+-+   unsigned offset = base % PAGE_SIZE;
+-+   base = base - offset;
+-+   /* open /dev/mem */
+-+   if ((mem_fd = open("/dev/mem", O_RDWR|O_SYNC) ) < 0) {
+-+      printf("can't open /dev/mem\nThis program should be run as root. Try prefixing command with: sudo\n");
+-+      return NULL;
+-+   }
+-+   void *mem = mmap(
+-+      0,
+-+      size,
+-+      PROT_READ|PROT_WRITE,
+-+      MAP_PRIVATE/*|MAP_FIXED*/,
+-+      mem_fd,
+-+      base);
+-+#ifdef DEBUG
+-+   printf("base=0x%x, mem=%p\n", base, mem);
+-+#endif
+-+   if (mem == MAP_FAILED) {
+-+      printf("mmap error %d\n", (int)mem);
+-+      return NULL;
+-+   }
+-+   close(mem_fd);
+-+   return (char *)mem + offset;
+-+}
+++function ff_hevc_sao_edge_eo2_w64_neon_8, export=1
+++        init_edge_64
+++        vpush {d8-d15}
+++1:      sub     r1, r3
+++        // load a
+++        // TODO: fix unaligned load
+++        //       don't reload a like in eo1
+++        sub     r1, #1
+++        vld1.8  {q0-q1}, [r1]!
+++        vld1.8  {q2-q3}, [r1], r3
+++        sub     r1, #31
+++        subs    r12, #1
+++        // load c
+++        vld1.8  {q4-q5}, [r1, :128]!
+++        vld1.8  {q6-q7}, [r1, :128], r3
+++        sub     r1, #32
+++        // load b
+++        add     r1, #1
+++        vld1.8  {q8-q9}, [r1]!
+++        vld1.8  {q10-q11}, [r1]
+++        sub     r1, #33
+++        edge_w64_body
+++        bne   1b
+++        vpop  {d8-d15}
+++        pop   {r4-r5}
+++        bx lr
+++endfunc
+ +
+-+void unmapmem(void *addr, unsigned size)
+-+{
+-+   int s = munmap(addr, size);
+-+   if (s != 0) {
+-+      printf("munmap error %d\n", s);
+-+      exit (-1);
+-+   }
+-+}
+++function ff_hevc_sao_edge_eo3_w64_neon_8, export=1
+++        init_edge_64
+++        vpush {d8-d15}
+++1:      sub     r1, r3
+++        // load a
+++        // TODO: fix unaligned load
+++        //       don't reload a like in eo1
+++        add     r1, #1
+++        vld1.8  {q0-q1}, [r1]!
+++        vld1.8  {q2-q3}, [r1], r3
+++        sub     r1, #33
+++        subs    r12, #1
+++        // load c
+++        vld1.8  {q4-q5}, [r1, :128]!
+++        vld1.8  {q6-q7}, [r1, :128], r3
+++        sub     r1, #32
+++        // load b
+++        sub     r1, #1
+++        vld1.8  {q8-q9}, [r1]!
+++        vld1.8  {q10-q11}, [r1]
+++        sub     r1, #31
+++        edge_w64_body
+++        bne   1b
+++        vpop  {d8-d15}
+++        pop   {r4-r5}
+++        bx lr
+++endfunc
+ +
+-+/*
+-+ * use ioctl to send mbox property message
+-+ */
+++.macro init_edge_32
+++        ldr     r12, [sp, #4] // sao_offset_val_table
+++        vld1.32 {d31}, [r12]
+++        ldr     r12, [sp] // height
+++.endm
+++
+++.macro diff out0, tmp0, in0, in1
+++        vcgt.u8 \out0, \in1, \in0  // c > a -> -1 , otherwise 0
+++        vcgt.u8 \tmp0,  \in0, \in1  // a > c -> -1 , otherwise 0
+++        vsub.s8 \out0, \tmp0, \out0 // diff0
+++.endm
+++
+++.macro table32
+++        vmov.s8  q10, #2
+++        vadd.s8  q0, q10
+++        vadd.s8  q1, q10
+++        vmov.s8  q10, #128
+++        vtbl.8   d0, {d31}, d0
+++        vadd.s8  q11, q2, q10
+++        vtbl.8   d1, {d31}, d1
+++        vadd.s8  q12, q3, q10
+++        vtbl.8   d2, {d31}, d2
+++        vqadd.s8 q11, q0
+++        vtbl.8   d3, {d31}, d3
+++        vqadd.s8 q12, q1
+++        vsub.s8  q0, q11, q10
+++        vsub.s8  q1, q12, q10
+++        vst1.8   {q0-q1}, [r0, :128], r2
+++.endm
+++
+++function ff_hevc_sao_edge_eo0_w32_neon_8, export=1
+++        init_edge_32
+++        vpush {q4-q7}
+++        sub     r1, #4
+++1:      subs    r12, #1
+++        vld1.8  {q13-q14}, [r1]!
+++        vld1.32 d30, [r1], r3
+++        sub     r1, #32
+++        // a
+++        vext.8   q0, q13, q14, #3
+++        vext.8   q1, q14, q15, #3
+++        vshr.u64 d24, d30, #24
+++        // c
+++        vext.8   q2, q13, q14, #4
+++        vext.8   q3, q14, q15, #4
+++        vshr.u64 d16, d30, #32
+++        // diff0
+++        diff32 q13, q14, q4, q5, q0, q1, q2, q3
+++        diff   d18, d25, d24, d16
+++        // -diff1
+++        vext.s8 q0, q13, q14, #1
+++        vext.s8 q1, q14, q9, #1
+++
+++        vsub.s8 q0, q13, q0 //diff0 + diff1
+++        vsub.s8 q1, q14, q1
+++        table32
+++        bne     1b
+++        vpop {q4-q7}
+++
+++        bx      lr
+++endfunc
+ +
+-+static int mbox_property(int file_desc, void *buf)
+-+{
+-+   int ret_val = ioctl(file_desc, IOCTL_MBOX_PROPERTY, buf);
+++function ff_hevc_sao_edge_eo1_w32_neon_8, export=1
+++        init_edge_32
+++        vpush {q4-q7}
+++        // load a
+++        sub     r1, r3
+++        vld1.8  {q0-q1}, [r1, :128], r3
+++        // load c
+++        vld1.8  {q2-q3}, [r1, :128], r3
+++        diff32 q12, q13, q0, q1, q0, q1, q2, q3 // CMP ( c, a )
+++1:      subs    r12, #1
+++        // load b
+++        vld1.8  {q8-q9}, [r1, :128], r3
+++        diff32 q4, q5, q10, q11, q8, q9, q2, q3 // CMP ( c, b )
+++        vadd.s8 q0, q4, q12 //diff0 + diff1
+++        vadd.s8 q1, q5, q13
+++        table32
+++        // CMP ( c, a )
+++        vneg.s8 q12, q4
+++        vneg.s8 q13, q5
+++        // c
+++        vmov.64 q2, q8
+++        vmov.64 q3, q9
+++        bne     1b
+++        vpop {q4-q7}
+++        bx      lr
+++endfunc
+ +
+-+   if (ret_val < 0) {
+-+      printf("ioctl_set_msg failed:%d\n", ret_val);
+-+   }
+++function ff_hevc_sao_edge_eo2_w32_neon_8, export=1
+++        init_edge_32
+++        vpush   {d8-d15}
+++        // load a
+++        sub     r1, r3
+++        sub     r1, #8
+++        vld1.8  {q10-q11}, [r1, :64]!
+++        vld1.8  {d24}, [r1, :64], r3
+++        sub     r1, #32
+++        vext.8  q0, q10, q11, #7
+++        vext.8  q1, q11, q12, #7
+++        // load c
+++        vld1.8  {d9}, [r1, :64]!
+++        vld1.8  {q2-q3}, [r1, :64], r3
+++        sub     r1, #8
+++        vext.8  q4, q4, q2, #15
+++1:      subs    r12, #1
+++        // load b
+++        vld1.8  {q10-q11}, [r1, :64]!
+++        vld1.8  {q12}, [r1, :64], r3
+++        sub     r1, #32
+++        vext.8  q8, q10, q11, #9
+++        vext.8  q9, q11, q12, #9
+++        vext.8  q6, q10, q11, #8
+++        vext.8  q7, q11, q12, #8
+++        vext.8  q5, q10, q11, #7
+++        diff32 q12, q13, q0, q1, q0, q1, q2, q3
+++        diff32 q0, q1, q10, q11, q8, q9, q2, q3
+++        vadd.s8 q0, q12 //diff0 + diff1
+++        vadd.s8 q1, q13
+++        table32
+++        // inputs for next loop iteration
+++        // a
+++        vmov.8  q0, q4
+++        vext.8  q1, q2, q3, #15
+++        // c
+++        vmov.8  q2, q6
+++        vmov.8  q3, q7
+++        vmov.8  q4, q5
+++        bne     1b
+++        vpop    {d8-d15}
+++        bx      lr
+++endfunc
+ +
+-+#ifdef DEBUG
+-+   unsigned *p = buf; int i; unsigned size = *(unsigned *)buf;
+-+   for (i=0; i<size/4; i++)
+-+      printf("%04x: 0x%08x\n", i*sizeof *p, p[i]);
+-+#endif
+-+   return ret_val;
+-+}
+++function ff_hevc_sao_edge_eo3_w32_neon_8, export=1
+++        init_edge_32
+++        sub     r1, r3
+++        // load a
+++        vld1.8  {q10-q11}, [r1, :64]!
+++        vld1.8  {d24}, [r1, :64], r3
+++        sub     r1, #32
+++        vext.8  q0, q10, q11, #1
+++        vext.8  q1, q11, q12, #1
+++        // load c
+++        vld1.8  {q2-q3}, [r1, :64]!
+++        vld1.8  {d30}, [r1, :64], r3
+++        sub     r1, #40
+++1:      subs    r12, #1
+++        // load b
+++        vld1.8  {q10-q11}, [r1, :64]!
+++        vld1.8  {q12}, [r1, :64], r3
+++        sub     r1, #32
+++        vext.8  q8, q10, q11, #7
+++        vext.8  q9, q11, q12, #7
+++        vext.8  q14, q12, q10, #7
+++
+++        diff32 q12, q13, q0, q1, q0, q1, q2, q3
+++        diff32 q0, q1, q10, q11, q8, q9, q2, q3
+++
+++        vadd.s8 q0, q12 //diff0 + diff1
+++        vadd.s8 q1, q13
+++        table32
+++
+++        // inputs for next loop iteration
+++        // a
+++        vext.8  q0, q2, q3, #1
+++        vext.8  q1, q3, q15, #1
+++        // c
+++        vext.8  q2, q8, q9, #1
+++        vext.8  q3, q9, q14, #1
+++        vext.8  d30, d28, d2, #1
+++        bne     1b
+++        bx      lr
+++endfunc
+ +
+-+unsigned mem_alloc(int file_desc, unsigned size, unsigned align, unsigned flags)
+-+{
+-+   int i=0;
+-+   unsigned p[32];
+-+   p[i++] = 0; // size
+-+   p[i++] = 0x00000000; // process request
++diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
++index 39713ed..25eb52b 100644
++--- a/libavcodec/avcodec.h
+++++ b/libavcodec/avcodec.h
++@@ -410,6 +410,8 @@ enum AVCodecID {
++     AV_CODEC_ID_SHEERVIDEO,
++     AV_CODEC_ID_YLC,
++ 
+++    AV_CODEC_ID_H264_MVC,
+++
++     /* various PCM "codecs" */
++     AV_CODEC_ID_FIRST_AUDIO = 0x10000,     ///< A dummy id pointing at the start of audio codecs
++     AV_CODEC_ID_PCM_S16LE = 0x10000,
++@@ -2850,6 +2852,7 @@ typedef struct AVCodecContext {
++ #define FF_BUG_DC_CLIP          4096
++ #define FF_BUG_MS               8192 ///< Work around various bugs in Microsoft's broken decoders.
++ #define FF_BUG_TRUNCATED       16384
+++#define FF_BUG_GMC_UNSUPPORTED 32768
++ 
++     /**
++      * strictly follow the standard (MPEG-4, ...).
++@@ -3195,6 +3198,9 @@ typedef struct AVCodecContext {
++ #define FF_PROFILE_H264_HIGH_444_PREDICTIVE  244
++ #define FF_PROFILE_H264_HIGH_444_INTRA       (244|FF_PROFILE_H264_INTRA)
++ #define FF_PROFILE_H264_CAVLC_444            44
+++#define FF_PROFILE_H264_MULTIVIEW_HIGH       118
+++#define FF_PROFILE_H264_STEREO_HIGH          128
+++#define FF_PROFILE_H264_MULTIVIEW_HIGH_DEPTH 138
++ 
++ #define FF_PROFILE_VC1_SIMPLE   0
++ #define FF_PROFILE_VC1_MAIN     1
++@@ -3505,6 +3511,12 @@ typedef struct AVCodecContext {
++ #define FF_SUB_TEXT_FMT_ASS_WITH_TIMINGS 1
++ #endif
++ 
+++    /**
+++     * Opaque pointer for use by replacement get_buffer2 code
+++     *
+++     * @author jc (08/02/2016)
+++     */
+++    void * get_buffer_context;
++ } AVCodecContext;
++ 
++ AVRational av_codec_get_pkt_timebase         (const AVCodecContext *avctx);
++diff --git a/libavcodec/cabac.h b/libavcodec/cabac.h
++index 1bf1c62..ccfa991 100644
++--- a/libavcodec/cabac.h
+++++ b/libavcodec/cabac.h
++@@ -43,7 +43,14 @@ extern const uint8_t ff_h264_cabac_tables[512 + 4*2*64 + 4*64 + 63];
++ typedef struct CABACContext{
++     int low;
++     int range;
++-    int outstanding_count;
+++    union
+++    {
+++        int outstanding_count;
+++        struct {
+++            uint16_t bits;
+++            uint16_t range;
+++        } by22;
+++    };
++     const uint8_t *bytestream_start;
++     const uint8_t *bytestream;
++     const uint8_t *bytestream_end;
++diff --git a/libavcodec/codec_desc.c b/libavcodec/codec_desc.c
++index 9d94b72..535ebf0 100644
++--- a/libavcodec/codec_desc.c
+++++ b/libavcodec/codec_desc.c
++@@ -1563,6 +1563,13 @@ static const AVCodecDescriptor codec_descriptors[] = {
++         .long_name = NULL_IF_CONFIG_SMALL("YUY2 Lossless Codec"),
++         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
++     },
+++    {
+++        .id        = AV_CODEC_ID_H264_MVC,
+++        .type      = AVMEDIA_TYPE_VIDEO,
+++        .name      = "h264_mvc",
+++        .long_name = NULL_IF_CONFIG_SMALL("H264 MVC"),
+++        .props     = AV_CODEC_PROP_LOSSY,
+++    },
++ 
++     /* various PCM "codecs" */
++     {
++diff --git a/libavcodec/h264.h b/libavcodec/h264.h
++index efe3555..16358aa 100644
++--- a/libavcodec/h264.h
+++++ b/libavcodec/h264.h
++@@ -126,7 +126,9 @@ enum {
++     NAL_END_STREAM      = 11,
++     NAL_FILLER_DATA     = 12,
++     NAL_SPS_EXT         = 13,
+++    NAL_SPS_SUBSET      = 15,
++     NAL_AUXILIARY_SLICE = 19,
+++    NAL_SLICE_EXT       = 20,
++     NAL_FF_IGNORE       = 0xff0f001,
++ };
++ 
++diff --git a/libavcodec/h264_parser.c b/libavcodec/h264_parser.c
++index ce4bab2..b9b0c78 100644
++--- a/libavcodec/h264_parser.c
+++++ b/libavcodec/h264_parser.c
++@@ -58,6 +58,8 @@ typedef struct H264ParseContext {
++     uint8_t parse_history[6];
++     int parse_history_count;
++     int parse_last_mb;
+++    int is_mvc;
+++    int slice_ext;
++ } H264ParseContext;
++ 
++ 
++@@ -105,24 +107,27 @@ static int h264_find_frame_end(H264ParseContext *p, const uint8_t *buf,
++         } else if (state <= 5) {
++             int nalu_type = buf[i] & 0x1F;
++             if (nalu_type == NAL_SEI || nalu_type == NAL_SPS ||
++-                nalu_type == NAL_PPS || nalu_type == NAL_AUD) {
+++                nalu_type == NAL_PPS || nalu_type == NAL_AUD ||
+++                nalu_type == NAL_SPS_SUBSET) {
++                 if (pc->frame_start_found) {
++                     i++;
++                     goto found;
++                 }
++             } else if (nalu_type == NAL_SLICE || nalu_type == NAL_DPA ||
++-                       nalu_type == NAL_IDR_SLICE) {
+++                       nalu_type == NAL_IDR_SLICE || (p->is_mvc && nalu_type == NAL_SLICE_EXT)) {
++                 state += 8;
+ +
+-+   p[i++] = 0x3000c; // (the tag id)
+-+   p[i++] = 12; // (size of the buffer)
+-+   p[i++] = 12; // (size of the data)
+-+   p[i++] = size; // (num bytes? or pages?)
+-+   p[i++] = align; // (alignment)
+-+   p[i++] = flags; // (MEM_FLAG_L1_NONALLOCATING)
+++                p->slice_ext = (nalu_type == NAL_SLICE_EXT);
++                 continue;
++             }
++             state = 7;
++         } else {
++             p->parse_history[p->parse_history_count++] = buf[i];
++-            if (p->parse_history_count > 5) {
+++            if (p->parse_history_count > 8) {
++                 unsigned int mb, last_mb = p->parse_last_mb;
++                 GetBitContext gb;
++ 
++-                init_get_bits(&gb, p->parse_history, 8*p->parse_history_count);
+++                init_get_bits8(&gb, p->parse_history + 3*p->slice_ext, p->parse_history_count - 3*p->slice_ext);
++                 p->parse_history_count = 0;
++                 mb= get_ue_golomb_long(&gb);
++                 p->parse_last_mb = mb;
++@@ -145,7 +150,7 @@ found:
++     pc->frame_start_found = 0;
++     if (p->is_avc)
++         return next_avc;
++-    return i - (state & 5) - 5 * (state > 7);
+++    return i - (state & 5) - 8 * (state > 7);
++ }
++ 
++ static int scan_mmco_reset(AVCodecParserContext *s, GetBitContext *gb,
++@@ -585,7 +590,8 @@ static int h264_parse(AVCodecParserContext *s,
++         }
++     }
++ 
++-    parse_nal_units(s, avctx, buf, buf_size);
+++    if (!p->is_mvc)
+++        parse_nal_units(s, avctx, buf, buf_size);
++ 
++     if (avctx->framerate.num)
++         avctx->time_base = av_inv_q(av_mul_q(avctx->framerate, (AVRational){avctx->ticks_per_frame, 1}));
++@@ -622,7 +628,7 @@ static int h264_split(AVCodecContext *avctx,
++         if ((state & 0xFFFFFF00) != 0x100)
++             break;
++         nalu_type = state & 0x1F;
++-        if (nalu_type == NAL_SPS) {
+++        if (nalu_type == NAL_SPS || nalu_type == NAL_SPS_SUBSET) {
++             has_sps = 1;
++         } else if (nalu_type == NAL_PPS)
++             has_pps = 1;
++@@ -672,3 +678,23 @@ AVCodecParser ff_h264_parser = {
++     .parser_close   = h264_close,
++     .split          = h264_split,
++ };
+ +
+-+   p[i++] = 0x00000000; // end tag
+-+   p[0] = i*sizeof *p; // actual size
+++static av_cold int init_mvc(AVCodecParserContext *s)
+++{
+++    H264ParseContext *p = s->priv_data;
+++    int ret = init(s);
+++    if (ret < 0)
+++        return ret;
+ +
+-+   mbox_property(file_desc, p);
+-+   return p[5];
+++    p->is_mvc = 1;
+++    return 0;
+ +}
+ +
+-+unsigned mem_free(int file_desc, unsigned handle)
+-+{
+-+   int i=0;
+-+   unsigned p[32];
+-+   p[i++] = 0; // size
+-+   p[i++] = 0x00000000; // process request
+++AVCodecParser ff_h264_mvc_parser = {
+++    .codec_ids      = { AV_CODEC_ID_H264_MVC },
+++    .priv_data_size = sizeof(H264ParseContext),
+++    .parser_init    = init_mvc,
+++    .parser_parse   = h264_parse,
+++    .parser_close   = h264_close,
+++    .split          = h264_split,
+++};
++diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
++index b478065..88dd40b 100644
++--- a/libavcodec/hevc.c
+++++ b/libavcodec/hevc.c
++@@ -41,8 +41,186 @@
++ #include "hevc.h"
++ #include "profiles.h"
++ 
+++#ifdef RPI
+++  #include "rpi_qpu.h"
+++  #include "rpi_user_vcsm.h"
+++  // Move Inter prediction into separate pass
+++  #define RPI_INTER
+ +
+-+   p[i++] = 0x3000f; // (the tag id)
+-+   p[i++] = 4; // (size of the buffer)
+-+   p[i++] = 4; // (size of the data)
+-+   p[i++] = handle;
+++  #ifdef RPI_INTER_QPU
+++    // Define RPI_MULTI_MAILBOX to use the updated mailbox that can launch both QPU and VPU
+++    #define RPI_MULTI_MAILBOX
+++  #endif
+ +
+-+   p[i++] = 0x00000000; // end tag
+-+   p[0] = i*sizeof *p; // actual size
+++  // Define RPI_CACHE_UNIF_MVS to write motion vector uniform stream to cached memory
+++  // RPI_CACHE_UNIF_MVS doesn't seem to make much difference, so left undefined.
+ +
+-+   mbox_property(file_desc, p);
+-+   return p[5];
+-+}
+++  // Define RPI_SIMULATE_QPUS for debugging to run QPU code on the ARMs (*rotted*)
+++  //#define RPI_SIMULATE_QPUS
+++  #ifdef RPI_WORKER
+++    #include "pthread.h"
+++  #endif
+ +
+-+unsigned mem_lock(int file_desc, unsigned handle)
+-+{
+-+   int i=0;
+-+   unsigned p[32];
+-+   p[i++] = 0; // size
+-+   p[i++] = 0x00000000; // process request
+++  static void rpi_execute_dblk_cmds(HEVCContext *s);
+++  static void rpi_execute_transform(HEVCContext *s);
+++  static void rpi_launch_vpu_qpu(HEVCContext *s);
+++  static void rpi_execute_pred_cmds(HEVCContext *s);
+++  static void rpi_execute_inter_cmds(HEVCContext *s);
+++  static void rpi_begin(HEVCContext *s);
+++  static void flush_frame(HEVCContext *s,AVFrame *frame);
+++  static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2, int job);
+ +
+-+   p[i++] = 0x3000d; // (the tag id)
+-+   p[i++] = 4; // (size of the buffer)
+-+   p[i++] = 4; // (size of the data)
+-+   p[i++] = handle;
+++#endif
+ +
+-+   p[i++] = 0x00000000; // end tag
+-+   p[0] = i*sizeof *p; // actual size
+++// #define DISABLE_MC
+ +
+-+   mbox_property(file_desc, p);
+-+   return p[5];
+-+}
+++#define PACK2(hi,lo) (((hi) << 16) | ((lo) & 0xffff))
+ +
+-+unsigned mem_unlock(int file_desc, unsigned handle)
+++#ifndef av_mod_uintp2
+++static av_always_inline av_const unsigned av_mod_uintp2_c(unsigned a, unsigned p)
+ +{
+-+   int i=0;
+-+   unsigned p[32];
+-+   p[i++] = 0; // size
+-+   p[i++] = 0x00000000; // process request
+-+
+-+   p[i++] = 0x3000e; // (the tag id)
+-+   p[i++] = 4; // (size of the buffer)
+-+   p[i++] = 4; // (size of the data)
+-+   p[i++] = handle;
+-+
+-+   p[i++] = 0x00000000; // end tag
+-+   p[0] = i*sizeof *p; // actual size
+-+
+-+   mbox_property(file_desc, p);
+-+   return p[5];
+++    return a & ((1 << p) - 1);
+ +}
+++#   define av_mod_uintp2   av_mod_uintp2_c
+++#endif
+ +
+-+unsigned execute_code(int file_desc, unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5)
+-+{
+-+   int i=0;
+-+   unsigned p[32];
+-+   p[i++] = 0; // size
+-+   p[i++] = 0x00000000; // process request
++ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
++ 
+ +
+-+   p[i++] = 0x30010; // (the tag id)
+-+   p[i++] = 28; // (size of the buffer)
+-+   p[i++] = 28; // (size of the data)
+-+   p[i++] = code;
+-+   p[i++] = r0;
+-+   p[i++] = r1;
+-+   p[i++] = r2;
+-+   p[i++] = r3;
+-+   p[i++] = r4;
+-+   p[i++] = r5;
+++#ifdef RPI_INTER_QPU
+ +
+-+   p[i++] = 0x00000000; // end tag
+-+   p[0] = i*sizeof *p; // actual size
+++// Each luma QPU processes 2*RPI_NUM_CHUNKS 64x64 blocks
+++// Each chroma QPU processes 3*RPI_NUM_CHUNKS 64x64 blocks, but requires two commands for B blocks
+++// For each block of 64*64 the smallest block size is 8x4
+++// We also need an extra command for the setup information
+ +
+-+   mbox_property(file_desc, p);
+-+   return p[5];
+-+}
+++#define RPI_CHROMA_COMMAND_WORDS 12
+++#define UV_COMMANDS_PER_QPU ((1 + 3*RPI_NUM_CHUNKS*(64*64)*2/(8*4)) * RPI_CHROMA_COMMAND_WORDS)
+++// The QPU code for UV blocks only works up to a block width of 8
+++#define RPI_CHROMA_BLOCK_WIDTH 8
+ +
+-+unsigned qpu_enable(int file_desc, unsigned enable)
+-+{
+-+   int i=0;
+-+   unsigned p[32];
+++#define RPI_LUMA_COMMAND_WORDS 10
+++#define Y_COMMANDS_PER_QPU ((1+2*RPI_NUM_CHUNKS*(64*64)/(8*4)) * RPI_LUMA_COMMAND_WORDS)
+ +
+-+   p[i++] = 0; // size
+-+   p[i++] = 0x00000000; // process request
+++#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24)
+ +
+-+   p[i++] = 0x30012; // (the tag id)
+-+   p[i++] = 4; // (size of the buffer)
+-+   p[i++] = 4; // (size of the data)
+-+   p[i++] = enable;
+++// TODO Chroma only needs 4 taps
+ +
+-+   p[i++] = 0x00000000; // end tag
+-+   p[0] = i*sizeof *p; // actual size
+++// Actual filter goes -ve, +ve, +ve, -ve using these values
+++static const uint32_t rpi_filter_coefs[8][1] = {
+++        { ENCODE_COEFFS(   0,  64,   0,   0) },
+++        { ENCODE_COEFFS(  2,  58,  10,  2) },
+++        { ENCODE_COEFFS(  4,  54,  16,  2) },
+++        { ENCODE_COEFFS(  6,  46,  28,  4) },
+++        { ENCODE_COEFFS(  4,  36,  36,  4) },
+++        { ENCODE_COEFFS(  4,  28,  46,  6) },
+++        { ENCODE_COEFFS(  2,  16,  54,  4) },
+++        { ENCODE_COEFFS(  2,  10,  58,  2) }
+++};
+ +
+-+   mbox_property(file_desc, p);
+-+   return p[5];
+-+}
+++#endif
+ +
+-+unsigned execute_qpu(int file_desc, unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout) {
+-+   int i=0;
+-+   unsigned p[32];
+ +
+-+   p[i++] = 0; // size
+-+   p[i++] = 0x00000000; // process request
+-+   p[i++] = 0x30011; // (the tag id)
+-+   p[i++] = 16; // (size of the buffer)
+-+   p[i++] = 16; // (size of the data)
+-+   p[i++] = num_qpus;
+-+   p[i++] = control;
+-+   p[i++] = noflush;
+-+   p[i++] = timeout; // ms
+++#ifdef RPI_WORKER
+ +
+-+   p[i++] = 0x00000000; // end tag
+-+   p[0] = i*sizeof *p; // actual size
+++//#define LOG_ENTER printf("Enter %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s);
+++//#define LOG_EXIT printf("Exit %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s);
+ +
+-+   mbox_property(file_desc, p);
+-+   return p[5];
+-+}
+++#define LOG_ENTER
+++#define LOG_EXIT
+ +
+-+int mbox_open() {
+-+   int file_desc;
+++// Call this when we have completed pass0 and wish to trigger pass1 for the current job
+++static void worker_submit_job(HEVCContext *s)
+++{
+++  LOG_ENTER
+++  pthread_mutex_lock(&s->worker_mutex);
+++  s->worker_tail++;
+++  s->pass0_job = (s->pass0_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
+++  pthread_cond_broadcast(&s->worker_cond_tail); // Let people know that the tail has moved
+++  pthread_mutex_unlock(&s->worker_mutex);
+++  LOG_EXIT
+++}
+ +
+-+   // open a char device file used for communicating with kernel mbox driver
+-+   file_desc = open(DEVICE_FILE_NAME, 0);
+-+   if (file_desc < 0) {
+-+      printf("Can't open device file: %s\n", DEVICE_FILE_NAME);
+-+      printf("Try creating a device file with: sudo mknod %s c %d 0\n", DEVICE_FILE_NAME, MAJOR_NUM);
+-+   }
+-+   return file_desc;
+++// Call this to say we have completed pass1
+++static void worker_complete_job(HEVCContext *s)
+++{
+++  LOG_ENTER
+++  pthread_mutex_lock(&s->worker_mutex);
+++  s->worker_head++;
+++  s->pass1_job = (s->pass1_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
+++  pthread_cond_broadcast(&s->worker_cond_head); // Let people know that the head has moved
+++  pthread_mutex_unlock(&s->worker_mutex);
+++  LOG_EXIT
+ +}
+ +
+-+void mbox_close(int file_desc) {
+-+  close(file_desc);
+++// Call this to wait for all jobs to have completed at the end of a frame
+++static void worker_wait(HEVCContext *s)
+++{
+++  LOG_ENTER
+++  pthread_mutex_lock(&s->worker_mutex);
+++  while( s->worker_head !=s->worker_tail)
+++  {
+++    pthread_cond_wait(&s->worker_cond_head, &s->worker_mutex);
+++  }
+++  pthread_mutex_unlock(&s->worker_mutex);
+++  LOG_EXIT
+ +}
+-diff --git a/libavcodec/rpi_mailbox.h b/libavcodec/rpi_mailbox.h
+-new file mode 100644
+-index 0000000..c264d2e
+---- /dev/null
+-+++ b/libavcodec/rpi_mailbox.h
+-@@ -0,0 +1,20 @@
+-+#ifndef RPI_MAILBOX_H
+-+#define RPI_MAILBOX_H
+ +
+-+extern int mbox_open(void);
+-+extern void mbox_close(int file_desc);
+++// Call worker_pass0_ready to wait until the s->pass0_job slot becomes
+++// available to receive the next job.
+++static void worker_pass0_ready(HEVCContext *s)
+++{
+++  LOG_ENTER
+++    pthread_mutex_lock(&s->worker_mutex);
+++    // tail is number of submitted jobs
+++    // head is number of completed jobs
+++    // tail-head is number of outstanding jobs in the queue
+++    // we need to ensure there is at least 1 space left for us to use
+++    while( s->worker_tail - s->worker_head >= RPI_MAX_JOBS)
+++    {
+++      // Wait until another job is completed
+++      pthread_cond_wait(&s->worker_cond_head, &s->worker_mutex);
+++    }
+++    pthread_mutex_unlock(&s->worker_mutex);
+++  LOG_EXIT
+++}
+ +
+-+extern unsigned get_version(int file_desc);
+-+extern unsigned mem_alloc(int file_desc, unsigned size, unsigned align, unsigned flags);
+-+extern unsigned mem_free(int file_desc, unsigned handle);
+-+extern unsigned mem_lock(int file_desc, unsigned handle);
+-+extern unsigned mem_unlock(int file_desc, unsigned handle);
+-+extern void *mapmem_shared(unsigned base, unsigned size);
+-+extern void *mapmem_private(unsigned base, unsigned size);
+-+extern void unmapmem(void *addr, unsigned size);
+++static void *worker_start(void *arg)
+++{
+++  HEVCContext *s = (HEVCContext *)arg;
+++  while(1) {
+++    pthread_mutex_lock(&s->worker_mutex);
+ +
+-+extern unsigned execute_code(int file_desc, unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5);
+-+extern unsigned execute_qpu(int file_desc, unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout);
+-+extern unsigned qpu_enable(int file_desc, unsigned enable);
+++    while( !s->kill_worker && s->worker_tail - s->worker_head <= 0)
+++    {
+++      pthread_cond_wait(&s->worker_cond_tail, &s->worker_mutex);
+++    }
+++    pthread_mutex_unlock(&s->worker_mutex);
+ +
+-+#endif
+-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
+-new file mode 100644
+-index 0000000..b1f50ee
+---- /dev/null
+-+++ b/libavcodec/rpi_qpu.c
+-@@ -0,0 +1,652 @@
+-+#ifdef RPI
+-+// Use the vcsm device for shared memory
+-+// This works better than the mmap in that the memory can be cached, but requires a kernel modification to enable the device.
+-+#define RPI_USE_VCSM
+-+#define RPI_TIME_TOTAL_QPU
+++    if (s->kill_worker) {
+++      break;
+++    }
+++    LOG_ENTER
+++    // printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
+++    rpi_launch_vpu_qpu(s);
+++    // Perform inter prediction
+++    rpi_execute_inter_cmds(s);
+++    // Wait for transform completion
+++    vpu_wait(s->vpu_id);
+ +
+-+#include <stdio.h>
+-+#include <stdlib.h>
+-+#include <string.h>
+-+#include <stddef.h>
+-+#include <assert.h>
+++    // Perform intra prediction and residual reconstruction
+++    rpi_execute_pred_cmds(s);
+++    // Perform deblocking for CTBs in this row
+++    rpi_execute_dblk_cmds(s);
+ +
+-+#include "config.h"
+++    worker_complete_job(s);
+++    LOG_EXIT
+++  }
+++  return NULL;
+++}
+ +
+-+#include <pthread.h>
+-+#include <time.h>
+++#endif
+ +
+-+#include "rpi_mailbox.h"
+-+#include "rpi_qpu.h"
+-+#include "rpi_shader.h"
+-+#include "rpi_hevc_transform.h"
++ /**
++  * NOTE: Each function hls_foo correspond to the function foo in the
++  * specification (HLS stands for High Level Syntax).
++@@ -55,6 +233,32 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
++ /* free everything allocated  by pic_arrays_init() */
++ static void pic_arrays_free(HEVCContext *s)
++ {
+++#ifdef RPI
+++    int job;
+++    for(job=0;job<RPI_MAX_JOBS;job++) {
+++      if (s->coeffs_buf_arm[job][0]) {
+++        gpu_free(&s->coeffs_buf_default[job]);
+++        s->coeffs_buf_arm[job][0] = 0;
+++      }
+++      if (s->coeffs_buf_arm[job][2]) {
+++        gpu_free(&s->coeffs_buf_accelerated[job]);
+++        s->coeffs_buf_arm[job][2] = 0;
+++      }
+++    }
+++#endif
+++#ifdef RPI_DEBLOCK_VPU
+++    {
+++        int i;
+++        for (i = 0; i != RPI_DEBLOCK_VPU_Q_COUNT; ++i) {
+++            struct dblk_vpu_q_s * const dvq = s->dvq_ents + i;
+ +
+-+#ifdef RPI_USE_VCSM
+-+#include "rpi_user_vcsm.h"
+++            if (dvq->vpu_cmds_arm) {
+++                gpu_free(&dvq->deblock_vpu_gmem);
+++              dvq->vpu_cmds_arm = 0;
+++            }
+++        }
+++    }
+ +#endif
++     av_freep(&s->sao);
++     av_freep(&s->deblock);
++ 
++@@ -91,6 +295,87 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
++     int ctb_count        = sps->ctb_width * sps->ctb_height;
++     int min_pu_size      = sps->min_pu_width * sps->min_pu_height;
++ 
+++#ifdef RPI
+++    int coefs_in_ctb = (1 << sps->log2_ctb_size) * (1 << sps->log2_ctb_size);
+++    int coefs_per_luma = 64*64*24*RPI_NUM_CHUNKS;
+++    int coefs_per_chroma = (coefs_per_luma * 2) >> sps->vshift[1] >> sps->hshift[1];
+++    int coefs_per_row = coefs_per_luma + coefs_per_chroma;
+++    int job;
+ +
+-+// On Pi2 there is no way to access the VPU L2 cache
+-+// GPU_MEM_FLG should be 4 for uncached memory.
+-+// However, if using VCSM allocated buffers, need to use C at the moment because VCSM does not allocate uncached memory correctly
+-+// The QPU crashes if we mix L2 cached and L2 uncached accesses due to a HW bug.
+-+#define GPU_MEM_FLG 0xC
+-+#define GPU_MEM_MAP 0x0
+-+
+-+#define vcos_verify(x) ((x)>=0)
+-+
+-+typedef unsigned char uint8_t;
+-+typedef signed char int8_t;
+-+typedef unsigned short uint16_t;
+-+typedef unsigned int uint32_t;
+-+typedef int int32_t;
+-+
+-+/*static const unsigned code[] =
+-+{
+-+  #include "rpi_shader.hex"
+-+};*/
+-+
+-+// Size in 32bit words
+-+#define QPU_CODE_SIZE 2048
+-+#define VPU_CODE_SIZE 2048
+-+
+-+struct GPU
+-+{
+-+  unsigned int qpu_code[QPU_CODE_SIZE];
+-+  unsigned int vpu_code[VPU_CODE_SIZE];
+-+  int open_count; // Number of allocated video buffers
+-+  unsigned int vc_handle; // Handle of this memory
+-+  int      mb; // Mailbox handle
+-+  int      vc; // Address in GPU memory
+-+  int mail[12]; // These are used to pass pairs of code/unifs to the QPUs
+-+};
+-+
+-+// Stop more than one thread trying to allocate memory or use the processing resources at once
+-+static pthread_mutex_t gpu_mutex = PTHREAD_MUTEX_INITIALIZER;
+-+static volatile struct GPU* gpu = NULL;
+-+
+-+#ifdef RPI_TIME_TOTAL_QPU
+-+static unsigned int Microseconds(void) {
+-+    struct timespec ts;
+-+    unsigned int x;
+-+    static unsigned int base = 0;
+-+    clock_gettime(CLOCK_REALTIME, &ts);
+-+    x = ts.tv_sec*1000000 + ts.tv_nsec/1000;
+-+    if (base==0) base=x;
+-+    return x-base;
+-+}
+++    av_assert0(sps);
+++    s->max_ctu_count = coefs_per_luma / coefs_in_ctb;
+++    s->ctu_per_y_chan = s->max_ctu_count / 12;
+++    s->ctu_per_uv_chan = s->max_ctu_count / 8;
+++    for(job=0;job<RPI_MAX_JOBS;job++) {
+++      printf("Allocated %d\n",coefs_per_row);
+++      for(job=0;job<RPI_MAX_JOBS;job++) {
+++        gpu_malloc_cached(sizeof(int16_t) * coefs_per_row, &s->coeffs_buf_default[job]);
+++        s->coeffs_buf_arm[job][0] = (int16_t*) s->coeffs_buf_default[job].arm;
+++        if (!s->coeffs_buf_arm[job][0])
+++            goto fail;
+++        gpu_malloc_cached(sizeof(int16_t) * (coefs_per_row + 32*32), &s->coeffs_buf_accelerated[job]);  // We prefetch past the end so provide an extra blocks worth of data
+++        s->coeffs_buf_arm[job][2] = (int16_t*) s->coeffs_buf_accelerated[job].arm;
+++        s->coeffs_buf_vc[job][2] = s->coeffs_buf_accelerated[job].vc;
+++        if (!s->coeffs_buf_arm[job][2])
+++            goto fail;
+++        s->coeffs_buf_arm[job][3] = coefs_per_row + s->coeffs_buf_arm[job][2];  // This points to just beyond the end of the buffer.  Coefficients fill in backwards.
+++        s->coeffs_buf_vc[job][3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[job][2];
+++      }
+++    }
+ +#endif
+++#ifdef RPI_DEBLOCK_VPU
+++    {
+++        int i;
+++        s->enable_rpi_deblock = !sps->sao_enabled;
+++        s->setup_width = (sps->width+15) / 16;
+++        s->setup_height = (sps->height+15) / 16;
+++        s->uv_setup_width = ( (sps->width >> sps->hshift[1]) + 15) / 16;
+++        s->uv_setup_height = ( (sps->height >> sps->vshift[1]) + 15) / 16;
+++
+++        for (i = 0; i != RPI_DEBLOCK_VPU_Q_COUNT; ++i)
+++        {
+++            struct dblk_vpu_q_s * const dvq = s->dvq_ents + i;
+++            const unsigned int cmd_size = (sizeof(*dvq->vpu_cmds_arm) * 3 + 15) & ~15;
+++            const unsigned int y_size = (sizeof(*dvq->y_setup_arm) * s->setup_width * s->setup_height + 15) & ~15;
+++            const unsigned int uv_size = (sizeof(*dvq->uv_setup_arm) * s->uv_setup_width * s->uv_setup_height + 15) & ~15;
+++            const unsigned int total_size =- cmd_size + y_size + uv_size;
+++            int p_vc;
+++            uint8_t * p_arm;
+++ #if RPI_VPU_DEBLOCK_CACHED
+++            gpu_malloc_cached(total_size, &dvq->deblock_vpu_gmem);
+++ #else
+++            gpu_malloc_uncached(total_size, &dvq->deblock_vpu_gmem);
+++ #endif
+++            p_vc = dvq->deblock_vpu_gmem.vc;
+++            p_arm = dvq->deblock_vpu_gmem.arm;
+++
+++            // Zap all
+++            memset(p_arm, 0, dvq->deblock_vpu_gmem.numbytes);
+++
+++            // Subdivide
+++            dvq->vpu_cmds_arm = (void*)p_arm;
+++            dvq->vpu_cmds_vc = p_vc;
+++
+++            p_arm += cmd_size;
+++            p_vc += cmd_size;
+++
+++            dvq->y_setup_arm = (void*)p_arm;
+++            dvq->y_setup_vc = (void*)p_vc;
+++
+++            p_arm += y_size;
+++            p_vc += y_size;
+++
+++            dvq->uv_setup_arm = (void*)p_arm;
+++            dvq->uv_setup_vc = (void*)p_vc;
+++
+++            dvq->cmd_id = -1;
+++        }
+ +
+-+// Connect to QPU, returns 0 on success.
+-+static int gpu_init(volatile struct GPU **gpu) {
+-+  int mb = mbox_open();
+-+  int vc;
+-+  int handle;
+-+  volatile struct GPU* ptr;
+-+	if (mb < 0)
+-+		return -1;
+-+
+-+	if (qpu_enable(mb, 1)) return -2;
+-+
+-+#ifdef RPI_USE_VCSM
+-+  vcsm_init();
+++        s->dvq_n = 0;
+++        s->dvq = s->dvq_ents + s->dvq_n;
+++    }
+ +#endif
+ +
+-+  handle = mem_alloc(mb, sizeof(struct GPU), 4096, GPU_MEM_FLG);
+-+  if (!handle)
+-+  {
+-+    qpu_enable(mb, 0);
+-+    return -3;
+-+  }
+-+	vc = mem_lock(mb, handle);
+-+	ptr = mapmem_shared((vc+GPU_MEM_MAP)&~0xc0000000, sizeof(struct GPU));
+-+	if (ptr == NULL)
+-+	{	mem_free(mb, handle);
+-+		mem_unlock(mb, handle);
+-+		qpu_enable(mb, 0);
+-+		return -4;
+-+	}
+-+
+-+	ptr->mb = mb;
+-+	ptr->vc_handle = handle;
+-+	ptr->vc = vc;
+-+
+-+  *gpu = ptr;
+-+
+-+  // Now copy over the QPU code into GPU memory
+-+  {
+-+    int num_bytes = qpu_get_fn(QPU_MC_END) - qpu_get_fn(QPU_MC_SETUP);
+-+    assert(num_bytes<=QPU_CODE_SIZE*sizeof(unsigned int));
+-+    memcpy((void*)ptr->qpu_code, rpi_shader, num_bytes);
+-+  }
+-+  // And the VPU code
+-+  {
+-+    int num_bytes = sizeof(rpi_hevc_transform);
+-+    assert(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
+-+    memcpy((void*)ptr->vpu_code, rpi_hevc_transform, num_bytes);
+-+  }
+-+
+-+  return 0;
+-+}
+-+
+-+// Make sure we have exclusive access to the mailbox, and enable qpu if necessary.
+-+static void gpu_lock(void) {
+-+  pthread_mutex_lock(&gpu_mutex);
+-+  if (gpu==NULL) {
+-+    gpu_init(&gpu);
+-+  }
+-+}
+-+
+-+static void gpu_unlock(void) {
+-+  pthread_mutex_unlock(&gpu_mutex);
+-+}
+-+
+-+// Allocate memory on GPU
+-+// Fills in structure <p> containing ARM pointer, videocore handle, videocore memory address, numbytes
+-+// Returns 0 on success.
+-+// This allocates memory that will not be cached in ARM's data cache.
+-+// Therefore safe to use without data cache flushing.
+-+int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p) {
+-+  gpu_lock();
+-+  p->vc_handle = mem_alloc(gpu->mb, numbytes, 4096, GPU_MEM_FLG);
+-+  p->vcsm_handle = 0;
+-+  if (!p->vc_handle)
+-+  {
+-+    qpu_enable(gpu->mb, 0);
+-+    return -3;
+-+  }
+-+  p->vc = mem_lock(gpu->mb, p->vc_handle);
+-+  p->arm = mapmem_shared((p->vc+GPU_MEM_MAP)&~0xc0000000,numbytes);
+-+  p->numbytes = numbytes;
+-+  if (p->arm == NULL)
+-+  {
+-+    mem_free(gpu->mb, p->vc_handle);
+-+    mem_unlock(gpu->mb, p->vc_handle);
+-+    gpu_unlock();
+-+    qpu_enable(gpu->mb, 0);
+-+    return -4;
+-+  }
+-+  gpu->open_count++;
+-+  gpu_unlock();
+-+  return 0;
+-+}
+-+
+-+void gpu_cache_flush(GPU_MEM_PTR_T *p)
++     s->bs_width  = (width  >> 2) + 1;
++     s->bs_height = (height >> 2) + 1;
++ 
++@@ -137,6 +422,29 @@ fail:
++     return AVERROR(ENOMEM);
++ }
++ 
+++static void default_pred_weight_table(HEVCContext * const s)
+ +{
+-+  // This only works when using RPI_USE_VCSM
+-+  void *tmp = vcsm_lock(p->vcsm_handle);
+-+  vcsm_unlock_ptr(tmp);
+-+}
+-+
+-+// This allocates data that will be
+-+//    Cached in ARM L2
+-+//    Uncached in VPU L2
+-+int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p) {
+-+  gpu_lock();
+-+#ifdef RPI_USE_VCSM
+-+  {
+-+      p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST, (char *)"Video Frame" ); // f....... locks up for VP9 - retest this?
+-+      //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_VC, (char *)"Video Frame" ); // 3b...... works
+-+      //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" ); //fb...... locks up
+-+      //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST_AND_VC, (char *)"Video Frame" ); // 3b works (but corrupted due to caching)
+-+      p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
+-+      p->arm = vcsm_lock(p->vcsm_handle);
+-+      p->vc = mem_lock(gpu->mb, p->vc_handle);
+-+  }
+-+#else
+-+  p->vc_handle = mem_alloc(gpu->mb, numbytes, 4096, GPU_MEM_FLG);
+-+  p->vcsm_handle = 0;
+-+  if (!p->handle)
+-+  {
+-+    qpu_enable(gpu->mb, 0);
+-+    return -3;
+++  unsigned int i;
+++  s->sh.luma_log2_weight_denom = 0;
+++  s->sh.chroma_log2_weight_denom = 0;
+++  for (i = 0; i < s->sh.nb_refs[L0]; i++) {
+++      s->sh.luma_weight_l0[i] = 1;
+++      s->sh.luma_offset_l0[i] = 0;
+++      s->sh.chroma_weight_l0[i][0] = 1;
+++      s->sh.chroma_offset_l0[i][0] = 0;
+++      s->sh.chroma_weight_l0[i][1] = 1;
+++      s->sh.chroma_offset_l0[i][1] = 0;
+ +  }
+-+  p->vc = mem_lock(gpu->mb, p->vc_handle);
+-+  printf("This mapmem_private does not seem to work\n");
+-+  exit(-1);
+-+  p->arm = mapmem_private((p->vc+GPU_MEM_MAP)&~0xc0000000,numbytes);
+-+  p->numbytes = numbytes;
+-+  if (p->arm == NULL)
+-+  {
+-+    mem_free(gpu->mb, p->handle);
+-+    mem_unlock(gpu->mb, p->handle);
+-+    gpu_unlock();
+-+    qpu_enable(gpu->mb, 0);
+-+    return -4;
+++  for (i = 0; i < s->sh.nb_refs[L1]; i++) {
+++      s->sh.luma_weight_l1[i] = 1;
+++      s->sh.luma_offset_l1[i] = 0;
+++      s->sh.chroma_weight_l1[i][0] = 1;
+++      s->sh.chroma_offset_l1[i][0] = 0;
+++      s->sh.chroma_weight_l1[i][1] = 1;
+++      s->sh.chroma_offset_l1[i][1] = 0;
+ +  }
+-+#endif
+-+  gpu->open_count++;
+-+  gpu_unlock();
+-+  return 0;
+ +}
+ +
+-+static void gpu_term(void)
++ static void pred_weight_table(HEVCContext *s, GetBitContext *gb)
++ {
++     int i = 0;
++@@ -674,6 +982,11 @@ static int hls_slice_header(HEVCContext *s)
++                 (s->ps.pps->weighted_bipred_flag && sh->slice_type == B_SLICE)) {
++                 pred_weight_table(s, gb);
++             }
+++            else
+++            {
+++              // Give us unit weights
+++              default_pred_weight_table(s);
+++            }
++ 
++             sh->max_num_merge_cand = 5 - get_ue_golomb_long(gb);
++             if (sh->max_num_merge_cand < 1 || sh->max_num_merge_cand > 5) {
++@@ -931,6 +1244,25 @@ static int hls_cross_component_pred(HEVCContext *s, int idx) {
++     return 0;
++ }
++ 
+++#ifdef RPI
+++static void rpi_intra_pred(HEVCContext *s, int log2_trafo_size, int x0, int y0, int c_idx)
+ +{
+-+	int mb = gpu->mb;
+-+	unsigned handle = gpu->vc_handle;
+-+  if (gpu==NULL)
+-+    return;
+-+	unmapmem((void*)gpu, sizeof(struct GPU));
+-+	mem_unlock(mb, handle);
+-+	mem_free(mb, handle);
+-+	qpu_enable(mb, 0);
+-+#ifdef RPI_USE_VCSM
+-+  vcsm_exit();
+-+#endif
+-+	mbox_close(mb);
+-+  gpu = NULL;
+++    if (s->enable_rpi) {
+++        HEVCLocalContext *lc = s->HEVClc;
+++        HEVCPredCmd *cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++;
+++        cmd->type = RPI_PRED_INTRA;
+++        cmd->size = log2_trafo_size;
+++        cmd->c_idx = c_idx;
+++        cmd->x = x0;
+++        cmd->y = y0;
+++        cmd->na = (lc->na.cand_bottom_left<<4) + (lc->na.cand_left<<3) + (lc->na.cand_up_left<<2) + (lc->na.cand_up<<1) + lc->na.cand_up_right;
+++        cmd->mode = c_idx ? lc->tu.intra_pred_mode_c :  lc->tu.intra_pred_mode;
+++    } else {
+++        s->hpc.intra_pred[log2_trafo_size - 2](s, x0, y0, c_idx);
+++    }
+ +}
+++#endif
+ +
+-+void gpu_free(GPU_MEM_PTR_T *p) {
+-+  int mb = gpu->mb;
+-+	unsigned handle = p->vc_handle;
+-+  gpu_lock();
+-+#ifdef RPI_USE_VCSM
+-+  if (p->vcsm_handle) {
+-+      mem_unlock(mb,p->vc_handle);
+-+      vcsm_unlock_ptr(p->arm);
+-+      vcsm_free(p->vcsm_handle);
+-+  } else {
+-+	unmapmem((void*)p->arm, sizeof(struct GPU));
+-+      mem_unlock(mb, handle);
+-+      mem_free(mb, handle);
+-+  }
++ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
++                               int xBase, int yBase, int cb_xBase, int cb_yBase,
++                               int log2_cb_size, int log2_trafo_size,
++@@ -943,8 +1275,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
++     if (lc->cu.pred_mode == MODE_INTRA) {
++         int trafo_size = 1 << log2_trafo_size;
++         ff_hevc_set_neighbour_available(s, x0, y0, trafo_size, trafo_size);
++-
+++#ifdef RPI
+++        rpi_intra_pred(s, log2_trafo_size, x0, y0, 0);
+ +#else
+-+	unmapmem((void*)p->arm, sizeof(struct GPU));
+-+	mem_unlock(mb, handle);
+-+	mem_free(mb, handle);
++         s->hpc.intra_pred[log2_trafo_size - 2](s, x0, y0, 0);
+ +#endif
+-+
+-+  gpu->open_count--;
+-+  if (gpu->open_count==0) {
+-+      printf("Closing GPU\n");
+-+      gpu_term();
+-+      gpu = NULL;
+-+  }
+-+  gpu_unlock();
+-+}
+-+
+-+unsigned int vpu_get_fn(void) {
+-+  // Make sure that the gpu is initialized
+-+  if (gpu==NULL) {
+-+    printf("Preparing gpu\n");
+-+    gpu_lock();
+-+    gpu_unlock();
+-+  }
+-+  return gpu->vc + offsetof(struct GPU,vpu_code);
+-+}
+-+
+-+unsigned vpu_execute_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5)
+-+{
+-+  unsigned r;
+-+  gpu_lock();
+-+  r = execute_code(gpu->mb, code, r0, r1, r2, r3, r4, r5);
+-+  gpu_unlock();
+-+  return r;
+-+}
+-+
+-+// Run a program on a QPU with the given code and uniform stream (given in GPU addresses)
+-+// The first num QPUs will start at code, the next num2 QPUs will start at code2
+-+void qpu_run_shader12(int code, int num, int code2, int num2, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8, int unifs9, int unifs10, int unifs11, int unifs12)
+-+{
+-+  int i;
+-+#ifdef RPI_TIME_TOTAL_QPU
+-+  static int last_time=0;
+-+  static long long on_time=0;
+-+  static long long off_time=0;
+-+  int start_time;
+-+  int end_time;
+-+  static int count=0;
++     }
++ 
++     if (cbf_luma || cbf_cb[0] || cbf_cr[0] ||
++@@ -1030,7 +1365,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
++             for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
++                 if (lc->cu.pred_mode == MODE_INTRA) {
++                     ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
+++#ifdef RPI
+++                    rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (i << log2_trafo_size_c), 1);
+++#else
++                     s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (i << log2_trafo_size_c), 1);
+ +#endif
+-+
+-+  gpu_lock();
+-+#ifdef RPI_TIME_TOTAL_QPU
+-+  start_time = Microseconds();
+-+  if (last_time==0)
+-+    last_time = start_time;
+-+  off_time += start_time-last_time;
++                 }
++                 if (cbf_cb[i])
++                     ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c),
++@@ -1059,7 +1398,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
++             for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
++                 if (lc->cu.pred_mode == MODE_INTRA) {
++                     ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
+++#ifdef RPI
+++                    rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (i << log2_trafo_size_c), 2);
+++#else
++                     s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (i << log2_trafo_size_c), 2);
+ +#endif
+-+  for(i=0;i<num;i++) {
+-+    gpu->mail[i*2 + 1] = code;
+-+  }
+-+  for(;i<num+num2;i++) {
+-+    gpu->mail[i*2 + 1] = code2;
+-+  }
+-+  gpu->mail[0 ] = unifs1;
+-+  gpu->mail[2 ] = unifs2;
+-+  gpu->mail[4 ] = unifs3;
+-+  gpu->mail[6 ] = unifs4;
+-+  gpu->mail[8 ] = unifs5;
+-+  gpu->mail[10] = unifs6;
+-+	gpu->mail[12] = unifs7;
+-+	gpu->mail[14] = unifs8;
+-+	gpu->mail[16] = unifs9;
+-+	gpu->mail[18] = unifs10;
+-+	gpu->mail[20] = unifs11;
+-+	gpu->mail[22] = unifs12;
+-+	execute_qpu(
+-+		gpu->mb,
+-+		12 /* Number of QPUs */,
+-+		gpu->vc + offsetof(struct GPU, mail),
+-+		1 /* no flush */,  // Don't flush VPU L1 cache
+-+		5000 /* timeout ms */);
+-+#ifdef RPI_TIME_TOTAL_QPU
+-+  end_time = Microseconds();
+-+  last_time = end_time;
+-+  on_time += end_time - start_time;
+-+  count++;
+-+  if ((count&0x7f)==0)
+-+    printf("On=%dms, Off=%dms\n",(int)(on_time/1000),(int)(off_time/1000));
++                 }
++                 if (cbf_cr[i])
++                     ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c),
++@@ -1088,7 +1431,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
++                 if (lc->cu.pred_mode == MODE_INTRA) {
++                     ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size),
++                                                     trafo_size_h, trafo_size_v);
+++#ifdef RPI
+++                    rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (i << log2_trafo_size), 1);
+++#else
++                     s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (i << log2_trafo_size), 1);
+ +#endif
+-+  gpu_unlock();
+-+}
+-+
+-+unsigned int qpu_get_fn(int num) {
+-+    // Make sure that the gpu is initialized
+-+    unsigned int *fn;
+-+    if (gpu==NULL) {
+-+      printf("Preparing gpu\n");
+-+      gpu_lock();
+-+      gpu_unlock();
+-+    }
+-+    switch(num) {
+-+    case QPU_MC_SETUP:
+-+      fn = mc_setup;
+-+      break;
+-+    case QPU_MC_FILTER:
+-+      fn = mc_filter;
+-+      break;
+-+    case QPU_MC_EXIT:
+-+      fn = mc_exit;
+-+      break;
+-+    case QPU_MC_INTERRUPT_EXIT:
+-+      fn = mc_interrupt_exit;
+-+      break;
+-+    case QPU_MC_FILTER_B:
+-+      fn = mc_filter_b;
+-+      break;
+-+    case QPU_MC_FILTER_HONLY:
+-+      fn = mc_filter_honly;
+-+      break;
+-+    case QPU_MC_SETUP_UV:
+-+      fn = mc_setup_uv;
+-+      break;
+-+    case QPU_MC_FILTER_UV:
+-+      fn = mc_filter_uv;
+-+      break;
+-+    case QPU_MC_FILTER_UV_B:
+-+      fn = mc_filter_uv_b;
+-+      break;
+-+    case QPU_MC_END:
+-+      fn = mc_end;
+-+      break;
+-+    default:
+-+      printf("Unknown function\n");
+-+      exit(-1);
+-+    }
+-+    return gpu->vc + 4*(int)(fn-rpi_shader);
+-+    //return code[num] + gpu->vc;
++                 }
++                 if (cbf_cb[i])
++                     ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size),
++@@ -1098,7 +1445,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
++                 if (lc->cu.pred_mode == MODE_INTRA) {
++                     ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size),
++                                                 trafo_size_h, trafo_size_v);
+++#ifdef RPI
+++                    rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (i << log2_trafo_size), 2);
+++#else
++                     s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (i << log2_trafo_size), 2);
+++#endif
++                 }
++                 if (cbf_cr[i])
++                     ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size),
++@@ -1110,26 +1461,46 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
++             int trafo_size_h = 1 << (log2_trafo_size_c + s->ps.sps->hshift[1]);
++             int trafo_size_v = 1 << (log2_trafo_size_c + s->ps.sps->vshift[1]);
++             ff_hevc_set_neighbour_available(s, x0, y0, trafo_size_h, trafo_size_v);
+++#ifdef RPI
+++            rpi_intra_pred(s, log2_trafo_size_c, x0, y0, 1);
+++            rpi_intra_pred(s, log2_trafo_size_c, x0, y0, 2);
+++#else
++             s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0, 1);
++             s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0, 2);
+++#endif
++             if (s->ps.sps->chroma_format_idc == 2) {
++                 ff_hevc_set_neighbour_available(s, x0, y0 + (1 << log2_trafo_size_c),
++                                                 trafo_size_h, trafo_size_v);
+++#ifdef RPI
+++                rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (1 << log2_trafo_size_c), 1);
+++                rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (1 << log2_trafo_size_c), 2);
+++#else
++                 s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (1 << log2_trafo_size_c), 1);
++                 s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (1 << log2_trafo_size_c), 2);
+++#endif
++             }
++         } else if (blk_idx == 3) {
++             int trafo_size_h = 1 << (log2_trafo_size + 1);
++             int trafo_size_v = 1 << (log2_trafo_size + s->ps.sps->vshift[1]);
++             ff_hevc_set_neighbour_available(s, xBase, yBase,
++                                             trafo_size_h, trafo_size_v);
+++#ifdef RPI
+++            rpi_intra_pred(s, log2_trafo_size, xBase, yBase, 1);
+++            rpi_intra_pred(s, log2_trafo_size, xBase, yBase, 2);
+++#else
++             s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase, 1);
++             s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase, 2);
+++#endif
++             if (s->ps.sps->chroma_format_idc == 2) {
++                 ff_hevc_set_neighbour_available(s, xBase, yBase + (1 << (log2_trafo_size)),
++                                                 trafo_size_h, trafo_size_v);
+++#ifdef RPI
+++                rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (1 << (log2_trafo_size)), 1);
+++                rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (1 << (log2_trafo_size)), 2);
+++#else
++                 s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (1 << (log2_trafo_size)), 1);
++                 s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (1 << (log2_trafo_size)), 2);
+++#endif
++             }
++         }
++     }
++@@ -1332,6 +1703,93 @@ static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size)
++  * @param luma_offset additive offset applied to the luma prediction value
++  */
++ 
+++#ifdef RPI_INTER
+++#define RPI_REDIRECT(fn) (s->enable_rpi ? rpi_ ## fn : fn)
+++static void rpi_luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+++                        AVFrame *ref, const Mv *mv, int x_off, int y_off,
+++                        int block_w, int block_h, int luma_weight, int luma_offset)
+++{
+++    HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++;
+++    cmd->cmd = RPI_CMD_LUMA_UNI;
+++    cmd->dst = dst;
+++    cmd->dststride = dststride;
+++    cmd->src = ref->data[0];
+++    cmd->srcstride = ref->linesize[0];
+++    cmd->mv = *mv;
+++    cmd->x_off = x_off;
+++    cmd->y_off = y_off;
+++    cmd->block_w = block_w;
+++    cmd->block_h = block_h;
+++    cmd->weight = luma_weight;
+++    cmd->offset = luma_offset;
+ +}
+ +
+-+#if 0
+-+
+-+int32_t hcoeffs[] = {-4, 10, -21, 70, 90, -24, 11, -4};
+-+//int32_t hcoeffs[] = {1, 1, 1, 1, 1, 1, 1, 1};
+-+int32_t vcoeffs[] = {-2, 6, -13, 37, 115, -20, 9, -4};
+-+//int32_t vcoeffs[] = {1, 1, 1, 1, 1, 1, 1, 1};
+-+
+-+#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0-1) & 0xff) | ((c1-1) & 0xff) << 8 | ((c2-1) & 0xff) << 16 | ((c3-1) & 0xff) << 24);
+-+
+-+static uint8_t av_clip_uint8(int32_t a)
+++static void rpi_luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+++                       AVFrame *ref0, const Mv *mv0, int x_off, int y_off,
+++                       int block_w, int block_h, AVFrame *ref1, const Mv *mv1, struct MvField *current_mv)
+ +{
+-+    if (a&(~255)) return (-a)>>31;
+-+    else          return a;
+++    HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++;
+++    cmd->cmd = RPI_CMD_LUMA_BI;
+++    cmd->dst = dst;
+++    cmd->dststride = dststride;
+++    cmd->src = ref0->data[0];
+++    cmd->srcstride = ref0->linesize[0];
+++    cmd->mv = *mv0;
+++    cmd->x_off = x_off;
+++    cmd->y_off = y_off;
+++    cmd->block_w = block_w;
+++    cmd->block_h = block_h;
+++    cmd->src1 = ref1->data[0];
+++    cmd->srcstride1 = ref1->linesize[0];
+++    cmd->mv1 = *mv1;
+++    cmd->ref_idx[0] = current_mv->ref_idx[0];
+++    cmd->ref_idx[1] = current_mv->ref_idx[1];
+ +}
+ +
+-+static int32_t filter8(const uint8_t *data, int pitch)
+++static void rpi_chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
+++                          ptrdiff_t dststride, uint8_t *src0, ptrdiff_t srcstride, int reflist,
+++                          int x_off, int y_off, int block_w, int block_h, struct MvField *current_mv, int chroma_weight, int chroma_offset)
+ +{
+-+   int32_t vsum = 0;
+-+   int x, y;
+-+
+-+   for (y = 0; y < 8; y++) {
+-+      int32_t hsum = 0;
+-+
+-+      for (x = 0; x < 8; x++)
+-+         hsum += hcoeffs[x]*data[x + y * pitch];
+-+
+-+      vsum += vcoeffs[y]*av_clip_uint8( (hsum + 64) >> 7); // Added brackets to stop compiler warning
+-+   }
+++    HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++;
+++    cmd->cmd = RPI_CMD_CHROMA_UNI;
+++    cmd->dst = dst0;
+++    cmd->dststride = dststride;
+++    cmd->src = src0;
+++    cmd->srcstride = srcstride;
+++    cmd->mv = current_mv->mv[reflist];
+++    cmd->x_off = x_off;
+++    cmd->y_off = y_off;
+++    cmd->block_w = block_w;
+++    cmd->block_h = block_h;
+++    cmd->weight = chroma_weight;
+++    cmd->offset = chroma_offset;
+++}
+ +
+-+   return av_clip_uint8( (vsum + 64) >> 7);
+++static void rpi_chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVFrame *ref0, AVFrame *ref1,
+++                         int x_off, int y_off, int block_w, int block_h, struct MvField *current_mv, int cidx)
+++{
+++    HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++;
+++    cmd->cmd = RPI_CMD_CHROMA_BI+cidx;
+++    cmd->dst = dst0;
+++    cmd->dststride = dststride;
+++    cmd->src = ref0->data[cidx+1];
+++    cmd->srcstride = ref0->linesize[cidx+1];
+++    cmd->mv = current_mv->mv[0];
+++    cmd->mv1 = current_mv->mv[1];
+++    cmd->x_off = x_off;
+++    cmd->y_off = y_off;
+++    cmd->block_w = block_w;
+++    cmd->block_h = block_h;
+++    cmd->src1 = ref1->data[cidx+1];
+++    cmd->srcstride1 = ref1->linesize[cidx+1];
+++    cmd->ref_idx[0] = current_mv->ref_idx[0];
+++    cmd->ref_idx[1] = current_mv->ref_idx[1];
+ +}
+ +
+-+// Note regression changes coefficients so is not thread safe
+-+//#define REGRESSION
+-+#ifdef REGRESSION
+-+#define CMAX 100
+ +#else
+-+#define CMAX 2
+++#define RPI_REDIRECT(fn) fn
+ +#endif
+-+#define YMAX 16
+-+
+-+int rpi_test_shader(void)
+-+{
+-+   int i, c;
+ +
+-+   uint32_t *unifs;
++ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
++                         AVFrame *ref, const Mv *mv, int x_off, int y_off,
++                         int block_w, int block_h, int luma_weight, int luma_offset)
++@@ -1347,6 +1805,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
++                            (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
++     int idx              = ff_hevc_pel_weight[block_w];
++ 
+++#ifdef DISABLE_MC
+++    return;
+++#endif
+ +
+-+   uint8_t *in_buffer;
+-+   uint8_t *out_buffer[2];
++     x_off += mv->x >> 2;
++     y_off += mv->y >> 2;
++     src   += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift));
++@@ -1393,7 +1855,7 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
++  * @param mv1 motion vector1 (relative to block position) to get pixel data from
++  * @param current_mv current motion vector structure
++  */
++- static void luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+++static void luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
++                        AVFrame *ref0, const Mv *mv0, int x_off, int y_off,
++                        int block_w, int block_h, AVFrame *ref1, const Mv *mv1, struct MvField *current_mv)
++ {
++@@ -1417,6 +1879,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
++     uint8_t *src0  = ref0->data[0] + y_off0 * src0stride + (int)((unsigned)x_off0 << s->ps.sps->pixel_shift);
++     uint8_t *src1  = ref1->data[0] + y_off1 * src1stride + (int)((unsigned)x_off1 << s->ps.sps->pixel_shift);
++ 
+++#ifdef DISABLE_MC
+++    return;
+++#endif
+ +
+-+   GPU_MEM_PTR_T unifs_ptr;
+-+   GPU_MEM_PTR_T in_buffer_ptr;
+-+   GPU_MEM_PTR_T out_buffer_ptr[2];
++     if (x_off0 < QPEL_EXTRA_BEFORE || y_off0 < QPEL_EXTRA_AFTER ||
++         x_off0 >= pic_width - block_w - QPEL_EXTRA_AFTER ||
++         y_off0 >= pic_height - block_h - QPEL_EXTRA_AFTER) {
++@@ -1502,6 +1968,10 @@ static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
++     intptr_t _mx         = mx << (1 - hshift);
++     intptr_t _my         = my << (1 - vshift);
++ 
+++#ifdef DISABLE_MC
+++    return;
+++#endif
+ +
+-+   // Addresses in GPU memory of filter programs
+-+   uint32_t mc_setup = 0;
+-+   uint32_t mc_filter = 0;
+-+   uint32_t mc_exit = 0;
+-+
+-+   int pitch = 0x500;
+-+
+-+   if (gpu==NULL) {
+-+      gpu_lock();
+-+      gpu_unlock();
+-+   }
++     x_off += mv->x >> (2 + hshift);
++     y_off += mv->y >> (2 + vshift);
++     src0  += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift));
++@@ -1566,6 +2036,10 @@ static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVF
++     int hshift = s->ps.sps->hshift[1];
++     int vshift = s->ps.sps->vshift[1];
++ 
+++#ifdef DISABLE_MC
+++    return;
+++#endif
+ +
+-+   printf("This needs to change to reflect new assembler\n");
+-+   // Use table to compute locations of program start points
+-+   mc_setup = code[0] + gpu->vc;
+-+   mc_filter = code[1] + gpu->vc;
+-+   mc_exit = code[2] + gpu->vc;
++     intptr_t mx0 = av_mod_uintp2(mv0->x, 2 + hshift);
++     intptr_t my0 = av_mod_uintp2(mv0->y, 2 + vshift);
++     intptr_t mx1 = av_mod_uintp2(mv1->x, 2 + hshift);
++@@ -1693,14 +2167,14 @@ static void hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW,
++     }
++ }
++ 
++-static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
++-                                int nPbW, int nPbH,
++-                                int log2_cb_size, int partIdx, int idx)
+++static void hls_prediction_unit(HEVCContext * const s, const int x0, const int y0,
+++                                const int nPbW, const int nPbH,
+++                                const unsigned int log2_cb_size, const unsigned int partIdx, const unsigned int idx)
++ {
++ #define POS(c_idx, x, y)                                                              \
++     &s->frame->data[c_idx][((y) >> s->ps.sps->vshift[c_idx]) * s->frame->linesize[c_idx] + \
++                            (((x) >> s->ps.sps->hshift[c_idx]) << s->ps.sps->pixel_shift)]
++-    HEVCLocalContext *lc = s->HEVClc;
+++    HEVCLocalContext * const lc = s->HEVClc;
++     int merge_idx = 0;
++     struct MvField current_mv = {{{ 0 }}};
++ 
++@@ -1718,8 +2192,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
++     int y_cb             = y0 >> log2_min_cb_size;
++     int x_pu, y_pu;
++     int i, j;
++-
++-    int skip_flag = SAMPLE_CTB(s->skip_flag, x_cb, y_cb);
+++    const int skip_flag = SAMPLE_CTB(s->skip_flag, x_cb, y_cb);
++ 
++     if (!skip_flag)
++         lc->pu.merge_flag = ff_hevc_merge_flag_decode(s);
++@@ -1763,16 +2236,89 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
++         int nPbW_c = nPbW >> s->ps.sps->hshift[1];
++         int nPbH_c = nPbH >> s->ps.sps->vshift[1];
++ 
++-        luma_mc_uni(s, dst0, s->frame->linesize[0], ref0->frame,
+++#ifdef RPI_LUMA_QPU
+++        if (s->enable_rpi) {
+++            const Mv * const mv    = &current_mv.mv[0];
+++            const unsigned int mx          = mv->x & 3;
+++            const unsigned int my          = mv->y & 3;
+++            const unsigned int my_mx       = (my<<8) | mx;
+++            const uint32_t     my2_mx2_my_mx = (my_mx << 16) | my_mx;
+++            const int x1_m3 = x0 + (mv->x >> 2) - 3;
+++            const int y1_m3 = y0 + (mv->y >> 2) - 3;
+++            const uint32_t src_vc_address_y = get_vc_address_y(ref0->frame);
+++            uint32_t * y = s->curr_y_mvs;
+ +
+-+   if (!vcos_verify(gpu_malloc_uncached(4*64,&unifs_ptr))) {
+-+      return -2;
+-+   }
+-+   unifs = (uint32_t*)unifs_ptr.arm;
+++            for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
+++              const uint32_t src_yx_hi = ((y1_m3 + start_y) << 16);
+ +
+-+   if (!vcos_verify(gpu_malloc_uncached(64*23,&in_buffer_ptr))) {
+-+      return -3;
+-+   }
+-+   in_buffer = (uint8_t*)in_buffer_ptr.arm;
+++              for(int start_x=0;start_x < nPbW;start_x+=16) {
+++                  const int bw = nPbW-start_x;
+++                  const int bh = nPbH-start_y;
+++                  y++[-RPI_LUMA_COMMAND_WORDS] = src_yx_hi | ((x1_m3 + start_x) & 0xffff);
+++                  y++[-RPI_LUMA_COMMAND_WORDS] = src_vc_address_y;
+++                  y++[-RPI_LUMA_COMMAND_WORDS] = src_yx_hi | ((x1_m3 + 8 + start_x) & 0xffff);
+++                  y++[-RPI_LUMA_COMMAND_WORDS] = src_vc_address_y;
+++                  *y++ = ( (bw<16 ? bw : 16) << 16 ) + (bh<16 ? bh : 16);
+++                  *y++ = my2_mx2_my_mx;
+++                  *y++ = s->sh.luma_weight_l0[current_mv.ref_idx[0]];
+++                  *y++ = s->sh.luma_offset_l0[current_mv.ref_idx[0]] * 2 + 1;
+++                  *y++ = (get_vc_address_y(s->frame) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
+++                  y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter;
+++                }
+++            }
+++            s->curr_y_mvs = y;
+++        } else
+++#endif
+++        {
+++            RPI_REDIRECT(luma_mc_uni)(s, dst0, s->frame->linesize[0], ref0->frame,
++                     &current_mv.mv[0], x0, y0, nPbW, nPbH,
++                     s->sh.luma_weight_l0[current_mv.ref_idx[0]],
++                     s->sh.luma_offset_l0[current_mv.ref_idx[0]]);
+++        }
++ 
++         if (s->ps.sps->chroma_format_idc) {
++-            chroma_mc_uni(s, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1],
+++#ifdef RPI_INTER_QPU
+++          if (s->enable_rpi) {
+++                int hshift           = s->ps.sps->hshift[1];
+++                int vshift           = s->ps.sps->vshift[1];
+++                const Mv *mv         = &current_mv.mv[0];
+++                intptr_t mx          = av_mod_uintp2(mv->x, 2 + hshift);
+++                intptr_t my          = av_mod_uintp2(mv->y, 2 + vshift);
+++                intptr_t _mx         = mx << (1 - hshift);
+++                intptr_t _my         = my << (1 - vshift); // Fractional part of motion vector
+ +
+-+   if (!vcos_verify(gpu_malloc_uncached(16*pitch,&out_buffer_ptr[0])) || !vcos_verify(gpu_malloc_uncached(16*pitch,&out_buffer_ptr[1]))) {
+-+      return -4;
+-+   }
+-+   out_buffer[0] = (uint8_t*)out_buffer_ptr[0].arm;
+-+   out_buffer[1] = (uint8_t*)out_buffer_ptr[1].arm;
+++                int x1_c = x0_c + (mv->x >> (2 + hshift));
+++                int y1_c = y0_c + (mv->y >> (2 + hshift));
+ +
+-+   for (c = 0; c < CMAX; c++) {
+-+      int xo[] = {rand()&31, rand()&31};
+++                uint32_t *u = s->curr_u_mvs;
+++                for(int start_y=0;start_y < nPbH_c;start_y+=16) {
+++                  for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
+++                      int bw = nPbW_c-start_x;
+++                      int bh = nPbH_c-start_y;
+++                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
+++                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
+++                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
+++                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_u(ref0->frame);
+++                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_v(ref0->frame);
+++                      *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
+++                      *u++ = rpi_filter_coefs[_mx][0];
+++                      *u++ = rpi_filter_coefs[_my][0];
+++                      *u++ = PACK2(s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0] * 2 + 1,
+++                                   s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0]);
+++                      *u++ = PACK2(s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1] * 2 + 1,
+++                                   s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1]);
+++                      *u++ = (get_vc_address_u(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
+++                      *u++ = (get_vc_address_v(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+++                    }
+++                }
+++                s->curr_u_mvs = u;
+++                return;
+++            }
+++#endif
+++            RPI_REDIRECT(chroma_mc_uni)(s, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1],
++                           0, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
++                           s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]);
++-            chroma_mc_uni(s, dst2, s->frame->linesize[2], ref0->frame->data[2], ref0->frame->linesize[2],
+++            RPI_REDIRECT(chroma_mc_uni)(s, dst2, s->frame->linesize[2], ref0->frame->data[2], ref0->frame->linesize[2],
++                           0, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
++                           s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1]);
++         }
++@@ -1782,17 +2328,89 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
++         int nPbW_c = nPbW >> s->ps.sps->hshift[1];
++         int nPbH_c = nPbH >> s->ps.sps->vshift[1];
++ 
++-        luma_mc_uni(s, dst0, s->frame->linesize[0], ref1->frame,
+++#ifdef RPI_LUMA_QPU
+++        if (s->enable_rpi) {
+++            const int reflist = 1;
+++            const Mv *mv    = &current_mv.mv[reflist];
+++            int mx          = mv->x & 3;
+++            int my          = mv->y & 3;
+++            int my_mx = (my<<8) + mx;
+++            int my2_mx2_my_mx = (my_mx << 16) + my_mx;
+++            int x1 = x0 + (mv->x >> 2);
+++            int y1 = y0 + (mv->y >> 2);
+++            uint32_t *y = s->curr_y_mvs;
+++            for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
+++              for(int start_x=0;start_x < nPbW;start_x+=16) {
+++                  int bw = nPbW-start_x;
+++                  int bh = nPbH-start_y;
+++                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff);
+++                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref1->frame);
+++                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + 8 + start_x) & 0xffff);
+++                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref1->frame);
+++                  *y++ = ( (bw<16 ? bw : 16) << 16 ) + (bh<16 ? bh : 16);
+++                  *y++ = my2_mx2_my_mx;
+++                  *y++ = s->sh.luma_weight_l1[current_mv.ref_idx[reflist]];
+++                  *y++ = s->sh.luma_offset_l1[current_mv.ref_idx[reflist]] * 2 + 1;
+++                  *y++ = (get_vc_address_y(s->frame) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
+++                  y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter;
+++                }
+++            }
+++            s->curr_y_mvs = y;
+++        } else
+++#endif
+ +
+-+#ifdef REGRESSION
+-+      for (i = 0; i < 8; i++) {
+-+         hcoeffs[i] = (int8_t)rand();
+-+         vcoeffs[i] = (int8_t)rand();
+-+         if (hcoeffs[i]==-128)
+-+           hcoeffs[i]++;
+-+         if (vcoeffs[i]==-128)
+-+           vcoeffs[i]++;
+-+      }
+++        {
+++            RPI_REDIRECT(luma_mc_uni)(s, dst0, s->frame->linesize[0], ref1->frame,
++                     &current_mv.mv[1], x0, y0, nPbW, nPbH,
++                     s->sh.luma_weight_l1[current_mv.ref_idx[1]],
++                     s->sh.luma_offset_l1[current_mv.ref_idx[1]]);
+++        }
++ 
++         if (s->ps.sps->chroma_format_idc) {
++-            chroma_mc_uni(s, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1],
+++#ifdef RPI_INTER_QPU
+++            if (s->enable_rpi) {
+++                const int reflist = 1;
+++                const int hshift           = s->ps.sps->hshift[1];
+++                const int vshift           = s->ps.sps->vshift[1];
+++                const Mv * const mv        = &current_mv.mv[reflist];
+++                const intptr_t mx          = av_mod_uintp2(mv->x, 2 + hshift);
+++                const intptr_t my          = av_mod_uintp2(mv->y, 2 + vshift);
+++                const intptr_t _mx         = mx << (1 - hshift);
+++                const intptr_t _my         = my << (1 - vshift); // Fractional part of motion vector
+++
+++                const int x1_c = x0_c + (mv->x >> (2 + hshift));
+++                const int y1_c = y0_c + (mv->y >> (2 + hshift));
+++
+++                uint32_t * u = s->curr_u_mvs;
+++                for(int start_y=0;start_y < nPbH_c;start_y+=16) {
+++                  for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
+++                      const int bw = nPbW_c-start_x;
+++                      const int bh = nPbH_c-start_y;
+++                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
+++                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
+++                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
+++                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_u(ref1->frame);
+++                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_v(ref1->frame);
+++                      *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
+++                      *u++ = rpi_filter_coefs[_mx][0];
+++                      *u++ = rpi_filter_coefs[_my][0];
+++                      *u++ = PACK2(s->sh.chroma_offset_l1[current_mv.ref_idx[reflist]][0] * 2 + 1,
+++                                   s->sh.chroma_weight_l1[current_mv.ref_idx[reflist]][0]);
+++                      *u++ = PACK2(s->sh.chroma_offset_l1[current_mv.ref_idx[reflist]][1] * 2 + 1,
+++                                   s->sh.chroma_weight_l1[current_mv.ref_idx[reflist]][1]);
+++                      *u++ = (get_vc_address_u(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
+++                      *u++ = (get_vc_address_v(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+++                    }
+++                }
+++                s->curr_u_mvs = u;
+++                return;
+++            }
+ +#endif
+++            RPI_REDIRECT(chroma_mc_uni)(s, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1],
++                           1, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
++                           s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0], s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0]);
++ 
++-            chroma_mc_uni(s, dst2, s->frame->linesize[2], ref1->frame->data[2], ref1->frame->linesize[2],
+++            RPI_REDIRECT(chroma_mc_uni)(s, dst2, s->frame->linesize[2], ref1->frame->data[2], ref1->frame->linesize[2],
++                           1, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
++                           s->sh.chroma_weight_l1[current_mv.ref_idx[1]][1], s->sh.chroma_offset_l1[current_mv.ref_idx[1]][1]);
++         }
++@@ -1802,15 +2420,118 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
++         int nPbW_c = nPbW >> s->ps.sps->hshift[1];
++         int nPbH_c = nPbH >> s->ps.sps->vshift[1];
++ 
++-        luma_mc_bi(s, dst0, s->frame->linesize[0], ref0->frame,
+++#ifdef RPI_LUMA_QPU
+++        if (s->enable_rpi && 0) {
+++            const Mv *mv    = &current_mv.mv[0];
+++            int mx          = mv->x & 3;
+++            int my          = mv->y & 3;
+++            int my_mx = (my<<8) + mx;
+++            const Mv *mv2    = &current_mv.mv[1];
+++            int mx2          = mv2->x & 3;
+++            int my2          = mv2->y & 3;
+++            int my2_mx2 = (my2<<8) + mx2;
+++            int my2_mx2_my_mx = (my2_mx2 << 16) + my_mx;
+++            int x1 = x0 + (mv->x >> 2);
+++            int y1 = y0 + (mv->y >> 2);
+++            int x2 = x0 + (mv2->x >> 2);
+++            int y2 = y0 + (mv2->y >> 2);
+++            uint32_t *y = s->curr_y_mvs;
+++            for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
+++              for(int start_x=0;start_x < nPbW;start_x+=8) { // B blocks work 8 at a time
+++                  int bw = nPbW-start_x;
+++                  int bh = nPbH-start_y;
+++                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff);
+++                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref0->frame);
+++                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y2 - 3 + start_y) << 16) + ( (x2 - 3 + start_x) & 0xffff); // Second fetch is for ref1
+++                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref1->frame);
+++                  *y++ = PACK2(bw<8 ? bw : 8, bh<16 ? bh : 16);
+++                  *y++ = my2_mx2_my_mx;
+ +
+-+      for (i = 0; i < 64*23; i++) {
+-+         //printf("%d %d %p\n",i,gpu->mb,&in_buffer[i]);
+-+         in_buffer[i] = rand();
+-+      }
+++                  *y++ = PACK2(s->sh.luma_weight_l1[current_mv.ref_idx[1]],
+++                               s->sh.luma_weight_l0[current_mv.ref_idx[0]]);
+++                  *y++ = s->sh.luma_offset_l0[current_mv.ref_idx[0]] +
+++                         s->sh.luma_offset_l1[current_mv.ref_idx[1]] + 1;
+ +
+-+      // Clear output array
+-+      {
+-+        int b;
+-+        for(b=0;b<2;b++) {
+-+          for(i=0;i<16*16;i++) {
+-+            out_buffer[b][i] = 3;
+-+          }
+++                  *y++ = (get_vc_address_y(s->frame) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
+++                  y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter_b;
+++                }
+++            }
+++            s->curr_y_mvs = y;
+++        } else
+++#endif
+++        {
+++            RPI_REDIRECT(luma_mc_bi)(s, dst0, s->frame->linesize[0], ref0->frame,
++                    &current_mv.mv[0], x0, y0, nPbW, nPbH,
++                    ref1->frame, &current_mv.mv[1], &current_mv);
+ +        }
+-+      }
++ 
++         if (s->ps.sps->chroma_format_idc) {
++-            chroma_mc_bi(s, dst1, s->frame->linesize[1], ref0->frame, ref1->frame,
+++#ifdef RPI_INTER_QPU
+++          if (s->enable_rpi) {
+++                int hshift           = s->ps.sps->hshift[1];
+++                int vshift           = s->ps.sps->vshift[1];
+++                const Mv *mv         = &current_mv.mv[0];
+++                intptr_t mx          = av_mod_uintp2(mv->x, 2 + hshift);
+++                intptr_t my          = av_mod_uintp2(mv->y, 2 + vshift);
+++                intptr_t _mx         = mx << (1 - hshift);
+++                intptr_t _my         = my << (1 - vshift); // Fractional part of motion vector
+++                int x1_c = x0_c + (mv->x >> (2 + hshift));
+++                int y1_c = y0_c + (mv->y >> (2 + hshift));
+ +
+-+      unifs[0] = mc_filter;
+-+      unifs[1] = in_buffer_ptr.vc+xo[0]+16;
+-+      unifs[2] = 64; // src pitch
+-+      unifs[3] = pitch; // dst pitch
+-+      unifs[4] = 0; // Padding
+-+      unifs[5] = 0;
+-+      unifs[6] = 0;
+-+      unifs[7 ] = mc_filter;
+-+      unifs[8 ] = in_buffer_ptr.vc+xo[1]+16;
+-+      unifs[9 ] = ENCODE_COEFFS(hcoeffs[0], hcoeffs[1], hcoeffs[2], hcoeffs[3]);
+-+      unifs[10] = ENCODE_COEFFS(hcoeffs[4], hcoeffs[5], hcoeffs[6], hcoeffs[7]);
+-+      unifs[11] = ENCODE_COEFFS(vcoeffs[0], vcoeffs[1], vcoeffs[2], vcoeffs[3]);
+-+      unifs[12] = ENCODE_COEFFS(vcoeffs[4], vcoeffs[5], vcoeffs[6], vcoeffs[7]);
+-+      unifs[13] = out_buffer_ptr[0].vc;
+-+      unifs[14] = mc_exit;
+-+      unifs[15] = in_buffer_ptr.vc+xo[1]+16;        // dummy
+-+      unifs[16] = ENCODE_COEFFS(hcoeffs[0], hcoeffs[1], hcoeffs[2], hcoeffs[3]);
+-+      unifs[17] = ENCODE_COEFFS(hcoeffs[4], hcoeffs[5], hcoeffs[6], hcoeffs[7]);
+-+      unifs[18] = ENCODE_COEFFS(vcoeffs[0], vcoeffs[1], vcoeffs[2], vcoeffs[3]);
+-+      unifs[19] = ENCODE_COEFFS(vcoeffs[4], vcoeffs[5], vcoeffs[6], vcoeffs[7]);
+-+      unifs[20] = out_buffer_ptr[1].vc;
+-+
+-+      printf("Gpu->vc=%x Code=%x dst=%x\n",gpu->vc, mc_filter,out_buffer_ptr[1].vc);
+-+
+-+      // flush_dcache(); TODO is this needed on ARM side? - tried to use the direct alias to avoid this problem
+++                const Mv *mv2         = &current_mv.mv[1];
+++                intptr_t mx2          = av_mod_uintp2(mv2->x, 2 + hshift);
+++                intptr_t my2          = av_mod_uintp2(mv2->y, 2 + vshift);
+++                intptr_t _mx2         = mx2 << (1 - hshift);
+++                intptr_t _my2         = my2 << (1 - vshift); // Fractional part of motion vector
+ +
+-+      //qpu_run_shader(mc_setup, unifs_ptr.vc);
+-+      //qpu_run_shader(gpu, gpu->vc, unifs_ptr.vc);
+-+      rpi_do_block(in_buffer_ptr.vc+xo[0]+16, 64, out_buffer_ptr[0].vc, pitch,out_buffer[0]);
+-+      rpi_do_block(in_buffer_ptr.vc+xo[1]+16, 64, out_buffer_ptr[1].vc, pitch,out_buffer[1]);
+++                int x2_c = x0_c + (mv2->x >> (2 + hshift));
+++                int y2_c = y0_c + (mv2->y >> (2 + hshift));
+ +
+-+      if (1)
+-+      {
+-+         int x, y, b;
+-+         int bad = 0;
+ +
+-+         for (b=0; b<2; ++b)
+-+            for (y=0; y<YMAX; ++y)
+-+               for (x=0; x<16; ++x) {
+-+                  int32_t ref = filter8(in_buffer+x+y*64+xo[b], 64);
+++                uint32_t *u = s->curr_u_mvs;
+++                for(int start_y=0;start_y < nPbH_c;start_y+=16) {
+++                  for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
+++                      int bw = nPbW_c-start_x;
+++                      int bh = nPbH_c-start_y;
+++                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b0;
+++                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
+++                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
+++                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_u(ref0->frame);
+++                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_v(ref0->frame);
+++                      *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
+++                      *u++ = rpi_filter_coefs[_mx][0];
+++                      *u++ = rpi_filter_coefs[_my][0];
+++                      *u++ = s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0]; // Weight L0 U
+++                      *u++ = s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1]; // Weight L0 V
+++                      *u++ = 0;  // Intermediate results are not written back in first pass of B filtering
+++                      *u++ = 0;
+ +
+-+                  if (out_buffer[b][x+y*pitch] != ref) {
+-+                      bad = 1;
+-+//                     printf("%d, %d, %d, %d\n", c, b, x, y);
+-+                  }
+-+#ifndef REGRESSION
+-+                  //printf("%08x %08x\n", out_buffer[b][x+y*pitch], ref);
+++                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b;
+++                      u++[-RPI_CHROMA_COMMAND_WORDS] = x2_c - 1 + start_x;
+++                      u++[-RPI_CHROMA_COMMAND_WORDS] = y2_c - 1 + start_y;
+++                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_u(ref1->frame);
+++                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_v(ref1->frame);
+++                      *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
+++                      *u++ = rpi_filter_coefs[_mx2][0];
+++                      *u++ = rpi_filter_coefs[_my2][0];
+++                      *u++ = PACK2(s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0] +
+++                                     s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0] + 1,
+++                                   s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0]);
+++                      *u++ = PACK2(s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1] +
+++                                     s->sh.chroma_offset_l1[current_mv.ref_idx[1]][1] + 1,
+++                                   s->sh.chroma_weight_l1[current_mv.ref_idx[1]][1]);
+++                      *u++ = (get_vc_address_u(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
+++                      *u++ = (get_vc_address_v(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+++                    }
+++                }
+++                s->curr_u_mvs = u;
+++                return;
+++            }
+ +#endif
+-+               }
+-+          if (bad)
+-+            printf("Failed dst=%x test=%d\n",out_buffer_ptr[1].vc,c);
+-+          else
+-+            printf("Passed dst=%x test=%d\n",out_buffer_ptr[1].vc,c);
+-+      }
+-+      //printf("%d\n", simpenrose_get_qpu_tick_count());
+-+   }
+-+
+-+   gpu_free(&out_buffer_ptr[0]);
+-+   gpu_free(&out_buffer_ptr[1]);
+-+   gpu_free(&in_buffer_ptr);
+-+   gpu_free(&unifs_ptr);
+-+
+-+   return 0;
+-+}
+-+
+-+void rpi_do_block_arm(const uint8_t *in_buffer, int src_pitch, uint8_t *dst, int dst_pitch)
+++            RPI_REDIRECT(chroma_mc_bi)(s, dst1, s->frame->linesize[1], ref0->frame, ref1->frame,
++                          x0_c, y0_c, nPbW_c, nPbH_c, &current_mv, 0);
++ 
++-            chroma_mc_bi(s, dst2, s->frame->linesize[2], ref0->frame, ref1->frame,
+++            RPI_REDIRECT(chroma_mc_bi)(s, dst2, s->frame->linesize[2], ref0->frame, ref1->frame,
++                          x0_c, y0_c, nPbW_c, nPbH_c, &current_mv, 1);
++         }
++     }
++@@ -2304,6 +3025,734 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
++     lc->ctb_up_left_flag = ((x_ctb > 0) && (y_ctb > 0)  && (ctb_addr_in_slice-1 >= s->ps.sps->ctb_width) && (s->ps.pps->tile_id[ctb_addr_ts] == s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1 - s->ps.sps->ctb_width]]));
++ }
++ 
+++#ifdef RPI
+++static void rpi_execute_dblk_cmds(HEVCContext *s)
+ +{
+-+  int x,y;
+-+  for (y=0; y<16; ++y) {
+-+    for (x=0; x<16; ++x) {
+-+       dst[x+y*dst_pitch] = filter8(in_buffer+x+y*src_pitch, src_pitch);
+++    int n;
+++    int job = s->pass1_job;
+++    int ctb_size    = 1 << s->ps.sps->log2_ctb_size;
+++    int (*p)[2] = s->dblk_cmds[job];
+++    for(n = s->num_dblk_cmds[job]; n>0 ;n--,p++) {
+++        ff_hevc_hls_filters(s, (*p)[0], (*p)[1], ctb_size);
+ +    }
+-+  }
+++    s->num_dblk_cmds[job] = 0;
+ +}
+ +
+-+void rpi_do_block(const uint8_t *in_buffer_vc, int src_pitch, uint8_t *dst_vc, int dst_pitch, uint8_t *dst)
+++static void rpi_execute_transform(HEVCContext *s)
+ +{
+-+   uint32_t *unifs;
+-+
+-+   GPU_MEM_PTR_T unifs_ptr;
+-+   //uint8_t *out_buffer;
+-+   //GPU_MEM_PTR_T out_buffer_ptr;
+-+
+-+   // Addresses in GPU memory of filter programs
+-+   uint32_t mc_setup = 0;
+-+   uint32_t mc_filter = 0;
+-+   uint32_t mc_exit = 0;
+-+   //int x,y;
+-+
+-+   if (gpu==NULL) {
+-+      gpu_lock();
+-+      gpu_unlock();
+-+   }
+-+
+-+   // Use table to compute locations of program start points
+-+   mc_setup = code[0] + gpu->vc;
+-+   mc_filter = code[1] + gpu->vc;
+-+   mc_exit = code[2] + gpu->vc;
+-+
+-+   if (!vcos_verify(gpu_malloc_uncached(4*64,&unifs_ptr))) {
+-+      return;
+-+   }
+-+   //gpu_malloc_uncached(16*dst_pitch,&out_buffer_ptr);
+-+   //out_buffer = (uint8_t*)out_buffer_ptr.arm;
+-+
+-+   /*for (y=0; y<16; ++y) {
+-+      for (x=0; x<16; ++x) {
+-+         out_buffer[x+y*dst_pitch] = 7;
+-+      }
+++    int i=2;
+++    int job = s->pass1_job;
+++    /*int j;
+++    int16_t *coeffs = s->coeffs_buf_arm[job][i];
+++    for(j=s->num_coeffs[job][i]; j > 0; j-= 16*16, coeffs+=16*16) {
+++        s->hevcdsp.idct[4-2](coeffs, 16);
+++    }
+++    i=3;
+++    coeffs = s->coeffs_buf_arm[job][i] - s->num_coeffs[job][i];
+++    for(j=s->num_coeffs[job][i]; j > 0; j-= 32*32, coeffs+=32*32) {
+++        s->hevcdsp.idct[5-2](coeffs, 32);
+ +    }*/
+ +
+-+   unifs = (uint32_t*)unifs_ptr.arm;
+-+
+-+    unifs[0] = mc_filter;
+-+    unifs[1] = (int)in_buffer_vc;
+-+    unifs[2] = src_pitch; // src pitch
+-+    unifs[3] = dst_pitch; // dst pitch
+-+    unifs[4] = 0; // Padding
+-+    unifs[5] = 0;
+-+    unifs[6] = 0;
+-+    unifs[7 ] = mc_exit;
+-+    unifs[8 ] = (int)in_buffer_vc;
+-+    unifs[9 ] = ENCODE_COEFFS(hcoeffs[0], hcoeffs[1], hcoeffs[2], hcoeffs[3]);
+-+    unifs[10] = ENCODE_COEFFS(hcoeffs[4], hcoeffs[5], hcoeffs[6], hcoeffs[7]);
+-+    unifs[11] = ENCODE_COEFFS(vcoeffs[0], vcoeffs[1], vcoeffs[2], vcoeffs[3]);
+-+    unifs[12] = ENCODE_COEFFS(vcoeffs[4], vcoeffs[5], vcoeffs[6], vcoeffs[7]);
+-+    unifs[13] = (int)dst_vc;
+-+    //unifs[13] = (int)out_buffer_ptr.vc;
+-+
+-+    //printf("Gpu->vc=%x Code=%x dst=%x\n",gpu->vc, mc_filter,out_buffer_ptr[1].vc);
+-+
+-+    qpu_run_shader(mc_setup, unifs_ptr.vc);
+-+
+-+    /*for (y=0; y<16; ++y) {
+-+      for (x=0; x<16; ++x) {
+-+         dst[x+y*dst_pitch] = out_buffer[x+y*dst_pitch];
+-+      }
+-+    }*/
+++    gpu_cache_flush(&s->coeffs_buf_accelerated[job]);
+++    s->vpu_id = vpu_post_code2( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2],
+++                               s->num_coeffs[job][2] >> 8, s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3],
+++                               s->num_coeffs[job][3] >> 10, 0, &s->coeffs_buf_accelerated[job]);
+++    //vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0);
+++    //gpu_cache_flush(&s->coeffs_buf_accelerated);
+++    //vpu_wait(s->vpu_id);
+ +
+-+    gpu_free(&unifs_ptr);
+-+    //gpu_free(&out_buffer_ptr);
+++    for(i=0;i<4;i++)
+++        s->num_coeffs[job][i] = 0;
+ +}
+ +
+-+
+++static void rpi_execute_pred_cmds(HEVCContext *s)
+++{
+++  int i;
+++  int job = s->pass1_job;
+++  HEVCPredCmd *cmd = s->univ_pred_cmds[job];
+++#ifdef RPI_WORKER
+++  HEVCLocalContextIntra *lc = &s->HEVClcIntra;
+++#else
+++  HEVCLocalContext *lc = s->HEVClc;
+ +#endif
+ +
+-+#endif // RPI
+-diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
+-new file mode 100644
+-index 0000000..4e3c35c
+---- /dev/null
+-+++ b/libavcodec/rpi_qpu.h
+-@@ -0,0 +1,45 @@
+-+#ifndef RPI_QPU_H
+-+#define RPI_QPU_H
+-+
+-+typedef struct gpu_mem_ptr_s {
+-+  unsigned char *arm; // Pointer to memory mapped on ARM side
+-+  int vc_handle;   // Videocore handle of relocatable memory
+-+  int vcsm_handle; // Handle for use by VCSM
+-+  int vc;       // Address for use in GPU code
+-+  int numbytes; // Size of memory block
+-+} GPU_MEM_PTR_T;
+-+
+-+// General GPU functions
+-+extern int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p);
+-+extern int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p);
+-+extern void gpu_free(GPU_MEM_PTR_T *p);
+-+extern void gpu_cache_flush(GPU_MEM_PTR_T *p);
+-+
+-+// QPU specific functions
+-+extern void qpu_run_shader12(int code, int num, int code2, int num2, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8, int unifs9, int unifs10, int unifs11, int unifs12);
+++  for(i = s->num_pred_cmds[job]; i > 0; i--, cmd++) {
+++      //printf("i=%d cmd=%p job1=%d job0=%d\n",i,cmd,s->pass1_job,s->pass0_job);
+++      if (cmd->type == RPI_PRED_INTRA) {
+++          lc->tu.intra_pred_mode_c = lc->tu.intra_pred_mode = cmd->mode;
+++          lc->na.cand_bottom_left  = (cmd->na >> 4) & 1;
+++          lc->na.cand_left         = (cmd->na >> 3) & 1;
+++          lc->na.cand_up_left      = (cmd->na >> 2) & 1;
+++          lc->na.cand_up           = (cmd->na >> 1) & 1;
+++          lc->na.cand_up_right     = (cmd->na >> 0) & 1;
+++          s->hpc.intra_pred[cmd->size - 2](s, cmd->x, cmd->y, cmd->c_idx);
+++      } else {
+++#ifdef RPI_PRECLEAR
+++          int trafo_size = 1 << cmd->size;
+++#endif
+++          s->hevcdsp.transform_add[cmd->size-2](cmd->dst, cmd->buf, cmd->stride);
+++#ifdef RPI_PRECLEAR
+++          memset(cmd->buf, 0, trafo_size * trafo_size * sizeof(int16_t)); // Clear coefficients here while they are in the cache
+++#endif
+++      }
+++  }
+++  s->num_pred_cmds[job] = 0;
+++}
+ +
+-+enum {
+-+  QPU_MC_SETUP,
+-+  QPU_MC_FILTER,
+-+  QPU_MC_EXIT,
+-+  QPU_MC_INTERRUPT_EXIT,
+-+  QPU_MC_FILTER_B,
+-+  QPU_MC_FILTER_HONLY,
+-+  QPU_MC_SETUP_UV,
+-+  QPU_MC_FILTER_UV,
+-+  QPU_MC_FILTER_UV_B,
+-+  QPU_MC_END
+-+  };
+-+extern unsigned int qpu_get_fn(int num);
+-+
+-+// VPU specific functions
+-+extern unsigned int vpu_get_fn(void);
+-+extern unsigned vpu_execute_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5);
+-+
+-+// Simple test of shader code
+-+extern int rpi_test_shader(void);
+++static void rpi_execute_inter_cmds(HEVCContext *s)
+++{
+++    int job = s->pass1_job;
+++    HEVCMvCmd *cmd = s->unif_mv_cmds[job];
+++    int n,cidx;
+++    AVFrame myref;
+++    AVFrame myref1;
+++    struct MvField mymv;
+++    if (s->num_mv_cmds[job] > RPI_MAX_MV_CMDS) {
+++        printf("Overflow inter_cmds\n");
+++        exit(-1);
+++    }
+++    for(n = s->num_mv_cmds[job]; n>0 ; n--, cmd++) {
+++        switch(cmd->cmd) {
+++        case RPI_CMD_LUMA_UNI:
+++            myref.data[0] = cmd->src;
+++            myref.linesize[0] = cmd->srcstride;
+++            luma_mc_uni(s, cmd->dst, cmd->dststride, &myref, &cmd->mv, cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, cmd->weight, cmd->offset);
+++            break;
+++        case RPI_CMD_LUMA_BI:
+++            myref.data[0] = cmd->src;
+++            myref.linesize[0] = cmd->srcstride;
+++            myref1.data[0] = cmd->src1;
+++            myref1.linesize[0] = cmd->srcstride1;
+++            mymv.ref_idx[0] = cmd->ref_idx[0];
+++            mymv.ref_idx[1] = cmd->ref_idx[1];
+++            luma_mc_bi(s, cmd->dst, cmd->dststride,
+++                       &myref, &cmd->mv, cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h,
+++                       &myref1, &cmd->mv1, &mymv);
+++            break;
+++        case RPI_CMD_CHROMA_UNI:
+++            mymv.mv[0] = cmd->mv;
+++            chroma_mc_uni(s, cmd->dst,
+++                          cmd->dststride, cmd->src, cmd->srcstride, 0,
+++                          cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, &mymv, cmd->weight, cmd->offset);
+++            break;
+++        case RPI_CMD_CHROMA_BI:
+++        case RPI_CMD_CHROMA_BI+1:
+++            cidx = cmd->cmd - RPI_CMD_CHROMA_BI;
+++            myref.data[cidx+1] = cmd->src;
+++            myref.linesize[cidx+1] = cmd->srcstride;
+++            myref1.data[cidx+1] = cmd->src1;
+++            myref1.linesize[cidx+1] = cmd->srcstride1;
+++            mymv.ref_idx[0] = cmd->ref_idx[0];
+++            mymv.ref_idx[1] = cmd->ref_idx[1];
+++            mymv.mv[0] = cmd->mv;
+++            mymv.mv[1] = cmd->mv1;
+++            chroma_mc_bi(s, cmd->dst, cmd->dststride, &myref, &myref1,
+++                         cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, &mymv, cidx);
+++            break;
+++        }
+++    }
+++    s->num_mv_cmds[job] = 0;
+++}
+ +
+-+extern void rpi_do_block(const unsigned char *in_buffer_vc, int src_pitch, unsigned char *dst_vc, int dst_pitch, unsigned char *dst);
+-+extern void rpi_do_block_arm(const unsigned char *in_buffer, int src_pitch, unsigned char *dst, int dst_pitch);
+++static void rpi_do_all_passes(HEVCContext *s)
+++{
+++    // Kick off QPUs and VPUs
+++    rpi_launch_vpu_qpu(s);
+++    // Perform luma inter prediction
+++    rpi_execute_inter_cmds(s);
+++    // Wait for transform completion
+++    vpu_wait(s->vpu_id);
+++    // Perform intra prediction and residual reconstruction
+++    rpi_execute_pred_cmds(s);
+++    // Perform deblocking for CTBs in this row
+++    rpi_execute_dblk_cmds(s);
+++    // Prepare next batch
+++    rpi_begin(s);
+++}
+ +
+ +#endif
+-diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
+-new file mode 100644
+-index 0000000..41cc2e1
+---- /dev/null
+-+++ b/libavcodec/rpi_shader.c
+-@@ -0,0 +1,818 @@
+-+#include "rpi_shader.h"
+ +
+-+#ifdef _MSC_VER
+-+   #include <stdint.h>
+-+   /* cast through uintptr_t to avoid warnings */
+-+   #define POINTER_TO_UINT(X) ((unsigned int)(uintptr_t)(X))
+-+#else
+-+   #define POINTER_TO_UINT(X) ((unsigned int)(X))
+-+#endif
+++#ifdef RPI
+++static void rpi_begin(HEVCContext *s)
+++{
+++    int job = s->pass0_job;
+++    int i;
+++#ifdef RPI_INTER_QPU
+++    int pic_width        = s->ps.sps->width >> s->ps.sps->hshift[1];
+++    int pic_height       = s->ps.sps->height >> s->ps.sps->vshift[1];
+ +
+-+#ifdef __cplusplus
+-+extern "C" { /* the types are probably wrong... */
+-+#endif
+-+#ifdef __cplusplus
+-+}
+++    for(i=0;i<8;i++) {
+++        s->u_mvs[job][i] = s->mvs_base[job][i];
+++        *s->u_mvs[job][i]++ = 0;
+++        *s->u_mvs[job][i]++ = 0;
+++        *s->u_mvs[job][i]++ = 0;
+++        *s->u_mvs[job][i]++ = 0;
+++        *s->u_mvs[job][i]++ = 0;
+++        *s->u_mvs[job][i]++ = pic_width;
+++        *s->u_mvs[job][i]++ = pic_height;
+++        *s->u_mvs[job][i]++ = s->frame->linesize[1];
+++        *s->u_mvs[job][i]++ = s->frame->linesize[2];
+++        *s->u_mvs[job][i]++ = s->sh.chroma_log2_weight_denom + 6;
+++        *s->u_mvs[job][i]++ = 0;
+++        *s->u_mvs[job][i]++ = i;  // Select section of VPM (avoid collisions with 3d unit)
+++    }
+++    s->curr_u_mvs = s->u_mvs[job][0];
+ +#endif
+ +
+-+#ifdef _MSC_VER
+-+__declspec(align(8))
+-+#elif defined(__GNUC__)
+-+__attribute__((aligned(8)))
+-+#endif
+-+unsigned int rpi_shader[] = {
+-+// ::mc_setup
+-+/* [0x00000000] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000008] */ 0x0c9a0f80, 0x10020427, // add ra_x_base, unif, elem_num
+-+/* [0x00000010] */ 0x15827d80, 0x10020767, // mov ra_y, unif
+-+/* [0x00000018] */ 0x15827d80, 0x10020627, // mov ra_x2_base, unif
+-+/* [0x00000020] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
+-+/* [0x00000028] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
+-+/* [0x00000030] */ 0x15827d80, 0x10021427, // mov rb16, unif
+-+/* [0x00000038] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000040] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+-+/* [0x00000048] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
+-+/* [0x00000050] */ 0x00000001, 0xe0020527, // mov ra20, 1
+-+/* [0x00000058] */ 0x00000040, 0xe0020567, // mov ra21, 64
+-+/* [0x00000060] */ 0x00000100, 0xe00205a7, // mov ra22, 256
+-+/* [0x00000068] */ 0x00000008, 0xe00205e7, // mov ra23, 8
+-+/* [0x00000070] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
+-+/* [0x00000078] */ 0x00000040, 0xe0021567, // mov rb21, 64
+-+/* [0x00000080] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
+-+/* [0x00000088] */ 0x00000018, 0xe00215e7, // mov rb23, 24
+-+/* [0x00000090] */ 0x00000000, 0xe0020227, // mov ra8, 0
+-+/* [0x00000098] */ 0x00000000, 0xe0020267, // mov ra9, 0
+-+/* [0x000000a0] */ 0x00000000, 0xe00202a7, // mov ra10, 0
+-+/* [0x000000a8] */ 0x00000000, 0xe00202e7, // mov ra11, 0
+-+/* [0x000000b0] */ 0x00000000, 0xe0020327, // mov ra12, 0
+-+/* [0x000000b8] */ 0x00000000, 0xe0020367, // mov ra13, 0
+-+/* [0x000000c0] */ 0x00000000, 0xe00203a7, // mov ra14, 0
+-+/* [0x000000c8] */ 0x00000000, 0xe00203e7, // mov ra15, 0
+-+/* [0x000000d0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+-+/* [0x000000d8] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+-+/* [0x000000e0] */ 0x159e7480, 0x10020867, // mov r1, r2
+-+/* [0x000000e8] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+-+/* [0x000000f0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+-+/* [0x000000f8] */ 0x159e7480, 0x10020827, // mov r0, r2
+-+/* [0x00000100] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+-+/* [0x00000108] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000110] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
+-+/* [0x00000118] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+-+/* [0x00000120] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
+-+/* [0x00000128] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+-+/* [0x00000130] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+-+/* [0x00000138] */ 0x159e7480, 0x10020867, // mov r1, r2
+-+/* [0x00000140] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+-+/* [0x00000148] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+-+/* [0x00000150] */ 0x159e7480, 0x10020827, // mov r0, r2
+-+/* [0x00000158] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+-+/* [0x00000160] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000168] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+-+/* [0x00000170] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
+-+/* [0x00000178] */ 0x15427d80, 0x10020827, // mov r0, ra_x_base
+-+/* [0x00000180] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
+-+/* [0x00000188] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
+-+/* [0x00000190] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base
+-+/* [0x00000198] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x000001a0] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
+-+/* [0x000001a8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
+-+/* [0x000001b0] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
+-+/* [0x000001b8] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
+-+/* [0x000001c0] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+-+/* [0x000001c8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x000001d0] */ 0x4c9d00cf, 0x10024821, // add r0, r0, r3; mul24 r1, r1, rb_pitch
+-+/* [0x000001d8] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
+-+/* [0x000001e0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+-+/* [0x000001e8] */ 0x949dc5c0, 0xd0025890, // and r2, r2, ~3; mov ra_x_base, r0
+-+/* [0x000001f0] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
+-+/* [0x000001f8] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
+-+/* [0x00000200] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000208] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000210] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
+-+/* [0x00000218] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x00000220] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
+-+/* [0x00000228] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000230] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+-+/* [0x00000238] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
+-+/* [0x00000240] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
+-+// ::mc_filter_uv
+-+/* [0x00000248] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000250] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00000258] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000260] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000268] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x00000270] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000278] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x00000280] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x00000288] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+-+/* [0x00000290] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x00000298] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+-+/* [0x000002a0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x000002a8] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x000002b0] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000002b8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x000002c0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x000002c8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x000002d0] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+-+/* [0x000002d8] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+-+/* [0x000002e0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x000002e8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x000002f0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x000002f8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00000300] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
+-+/* [0x00000308] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000310] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000318] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000320] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000328] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x00000330] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000338] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000340] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000348] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+-+/* [0x00000350] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000358] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000360] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000368] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+-+/* [0x00000370] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000378] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000380] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000388] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+-+/* [0x00000390] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x00000398] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x000003a0] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+// :uvloop
+-+/* [0x000003a8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x000003b0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x000003b8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x000003c0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x000003c8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x000003d0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x000003d8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x000003e0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x000003e8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x000003f0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x000003f8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000400] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
+-+/* [0x00000408] */ 0x40038031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
+-+/* [0x00000410] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00000418] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x00000420] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000428] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00000430] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00000438] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00000440] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+/* [0x00000448] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+/* [0x00000450] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+/* [0x00000458] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+/* [0x00000460] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+/* [0x00000468] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+/* [0x00000470] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+/* [0x00000478] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+/* [0x00000480] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
+-+/* [0x00000488] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+-+/* [0x00000490] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+-+/* [0x00000498] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+-+/* [0x000004a0] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+-+/* [0x000004a8] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+-+/* [0x000004b0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+-+/* [0x000004b8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x000004c0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+-+/* [0x000004c8] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop
+-+/* [0x000004d0] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
+-+/* [0x000004d8] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
+-+/* [0x000004e0] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
+-+/* [0x000004e8] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
+-+/* [0x000004f0] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
+-+/* [0x000004f8] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
+-+/* [0x00000500] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
+-+/* [0x00000508] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
+-+/* [0x00000510] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
+-+/* [0x00000518] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
+-+/* [0x00000520] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
+-+/* [0x00000528] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
+-+/* [0x00000530] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x00000538] */ 0xfffffe50, 0xf06809e7, // brr.anyn -, r:uvloop
+-+/* [0x00000540] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
+-+/* [0x00000548] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x00000550] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x00000558] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00000560] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000568] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000570] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x00000578] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x00000580] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000588] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x00000590] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000598] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+// ::mc_filter
+-+/* [0x000005a0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x000005a8] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x000005b0] */ 0x156e7d80, 0x10020667, // mov ra_x2shift, ra_x2shift_next
+-+/* [0x000005b8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x000005c0] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
+-+/* [0x000005c8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x000005d0] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3,unif
+-+/* [0x000005d8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x000005e0] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
+-+/* [0x000005e8] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
+-+/* [0x000005f0] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
+-+/* [0x000005f8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x00000600] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
+-+/* [0x00000608] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+-+/* [0x00000610] */ 0x149dc5c0, 0xd00206a7, // and ra_x2_base_next, r2, ~3
+-+/* [0x00000618] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x00000620] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x00000628] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x00000630] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000638] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x00000640] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x00000648] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x00000650] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+-+/* [0x00000658] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+-+/* [0x00000660] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x00000668] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000670] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000678] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00000680] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
+-+/* [0x00000688] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000690] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000698] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000006a0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000006a8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x000006b0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000006b8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000006c0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000006c8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+-+/* [0x000006d0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000006d8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000006e0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000006e8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+-+/* [0x000006f0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000006f8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000700] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000708] */ 0x000001d0, 0xf07809e7, // brr.anynn -, r:fast_path
+-+/* [0x00000710] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+-+/* [0x00000718] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x00000720] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000728] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+// :loop
+-+/* [0x00000730] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x00000738] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x00000740] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x00000748] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000750] */ 0xee654987, 0x10024860, // shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20
+-+/* [0x00000758] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000760] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000768] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x00000770] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x00000778] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x00000780] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000788] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
+-+/* [0x00000790] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00000798] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x000007a0] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x000007a8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x000007b0] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x000007b8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x000007c0] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+/* [0x000007c8] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+/* [0x000007d0] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+/* [0x000007d8] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+/* [0x000007e0] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+/* [0x000007e8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+/* [0x000007f0] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+/* [0x000007f8] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+/* [0x00000800] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
+-+/* [0x00000808] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+-+/* [0x00000810] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+-+/* [0x00000818] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+-+/* [0x00000820] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+-+/* [0x00000828] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+-+/* [0x00000830] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+-+/* [0x00000838] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00000840] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+-+/* [0x00000848] */ 0xfffffec8, 0xf06809e7, // brr.anyn -, r:loop
+-+/* [0x00000850] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
+-+/* [0x00000858] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
+-+/* [0x00000860] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
+-+/* [0x00000868] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
+-+/* [0x00000870] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
+-+/* [0x00000878] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
+-+/* [0x00000880] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
+-+/* [0x00000888] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
+-+/* [0x00000890] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
+-+/* [0x00000898] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
+-+/* [0x000008a0] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
+-+/* [0x000008a8] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
+-+/* [0x000008b0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x000008b8] */ 0xfffffe58, 0xf06809e7, // brr.anyn -, r:loop
+-+/* [0x000008c0] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
+-+/* [0x000008c8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x000008d0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x000008d8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x000008e0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x000008e8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x000008f0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+// :fast_path
+-+/* [0x000008f8] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+// :fast_loop
+-+/* [0x00000900] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x00000908] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x00000910] */ 0x95727d9b, 0x1004475f, // mov.ifz ra_y, ra_y_next   ; mov rb31, r3
+-+/* [0x00000918] */ 0x95690dbf, 0x10044623, // mov.ifz ra_x2_base, ra_x2_base_next   ; mov r3, rb_pitch
+-+/* [0x00000920] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000928] */ 0x929de5e4, 0x100248a1, // min r2, r2, rb_frame_height_minus_1 ; mov r1, r4
+-+/* [0x00000930] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x00000938] */ 0xec414c87, 0x10024e20, // add t0s, ra_x_base, r2    ; v8subs r0, r0, rb20
+-+/* [0x00000940] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x00000948] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
+-+/* [0x00000950] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra1 << 1, r0 << 1
+-+/* [0x00000958] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra2 << 2, r0 << 2
+-+/* [0x00000960] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra3 << 3, r0 << 3
+-+/* [0x00000968] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra4 << 4, r0 << 4
+-+/* [0x00000970] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra5 << 5, r0 << 5
+-+/* [0x00000978] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra6 << 6, r0 << 6
+-+/* [0x00000980] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra7 << 7, r0 << 7
+-+/* [0x00000988] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
+-+/* [0x00000990] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+-+/* [0x00000998] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+-+/* [0x000009a0] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+-+/* [0x000009a8] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+-+/* [0x000009b0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+-+/* [0x000009b8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x000009c0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8       ; mov r1, ra22
+-+/* [0x000009c8] */ 0xffffff18, 0xf06809e7, // brr.anyn -, r:fast_loop
+-+/* [0x000009d0] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
+-+/* [0x000009d8] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
+-+/* [0x000009e0] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
+-+/* [0x000009e8] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
+-+/* [0x000009f0] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
+-+/* [0x000009f8] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
+-+/* [0x00000a00] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
+-+/* [0x00000a08] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
+-+/* [0x00000a10] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
+-+/* [0x00000a18] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
+-+/* [0x00000a20] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
+-+/* [0x00000a28] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
+-+/* [0x00000a30] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x00000a38] */ 0xfffffea8, 0xf06809e7, // brr.anyn -, r:fast_loop
+-+/* [0x00000a40] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
+-+/* [0x00000a48] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x00000a50] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x00000a58] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000a60] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00000a68] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000a70] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+// ::mc_filter_b
+-+/* [0x00000a78] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000a80] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00000a88] */ 0x156e7d80, 0x10020667, // mov ra_x2shift, ra_x2shift_next
+-+/* [0x00000a90] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000a98] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
+-+/* [0x00000aa0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000aa8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3,unif
+-+/* [0x00000ab0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000ab8] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
+-+/* [0x00000ac0] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
+-+/* [0x00000ac8] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
+-+/* [0x00000ad0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x00000ad8] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
+-+/* [0x00000ae0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+-+/* [0x00000ae8] */ 0x149dc5c0, 0xd00206a7, // and ra_x2_base_next, r2, ~3
+-+/* [0x00000af0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x00000af8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x00000b00] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x00000b08] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000b10] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x00000b18] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x00000b20] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x00000b28] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+-+/* [0x00000b30] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+-+/* [0x00000b38] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x00000b40] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+-+/* [0x00000b48] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+-+/* [0x00000b50] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+-+/* [0x00000b58] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000b60] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000b68] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00000b70] */ 0x0c9dc7c0, 0x10020c67, // add vr_setup, r3, rb28
+-+/* [0x00000b78] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000b80] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000b88] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000b90] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000b98] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x00000ba0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000ba8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000bb0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000bb8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+-+/* [0x00000bc0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000bc8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000bd0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000bd8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+-+/* [0x00000be0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000be8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000bf0] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000bf8] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+-+/* [0x00000c00] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x00000c08] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000c10] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+// :bloop
+-+/* [0x00000c18] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x00000c20] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x00000c28] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x00000c30] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000c38] */ 0xee654987, 0x10024860, // shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20
+-+/* [0x00000c40] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000c48] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000c50] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x00000c58] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x00000c60] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x00000c68] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000c70] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
+-+/* [0x00000c78] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00000c80] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x00000c88] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000c90] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00000c98] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00000ca0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00000ca8] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+/* [0x00000cb0] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+/* [0x00000cb8] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+/* [0x00000cc0] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+/* [0x00000cc8] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+/* [0x00000cd0] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+/* [0x00000cd8] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+/* [0x00000ce0] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+/* [0x00000ce8] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
+-+/* [0x00000cf0] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+-+/* [0x00000cf8] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+-+/* [0x00000d00] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+-+/* [0x00000d08] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+-+/* [0x00000d10] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+-+/* [0x00000d18] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+-+/* [0x00000d20] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00000d28] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+-+/* [0x00000d30] */ 0xfffffec8, 0xf06809e7, // brr.anyn -, r:bloop
+-+/* [0x00000d38] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
+-+/* [0x00000d40] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
+-+/* [0x00000d48] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
+-+/* [0x00000d50] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
+-+/* [0x00000d58] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
+-+/* [0x00000d60] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
+-+/* [0x00000d68] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
+-+/* [0x00000d70] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
+-+/* [0x00000d78] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
+-+/* [0x00000d80] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
+-+/* [0x00000d88] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
+-+/* [0x00000d90] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
+-+/* [0x00000d98] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x00000da0] */ 0x8fc8f3f6, 0xd0020867, // asr r1, r1, 15          ; mov -, vr_wait
+-+/* [0x00000da8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x00000db0] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
+-+/* [0x00000db8] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:bloop
+-+/* [0x00000dc0] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+-+/* [0x00000dc8] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
+-+/* [0x00000dd0] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
+-+/* [0x00000dd8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000de0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00000de8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000df0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+// ::mc_filter_honly
+-+/* [0x00000df8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000e00] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00000e08] */ 0x156e7d80, 0x10020667, // mov ra_x2shift, ra_x2shift_next
+-+/* [0x00000e10] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000e18] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
+-+/* [0x00000e20] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000e28] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3,unif
+-+/* [0x00000e30] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000e38] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
+-+/* [0x00000e40] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
+-+/* [0x00000e48] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
+-+/* [0x00000e50] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x00000e58] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
+-+/* [0x00000e60] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+-+/* [0x00000e68] */ 0x149dc5c0, 0xd00206a7, // and ra_x2_base_next, r2, ~3
+-+/* [0x00000e70] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x00000e78] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x00000e80] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x00000e88] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000e90] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x00000e98] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x00000ea0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x00000ea8] */ 0x0c9de1c0, 0xd0021467, // add rb17, r0, -2
+-+/* [0x00000eb0] */ 0x919c71c0, 0xd0024812, // shl r0, r0, 7 ; mov rb18,r0
+-+/* [0x00000eb8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000ec0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000ec8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00000ed0] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000ed8] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000ee0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000ee8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000ef0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x00000ef8] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000f00] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000f08] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000f10] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+-+/* [0x00000f18] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000f20] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x00000f28] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000f30] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+// :loop_honly
+-+/* [0x00000f38] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x00000f40] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x00000f48] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x00000f50] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000f58] */ 0xee654987, 0x10024860, // shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20
+-+/* [0x00000f60] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000f68] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000f70] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x00000f78] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x00000f80] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x00000f88] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000f90] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
+-+/* [0x00000f98] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00000fa0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x00000fa8] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000fb0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00000fb8] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00000fc0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00000fc8] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+/* [0x00000fd0] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+/* [0x00000fd8] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+/* [0x00000fe0] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+/* [0x00000fe8] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+/* [0x00000ff0] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+/* [0x00000ff8] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+/* [0x00001000] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+/* [0x00001008] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
+-+/* [0x00001010] */ 0x8d5927f6, 0x100269e1, // sub.setf -, r3, rb18 ; mov r1, ra22
+-+/* [0x00001018] */ 0x559f2fc1, 0x100049e0, // mov -, vw_wait   ; mul24 r0, r0, r1
+-+/* [0x00001020] */ 0xfffffef8, 0xf06809e7, // brr.anyn -, r:loop_honly
+-+/* [0x00001028] */ 0x0f9cf1c0, 0xd0020827, // asr r0, r0, 15
+-+/* [0x00001030] */ 0x129d61c0, 0x10020827, // min r0, r0, rb22
+-+/* [0x00001038] */ 0x139c01c0, 0xd0020c27, // max vpm, r0, 0
+-+/* [0x00001040] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00001048] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00001050] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00001058] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+// ::mc_exit
+-+/* [0x00001060] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00001068] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+-+/* [0x00001070] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00001078] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00001080] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00001088] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00001090] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00001098] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x000010a0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+// ::mc_exit1
+-+/* [0x000010a8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x000010b0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x000010b8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x000010c0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x000010c8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x000010d0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x000010d8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x000010e0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+// ::mc_interrupt_exit
+-+/* [0x000010e8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x000010f0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x000010f8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00001100] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00001108] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00001110] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001118] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001120] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001128] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001130] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001138] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001140] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001148] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001150] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001158] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001160] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001168] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00001170] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x00001178] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+// ::mc_interrupt_exit4
+-+/* [0x00001180] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00001188] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00001190] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00001198] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x000011a0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x000011a8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000011b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000011b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000011c0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x000011c8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x000011d0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+// ::mc_interrupt_exit8
+-+/* [0x000011d8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x000011e0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x000011e8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x000011f0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x000011f8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00001200] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001208] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001210] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001218] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001220] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001228] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001230] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001238] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00001240] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x00001248] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+// ::mc_setup_uv
+-+/* [0x00001250] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00001258] */ 0x0c9a0f80, 0x10020427, // add ra_x_base, unif, elem_num
+-+/* [0x00001260] */ 0x15827d80, 0x10020767, // mov ra_y, unif
+-+/* [0x00001268] */ 0x15827d80, 0x10020627, // mov ra_x2_base, unif
+-+/* [0x00001270] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x00001278] */ 0x0d620f80, 0x10020667, // sub ra_u2v_ref_offset, unif, ra_x2_base
+-+/* [0x00001280] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
+-+/* [0x00001288] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
+-+/* [0x00001290] */ 0x15827d80, 0x10021427, // mov rb16, unif
+-+/* [0x00001298] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000012a0] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+-+/* [0x000012a8] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
+-+/* [0x000012b0] */ 0x00000001, 0xe0020527, // mov ra20, 1
+-+/* [0x000012b8] */ 0x00000040, 0xe0020567, // mov ra21, 64
+-+/* [0x000012c0] */ 0x00000100, 0xe00205a7, // mov ra22, 256
+-+/* [0x000012c8] */ 0x00000008, 0xe00205e7, // mov ra23, 8
+-+/* [0x000012d0] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
+-+/* [0x000012d8] */ 0x00000040, 0xe0021567, // mov rb21, 64
+-+/* [0x000012e0] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
+-+/* [0x000012e8] */ 0x00000018, 0xe00215e7, // mov rb23, 24
+-+/* [0x000012f0] */ 0x00000000, 0xe0020227, // mov ra8, 0
+-+/* [0x000012f8] */ 0x00000000, 0xe0020267, // mov ra9, 0
+-+/* [0x00001300] */ 0x00000000, 0xe00202a7, // mov ra10, 0
+-+/* [0x00001308] */ 0x00000000, 0xe00202e7, // mov ra11, 0
+-+/* [0x00001310] */ 0x00000000, 0xe0020327, // mov ra12, 0
+-+/* [0x00001318] */ 0x00000000, 0xe0020367, // mov ra13, 0
+-+/* [0x00001320] */ 0x00000000, 0xe00203a7, // mov ra14, 0
+-+/* [0x00001328] */ 0x00000000, 0xe00203e7, // mov ra15, 0
+-+/* [0x00001330] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+-+/* [0x00001338] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+-+/* [0x00001340] */ 0x159e7480, 0x10020867, // mov r1, r2
+-+/* [0x00001348] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+-+/* [0x00001350] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+-+/* [0x00001358] */ 0x159e7480, 0x10020827, // mov r0, r2
+-+/* [0x00001360] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+-+/* [0x00001368] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00001370] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
+-+/* [0x00001378] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+-+/* [0x00001380] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
+-+/* [0x00001388] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+-+/* [0x00001390] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+-+/* [0x00001398] */ 0x159e7480, 0x10020867, // mov r1, r2
+-+/* [0x000013a0] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+-+/* [0x000013a8] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+-+/* [0x000013b0] */ 0x159e7480, 0x10020827, // mov r0, r2
+-+/* [0x000013b8] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+-+/* [0x000013c0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x000013c8] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+-+/* [0x000013d0] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
+-+/* [0x000013d8] */ 0x15427d80, 0x10020827, // mov r0, ra_x_base
+-+/* [0x000013e0] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
+-+/* [0x000013e8] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base
+-+/* [0x000013f0] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
+-+/* [0x000013f8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
+-+/* [0x00001400] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x00001408] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+-+/* [0x00001410] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x_base, r0
+-+/* [0x00001418] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x00001420] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
+-+/* [0x00001428] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
+-+/* [0x00001430] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
+-+/* [0x00001438] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00001440] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00001448] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00001450] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
+-+/* [0x00001458] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x00001460] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
+-+/* [0x00001468] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00001470] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+-+/* [0x00001478] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
+-+/* [0x00001480] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
+-+// ::mc_filter_uv_b
+-+/* [0x00001488] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00001490] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00001498] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x000014a0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x000014a8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x000014b0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x000014b8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x000014c0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x000014c8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+-+/* [0x000014d0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x000014d8] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+-+/* [0x000014e0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x000014e8] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x000014f0] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000014f8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x00001500] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x00001508] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x00001510] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+-+/* [0x00001518] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+-+/* [0x00001520] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x00001528] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+-+/* [0x00001530] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+-+/* [0x00001538] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+-+/* [0x00001540] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00001548] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00001550] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00001558] */ 0x0c9dc7c0, 0x10020c67, // add vr_setup, r3, rb28
+-+/* [0x00001560] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
+-+/* [0x00001568] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00001570] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00001578] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00001580] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00001588] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x00001590] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00001598] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000015a0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000015a8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+-+/* [0x000015b0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000015b8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000015c0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000015c8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+-+/* [0x000015d0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000015d8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000015e0] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000015e8] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+-+/* [0x000015f0] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x000015f8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00001600] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+// :uvloop_b
+-+/* [0x00001608] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x00001610] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x00001618] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x00001620] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00001628] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x00001630] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00001638] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00001640] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x00001648] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x00001650] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x00001658] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00001660] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
+-+/* [0x00001668] */ 0x40038031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
+-+/* [0x00001670] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00001678] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x00001680] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00001688] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00001690] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00001698] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x000016a0] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+/* [0x000016a8] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+/* [0x000016b0] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+/* [0x000016b8] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+/* [0x000016c0] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+/* [0x000016c8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+/* [0x000016d0] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+/* [0x000016d8] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+/* [0x000016e0] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
+-+/* [0x000016e8] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+-+/* [0x000016f0] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+-+/* [0x000016f8] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+-+/* [0x00001700] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+-+/* [0x00001708] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+-+/* [0x00001710] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+-+/* [0x00001718] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00001720] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+-+/* [0x00001728] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x00001730] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
+-+/* [0x00001738] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
+-+/* [0x00001740] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
+-+/* [0x00001748] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
+-+/* [0x00001750] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
+-+/* [0x00001758] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
+-+/* [0x00001760] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
+-+/* [0x00001768] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
+-+/* [0x00001770] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
+-+/* [0x00001778] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
+-+/* [0x00001780] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
+-+/* [0x00001788] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
+-+/* [0x00001790] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x00001798] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
+-+/* [0x000017a0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x000017a8] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
+-+/* [0x000017b0] */ 0xfffffe38, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x000017b8] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+-+/* [0x000017c0] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
+-+/* [0x000017c8] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
+-+/* [0x000017d0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x000017d8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x000017e0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x000017e8] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x000017f0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x000017f8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00001800] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x00001808] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00001810] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+// ::mc_end
+-+};
+-+#ifdef __HIGHC__
+-+#pragma Align_to(8, rpi_shader)
+++#ifdef RPI_LUMA_QPU
+++    for(i=0;i<12;i++) {
+++        // This needs to have a generally similar structure to the
+++        // actual filter code as various pipelined bits need to land correctly
+++        // when inserted by the filter requests
+++        s->y_mvs[job][i] = s->y_mvs_base[job][i];
+++        *s->y_mvs[job][i]++ = 0; // y_x
+++        *s->y_mvs[job][i]++ = 0; // ref_y_base
+++        *s->y_mvs[job][i]++ = 0; // y2_x2
+++        *s->y_mvs[job][i]++ = 0; // ref_y2_base
+++        *s->y_mvs[job][i]++ = (s->ps.sps->width << 16) + s->ps.sps->height;
+++        *s->y_mvs[job][i]++ = s->frame->linesize[0]; // pitch
+++        *s->y_mvs[job][i]++ = s->frame->linesize[0]; // dst_pitch
+++        *s->y_mvs[job][i]++ = s->sh.luma_log2_weight_denom + 6;  // weight demon + 6
+++        *s->y_mvs[job][i]++ = 0; // Unused - alignment with per-block
+++        *s->y_mvs[job][i]++ = 0; // Next kernel
+++    }
+++    s->curr_y_mvs = s->y_mvs[job][0];
+ +#endif
+-diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
+-new file mode 100644
+-index 0000000..db971f4
+---- /dev/null
+-+++ b/libavcodec/rpi_shader.h
+-@@ -0,0 +1,20 @@
+-+#ifndef rpi_shader_H
+-+#define rpi_shader_H
+-+
+-+extern unsigned int rpi_shader[];
+-+
+-+#define mc_setup (rpi_shader + 0)
+-+#define mc_filter_uv (rpi_shader + 146)
+-+#define mc_filter (rpi_shader + 360)
+-+#define mc_filter_b (rpi_shader + 670)
+-+#define mc_filter_honly (rpi_shader + 894)
+-+#define mc_exit (rpi_shader + 1048)
+-+#define mc_exit1 (rpi_shader + 1066)
+-+#define mc_interrupt_exit (rpi_shader + 1082)
+-+#define mc_interrupt_exit4 (rpi_shader + 1120)
+-+#define mc_interrupt_exit8 (rpi_shader + 1142)
+-+#define mc_setup_uv (rpi_shader + 1172)
+-+#define mc_filter_uv_b (rpi_shader + 1314)
+-+#define mc_end (rpi_shader + 1542)
+-+
+++    s->ctu_count = 0;
+++}
+ +#endif
+-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
+-new file mode 100644
+-index 0000000..6851e83
+---- /dev/null
+-+++ b/libavcodec/rpi_shader.qasm
+-@@ -0,0 +1,1413 @@
+-+# register allocation
+-+#
+-+# ra0...ra7                                     eight horizontal filter coefficients
+-+#
+-+# rb1...rb7                                     seven shifted copies of the current unfiltered row
+-+#
+-+# ra8...ra15                                    eight filtered rows of context (rb15 == most recent)
+-+#
+-+#                                               (ra15 isn't clamped to zero - this happens during the
+-+#                                                copy to ra14, and during its use in the vertical filter)
+-+#
+-+# rb8...rb15                                    eight vertical filter coefficients
+-+#
+-+# ra16                                          clipped(row start address+elem_num)&~3
+-+# ra17                                          per-channel shifts
+-+# ra19                                          next ra17
+-+#
+-+# rb16                                          pitch
+-+# rb17                                          height + 5
+-+# rb18                                          height + 7
+-+# rb19                                          next ra16
+-+#
+-+# ra20                                          1
+-+# ra21                                          64
+-+# ra22                                          256
+-+# ra23                                          8
+-+#
+-+# rb20                                          0xffffff00
+-+# rb21                                          64
+-+# rb22                                          255
+-+# rb23                                          24
+-+#
+-+# rb24                                          vdw_setup_1(dst_pitch)
+-+# rb25                                          frame width-1
+-+# rb26                                          height<<23 + width<<16 + vdw_setup_0
+-+# rb27                                          vdw_setup_0 (depends on QPU number)
+-+# rb28                                          vpm_setup (depends on QPU number)
+-+# rb29                                          vdw_setup_1(dst_pitch-width)
+-+# rb30                                          frame height-1
+-+# rb31                                          used as temp to count loop iterations
+-+#
+-+# ra24...ra30                                   15, 14, 13, 12, 11, 10, 9
+-+# ra24                                          clipped(row start address+8+elem_num)&~3
+-+# ra25                                          per-channel shifts 2
+-+# ra26                                          next ra24
+-+# ra27                                          next ra25
+-+# ra28                                          next y
+-+# ra29                                          y for next texture access
+-+#
+-+# ra31                                          next kernel address
+ +
+-+.set rb_frame_width_minus_1,       rb25
+-+.set rb_frame_height_minus_1,      rb30
+-+.set rb_pitch,                     rb16
+-+.set ra_x_base,                    ra16
+-+.set rb_x_base_next,               rb19
+-+.set ra_x2_base,                   ra24
+-+.set ra_x2_base_next,              ra26
+-+.set ra_xshift,                    ra17
+++#ifdef RPI_SIMULATE_QPUS
+ +
+-+.set ra_x2shift,                   ra25
+-+.set ra_u2v_ref_offset,            ra25
+++static int32_t clipx(int x,int FRAME_WIDTH)
+++{
+++	if (x<=0) return 0;
+++	if (x>=FRAME_WIDTH) return FRAME_WIDTH-1;
+++	return x;
+++}
+ +
+-+.set ra_xshift_next,               ra19
+++static int32_t clipy(int y,int FRAME_HEIGHT)
+++{
+++	if (y<=0) return 0;
+++	if (y>=FRAME_HEIGHT) return FRAME_HEIGHT-1;
+++	return y;
+++}
+ +
+-+.set ra_x2shift_next,              ra27
+-+.set ra_u2v_dst_offset,            ra27
+++/*static int32_t filter8(uint8_t *data, int x0, int y0, int pitch, int mx, int my,int round,int denom,int weight,int offset)
+++{
+++   int32_t vsum = 0;
+++   int x, y;
+ +
+-+.set ra_y_next,                    ra28
+-+.set ra_y,                         ra29
+++   for (y = 0; y < 8; y++) {
+++      int32_t hsum = 0;
+ +
+-+.set rb_const_64,                  rb21
+++      for (x = 0; x < 8; x++)
+++         hsum += lumaFilter[mx][x]*data[clipx(x + x0) + clipy(y + y0) * pitch];
+ +
+-+# mc_setup(next_kernel, x, y, ref_base, frame_width, frame_height, pitch, dst_pitch, pad0, pad1)
+-+::mc_setup
+++      vsum += lumaFilter[my][y]*hsum;
+++   }
+++   vsum >>= 6;
+++   vsum = (((vsum*weight)+round)>>denom)+offset;
+ +
+-+# Read starting kernel
+-+mov ra31, unif
+++   return av_clip_uint8( vsum );
+++}*/
+ +
+-+# Load first request location
+-+add ra_x_base, unif, elem_num # Store x
+-+mov ra_y, unif # Store y
+-+mov ra_x2_base, unif # Store frame base
+++static int32_t filter8_chroma(uint8_t *data, int x0, int y0, int pitch, int hcoeffs, int vcoeffs,int offset_weight,int offset_before,int denom,int pic_width, int pic_height)
+++{
+++  int32_t vsum = 0;
+++  int x, y;
+++  int chromaFilterH[4];
+++  int chromaFilterV[4];
+++  int i;
+++  int offset_after = offset_weight>>16;
+++  int weight = (offset_weight<<16)>>16;
+++  for(i=0;i<4;i++) {
+++    chromaFilterH[i] = ((hcoeffs>>(8*i))<<24)>>24;
+++    chromaFilterV[i] = ((vcoeffs>>(8*i))<<24)>>24;
+++  }
+ +
+-+# Read image dimensions
+-+sub rb25,unif,1
+-+sub rb30,unif,1
+++   for (y = 0; y < 4; y++) {
+++      int32_t hsum = 0;
+ +
+-+# get source pitch
+-+mov rb16, unif
+++      for (x = 0; x < 4; x++)
+++         hsum += chromaFilterH[x]*data[clipx(x + x0,pic_width) + clipy(y + y0,pic_height) * pitch];
+ +
+-+# get destination pitch
+-+mov r0, unif
+-+mov r1, vdw_setup_1(0)
+-+add rb24, r1, r0
+++      vsum += chromaFilterV[y]*hsum;
+++   }
+++   vsum >>= 6;
+++   vsum = (((vsum*weight)+offset_before)>>denom)+offset_after;
+ +
+-+# load constants
+++   return vsum;
+++}
+ +
+-+mov ra20, 1
+-+mov ra21, 64
+-+mov ra22, 256
+-+mov ra23, 8
+++int lumaFilter[4][8]={ {0,0,0,64,0,0,0,0},{-1,4,-10,58,17,-5,1,0},{-1,4,-11,40,40,-11,4,-1},{0,1,-5,17,58,-10,4,-1} };
+ +
+-+mov rb20, 0xffffff00
+-+mov rb21, 64
+-+mov rb22, 255
+-+mov rb23, 24
+++static int32_t filter8_luma(uint8_t *data, int x0, int y0, int pitch, int my_mx,int offset_weight,int offset_before,int denom,int pic_width, int pic_height)
+++{
+++  int32_t vsum = 0;
+++  int x, y;
+++  int i;
+++  int offset_after = offset_weight>>16;
+++  int weight = (offset_weight<<16)>>16;
+ +
+-+# touch vertical context to keep simulator happy
+++   for (y = 0; y < 8; y++) {
+++      int32_t hsum = 0;
+ +
+-+mov ra8, 0
+-+mov ra9, 0
+-+mov ra10, 0
+-+mov ra11, 0
+-+mov ra12, 0
+-+mov ra13, 0
+-+mov ra14, 0
+-+mov ra15, 0
+++      for (x = 0; x < 8; x++)
+++         hsum += lumaFilter[my_mx&3][x]*data[clipx(x + x0,pic_width) + clipy(y + y0,pic_height) * pitch];
+ +
+-+# Compute part of VPM to use for DMA output
+-+mov r2, qpu_num
+-+and r2, r2, 15
+-+mov r1, r2
+-+asr r1, r1, 2
+-+shl r1, r1, 6
+-+mov r0, r2
+-+and r0, r0, 3
+-+add r0, r0, r1
+-+mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
+-+shl r0, r0, 5
+-+add rb27, r0, r1
+++      vsum += lumaFilter[(my_mx>>8)&3][y]*hsum;
+++   }
+++   vsum >>= 6;
+++   vsum = (((vsum*weight)+offset_before)>>denom)+offset_after;
+ +
+-+# Compute part of VPM to save data into
+-+mov r2, qpu_num
+-+and r2, r2, 15
+-+mov r1, r2
+-+asr r1, r1, 2
+-+shl r1, r1, 6
+-+mov r0, r2
+-+and r0, r0, 3
+-+add r0, r0, r1
+-+mov r1, vpm_setup(0, 4, h8p(0, 0))
+-+add rb28, r0, r1
+++   return vsum;
+++}
+ +
+-+# Compute base address for first and second access
+-+#add r0, unif, elem_num     # x
+-+mov r0, ra_x_base           # Load x
+-+add r2, r0, 8               # x+8
+-+max r0, r0, 0; mov r1, ra_y # Load y
+-+min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base  # Load the frame base
+-+shl ra_xshift_next, r0, 3
+-+max r2, r2, 0
+-+add ra_y, r1, 1
+-+min r2, r2, rb_frame_width_minus_1
+-+shl ra_x2shift_next, r2, 3
+-+max r1, r1, 0  # y
+-+min r1, r1, rb_frame_height_minus_1
+-+add r0, r0, r3; mul24 r1, r1, rb_pitch
+-+add r2, r2, r3
+-+and r0, r0, ~3
+-+and r2, r2, ~3; mov ra_x_base, r0
+-+# submit texture requests for first line
+-+add t0s, r0, r1 ; mov ra_x2_base, r2
+-+add t0s, r2, r1
+++static uint8_t *test_frame(HEVCContext *s,uint32_t p, AVFrame *frame, const int cIdx)
+++{
+++  //int pic_width        = s->ps.sps->width >> s->ps.sps->hshift[cIdx];
+++  int pic_height       = s->ps.sps->height >> s->ps.sps->vshift[cIdx];
+++  int pitch = frame->linesize[cIdx];
+++  uint32_t base = cIdx == 0 ? get_vc_address_y(frame) :
+++    cIdx == 1 ? get_vc_address_u(frame) : get_vc_address_v(frame);
+++  if (p>=base && p<base+pitch*pic_height) {
+++    return frame->data[cIdx] + (p-base);
+++  }
+++  return NULL;
+++}
+ +
+-+# Dump padding words
+-+mov r0, unif
+-+mov r0, unif
+++static uint8_t *compute_arm_addr(HEVCContext *s,uint32_t p, int cIdx)
+++{
+++  SliceHeader *sh   = &s->sh;
+++  uint8_t *arm = test_frame(s,p,s->frame,cIdx);
+++  int i;
+++  if (arm) return arm;
+++  if (sh->slice_type == P_SLICE || sh->slice_type == B_SLICE)
+++  {
+++    for(i=0;i<sh->nb_refs[L0];i++) {
+++      arm = test_frame(s,p,s->ref->refPicList[0].ref[i]->frame,cIdx);
+++      if (arm) return arm;
+++    }
+++  }
+++  if (sh->slice_type == B_SLICE) {
+++    for(i=0;i<sh->nb_refs[L1];i++) {
+++      arm = test_frame(s,p,s->ref->refPicList[1].ref[i]->frame,cIdx);
+++      if (arm) return arm;
+++    }
+++  }
+++  printf("Frame 0x%x not found! Exit=%x\n",p,qpu_get_fn(QPU_MC_EXIT));
+++  exit(-1);
+++  return NULL;
+++}
+ +
+-+# submit texture requests for second line
+-+max r1, ra_y, 0
+-+min r1, r1, rb_frame_height_minus_1
+-+add ra_y, ra_y, 1
+-+bra -, ra31
+-+nop ; mul24 r1, r1, rb_pitch
+-+add t0s, r1, ra_x_base
+-+add t0s, r1, ra_x2_base
+++static void rpi_simulate_inter_chroma(HEVCContext *s,uint32_t *p)
+++{
+++  uint32_t next_kernel;
+++  uint32_t x0;
+++  uint32_t y0;
+++  uint8_t *ref_u_base;
+++  uint8_t *ref_v_base;
+++  uint32_t frame_width = p[5];
+++  uint32_t frame_height = p[6];
+++  uint32_t pitch = p[7];
+++  uint32_t dst_pitch = p[8];
+++  int32_t offset_before = p[9];
+++  int32_t denom = p[10];
+++  uint32_t vpm_id = p[11];
+++  uint32_t tmp_u_dst[256];
+++  uint32_t tmp_v_dst[256];
+++  while(1) {
+++    p += 12;
+++    next_kernel = p[0-12];
+++    x0 = p[1-12];
+++    y0 = p[2-12];
+++    if (next_kernel==s->mc_filter_uv || next_kernel==s->mc_filter_uv_b0 || next_kernel==s->mc_filter_uv_b) {
+++      int x,y;
+++      uint32_t width_height = p[5];
+++      uint32_t hcoeffs = p[6];
+++      uint32_t vcoeffs = p[7];
+++      uint32_t offset_weight_u = p[8];
+++      uint32_t offset_weight_v = p[9];
+++      uint8_t *this_u_dst;
+++      uint8_t *this_v_dst;
+++      uint32_t width = width_height >> 16;
+++      uint32_t height = (width_height << 16) >> 16;
+++      ref_u_base = compute_arm_addr(s,p[3-12],1);
+++      ref_v_base = compute_arm_addr(s,p[4-12],2);
+++      if (next_kernel!=s->mc_filter_uv_b0)
+++      {
+++        this_u_dst = compute_arm_addr(s,p[10],1);
+++        this_v_dst = compute_arm_addr(s,p[11],2);
+++      }
+++      for (y=0; y<height; ++y) {
+++        for (x=0; x<width; ++x) {
+++          if (next_kernel==s->mc_filter_uv) {
+++            int32_t refa = filter8_chroma(ref_u_base,x+x0, y+y0, pitch, hcoeffs, vcoeffs, offset_weight_u,offset_before,denom,frame_width,frame_height);
+++            int32_t refb = filter8_chroma(ref_v_base,x+x0, y+y0, pitch, hcoeffs, vcoeffs, offset_weight_v,offset_before,denom,frame_width,frame_height);
+++            this_u_dst[x+y*dst_pitch] = av_clip_uint8(refa);
+++            this_v_dst[x+y*dst_pitch] = av_clip_uint8(refb);
+++          } else if (next_kernel==s->mc_filter_uv_b0) {
+++            int32_t refa = filter8_chroma(ref_u_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1,0,0,frame_width,frame_height);
+++            int32_t refb = filter8_chroma(ref_v_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1,0,0,frame_width,frame_height);
+++            tmp_u_dst[x+y*16] = refa;
+++            tmp_v_dst[x+y*16] = refb;
+++          } else {
+++            int32_t refa = filter8_chroma(ref_u_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1, 64 + tmp_u_dst[x+y*16], 7, frame_width, frame_height);
+++            int32_t refb = filter8_chroma(ref_v_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1, 64 + tmp_v_dst[x+y*16], 7, frame_width, frame_height);
+++            this_u_dst[x+y*dst_pitch] = av_clip_uint8(refa);
+++            this_v_dst[x+y*dst_pitch] = av_clip_uint8(refb);
+++          }
+++        }
+++      }
+++    } else {
+++      av_assert0(next_kernel==qpu_get_fn(QPU_MC_INTERRUPT_EXIT8) || next_kernel==qpu_get_fn(QPU_MC_EXIT) );
+++      break;
+++    }
+++  }
+++}
+ +
+-+################################################################################
+++// mc_setup(y_x, ref_y_base, y2_x2, ref_y2_base, frame_width_height, pitch, dst_pitch, offset_shift, next_kernel)
+++static void rpi_simulate_inter_luma(HEVCContext *s,uint32_t *p,int chan)
+++{
+++  uint32_t next_kernel;
+++  int y_x,y2_x2;
+++  int x0;
+++  int y0;
+++  int x2;
+++  int y2;
+++  uint32_t *p0 = p;
+++  uint8_t *ref_y_base;
+++  uint8_t *ref_y2_base;
+++  uint32_t frame_width_height = p[4];
+++  uint32_t frame_width = frame_width_height>>16;
+++  uint32_t frame_height = (frame_width_height<<16)>>16;
+++  uint32_t pitch = p[5];
+++  uint32_t dst_pitch = p[6];
+++  int offset_shift = p[7];
+++  int32_t offset_before = offset_shift>>16;
+++  int32_t denom = (offset_shift<<16)>>16;
+++  while(1) {
+++    p += 9;
+++    next_kernel = p[8-9];
+++    y_x = p[0-9];
+++    x0 = (y_x<<16)>>16;
+++    y0 = y_x>>16;
+++    y2_x2 = p[2-9];
+++    x2 = (y2_x2<<16)>>16;
+++    y2 = y2_x2>>16;
+ +
+-+# mc_filter_uv(next_kernel, x, y, frame_u_base, frame_v_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_u_dst, this_v_dst)
+++    if (next_kernel==s->mc_filter || next_kernel==s->mc_filter_b) {
+++      // y_x, frame_base, y2_x2, frame_base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
+++      int x,y;
+++      uint32_t width_height = p[4];
+++      uint32_t my2_mx2_my_mx = p[5];
+++      uint32_t offset_weight = p[6];
+++      uint8_t *this_dst = compute_arm_addr(s,p[7],0);
+++      uint32_t width = width_height >> 16;
+++      uint32_t height = (width_height << 16) >> 16;
+++      uint8_t *dst_base = s->frame->data[0];
+++      ref_y_base = compute_arm_addr(s,p[1-9],0);
+++      ref_y2_base = compute_arm_addr(s,p[3-9],0);
+++      for (y=0; y<height; ++y) {
+++        for (x=0; x<width; ++x) {
+++          if (next_kernel==s->mc_filter) {
+++            int32_t refa = filter8_luma(ref_y_base,x+x0, y+y0, pitch, my2_mx2_my_mx, offset_weight,offset_before,denom,frame_width,frame_height);
+++            refa = av_clip_uint8(refa);
+++            this_dst[x+y*dst_pitch] = refa;
+++          }
+++          else {
+++            int32_t refa = filter8_luma(ref_y_base, x+x0, y+y0, pitch, my2_mx2_my_mx, 1, 0, 0, frame_width, frame_height);
+++            int32_t refb = filter8_luma(ref_y2_base, x+x2, y+y2, pitch, my2_mx2_my_mx>>16, 1, 64 + refa, 7, frame_width, frame_height);
+++            this_dst[x+y*dst_pitch] = av_clip_uint8(refb);
+++          }
+++        }
+++      }
+++    } else {
+++      av_assert0(next_kernel==qpu_get_fn(QPU_MC_INTERRUPT_EXIT12) || next_kernel==qpu_get_fn(QPU_MC_EXIT) );
+++      break;
+++    }
+++  }
+++}
+ +
+-+# At this point we have already issued two pairs of texture requests for the current block
+-+# ra_x_base, ra_x16_base point to the current coordinates for this block
+-+::mc_filter_uv
+-+mov ra31, unif
+-+
+-+# per-channel shifts were calculated on the *previous* invocation
+-+
+-+mov ra_xshift, ra_xshift_next
+-+
+-+# get base addresses and per-channel shifts for *next* invocation
+-+add r0, unif, elem_num    # x
+-+max r0, r0, 0; mov r1, unif # y
+-+min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base
+-+shl ra_xshift_next, r0, 3
+-+sub r2, unif, r3 # compute offset from frame base u to frame base v
+-+add r0, r0, r3
+-+and rb_x_base_next, r0, ~3
+-+mov ra_y_next, r1
+-+add ra_x2_base_next, rb_x_base_next, r2
+-+
+-+# set up VPM write
+-+mov vw_setup, rb28
+-+
+-+# get width,height of block
+-+mov r2, 16
+-+mov r0, unif
+-+shr r1, r0, r2 # Extract width
+-+sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
+-+and r0, r0, rb22 # Extract height
+-+add rb17, r0, 5
+-+add rb18, r0, 7
+-+shl r0, r0, 7
+-+add r0, r0, r1 # Combine width and height of destination area
+-+shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
+-+add rb26, r0, rb27
+-+
+-+sub.setf -,8,r1 # 8-r1, so if <0 (negative) we need to use the full code
+-+
+-+# get filter coefficients
+++static void rpi_simulate_inter_qpu(HEVCContext *s)
+++{
+++  // First run the transform as normal
+++  int i;
+++  rpi_execute_transform(s);
+++  for(i=0;i<8;i++)
+++  {
+++    rpi_simulate_inter_chroma(s,s->mvs_base[i]);
+++  }
+++  for(i=0;i<12;i++)
+++  {
+++    rpi_simulate_inter_luma(s,s->y_mvs_base[i],i);
+++  }
+++}
+ +
+-+mov r0, unif
+-+asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra0, r0, rb23;      mov r0, unif
+-+asr ra7, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra6, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra5, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra4, r0, rb23;      mov r0, unif
+-+asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+asr rb8, r0, rb23;      mov r0, unif
+-+asr rb15, r0, rb23;     mul24 r0, r0, ra22
+-+asr rb14, r0, rb23;     mul24 r0, r0, ra22
+-+asr rb13, r0, rb23;     mul24 r0, r0, ra22
+-+asr rb12, r0, rb23
+++#endif
+ +
+-+# r2 is elem_num
+-+# r3 is loop counter
+++#ifdef RPI_INTER_QPU
+ +
+-+mov r5rep, -8
+-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++static void rpi_launch_vpu_qpu(HEVCContext *s)
+++{
+++    int k;
+++    int job = s->pass1_job;
+++    int i;
+++    uint32_t *unif_vc = (uint32_t *)s->unif_mvs_ptr[job].vc;
+++#ifdef RPI_LUMA_QPU
+++    uint32_t *y_unif_vc = (uint32_t *)s->y_unif_mvs_ptr[job].vc;
+++#endif
+++    if (s->sh.slice_type == I_SLICE) {
+++#ifdef RPI_MULTI_MAILBOX
+++      rpi_execute_transform(s);
+++      return;
+++#endif
+++    }
+++    for(k=0;k<8;k++) {
+++        s->u_mvs[job][k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
+++        s->u_mvs[job][k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
+++        s->u_mvs[job][k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP_UV); // Also need a dummy for V
+++        av_assert0(s->u_mvs[job][k] - s->mvs_base[job][k] < UV_COMMANDS_PER_QPU);
+++    }
+ +
+-+# retrieve texture results and pick out bytes
+-+# then submit two more texture requests
+++    s->u_mvs[job][8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
+ +
+-+mov r3, 0
+++#ifdef RPI_LUMA_QPU
+++    for(k=0;k<12;k++) {
+++        s->y_mvs[job][k][-RPI_LUMA_COMMAND_WORDS+1] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
+++        s->y_mvs[job][k][-RPI_LUMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // Also need a dummy for second request
+++        s->y_mvs[job][k][-1] = qpu_get_fn(QPU_MC_EXIT); // Add exit command (Final uniform)
+++        av_assert0(s->y_mvs[job][k] - s->y_mvs_base[job][k] < Y_COMMANDS_PER_QPU);
+++    }
+++    s->y_mvs[job][12-1][-1] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT12); // This QPU will signal interrupt when all others are done and have acquired a semaphore
+++#endif
+ +
+-+:uvloop
+-+# retrieve texture results and pick out bytes
+-+# then submit two more texture requests
+++#ifdef RPI_SIMULATE_QPUS
+++    rpi_simulate_inter_qpu(s);
+++    return;
+++#endif
+ +
+-+sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
+-+shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
+++#ifdef RPI_MULTI_MAILBOX
+++#ifdef RPI_CACHE_UNIF_MVS
+++    flush_frame3(s, s->frame,&s->coeffs_buf_accelerated[job],&s->y_unif_mvs_ptr[job], &s->unif_mvs_ptr[job], job);
+++#else
+++    flush_frame3(s, s->frame,&s->coeffs_buf_accelerated[job],NULL,NULL, job);
+++#endif
+ +
+-+max r2, ra_y, 0  # y
+-+min r2, r2, rb_frame_height_minus_1
+-+add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+add t0s, ra_x2_base, r2
+++#if 1
+++    {
+++        unsigned int i;
+++        uint32_t * p;
+++        uint32_t code = qpu_get_fn(QPU_MC_SETUP_UV);
+++        uint32_t mail_uv[QPU_N_UV * QPU_MAIL_EL_VALS];
+++        uint32_t mail_y[QPU_N_Y * QPU_MAIL_EL_VALS];
+++
+++        for (p = mail_uv, i = 0; i != QPU_N_UV; ++i) {
+++            *p++ = (uint32_t)(unif_vc + (s->mvs_base[job][i] - (uint32_t*)s->unif_mvs_ptr[job].arm));
+++            *p++ = code;
+++        }
+ +
+-+# generate seven shifted versions
+-+# interleave with scroll of vertical context
+++        code = qpu_get_fn(QPU_MC_SETUP);
+++        for (p = mail_y, i = 0; i != QPU_N_Y; ++i) {
+++            *p++ = (uint32_t)(y_unif_vc + (s->y_mvs_base[job][i] - (uint32_t*)s->y_unif_mvs_ptr[job].arm));
+++            *p++ = code;
+++        }
+ +
+-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++        s->vpu_id = vpu_qpu_post_code2(vpu_get_fn(),
+++            vpu_get_constants(),
+++            s->coeffs_buf_vc[job][2],
+++            s->num_coeffs[job][2] >> 8,
+++            s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3],
+++            s->num_coeffs[job][3] >> 10,
+++            0,
+++            // QPU job 1
+++            QPU_N_UV,
+++            mail_uv,
+++            // QPU job 2
+++            QPU_N_Y,
+++            mail_y
+++            );
+++    }
+ +
+-+mov r2, rb21         ; mul24 r3, r0, ra0
+-+nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
+-+sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+-+nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+sub r0, r2, r3
+-+
+-+mov r3, rb31
+-+
+-+mov ra8, ra9
+-+mov ra9, ra10
+-+mov ra10, ra11
+-+mov ra11, ra12
+-+mov ra12, ra13
+-+mov ra13, ra14
+-+
+-+sub.setf -, r3, 8 ; mov r1, ra22
+++#else
+++    s->vpu_id = vpu_qpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2], s->num_coeffs[job][2] >> 8,
+++                                                                      s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3], s->num_coeffs[job][3] >> 10, 0,
+++                                   qpu_get_fn(QPU_MC_SETUP_UV),
+++                                   (uint32_t)(unif_vc+(s->mvs_base[job][0 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+++                                   (uint32_t)(unif_vc+(s->mvs_base[job][1 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+++                                   (uint32_t)(unif_vc+(s->mvs_base[job][2 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+++                                   (uint32_t)(unif_vc+(s->mvs_base[job][3 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+++                                   (uint32_t)(unif_vc+(s->mvs_base[job][4 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+++                                   (uint32_t)(unif_vc+(s->mvs_base[job][5 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+++                                   (uint32_t)(unif_vc+(s->mvs_base[job][6 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+++                                   (uint32_t)(unif_vc+(s->mvs_base[job][7 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+++#ifdef RPI_LUMA_QPU
+++                                   qpu_get_fn(QPU_MC_SETUP),
+++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][0 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][1 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][2 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][3 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][4 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][5 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][6 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][7 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][8 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][9 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][10 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][11 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm))
+++#else
+++                                   0,
+++                                   0,0,0,0,
+++                                   0,0,0,0,
+++                                   0,0,0,0
+++#endif
+++                                 );
+++#endif
+++    for(i=0;i<4;i++)
+++        s->num_coeffs[job][i] = 0;
+++#else
+++#error Code rotted here
+++    qpu_run_shader8(qpu_get_fn(QPU_MC_SETUP_UV),
+++      (uint32_t)(unif_vc+(s->mvs_base[job][0 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+++      (uint32_t)(unif_vc+(s->mvs_base[job][1 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+++      (uint32_t)(unif_vc+(s->mvs_base[job][2 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+++      (uint32_t)(unif_vc+(s->mvs_base[job][3 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+++      (uint32_t)(unif_vc+(s->mvs_base[job][4 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+++      (uint32_t)(unif_vc+(s->mvs_base[job][5 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+++      (uint32_t)(unif_vc+(s->mvs_base[job][6 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+++      (uint32_t)(unif_vc+(s->mvs_base[job][7 ] - (uint32_t*)s->unif_mvs_ptr[job].arm))
+++      );
+++#endif
+ +
+-+# apply horizontal filter
+-+brr.anyn -, r:uvloop
+-+max ra14, ra15, 0       ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
+-+asr r0, r0, 15          ; mov r1, ra21
+-+min.setf ra15, r0, rb22
+ +
+-+# apply vertical filter and write to VPM
+++}
+++#else
+ +
+-+nop                     ; mul24 r0, ra14, rb14
+-+sub r1, r1, r0          ; mul24 r0, ra13, rb13
+-+sub r1, r1, r0          ; mul24 r0, ra12, rb12
+-+sub r1, r1, r0          ; mul24 r0, ra11, rb11
+-+sub r1, r1, r0          ; mul24 r0, ra10, rb10
+-+sub r1, r1, r0          ; mul24 r0, ra9, rb9
+-+sub r1, r1, r0          ; mul24 r0, ra8, rb8
+-+sub r1, r1, r0          ; mul24 r0, ra15, rb15
+-+sub.ifnn r1, r1, r0     ; mov -, vw_wait
+-+sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+brr.anyn -, r:uvloop
+-+asr r1, r1, 15
+-+min r1, r1, rb22
+-+max vpm, r1, 0
+++#ifdef RPI
+++static void rpi_launch_vpu_qpu(HEVCContext *s)
+++{
+++  rpi_execute_transform(s);
+++}
+++#endif
+ +
+-+# DMA out for U
+++#endif
+ +
+-+mov vw_setup, rb26 # VDW setup 0
+-+mov vw_setup, rb29 # Stride
+-+mov vw_addr, unif # start the VDW
+++#ifdef RPI
+ +
+-+# DMA out for V
+-+# We need to wait for the U to complete first, but have nothing useful to compute while we wait.
+-+# Could potentially push this write into the start of the next pipeline stage.
+-+mov r0, 16
+-+mov -, vw_wait
+++#ifndef RPI_FAST_CACHEFLUSH
+++#error RPI_FAST_CACHEFLUSH is broken
+++static void flush_buffer(AVBufferRef *bref) {
+++    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
+++    gpu_cache_flush(p);
+++}
+++#endif
+ +
+-+bra -, ra31
+-+add vw_setup, rb26, r0 # VDW setup 0
+-+mov vw_setup, rb29 # Stride
+-+mov vw_addr, unif # start the VDW
+++static void flush_frame(HEVCContext *s,AVFrame *frame)
+++{
+++#ifdef RPI_FAST_CACHEFLUSH
+++    struct vcsm_user_clean_invalid_s iocache = {};
+++    GPU_MEM_PTR_T p = get_gpu_mem_ptr_u(s->frame);
+++    int n = s->ps.sps->height;
+++    int curr_y = 0;
+++    int curr_uv = 0;
+++    int n_uv = n >> s->ps.sps->vshift[1];
+++    int sz,base;
+++    sz = s->frame->linesize[1] * (n_uv-curr_uv);
+++    base = s->frame->linesize[1] * curr_uv;
+++    iocache.s[0].handle = p.vcsm_handle;
+++    iocache.s[0].cmd = 3; // clean+invalidate
+++    iocache.s[0].addr = (int)(p.arm) + base;
+++    iocache.s[0].size  = sz;
+++    p = get_gpu_mem_ptr_v(s->frame);
+++    iocache.s[1].handle = p.vcsm_handle;
+++    iocache.s[1].cmd = 3; // clean+invalidate
+++    iocache.s[1].addr = (int)(p.arm) + base;
+++    iocache.s[1].size  = sz;
+++    p = get_gpu_mem_ptr_y(s->frame);
+++    sz = s->frame->linesize[0] * (n-curr_y);
+++    base = s->frame->linesize[0] * curr_y;
+++    iocache.s[2].handle = p.vcsm_handle;
+++    iocache.s[2].cmd = 3; // clean+invalidate
+++    iocache.s[2].addr = (int)(p.arm) + base;
+++    iocache.s[2].size  = sz;
+++    vcsm_clean_invalid( &iocache );
+++#else
+++    flush_buffer(frame->buf[0]);
+++    flush_buffer(frame->buf[1]);
+++    flush_buffer(frame->buf[2]);
+++#endif
+++}
+ +
+-+################################################################################
+++static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2, int job)
+++{
+++#ifdef RPI_FAST_CACHEFLUSH
+++    struct vcsm_user_clean_invalid_s iocache = {};
+++    int n;
+++    int curr_y;
+++    int curr_uv;
+++    int n_uv;
+++    GPU_MEM_PTR_T p = get_gpu_mem_ptr_u(s->frame);
+++    int sz,base;
+++    int (*d)[2] = s->dblk_cmds[job];
+++    int low=(*d)[1];
+++    int high=(*d)[1];
+++    for(n = s->num_dblk_cmds[job]; n>0 ;n--,d++) {
+++        int y = (*d)[1];
+++        low=FFMIN(low,y);
+++        high=FFMAX(high,y);
+++    }
+++    curr_y = low;
+++    n = high+(1 << s->ps.sps->log2_ctb_size);
+++    curr_uv = curr_y >> s->ps.sps->vshift[1];
+++    n_uv = n >> s->ps.sps->vshift[1];
+ +
+++    sz = s->frame->linesize[1] * (n_uv-curr_uv);
+++    base = s->frame->linesize[1] * curr_uv;
+++    iocache.s[0].handle = p.vcsm_handle;
+++    iocache.s[0].cmd = 3; // clean+invalidate
+++    iocache.s[0].addr = (int)(p.arm) + base;
+++    iocache.s[0].size  = sz;
+++    p = get_gpu_mem_ptr_v(s->frame);
+++    iocache.s[1].handle = p.vcsm_handle;
+++    iocache.s[1].cmd = 3; // clean+invalidate
+++    iocache.s[1].addr = (int)(p.arm) + base;
+++    iocache.s[1].size  = sz;
+++    p = get_gpu_mem_ptr_y(s->frame);
+++    sz = s->frame->linesize[0] * (n-curr_y);
+++    base = s->frame->linesize[0] * curr_y;
+++    iocache.s[2].handle = p.vcsm_handle;
+++    iocache.s[2].cmd = 3; // clean+invalidate
+++    iocache.s[2].addr = (int)(p.arm) + base;
+++    iocache.s[2].size  = sz;
+ +
+-+# mc_filter(next_kernel, x, y, frame_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_dst)
+-+
+-+# At this point we have already issued two pairs of texture requests for the current block
+-+# ra_x_base, ra_x16_base point to the current coordinates for this block
+-+::mc_filter
+-+mov ra31, unif
+++    iocache.s[3].handle = p0->vcsm_handle;
+++    iocache.s[3].cmd = 3; // clean+invalidate
+++    iocache.s[3].addr = (int) p0->arm;
+++    iocache.s[3].size  = p0->numbytes;
+++    if (p1) {
+++      iocache.s[4].handle = p1->vcsm_handle;
+++      iocache.s[4].cmd = 3; // clean+invalidate
+++      iocache.s[4].addr = (int) p1->arm;
+++      iocache.s[4].size  = p1->numbytes;
+++    }
+++    if (p2) {
+++      iocache.s[5].handle = p2->vcsm_handle;
+++      iocache.s[5].cmd = 3; // clean+invalidate
+++      iocache.s[5].addr = (int) p2->arm;
+++      iocache.s[5].size  = p2->numbytes;
+++    }
+++    vcsm_clean_invalid( &iocache );
+++#else
+++    flush_buffer(frame->buf[0]);
+++    flush_buffer(frame->buf[1]);
+++    flush_buffer(frame->buf[2]);
+++    gpu_cache_flush3(p0, p1, p2);
+++#endif
+++}
+ +
+-+# per-channel shifts were calculated on the *previous* invocation
+++#endif
+ +
+-+mov ra_xshift, ra_xshift_next
+-+mov ra_x2shift, ra_x2shift_next
++ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
++ {
++     HEVCContext *s  = avctxt->priv_data;
++@@ -2313,6 +3762,17 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
++     int y_ctb       = 0;
++     int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs];
++ 
+++#ifdef RPI
+++    s->enable_rpi = s->ps.sps->bit_depth == 8
+++                    && !s->ps.pps->cross_component_prediction_enabled_flag;
+ +
+-+# get base addresses and per-channel shifts for *next* invocation
+-+add r0, unif, elem_num    # x
+-+add r2, r0, 8 # x+8
+-+max r0, r0, 0; mov r1, unif # y
+-+min r0, r0, rb_frame_width_minus_1 ; mov r3,unif # frame_base
+-+shl ra_xshift_next, r0, 3
+-+max r2, r2, 0
+-+min r2, r2, rb_frame_width_minus_1
+-+shl ra_x2shift_next, r2, 3
+-+add r0, r0, r3
+-+add r2, r2, r3
+-+and rb_x_base_next, r0, ~3
+-+and ra_x2_base_next, r2, ~3
+-+mov ra_y_next, r1
+++    if (!s->enable_rpi) {
+++      if (s->ps.pps->cross_component_prediction_enabled_flag)
+++        printf("Cross component\n");
+++    }
+++#endif
+++    //printf("L0=%d L1=%d\n",s->sh.nb_refs[L1],s->sh.nb_refs[L1]);
+ +
+-+# set up VPM write
+-+mov vw_setup, rb28
++     if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) {
++         av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n");
++         return AVERROR_INVALIDDATA;
++@@ -2326,6 +3786,14 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
++         }
++     }
++ 
+++#ifdef RPI_WORKER
+++    s->pass0_job = 0;
+++    s->pass1_job = 0;
+++#endif
+++#ifdef RPI
+++    rpi_begin(s);
+++#endif
+ +
+-+# get width,height of block
+-+mov r2, 16
+-+mov r0, unif
+-+shr r1, r0, r2 # Extract width
+-+sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
+-+and r0, r0, rb22 # Extract height
+-+add rb17, r0, 5
+-+add rb18, r0, 7
+-+shl r0, r0, 7
+-+add r0, r0, r1 # Combine width and height of destination area
+-+shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
+-+add rb26, r0, rb27
++     while (more_data && ctb_addr_ts < s->ps.sps->ctb_size) {
++         int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
++ 
++@@ -2341,7 +3809,57 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
++         s->deblock[ctb_addr_rs].tc_offset   = s->sh.tc_offset;
++         s->filter_slice_edges[ctb_addr_rs]  = s->sh.slice_loop_filter_across_slices_enabled_flag;
++ 
+++#ifdef RPI_INTER_QPU
+++        s->curr_u_mvs = s->u_mvs[s->pass0_job][s->ctu_count % 8];
+++#endif
+++#ifdef RPI_LUMA_QPU
+++        s->curr_y_mvs = s->y_mvs[s->pass0_job][s->ctu_count % 12];
+++#endif
+ +
+-+sub.setf -,8,r1 # 8-r1, so if <0 (negative) we need to use the full code
++         more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
+ +
+-+# get filter coefficients
+++#ifdef RPI_INTER_QPU
+++        s->u_mvs[s->pass0_job][s->ctu_count % 8]= s->curr_u_mvs;
+++#endif
+++#ifdef RPI_LUMA_QPU
+++        s->y_mvs[s->pass0_job][s->ctu_count % 12] = s->curr_y_mvs;
+++#endif
+ +
+-+mov r0, unif
+-+asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra0, r0, rb23;      mov r0, unif
+-+asr ra7, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra6, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra5, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra4, r0, rb23;      mov r0, unif
+-+asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+asr rb8, r0, rb23;      mov r0, unif
+-+asr rb15, r0, rb23;     mul24 r0, r0, ra22
+-+asr rb14, r0, rb23;     mul24 r0, r0, ra22
+-+asr rb13, r0, rb23;     mul24 r0, r0, ra22
+-+brr.anynn -, r:fast_path
+-+asr rb12, r0, rb23  # delay slot 1
+++#ifdef RPI
+++        if (s->enable_rpi) {
+++          //av_assert0(s->num_dblk_cmds[s->pass0_job]>=0);
+++          //av_assert0(s->num_dblk_cmds[s->pass0_job]<RPI_MAX_DEBLOCK_CMDS);
+++          //av_assert0(s->pass0_job<RPI_MAX_JOBS);
+++          //av_assert0(s->pass0_job>=0);
+++          s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]][0] = x_ctb;
+++          s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]++][1] = y_ctb;
+++          s->ctu_count++;
+++          //printf("%d %d/%d job=%d\n",s->ctu_count,s->num_dblk_cmds[s->pass0_job],RPI_MAX_DEBLOCK_CMDS,s->pass0_job);
+ +
+-+# r2 is elem_num
+-+# r3 is loop counter
+++          if ( s->ctu_count >= s->max_ctu_count ) {
+++#ifdef RPI_WORKER
+++            if (s->used_for_ref) {
+++              // Split work load onto separate threads so we make as rapid progress as possible with this frame
+++              // Pass on this job to worker thread
+++              worker_submit_job(s);
+++              // Make sure we have space to prepare the next job
+++              worker_pass0_ready(s);
+ +
+-+mov r5rep, -8 # delay slot 2
+++              // Prepare the next batch of commands
+++              rpi_begin(s);
+++            } else {
+++              // Non-ref frame so do it all on this thread
+++              rpi_do_all_passes(s);
+++            }
+++#else
+++            rpi_do_all_passes(s);
+++#endif
+++          }
+ +
+-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]  # delay slot 3
+++        }
+++#endif
+ +
+-+# retrieve texture results and pick out bytes
+-+# then submit two more texture requests
+ +
+-+## nop                                                                 ; ldtmu0     # loop counter increment
+-+## shr r0, r4, ra17                                                    ; ldtmu0
+-+## shr r1, r4, ra17     ; v8subs r0, r0, rb20
+-+## add t0s, ra16, r5    ; v8subs r1, r1, rb20
+-+## add ra16, ra16, rb16 ; mov t0s, ra16
+-+##
+-+## # generate seven shifted versions
+-+## # interleave with scroll of vertical context
+-+##
+-+## mov r2, rb21         ; mul24 r3, r0, ra0
+-+## sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+-+## nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+## sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+## nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+## sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+## nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+## sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+## nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+## sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+## nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+## sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+## nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+## sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+## nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+## sub r2, r2, r3                                                    ; ldtmu0
+-+##
+-+## mov r0, ra22
+-+## shr r0, r4, ra17     ; mul24 r2, r2, r0                           ; ldtmu0
+-+## shr r1, r4, ra17     ; v8subs r0, r0, rb20
+-+## add t0s, ra16, r5    ; v8subs r1, r1, rb20
+-+## add ra16, ra16, rb16 ; mov t0s, ra16
+-+##
+-+## # apply horizontal filter
+-+##
+-+## asr r2, r2, 15    ; mul24 r3, r0, ra0
+-+## min r2, r2, rb22
+-+## max ra13, r2, 0
+-+##
+-+## # generate seven shifted versions
+-+## # interleave with scroll of vertical context
+-+##
+-+## mov r2, rb21
+-+## sub r2, r2, r3 ; mul24      r3, ra1 << 1, r0 << 1
+-+## nop            ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+## sub r2, r2, r3 ; mul24    r3, ra2 << 2, r0 << 2
+-+## nop            ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+## sub r2, r2, r3 ; mul24    r3, ra3 << 3, r0 << 3
+-+## nop            ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+## sub r2, r2, r3 ; mul24    r3, ra4 << 4, r0 << 4
+-+## nop            ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+## sub r2, r2, r3 ; mul24    r3, ra5 << 5, r0 << 5
+-+## nop            ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+## sub r2, r2, r3 ; mul24    r3, ra6 << 6, r0 << 6
+-+## nop            ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+## sub r2, r2, r3 ; mul24    r3, ra7 << 7, r0 << 7
+-+## nop            ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+## sub r0, r2, r3
+-+##
+-+## # apply horizontal filter
+-+##
+-+## nop          ; mul24 r0, r0, ra22         # last bit of context scroll, including clamp to zero
+-+## asr r0, r0, 15
+-+## min r0, r0, rb22
+-+## max ra14, r0, 0
+-+##
+-+##
+-+##
+-+##
+-+## nop                                                                 ; ldtmu0     # loop counter increment
+-+## shr r0, r4, ra17                                                    ; ldtmu0
+-+## shr r1, r4, ra17     ; v8subs r0, r0, rb20
+-+## add t0s, ra16, r5    ; v8subs r1, r1, rb20
+-+## add ra16, ra16, rb16 ; mov t0s, ra16
+-+##
+-+## # generate seven shifted versions
+-+## # interleave with scroll of vertical context
+-+##
+-+## mov r2, rb21         ; mul24 r3, r0, ra0
+-+## sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+-+## nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+## sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+## nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+## sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+## nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+## sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+## nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+## sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+## nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+## sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+## nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+## sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+## nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+## sub r0, r2, r3
+-+##
+-+## # apply horizontal filter
+-+##
+-+## nop          ; mul24 r0, r0, ra22         # last bit of context scroll, including clamp to zero
+-+## asr r0, r0, 15
+-+## min r0, r0, rb22
+-+## max ra15, r0, 0
+-+
+-+
+-+
+-+
+-+mov r3, 0
+-+
+-+:loop
+-+# retrieve texture results and pick out bytes
+-+# then submit two more texture requests
++         if (more_data < 0) {
++             s->tab_slice_address[ctb_addr_rs] = -1;
++             return more_data;
++@@ -2350,9 +3868,29 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
++ 
++         ctb_addr_ts++;
++         ff_hevc_save_states(s, ctb_addr_ts);
+++#ifdef RPI
+++        if (s->enable_rpi)
+++            continue;
+++#endif
++         ff_hevc_hls_filters(s, x_ctb, y_ctb, ctb_size);
++     }
++ 
+++#ifdef RPI
+ +
+-+sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
+-+shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
+++#ifdef RPI_WORKER
+++    // Wait for the worker to finish all its jobs
+++    if (s->enable_rpi) {
+++        worker_wait(s);
+++    }
+++#endif
+ +
+-+max r2, ra_y, 0  # y
+-+min r2, r2, rb_frame_height_minus_1
+-+add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+add t0s, ra_x2_base, r2
+++    // Finish off any half-completed rows
+++    if (s->enable_rpi && s->ctu_count) {
+++        rpi_do_all_passes(s);
+++    }
+ +
+-+# generate seven shifted versions
+-+# interleave with scroll of vertical context
+++#endif
+ +
+-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++     if (x_ctb + ctb_size >= s->ps.sps->width &&
++         y_ctb + ctb_size >= s->ps.sps->height)
++         ff_hevc_hls_filter(s, x_ctb, y_ctb, ctb_size);
++@@ -2387,6 +3925,11 @@ static int hls_decode_entry_wpp(AVCodecContext *avctxt, void *input_ctb_row, int
++     s = s1->sList[self_id];
++     lc = s->HEVClc;
++ 
+++#ifdef RPI
+++    s->enable_rpi = 0;
+++    //printf("Wavefront\n");
+++#endif
+ +
+-+mov r2, rb21         ; mul24 r3, r0, ra0
+-+sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+-+nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+sub r0, r2, r3
+-+
+-+mov r3, rb31
+-+
+-+mov ra8, ra9
+-+mov ra9, ra10
+-+mov ra10, ra11
+-+mov ra11, ra12
+-+mov ra12, ra13
+-+mov ra13, ra14
+-+
+-+sub.setf -, r3, 8 ; mov r1, ra22
++     if(ctb_row) {
++         ret = init_get_bits8(&lc->gb, s->data + s->sh.offset[ctb_row - 1], s->sh.size[ctb_row - 1]);
++ 
++@@ -2767,6 +4310,16 @@ static int decode_nal_unit(HEVCContext *s, const H2645NAL *nal)
++         if (ret < 0)
++             return ret;
++ 
+++        s->used_for_ref = !(s->nal_unit_type == NAL_TRAIL_N ||
+++                        s->nal_unit_type == NAL_TSA_N   ||
+++                        s->nal_unit_type == NAL_STSA_N  ||
+++                        s->nal_unit_type == NAL_RADL_N  ||
+++                        s->nal_unit_type == NAL_RASL_N);
+ +
+-+# apply horizontal filter
+-+brr.anyn -, r:loop
+-+max ra14, ra15, 0       ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
+-+asr r0, r0, 15          ; mov r1, ra21
+-+min.setf ra15, r0, rb22
+++        if (!s->used_for_ref && s->avctx->skip_frame >= AVDISCARD_NONREF) {
+++            s->is_decoded = 0;
+++            break;
+++        }
++         if (s->max_ra == INT_MAX) {
++             if (s->nal_unit_type == NAL_CRA_NUT || IS_BLA(s)) {
++                 s->max_ra = s->poc;
++@@ -2891,9 +4444,17 @@ static int decode_nal_units(HEVCContext *s, const uint8_t *buf, int length)
++     }
++ 
++ fail:
++-    if (s->ref && s->threads_type == FF_THREAD_FRAME)
+++    if (s->ref && s->threads_type == FF_THREAD_FRAME) {
+++#ifdef RPI_INTER_QPU
+++        ff_hevc_flush_buffer(s, &s->ref->tf, s->ps.sps->height);
+++#endif
++         ff_thread_report_progress(&s->ref->tf, INT_MAX, 0);
++-
+++    } else if (s->ref) {
+++#ifdef RPI_INTER_QPU
+++      // When running single threaded we need to flush the whole frame
+++      flush_frame(s,s->frame);
+++#endif
+++    }
++     return ret;
++ }
++ 
++@@ -3064,6 +4625,41 @@ fail:
++     return AVERROR(ENOMEM);
++ }
++ 
+++#ifdef RPI_WORKER
+++static av_cold void hevc_init_worker(HEVCContext *s)
+++{
+++    int err;
+++    pthread_cond_init(&s->worker_cond_head, NULL);
+++    pthread_cond_init(&s->worker_cond_tail, NULL);
+++    pthread_mutex_init(&s->worker_mutex, NULL);
+ +
+-+# apply vertical filter and write to VPM
+++    s->worker_tail=0;
+++    s->worker_head=0;
+++    s->kill_worker=0;
+++    err = pthread_create(&s->worker_thread, NULL, worker_start, s);
+++    if (err) {
+++        printf("Failed to create worker thread\n");
+++        exit(-1);
+++    }
+++}
+ +
+-+nop                     ; mul24 r0, ra14, rb14
+-+sub r1, r1, r0          ; mul24 r0, ra13, rb13
+-+sub r1, r1, r0          ; mul24 r0, ra12, rb12
+-+sub r1, r1, r0          ; mul24 r0, ra11, rb11
+-+sub r1, r1, r0          ; mul24 r0, ra10, rb10
+-+sub r1, r1, r0          ; mul24 r0, ra9, rb9
+-+sub r1, r1, r0          ; mul24 r0, ra8, rb8
+-+sub r1, r1, r0          ; mul24 r0, ra15, rb15
+-+sub.ifnn r1, r1, r0     ; mov -, vw_wait
+-+sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+brr.anyn -, r:loop
+-+asr r1, r1, 15
+-+min r1, r1, rb22
+-+max vpm, r1, 0
+++static av_cold void hevc_exit_worker(HEVCContext *s)
+++{
+++    void *res;
+++    s->kill_worker=1;
+++    pthread_cond_broadcast(&s->worker_cond_tail);
+++    pthread_join(s->worker_thread, &res);
+ +
+-+# DMA out
+++    pthread_cond_destroy(&s->worker_cond_head);
+++    pthread_cond_destroy(&s->worker_cond_tail);
+++    pthread_mutex_destroy(&s->worker_mutex);
+ +
+-+bra -, ra31
+-+mov vw_setup, rb26 # VDW: height rows, 16 8-bit units long
+-+mov vw_setup, rb29
+-+mov vw_addr, unif # start the VDW
+++    s->worker_tail=0;
+++    s->worker_head=0;
+++    s->kill_worker=0;
+++}
+++#endif
+ +
+-+####################################################
+-+
+-+:fast_path
+-+## nop                                                                 ; ldtmu0     # loop counter increment
+-+## shr r0, r4, ra17                                                    ; ldtmu0
+-+## shr r1, r4, ra17     ; v8subs r0, r0, rb20
+-+## add t0s, ra16, r5    ; v8subs r1, r1, rb20
+-+## add ra16, ra16, rb16 ; mov t0s, ra16
+-+##
+-+## # generate seven shifted versions
+-+## # interleave with scroll of vertical context
+-+##
+-+## mov r2, rb21         ; mul24 r3, r0, ra0
+-+## sub r2, r2, r3       ; mul24 r3, ra1 << 1, r0 << 1
+-+## sub r2, r2, r3       ; mul24 r3, ra2 << 2, r0 << 2
+-+## sub r2, r2, r3       ; mul24 r3, ra3 << 3, r0 << 3
+-+## sub r2, r2, r3       ; mul24 r3, ra4 << 4, r0 << 4
+-+## sub r2, r2, r3       ; mul24 r3, ra5 << 5, r0 << 5
+-+## sub r2, r2, r3       ; mul24 r3, ra6 << 6, r0 << 6
+-+## sub r2, r2, r3       ; mul24 r3, ra7 << 7, r0 << 7
+-+## sub r2, r2, r3                                                    ; ldtmu0
+-+##
+-+## mov r0, ra22
+-+## shr r0, r4, ra17     ; mul24 r2, r2, r0                           ; ldtmu0
+-+## shr r1, r4, ra17     ; v8subs r0, r0, rb20
+-+## add t0s, ra16, r5    ; v8subs r1, r1, rb20
+-+## add ra16, ra16, rb16 ; mov t0s, ra16
+-+##
+-+## # apply horizontal filter
+-+##
+-+## asr r2, r2, 15    ; mul24 r3, r0, ra0
+-+## min r2, r2, rb22
+-+## max ra13, r2, 0
+-+##
+-+## # generate seven shifted versions
+-+## # interleave with scroll of vertical context
+-+##
+-+## mov r2, rb21
+-+## sub r2, r2, r3 ; mul24    r3, ra1 << 1, r0 << 1
+-+## sub r2, r2, r3 ; mul24    r3, ra2 << 2, r0 << 2
+-+## sub r2, r2, r3 ; mul24    r3, ra3 << 3, r0 << 3
+-+## sub r2, r2, r3 ; mul24    r3, ra4 << 4, r0 << 4
+-+## sub r2, r2, r3 ; mul24    r3, ra5 << 5, r0 << 5
+-+## sub r2, r2, r3 ; mul24    r3, ra6 << 6, r0 << 6
+-+## sub r2, r2, r3 ; mul24    r3, ra7 << 7, r0 << 7
+-+## sub r0, r2, r3
+-+##
+-+## # apply horizontal filter
+-+##
+-+## nop          ; mul24 r0, r0, ra22         # last bit of context scroll, including clamp to zero
+-+## asr r0, r0, 15
+-+## min r0, r0, rb22
+-+## max ra14, r0, 0
+-+##
+-+##
+-+##
+-+##
+-+## nop                                                                 ; ldtmu0     # loop counter increment
+-+## shr r0, r4, ra17                                                    ; ldtmu0
+-+## shr r1, r4, ra17     ; v8subs r0, r0, rb20
+-+## add t0s, ra16, r5    ; v8subs r1, r1, rb20
+-+## add ra16, ra16, rb16 ; mov t0s, ra16
+-+##
+-+## # generate seven shifted versions
+-+## # interleave with scroll of vertical context
+-+##
+-+## mov r2, rb21   ; mul24    r3, r0, ra0
+-+## sub r2, r2, r3 ; mul24    r3, ra1 << 1, r0 << 1
+-+## sub r2, r2, r3 ; mul24    r3, ra2 << 2, r0 << 2
+-+## sub r2, r2, r3 ; mul24    r3, ra3 << 3, r0 << 3
+-+## sub r2, r2, r3 ; mul24    r3, ra4 << 4, r0 << 4
+-+## sub r2, r2, r3 ; mul24    r3, ra5 << 5, r0 << 5
+-+## sub r2, r2, r3 ; mul24    r3, ra6 << 6, r0 << 6
+-+## sub r2, r2, r3 ; mul24    r3, ra7 << 7, r0 << 7
+-+## sub r0, r2, r3
+-+##
+-+## # apply horizontal filter
+-+##
+-+## nop          ; mul24 r0, r0, ra22         # last bit of context scroll, including clamp to zero
+-+## asr r0, r0, 15
+-+## min r0, r0, rb22
+-+## max ra15, r0, 0
+-+
+-+
+-+mov r3, 0  # This signifies the amount of unrolling
+-+
+-+:fast_loop
+-+# retrieve texture results and pick out bytes
+-+# then submit two more texture requests
++ static av_cold int hevc_decode_free(AVCodecContext *avctx)
++ {
++     HEVCContext       *s = avctx->priv_data;
++@@ -3075,6 +4671,32 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
++ 
++     av_freep(&s->cabac_state);
++ 
+++#ifdef RPI
+ +
+-+# Due to pipelining we can only skip second pipeline instructions related to the fetched pixels
+-+sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
+-+shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+mov.ifz ra_y, ra_y_next   ; mov rb31, r3
+-+mov.ifz ra_x2_base, ra_x2_base_next   ; mov r3, rb_pitch
+++#ifdef RPI_WORKER
+++    hevc_exit_worker(s);
+++#endif
+ +
+-+max r2, ra_y, 0
+-+min r2, r2, rb_frame_height_minus_1 ; mov r1, r4  # discard texture read
+-+add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+add t0s, ra_x_base, r2    ; v8subs r0, r0, rb20
+-+add t0s, ra_x2_base, r2
+++    for(i=0;i<RPI_MAX_JOBS;i++) {
+++      av_freep(&s->unif_mv_cmds[i]);
+++      av_freep(&s->univ_pred_cmds[i]);
+ +
+-+# generate seven shifted versions
+-+# interleave with scroll of vertical context
+-+
+-+mov r2, rb21         ; mul24 r3, r0, ra0
+-+sub r2, r2, r3       ; mul24 r3, ra1 << 1, r0 << 1
+-+sub r2, r2, r3       ; mul24 r3, ra2 << 2, r0 << 2
+-+sub r2, r2, r3       ; mul24 r3, ra3 << 3, r0 << 3
+-+sub r2, r2, r3       ; mul24 r3, ra4 << 4, r0 << 4
+-+sub r2, r2, r3       ; mul24 r3, ra5 << 5, r0 << 5
+-+sub r2, r2, r3       ; mul24 r3, ra6 << 6, r0 << 6
+-+sub r2, r2, r3       ; mul24 r3, ra7 << 7, r0 << 7
+-+sub r0, r2, r3       ; mov r3, rb31
+-+
+-+mov ra8, ra9
+-+mov ra9, ra10
+-+mov ra10, ra11
+-+mov ra11, ra12
+-+mov ra12, ra13
+-+mov ra13, ra14
+-+
+-+sub.setf -, r3, 8       ; mov r1, ra22
+++#ifdef RPI_INTER_QPU
+++      if (s->unif_mvs[i]) {
+++        gpu_free( &s->unif_mvs_ptr[i] );
+++        s->unif_mvs[i] = 0;
+++      }
+++#endif
+++#ifdef RPI_LUMA_QPU
+++      if (s->y_unif_mvs[i]) {
+++        gpu_free( &s->y_unif_mvs_ptr[i] );
+++        s->y_unif_mvs[i] = 0;
+++      }
+++#endif
+++    }
+ +
+-+# apply horizontal filter
+++#endif
+ +
+-+brr.anyn -, r:fast_loop
+-+max ra14, ra15, 0       ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
+-+asr r0, r0, 15          ; mov r1, ra21
+-+min.setf ra15, r0, rb22
++     for (i = 0; i < 3; i++) {
++         av_freep(&s->sao_pixel_buffer_h[i]);
++         av_freep(&s->sao_pixel_buffer_v[i]);
++@@ -3116,10 +4738,23 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
++     return 0;
++ }
++ 
+++#ifdef RPI
+++#ifdef RPI_PRECLEAR
+++static av_cold void memclear16(int16_t *p, int n)
+++{
+++  vpu_execute_code( vpu_get_fn(), p, n, 0, 0, 0, 1);
+++  //int i;
+++  //for(i=0;i<n;i++)
+++  //  p[i] = 0;
+++}
+++#endif
+++#endif
+ +
+-+# apply vertical filter and write to VPM
++ static av_cold int hevc_init_context(AVCodecContext *avctx)
++ {
++     HEVCContext *s = avctx->priv_data;
++     int i;
+++    int job;
++ 
++     s->avctx = avctx;
++ 
++@@ -3129,6 +4764,78 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
++     s->HEVClcList[0] = s->HEVClc;
++     s->sList[0] = s;
++ 
+++#ifdef RPI
+++    for(job=0;job<RPI_MAX_JOBS;job++) {
+++        s->unif_mv_cmds[job] = av_mallocz(sizeof(HEVCMvCmd)*RPI_MAX_MV_CMDS);
+++        if (!s->unif_mv_cmds[job])
+++            goto fail;
+++        s->univ_pred_cmds[job] = av_mallocz(sizeof(HEVCPredCmd)*RPI_MAX_PRED_CMDS);
+++        if (!s->univ_pred_cmds[job])
+++            goto fail;
+++    }
+ +
+-+nop                     ; mul24 r0, ra14, rb14
+-+sub r1, r1, r0          ; mul24 r0, ra13, rb13
+-+sub r1, r1, r0          ; mul24 r0, ra12, rb12
+-+sub r1, r1, r0          ; mul24 r0, ra11, rb11
+-+sub r1, r1, r0          ; mul24 r0, ra10, rb10
+-+sub r1, r1, r0          ; mul24 r0, ra9, rb9
+-+sub r1, r1, r0          ; mul24 r0, ra8, rb8
+-+sub r1, r1, r0          ; mul24 r0, ra15, rb15
+-+sub.ifnn r1, r1, r0     ; mov -, vw_wait
+-+sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+brr.anyn -, r:fast_loop
+-+asr r1, r1, 15
+-+min r1, r1, rb22
+-+max vpm, r1, 0
+++#ifdef RPI_INTER_QPU
+++    // We divide the image into blocks 256 wide and 64 high
+++    // We support up to 2048 widths
+++    // We compute the number of chroma motion vector commands for 4:4:4 format and 4x4 chroma blocks - assuming all blocks are B predicted
+++    // Also add space for the startup command for each stream.
+ +
+-+# DMA out
+++    {
+++        int uv_commands_per_qpu = UV_COMMANDS_PER_QPU;
+++        uint32_t *p;
+++		for(job=0;job<RPI_MAX_JOBS;job++) {
+++#ifdef RPI_CACHE_UNIF_MVS
+++          gpu_malloc_cached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr[job] );
+++#else
+++          gpu_malloc_uncached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr[job] );
+++#endif
+++          s->unif_mvs[job] = (uint32_t *) s->unif_mvs_ptr[job].arm;
+ +
+-+bra -, ra31
+-+mov vw_setup, rb26 # VDW: height rows, 16 8-bit units long
+-+mov vw_setup, rb29
+-+mov vw_addr, unif # start the VDW
+++          // Set up initial locations for uniform streams
+++          p = s->unif_mvs[job];
+++          for(i = 0; i < 8; i++) {
+++            s->mvs_base[job][i] = p;
+++            p += uv_commands_per_qpu;
+++          }
+++        }
+++        s->mc_filter_uv = qpu_get_fn(QPU_MC_FILTER_UV);
+++        s->mc_filter_uv_b0 = qpu_get_fn(QPU_MC_FILTER_UV_B0);
+++        s->mc_filter_uv_b = qpu_get_fn(QPU_MC_FILTER_UV_B);
+++    }
+ +
+-+################################################################################
+++#endif
+++#ifdef RPI_LUMA_QPU
+++    for(job=0;job<RPI_MAX_JOBS;job++)
+++    {
+++        int y_commands_per_qpu = Y_COMMANDS_PER_QPU;
+++        uint32_t *p;
+++#ifdef RPI_CACHE_UNIF_MVS
+++        gpu_malloc_cached( 12 * y_commands_per_qpu * sizeof(uint32_t), &s->y_unif_mvs_ptr[job] );
+++#else
+++        gpu_malloc_uncached( 12 * y_commands_per_qpu * sizeof(uint32_t), &s->y_unif_mvs_ptr[job] );
+++#endif
+++        s->y_unif_mvs[job] = (uint32_t *) s->y_unif_mvs_ptr[job].arm;
+ +
+-+# mc_filter_b(next_kernel, x, y, frame_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_dst)
+++        // Set up initial locations for uniform streams
+++        p = s->y_unif_mvs[job];
+++        for(i = 0; i < 12; i++) {
+++            s->y_mvs_base[job][i] = p;
+++            p += y_commands_per_qpu;
+++        }
+++    }
+++    s->mc_filter = qpu_get_fn(QPU_MC_FILTER);
+++    s->mc_filter_b = qpu_get_fn(QPU_MC_FILTER_B);
+++#endif
+++    //gpu_malloc_uncached(2048*64,&s->dummy);
+ +
+-+# At this point we have already issued two pairs of texture requests for the current block
+-+# ra_x_base, ra_x16_base point to the current coordinates for this block
+-+::mc_filter_b
+-+mov ra31, unif
+++    s->enable_rpi = 0;
+ +
+-+# per-channel shifts were calculated on the *previous* invocation
+++#ifdef RPI_WORKER
+++    hevc_init_worker(s);
+++#endif
+ +
+-+mov ra_xshift, ra_xshift_next
+-+mov ra_x2shift, ra_x2shift_next
+++#endif
+ +
+-+# get base addresses and per-channel shifts for *next* invocation
+-+add r0, unif, elem_num    # x
+-+add r2, r0, 8 # x+8
+-+max r0, r0, 0; mov r1, unif # y
+-+min r0, r0, rb_frame_width_minus_1 ; mov r3,unif # frame_base
+-+shl ra_xshift_next, r0, 3
+-+max r2, r2, 0
+-+min r2, r2, rb_frame_width_minus_1
+-+shl ra_x2shift_next, r2, 3
+-+add r0, r0, r3
+-+add r2, r2, r3
+-+and rb_x_base_next, r0, ~3
+-+and ra_x2_base_next, r2, ~3
+-+mov ra_y_next, r1
++     s->cabac_state = av_malloc(HEVC_CONTEXTS);
++     if (!s->cabac_state)
++         goto fail;
++diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
++index be91010..6b03ea8 100644
++--- a/libavcodec/hevc.h
+++++ b/libavcodec/hevc.h
++@@ -23,6 +23,9 @@
++ #ifndef AVCODEC_HEVC_H
++ #define AVCODEC_HEVC_H
++ 
+++// define RPI to split the CABAC/prediction/transform into separate stages
+++#include "config.h"
+ +
+-+# set up VPM write
+-+mov vw_setup, rb28
++ #include "libavutil/buffer.h"
++ #include "libavutil/md5.h"
++ 
++@@ -37,6 +40,29 @@
++ #include "thread.h"
++ #include "videodsp.h"
++ 
+++// define RPI to split the CABAC/prediction/transform into separate stages
+++#ifdef RPI
+ +
+-+# get width,height of block
+-+mov r2, 16
+-+mov r0, unif
+-+shr r1, r0, r2 # Extract width
+-+sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
+-+and r0, r0, rb22 # Extract height
+-+add rb17, r0, 5
+-+add rb18, r0, 7
+-+shl r0, r0, 7
+-+# r0 is currently height<<7
+-+# For vr_setup we want height<<20 (so 20-7=13 additional bits)
+-+shl r3, r0, 13
+-+shl r3, r3, 8 # Mask off top 8 bits
+-+shr r3, r3, 8
+-+add r0, r0, r1 # Combine width and height of destination area
+-+shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
+-+add rb26, r0, rb27
+-+# In a B frame, so also set up VPM read
+-+add vr_setup, r3, rb28
+++  #include "rpi_qpu.h"
+++  // Define RPI_INTER_QPU to use QPU for chroma inter prediction
+++  #define RPI_INTER_QPU
+ +
+-+# get filter coefficients
+++  #ifdef RPI_INTER_QPU
+++    // Define RPI_LUMA_QPU to also use QPU for luma inter prediction
+++    #define RPI_LUMA_QPU
+++  #endif
+ +
+-+mov r0, unif
+-+asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra0, r0, rb23;      mov r0, unif
+-+asr ra7, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra6, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra5, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra4, r0, rb23;      mov r0, unif
+-+asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+asr rb8, r0, rb23;      mov r0, unif
+-+asr rb15, r0, rb23;     mul24 r0, r0, ra22
+-+asr rb14, r0, rb23;     mul24 r0, r0, ra22
+-+asr rb13, r0, rb23;     mul24 r0, r0, ra22
+-+asr rb12, r0, rb23
+++  // By passing jobs to a worker thread we hope to be able to catch up during slow frames
+++  #define RPI_MAX_JOBS 2
+++  // Define RPI_WORKER to launch a worker thread for pixel processing tasks
+++  #define RPI_WORKER
+++  // Define RPI_DEBLOCK_VPU to perform deblocking on the VPUs
+++//  #define RPI_DEBLOCK_VPU
+ +
+-+# r2 is elem_num
+-+# r3 is loop counter
+++#endif
+ +
+-+mov r5rep, -8
+-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+mov r3, 0
+++#define RPI_VPU_DEBLOCK_CACHED 1
+ +
+-+:bloop
+-+# retrieve texture results and pick out bytes
+-+# then submit two more texture requests
++ #define MAX_DPB_SIZE 16 // A.4.1
++ #define MAX_REFS 16
++ 
++@@ -660,17 +686,6 @@ typedef struct CodingUnit {
++     uint8_t cu_transquant_bypass_flag;
++ } CodingUnit;
++ 
++-typedef struct Mv {
++-    int16_t x;  ///< horizontal component of motion vector
++-    int16_t y;  ///< vertical component of motion vector
++-} Mv;
++-
++-typedef struct MvField {
++-    DECLARE_ALIGNED(4, Mv, mv)[2];
++-    int8_t ref_idx[2];
++-    int8_t pred_flag;
++-} MvField;
++-
++ typedef struct NeighbourAvailable {
++     int cand_bottom_left;
++     int cand_left;
++@@ -747,7 +762,17 @@ typedef struct HEVCFrame {
++     uint8_t flags;
++ } HEVCFrame;
++ 
+++#ifdef RPI_WORKER
+++typedef struct HEVCLocalContextIntra {
+++    TransformUnit tu;
+++    NeighbourAvailable na;
+++} HEVCLocalContextIntra;
+++#endif
+ +
+-+sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
+-+shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
++ typedef struct HEVCLocalContext {
+++    TransformUnit tu;
+++    NeighbourAvailable na;  // WARNING tu and na must be the first two fields to match HEVCLocalContextIntra
+ +
+-+max r2, ra_y, 0  # y
+-+min r2, r2, rb_frame_height_minus_1
+-+add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+add t0s, ra_x2_base, r2
++     uint8_t cabac_state[HEVC_CONTEXTS];
++ 
++     uint8_t stat_coeff[4];
++@@ -762,7 +787,6 @@ typedef struct HEVCLocalContext {
++ 
++     int qPy_pred;
++ 
++-    TransformUnit tu;
++ 
++     uint8_t ctb_left_flag;
++     uint8_t ctb_up_flag;
++@@ -779,7 +803,6 @@ typedef struct HEVCLocalContext {
++     int ct_depth;
++     CodingUnit cu;
++     PredictionUnit pu;
++-    NeighbourAvailable na;
++ 
++ #define BOUNDARY_LEFT_SLICE     (1 << 0)
++ #define BOUNDARY_LEFT_TILE      (1 << 1)
++@@ -790,6 +813,80 @@ typedef struct HEVCLocalContext {
++     int boundary_flags;
++ } HEVCLocalContext;
++ 
+ +
+-+# generate seven shifted versions
+-+# interleave with scroll of vertical context
+++#ifdef RPI
+ +
+-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++// The processing is done in chunks
+++// Each chunk corresponds to 24 64x64 luma blocks (24 so it is divisible by 8 for chroma and 12 for luma)
+++// This is a distance of 1536 pixels across the screen
+++// Increasing RPI_NUM_CHUNKS will reduce time spent activating QPUs and cache flushing,
+++// but allocate more memory and increase the latency before data in the next frame can be processed
+++#define RPI_NUM_CHUNKS 1
+ +
+-+mov r2, rb21         ; mul24 r3, r0, ra0
+-+sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+-+nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+sub r0, r2, r3
+-+
+-+mov r3, rb31
+-+
+-+mov ra8, ra9
+-+mov ra9, ra10
+-+mov ra10, ra11
+-+mov ra11, ra12
+-+mov ra12, ra13
+-+mov ra13, ra14
+-+
+-+sub.setf -, r3, 8 ; mov r1, ra22
+++// RPI_MAX_WIDTH is maximum width in pixels supported by the accelerated code
+++#define RPI_MAX_WIDTH (RPI_NUM_CHUNKS*64*24)
+ +
+-+# apply horizontal filter
+-+brr.anyn -, r:bloop
+-+max ra14, ra15, 0       ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
+-+asr r0, r0, 15          ; mov r1, ra21
+-+min.setf ra15, r0, rb22
+++// Worst case is for 4:4:4 4x4 blocks with 64 high coding tree blocks, so 16 MV cmds per 4 pixels across for each colour plane, * 2 for bi
+++#define RPI_MAX_MV_CMDS   (2*16*3*(RPI_MAX_WIDTH/4))
+++// Each block can have an intra prediction and a transform_add command
+++#define RPI_MAX_PRED_CMDS (2*16*3*(RPI_MAX_WIDTH/4))
+++// Worst case is 16x16 CTUs
+++#define RPI_MAX_DEBLOCK_CMDS (RPI_MAX_WIDTH*4/16)
+ +
+-+# apply vertical filter and write to VPM
+++#define RPI_CMD_LUMA_UNI 0
+++#define RPI_CMD_CHROMA_UNI 1
+++#define RPI_CMD_LUMA_BI 2
+++#define RPI_CMD_CHROMA_BI 3
+++#define RPI_CMD_V_BI 4
+ +
+-+nop                     ; mul24 r0, ra14, rb14
+-+sub r1, r1, r0          ; mul24 r0, ra13, rb13
+-+sub r1, r1, r0          ; mul24 r0, ra12, rb12
+-+sub r1, r1, r0          ; mul24 r0, ra11, rb11
+-+sub r1, r1, r0          ; mul24 r0, ra10, rb10
+-+sub r1, r1, r0          ; mul24 r0, ra9, rb9
+-+sub r1, r1, r0          ; mul24 r0, ra8, rb8
+-+sub r1, r1, r0          ; mul24 r0, ra15, rb15
+-+sub.ifnn r1, r1, r0     ; mov -, vw_wait
+-+sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+asr r1, r1, 15          ; mov -, vr_wait
+-+min r1, r1, rb22
+-+add r0, vpm, 1          # Blend in previous VPM contents at this location
+-+brr.anyn -, r:bloop
+-+max r1, r1, 0
+-+add r1, r1, r0
+-+shr vpm, r1, 1
+++// RPI_PRECLEAR is not working yet - perhaps clearing on VPUs is flawed?
+++// #define RPI_PRECLEAR
+ +
+-+# DMA out
+++// Command for inter prediction
+++typedef struct HEVCMvCmd {
+++    int cmd;
+++    uint8_t *dst;
+++    ptrdiff_t dststride;
+++    uint8_t *src;
+++    ptrdiff_t srcstride;
+++    Mv mv;
+++    int x_off;
+++    int y_off;
+++    int block_w;
+++    int block_h;
+++    int weight;
+++    int offset;
+++    uint8_t *src1;
+++    ptrdiff_t srcstride1;
+++    Mv mv1;
+++    int8_t ref_idx[2];
+++} HEVCMvCmd;
+ +
+-+bra -, ra31
+-+mov vw_setup, rb26 # VDW: height rows, 16 8-bit units long
+-+mov vw_setup, rb29
+-+mov vw_addr, unif # start the VDW
+ +
+-+################################################################################
+++// Command for intra prediction and transform_add of predictions to coefficients
+++#define RPI_PRED_TRANSFORM_ADD 0
+++#define RPI_PRED_INTRA 1
+++typedef struct HEVCPredCmd {
+++    uint8_t size;
+++    uint8_t type;
+++    uint8_t na;
+++    uint8_t c_idx;
+++    union {
+++        uint8_t *dst; // RPI_PRED_TRANSFORM_ADD
+++        uint32_t x;   // RPI_PRED_INTRA
+++    };
+++    union {
+++        int16_t *buf; // RPI_PRED_TRANSFORM_ADD
+++        uint32_t y;   // RPI_PRED_INTRA
+++    };
+++    union {
+++        enum IntraPredMode mode; // RPI_PRED_TRANSFORM_ADD
+++        uint32_t stride;         // RPI_PRED_INTRA
+++    };
+++} HEVCPredCmd;
+ +
+-+# mc_filter_honly(next_kernel, x, y, frame_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_dst)
+-+# This filter only does horizontal filtering.
+-+# It is assumed that the region to fetch does not include extra rows above.
+++#endif
+ +
+-+# At this point we have already issued two pairs of texture requests for the current block
+-+# ra_x_base, ra_x16_base point to the current coordinates for this block
+-+::mc_filter_honly
+-+mov ra31, unif
++ typedef struct HEVCContext {
++     const AVClass *c;  // needed by private avoptions
++     AVCodecContext *avctx;
++@@ -798,13 +895,107 @@ typedef struct HEVCContext {
++ 
++     HEVCLocalContext    *HEVClcList[MAX_NB_THREADS];
++     HEVCLocalContext    *HEVClc;
++-
+++#ifdef RPI_WORKER
+++    HEVCLocalContextIntra HEVClcIntra;
+++#endif
++     uint8_t             threads_type;
++     uint8_t             threads_number;
++ 
++     int                 width;
++     int                 height;
++ 
+++    int used_for_ref;
+ +
+-+# per-channel shifts were calculated on the *previous* invocation
+++#ifdef RPI
+++    int enable_rpi;
+++    HEVCMvCmd *unif_mv_cmds[RPI_MAX_JOBS];
+++    HEVCPredCmd *univ_pred_cmds[RPI_MAX_JOBS];
+++    int buf_width;
+++    GPU_MEM_PTR_T coeffs_buf_default[RPI_MAX_JOBS];
+++    GPU_MEM_PTR_T coeffs_buf_accelerated[RPI_MAX_JOBS];
+++    int16_t *coeffs_buf_arm[RPI_MAX_JOBS][4];
+++    unsigned int coeffs_buf_vc[RPI_MAX_JOBS][4];
+++    int num_coeffs[RPI_MAX_JOBS][4];
+++    int num_xfm_cmds[RPI_MAX_JOBS];
+++    int num_mv_cmds[RPI_MAX_JOBS];
+++    int num_pred_cmds[RPI_MAX_JOBS];
+++    int num_dblk_cmds[RPI_MAX_JOBS];
+++    int vpu_id;
+++    int pass0_job; // Pass0 does coefficient decode
+++    int pass1_job; // Pass1 does pixel processing
+++    int ctu_count; // Number of CTUs done in pass0 so far
+++    int max_ctu_count; // Number of CTUs when we trigger a round of processing
+++    int ctu_per_y_chan; // Number of CTUs per luma QPU
+++    int ctu_per_uv_chan; // Number of CTUs per chroma QPU
+++#ifdef RPI_INTER_QPU
+++    GPU_MEM_PTR_T unif_mvs_ptr[RPI_MAX_JOBS];
+++    uint32_t *unif_mvs[RPI_MAX_JOBS]; // Base of memory for motion vector commands
+ +
+-+mov ra_xshift, ra_xshift_next
+-+mov ra_x2shift, ra_x2shift_next
+++    // _base pointers are to the start of the row
+++    uint32_t *mvs_base[RPI_MAX_JOBS][8];
+++    // these pointers are to the next free space
+++    uint32_t *u_mvs[RPI_MAX_JOBS][8];
+++    uint32_t *curr_u_mvs; // Current uniform stream to use for chroma
+++    // Function pointers
+++    uint32_t mc_filter_uv;
+++    uint32_t mc_filter_uv_b0;
+++    uint32_t mc_filter_uv_b;
+++#endif
+++#ifdef RPI_LUMA_QPU
+++    GPU_MEM_PTR_T y_unif_mvs_ptr[RPI_MAX_JOBS];
+++    uint32_t *y_unif_mvs[RPI_MAX_JOBS]; // Base of memory for motion vector commands
+++    uint32_t *y_mvs_base[RPI_MAX_JOBS][12];
+++    uint32_t *y_mvs[RPI_MAX_JOBS][12];
+++    uint32_t *curr_y_mvs; // Current uniform stream for luma
+++    // Function pointers
+++    uint32_t mc_filter;
+++    uint32_t mc_filter_b;
+++#endif
+ +
+-+# get base addresses and per-channel shifts for *next* invocation
+-+add r0, unif, elem_num    # x
+-+add r2, r0, 8 # x+8
+-+max r0, r0, 0; mov r1, unif # y
+-+min r0, r0, rb_frame_width_minus_1 ; mov r3,unif # frame_base
+-+shl ra_xshift_next, r0, 3
+-+max r2, r2, 0
+-+min r2, r2, rb_frame_width_minus_1
+-+shl ra_x2shift_next, r2, 3
+-+add r0, r0, r3
+-+add r2, r2, r3
+-+and rb_x_base_next, r0, ~3
+-+and ra_x2_base_next, r2, ~3
+-+mov ra_y_next, r1
+++#ifdef RPI_WORKER
+++    pthread_t worker_thread;
+++    pthread_cond_t worker_cond_head;
+++    pthread_cond_t worker_cond_tail;
+++    pthread_mutex_t worker_mutex;
+ +
+-+# set up VPM write
+-+mov vw_setup, rb28
+++    int worker_tail; // Contains the number of posted jobs
+++    int worker_head; // Contains the number of completed jobs
+++    int kill_worker; // set to 1 to terminate the worker
+++#endif
+ +
+-+# get width,height of block
+-+mov r2, 16
+-+mov r0, unif
+-+shr r1, r0, r2 # Extract width
+-+sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
+-+and r0, r0, rb22 # Extract height
+-+add rb17, r0, -2 # Pipelining means we move data across 2 iterations early
+-+shl r0, r0, 7 ; mov rb18,r0
+-+add r0, r0, r1 # Combine width and height of destination area
+-+shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
+-+add rb26, r0, rb27
+++#define RPI_DEBLOCK_VPU_Q_COUNT 2
+ +
+-+# get filter coefficients
+++#ifdef RPI_DEBLOCK_VPU
+++    int enable_rpi_deblock;
+ +
+-+mov r0, unif
+-+asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra0, r0, rb23;      mov r0, unif
+-+asr ra7, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra6, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra5, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra4, r0, rb23;      mov r0, unif
+-+mov r0, unif
+++    int uv_setup_width;
+++    int uv_setup_height;
+++    int setup_width; // Number of 16x16 blocks across the image
+++    int setup_height; // Number of 16x16 blocks down the image
+ +
+-+# r2 is elem_num
+-+# r3 is loop counter
+-+mov r5rep, -8
+-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]  # delay slot 3
+-+mov r3, 0
+++    struct dblk_vpu_q_s
+++    {
+++        GPU_MEM_PTR_T deblock_vpu_gmem;
+ +
+-+:loop_honly
+-+# retrieve texture results and pick out bytes
+-+# then submit two more texture requests
+++        uint8_t (*y_setup_arm)[2][2][2][4];
+++        uint8_t (*y_setup_vc)[2][2][2][4];
+ +
+-+sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
+-+shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
+++        uint8_t (*uv_setup_arm)[2][2][2][4];  // Half of this is unused [][][1][], but easier for the VPU as it allows us to store with zeros and addresses are aligned
+++        uint8_t (*uv_setup_vc)[2][2][2][4];
+ +
+-+max r2, ra_y, 0  # y
+-+min r2, r2, rb_frame_height_minus_1
+-+add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+add t0s, ra_x2_base, r2
+++        int (*vpu_cmds_arm)[6]; // r0-r5 for each command
+++        int vpu_cmds_vc;
+ +
+-+# generate seven shifted versions
+-+# interleave with scroll of vertical context
+++        int cmd_id;
+++    } dvq_ents[RPI_DEBLOCK_VPU_Q_COUNT];
+ +
+-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++    struct dblk_vpu_q_s * dvq;
+++    unsigned int dvq_n;
+ +
+-+mov r2, rb21         ; mul24 r3, r0, ra0
+-+sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+-+nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+sub r0, r2, r3       ; mov r3, rb31
+++#endif
+ +
+-+sub.setf -, r3, rb18 ; mov r1, ra22
+++#endif
+ +
+-+mov -, vw_wait   ; mul24 r0, r0, r1
+-+brr.anyn -, r:loop_honly
+-+asr r0, r0, 15          # delay 1
+-+min r0, r0, rb22        # delay 2
+-+max vpm, r0, 0          # delay 3
++     uint8_t *cabac_state;
++ 
++     /** 1 if the independent slice segment header was successfully parsed */
++@@ -922,6 +1113,9 @@ typedef struct HEVCContext {
++     uint32_t max_mastering_luminance;
++     uint32_t min_mastering_luminance;
++ 
+++#ifdef RPI
+++    int dblk_cmds[RPI_MAX_JOBS][RPI_MAX_DEBLOCK_CMDS][2];
+++#endif
++ } HEVCContext;
++ 
++ int ff_hevc_decode_short_term_rps(GetBitContext *gb, AVCodecContext *avctx,
++@@ -1048,6 +1242,10 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++                                  int log2_trafo_size, enum ScanType scan_idx,
++                                  int c_idx);
++ 
+++#ifdef RPI_INTER_QPU
+++extern void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n);
+++#endif
+ +
+-+# DMA out
+-+bra -, ra31
+-+mov vw_setup, rb26 # VDW: height rows, 16 8-bit units long
+-+mov vw_setup, rb29
+-+mov vw_addr, unif # start the VDW
++ void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size);
++ 
++ 
++diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
++index 05b2821..e2f1f4e 100644
++--- a/libavcodec/hevc_cabac.c
+++++ b/libavcodec/hevc_cabac.c
++@@ -21,14 +21,72 @@
++  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++  */
++ 
+++#define UNCHECKED_BITSTREAM_READER 1
+ +
++ #include "libavutil/attributes.h"
++ #include "libavutil/common.h"
++ 
++-#include "cabac_functions.h"
++ #include "hevc.h"
+++#include "cabac_functions.h"
+++
+++// BY22 is probably faster than simple bypass if the processor has
+++// either a fast 32-bit divide or a fast 32x32->64[63:32] instruction
+++// x86 has fast int divide
+++// Arm doesn't have divide or general fast 64 bit, but does have the multiply
+++// * Beware: ARCH_xxx isn't set if configure --disable-asm is used
+++#define USE_BY22 (HAVE_FAST_64BIT || ARCH_ARM || ARCH_X86)
+++// Use native divide if we have a fast one - otherwise use mpy 1/x
+++// x86 has a fast integer divide - arm doesn't - unsure about other
+++// architectures
+++#define USE_BY22_DIV  ARCH_X86
+++
+++// Special case blocks with a single significant ceoff
+++// Decreases the complexity of the code for a common case but increases the
+++// code size.
+++#define USE_N_END_1 1
+++
+++#if ARCH_ARM
+++#include "arm/hevc_cabac.h"
+++#endif
++ 
++ #define CABAC_MAX_BIN 31
++ 
+++
+++#if USE_BY22 && !USE_BY22_DIV
+++#define I(x) (uint32_t)((0x10000000000ULL / (uint64_t)(x)) + 1ULL)
+++
+++static const uint32_t cabac_by22_inv_range[256] = {
+++                                                    0,      I(257), I(258), I(259),
+++    I(260), I(261), I(262), I(263), I(264), I(265), I(266), I(267), I(268), I(269),
+++    I(270), I(271), I(272), I(273), I(274), I(275), I(276), I(277), I(278), I(279),
+++    I(280), I(281), I(282), I(283), I(284), I(285), I(286), I(287), I(288), I(289),
+++    I(290), I(291), I(292), I(293), I(294), I(295), I(296), I(297), I(298), I(299),
+++    I(300), I(301), I(302), I(303), I(304), I(305), I(306), I(307), I(308), I(309),
+++    I(310), I(311), I(312), I(313), I(314), I(315), I(316), I(317), I(318), I(319),
+++    I(320), I(321), I(322), I(323), I(324), I(325), I(326), I(327), I(328), I(329),
+++    I(330), I(331), I(332), I(333), I(334), I(335), I(336), I(337), I(338), I(339),
+++    I(340), I(341), I(342), I(343), I(344), I(345), I(346), I(347), I(348), I(349),
+++    I(350), I(351), I(352), I(353), I(354), I(355), I(356), I(357), I(358), I(359),
+++    I(360), I(361), I(362), I(363), I(364), I(365), I(366), I(367), I(368), I(369),
+++    I(370), I(371), I(372), I(373), I(374), I(375), I(376), I(377), I(378), I(379),
+++    I(380), I(381), I(382), I(383), I(384), I(385), I(386), I(387), I(388), I(389),
+++    I(390), I(391), I(392), I(393), I(394), I(395), I(396), I(397), I(398), I(399),
+++    I(400), I(401), I(402), I(403), I(404), I(405), I(406), I(407), I(408), I(409),
+++    I(410), I(411), I(412), I(413), I(414), I(415), I(416), I(417), I(418), I(419),
+++    I(420), I(421), I(422), I(423), I(424), I(425), I(426), I(427), I(428), I(429),
+++    I(430), I(431), I(432), I(433), I(434), I(435), I(436), I(437), I(438), I(439),
+++    I(440), I(441), I(442), I(443), I(444), I(445), I(446), I(447), I(448), I(449),
+++    I(450), I(451), I(452), I(453), I(454), I(455), I(456), I(457), I(458), I(459),
+++    I(460), I(461), I(462), I(463), I(464), I(465), I(466), I(467), I(468), I(469),
+++    I(470), I(471), I(472), I(473), I(474), I(475), I(476), I(477), I(478), I(479),
+++    I(480), I(481), I(482), I(483), I(484), I(485), I(486), I(487), I(488), I(489),
+++    I(490), I(491), I(492), I(493), I(494), I(495), I(496), I(497), I(498), I(499),
+++    I(500), I(501), I(502), I(503), I(504), I(505), I(506), I(507), I(508), I(509),
+++    I(510), I(511)
+++};
+++#undef I
+++#endif  // USE_BY22
+ +
+-+################################################################################
++ /**
++  * number of bin by SyntaxElement.
++  */
++@@ -445,6 +503,211 @@ static const uint8_t diag_scan8x8_inv[8][8] = {
++     { 28, 36, 43, 49, 54, 58, 61, 63, },
++ };
++ 
+ +
+-+# mc_exit()
+++typedef struct
+++{
+++    uint16_t coeff;
+++    uint16_t scale;
+++} xy_off_t;
+++
+++#define XYT_C(x,y,t) ((x) + ((y) << (t)))
+++#define SCALE_TRAFO(t) ((t) > 3 ? 3 : (t))
+++#define SCALE_SHR(t) ((t) - SCALE_TRAFO(t))
+++#define XYT_S(x,y,t) (((x) >> SCALE_SHR(t)) + (((y) >> SCALE_SHR(t)) << SCALE_TRAFO(t)))
+++
+++#define XYT(x,y,t) {XYT_C(x,y,t), XYT_S(x,y,t)}
+++
+++#define OFF_DIAG(t) {\
+++    XYT(0,0,t), XYT(0,1,t), XYT(1,0,t), XYT(0,2,t),\
+++    XYT(1,1,t), XYT(2,0,t), XYT(0,3,t), XYT(1,2,t),\
+++    XYT(2,1,t), XYT(3,0,t), XYT(1,3,t), XYT(2,2,t),\
+++    XYT(3,1,t), XYT(2,3,t), XYT(3,2,t), XYT(3,3,t)\
+++}
+ +
+-+::mc_exit
+-+mov  -, vw_wait # wait on the VDW
+++#define OFF_HORIZ(t) {\
+++    XYT(0,0,t), XYT(1,0,t), XYT(2,0,t), XYT(3,0,t),\
+++    XYT(0,1,t), XYT(1,1,t), XYT(2,1,t), XYT(3,1,t),\
+++    XYT(0,2,t), XYT(1,2,t), XYT(2,2,t), XYT(3,2,t),\
+++    XYT(0,3,t), XYT(1,3,t), XYT(2,3,t), XYT(3,3,t)\
+++}
+ +
+-+mov -,srel(0)
+++#define OFF_VERT(t) {\
+++    XYT(0,0,t), XYT(0,1,t), XYT(0,2,t), XYT(0,3,t),\
+++    XYT(1,0,t), XYT(1,1,t), XYT(1,2,t), XYT(1,3,t),\
+++    XYT(2,0,t), XYT(2,1,t), XYT(2,2,t), XYT(2,3,t),\
+++    XYT(3,0,t), XYT(3,1,t), XYT(3,2,t), XYT(3,3,t)\
+++}
+ +
+-+ldtmu0
+-+ldtmu0
+-+ldtmu0
+-+ldtmu0
+++static const xy_off_t off_xys[3][4][16] =
+++{
+++    {OFF_DIAG(2), OFF_DIAG(3), OFF_DIAG(4), OFF_DIAG(5)},
+++    {OFF_HORIZ(2), OFF_HORIZ(3), OFF_HORIZ(4), OFF_HORIZ(5)},
+++    {OFF_VERT(2), OFF_VERT(3), OFF_VERT(4), OFF_VERT(5)}
+++};
+ +
+-+nop        ; nop ; thrend
+-+nop        ; nop # delay slot 1
+-+nop        ; nop # delay slot 2
+ +
+-+::mc_exit1
+-+mov  -, vw_wait # wait on the VDW
+++// Helper fns
+++#ifndef hevc_mem_bits32
+++static av_always_inline uint32_t hevc_mem_bits32(const void * buf, const unsigned int offset)
+++{
+++    return AV_RB32((const uint8_t *)buf + (offset >> 3)) << (offset & 7);
+++}
+++#endif
+ +
+-+#mov -,srel(1)
+++#if AV_GCC_VERSION_AT_LEAST(3,4) && !defined(hevc_clz32)
+++#define hevc_clz32 hevc_clz32_builtin
+++static av_always_inline unsigned int hevc_clz32_builtin(const uint32_t x)
+++{
+++    // __builtin_clz says it works on ints - so adjust if int is >32 bits long
+++    return __builtin_clz(x) - (sizeof(int) * 8 - 32);
+++}
+++#endif
+ +
+-+ldtmu0
+-+ldtmu0
+-+ldtmu0
+-+ldtmu0
+++// It is unlikely that we will ever need this but include for completeness
+++#ifndef hevc_clz32
+++static inline unsigned int hevc_clz32(unsigned int x)
+++{
+++    unsigned int n = 1;
+++    if ((x & 0xffff0000) == 0) {
+++        n += 16;
+++        x <<= 16;
+++    }
+++    if ((x & 0xff000000) == 0) {
+++        n += 8;
+++        x <<= 8;
+++    }
+++    if ((x & 0xf0000000) == 0) {
+++        n += 4;
+++        x <<= 4;
+++    }
+++    if ((x & 0xc0000000) == 0) {
+++        n += 2;
+++        x <<= 2;
+++    }
+++    return n - ((x >> 31) & 1);
+++}
+++#endif
+ +
+-+nop        ; nop ; thrend
+-+mov interrupt, 1; nop # delay slot 1
+-+nop        ; nop # delay slot 2
+ +
+-+# mc_interrupt_exit()
+-+::mc_interrupt_exit
+-+mov  -, vw_wait # wait on the VDW
+++#if !USE_BY22
+++// If no by22 then _by22 functions will revert to normal and so _peek/_flush
+++// will no longer be called but the setup calls will still exist and we want
+++// to null them out
+++#define bypass_start(s)
+++#define bypass_finish(s)
+++#else
+++// Use BY22 for residual bypass block
+++
+++#define bypass_start(s) get_cabac_by22_start(&s->HEVClc->cc)
+++#define bypass_finish(s) get_cabac_by22_finish(&s->HEVClc->cc)
+++
+++// BY22 notes that bypass is simply a divide into the bitstream and so we
+++// can peek out large quantities of bits at once and treat the result as if
+++// it was VLC.  In many cases this will lead to O(1) processing rather than
+++// O(n) though the setup and teardown is sufficiently expensive that it is
+++// only worth using if we expect to be dealing with more than a few bits
+++// The definition of "a few bits" will vary from platform to platform but
+++// tests on ARM show that it probably isn't worth it for a single coded
+++// residual, but is for >1 - it also seems likely that if there are
+++// more residuals then they are likely to be bigger and this will make the
+++// O(1) nature of the code more worthwhile.
+++
+++
+++#if !USE_BY22_DIV
+++// * 1/x @ 32 bits gets us 22 bits of accuracy
+++#define CABAC_BY22_PEEK_BITS  22
+++#else
+++// A real 32-bit divide gets us another bit
+++// If we have a 64 bit int & a unit time divider then we should get a lot
+++// of bits (55)  but that is untested and it is unclear if it would give
+++// us a large advantage
+++#define CABAC_BY22_PEEK_BITS  23
+++#endif
+ +
+-+ldtmu0
+-+ldtmu0
+-+ldtmu0
+-+ldtmu0
+++// Bypass block start
+++// Must be called before _by22_peek is used as it sets the CABAC environment
+++// into the correct state.  _by22_finish must be called to return to 'normal'
+++// (i.e. non-bypass) cabac decoding
+++static inline void get_cabac_by22_start(CABACContext * const c)
+++{
+++    const unsigned int bits = __builtin_ctz(c->low);
+++    const uint32_t m = hevc_mem_bits32(c->bytestream, 0);
+++    uint32_t x = (c->low << (22 - CABAC_BITS)) ^ ((m ^ 0x80000000U) >> (9 + CABAC_BITS - bits));
+++#if !USE_BY22_DIV
+++    const uint32_t inv = cabac_by22_inv_range[c->range & 0xff];
+++#endif
+ +
+-+mov -,sacq(0) # 1
+-+mov -,sacq(0) # 2
+-+mov -,sacq(0) # 3
+-+mov -,sacq(0) # 4
+-+mov -,sacq(0) # 5
+-+mov -,sacq(0) # 6
+-+mov -,sacq(0) # 7
+-+mov -,sacq(0) # 8
+-+mov -,sacq(0) # 9
+-+mov -,sacq(0) # 10
+-+mov -,sacq(0) # 11
+++    c->bytestream -= (CABAC_BITS / 8);
+++    c->by22.bits = bits;
+++#if !USE_BY22_DIV
+++    c->by22.range = c->range;
+++    c->range = inv;
+++#endif
+++    c->low = x;
+++}
+ +
+-+nop        ; nop ; thrend
+-+mov interrupt, 1; nop # delay slot 1
+-+nop        ; nop # delay slot 2
+++// Bypass block finish
+++// Must be called at the end of the bypass block to return to normal operation
+++static inline void get_cabac_by22_finish(CABACContext * const c)
+++{
+++    unsigned int used = c->by22.bits;
+++    unsigned int bytes_used = (used / CABAC_BITS) * (CABAC_BITS / 8);
+++    unsigned int bits_used = used & (CABAC_BITS == 16 ? 15 : 7);
+++
+++    c->bytestream += bytes_used + (CABAC_BITS / 8);
+++    c->low = (((uint32_t)c->low >> (22 - CABAC_BITS + bits_used)) | 1) << bits_used;
+++#if !USE_BY22_DIV
+++    c->range = c->by22.range;
+++#endif
+++}
+ +
+-+# mc_interrupt_exit4()
+-+::mc_interrupt_exit4
+-+mov  -, vw_wait # wait on the VDW
+-+
+-+ldtmu0
+-+ldtmu0
+-+ldtmu0
+-+ldtmu0
+-+
+-+mov -,sacq(0) # 1
+-+mov -,sacq(0) # 2
+-+mov -,sacq(0) # 3
+-+
+-+nop        ; nop ; thrend
+-+mov interrupt, 1; nop # delay slot 1
+-+nop        ; nop # delay slot 2
+-+
+-+# mc_interrupt_exit8()
+-+::mc_interrupt_exit8
+-+mov  -, vw_wait # wait on the VDW
+-+
+-+ldtmu0
+-+ldtmu0
+-+ldtmu0
+-+ldtmu0
+-+
+-+mov -,sacq(0) # 1
+-+mov -,sacq(0) # 2
+-+mov -,sacq(0) # 3
+-+mov -,sacq(0) # 4
+-+mov -,sacq(0) # 5
+-+mov -,sacq(0) # 6
+-+mov -,sacq(0) # 7
+-+
+-+nop        ; nop ; thrend
+-+mov interrupt, 1; nop # delay slot 1
+-+nop        ; nop # delay slot 2
+-+
+-+################################################################################
+-+# mc_setup_uv(next_kernel, x, y, ref_u_base, ref_v_base, frame_width, frame_height, pitch, dst_pitch, pad0, pad1, pad2)
+-+::mc_setup_uv
+-+
+-+# Read starting kernel
+-+mov ra31, unif
+-+
+-+# Load first request location
+-+add ra_x_base, unif, elem_num # Store x
+-+mov ra_y, unif # Store y
+-+mov ra_x2_base, unif # Store frame u base
+-+nop
+-+sub ra_u2v_ref_offset, unif, ra_x2_base # Store offset to add to move from u to v in reference frame
+++// Peek bypass bits
+++// _by22_start must be called before _by22_peek is called and _by22_flush
+++// must be called afterwards to flush any used bits
+++// The actual number of valid bits returned is
+++// min(<coded bypass block length>, CABAC_BY22_PEEK_BITS). CABAC_BY22_PEEK_BITS
+++// will be at least 22 which should be long enough for any prefix or suffix
+++// though probably not long enough for the worst case combination
+++#ifndef get_cabac_by22_peek
+++static inline uint32_t get_cabac_by22_peek(const CABACContext * const c)
+++{
+++#if USE_BY22_DIV
+++    return ((unsigned int)c->low / (unsigned int)c->range) << 9;
+++#else
+++    uint32_t x = c->low & ~1U;
+++    const uint32_t inv = c->range;
+ +
+-+# Read image dimensions
+-+sub rb25,unif,1
+-+sub rb30,unif,1
+++    if (inv != 0)
+++        x = (uint32_t)(((uint64_t)x * (uint64_t)inv) >> 32);
+ +
+-+# get source pitch
+-+mov rb16, unif
+++    return x << 1;
+++#endif
+++}
+++#endif
+ +
+-+# get destination pitch
+-+mov r0, unif
+-+mov r1, vdw_setup_1(0)
+-+add rb24, r1, r0
+++// Flush bypass bits peeked by _by22_peek
+++// Flush n bypass bits. n must be >= 1 to guarantee correct operation
+++// val is an unmodified copy of whatever _by22_peek returned
+++#ifndef get_cabac_by22_flush
+++static inline void get_cabac_by22_flush(CABACContext * c, const unsigned int n, const uint32_t val)
+++{
+++    // Subtract the bits used & reshift up to the top of the word
+++#if USE_BY22_DIV
+++    const uint32_t low = (((unsigned int)c->low << n) - (((val >> (32 - n)) * (unsigned int)c->range) << 23));
+++#else
+++    const uint32_t low = (((uint32_t)c->low << n) - (((val >> (32 - n)) * c->by22.range) << 23));
+++#endif
+ +
+-+# load constants
+++    // and refill lower bits
+++    // We will probably OR over some existing bits but that doesn't matter
+++    c->by22.bits += n;
+++    c->low = low | (hevc_mem_bits32(c->bytestream, c->by22.bits) >> 9);
+++}
+++#endif
+ +
+-+mov ra20, 1
+-+mov ra21, 64
+-+mov ra22, 256
+-+mov ra23, 8
+++#endif  // USE_BY22
+ +
+-+mov rb20, 0xffffff00
+-+mov rb21, 64
+-+mov rb22, 255
+-+mov rb23, 24
+ +
+-+# touch vertical context to keep simulator happy
++ void ff_hevc_save_states(HEVCContext *s, int ctb_addr_ts)
++ {
++     if (s->ps.pps->entropy_coding_sync_enabled_flag &&
++@@ -863,19 +1126,19 @@ int ff_hevc_cbf_luma_decode(HEVCContext *s, int trafo_depth)
++     return GET_CABAC(elem_offset[CBF_LUMA] + !trafo_depth);
++ }
++ 
++-static int hevc_transform_skip_flag_decode(HEVCContext *s, int c_idx)
+++static int hevc_transform_skip_flag_decode(HEVCContext *s, int c_idx_nz)
++ {
++-    return GET_CABAC(elem_offset[TRANSFORM_SKIP_FLAG] + !!c_idx);
+++    return GET_CABAC(elem_offset[TRANSFORM_SKIP_FLAG] + c_idx_nz);
++ }
++ 
++-static int explicit_rdpcm_flag_decode(HEVCContext *s, int c_idx)
+++static int explicit_rdpcm_flag_decode(HEVCContext *s, int c_idx_nz)
++ {
++-    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_FLAG] + !!c_idx);
+++    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_FLAG] + c_idx_nz);
++ }
++ 
++-static int explicit_rdpcm_dir_flag_decode(HEVCContext *s, int c_idx)
+++static int explicit_rdpcm_dir_flag_decode(HEVCContext *s, int c_idx_nz)
++ {
++-    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_DIR_FLAG] + !!c_idx);
+++    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_DIR_FLAG] + c_idx_nz);
++ }
++ 
++ int ff_hevc_log2_res_scale_abs(HEVCContext *s, int idx) {
++@@ -891,14 +1154,14 @@ int ff_hevc_res_scale_sign_flag(HEVCContext *s, int idx) {
++     return GET_CABAC(elem_offset[RES_SCALE_SIGN_FLAG] + idx);
++ }
++ 
++-static av_always_inline void last_significant_coeff_xy_prefix_decode(HEVCContext *s, int c_idx,
+++static av_always_inline void last_significant_coeff_xy_prefix_decode(HEVCContext *s, int c_idx_nz,
++                                                    int log2_size, int *last_scx_prefix, int *last_scy_prefix)
++ {
++     int i = 0;
++     int max = (log2_size << 1) - 1;
++     int ctx_offset, ctx_shift;
++ 
++-    if (!c_idx) {
+++    if (!c_idx_nz) {
++         ctx_offset = 3 * (log2_size - 2)  + ((log2_size - 1) >> 2);
++         ctx_shift = (log2_size + 1) >> 2;
++     } else {
++@@ -929,22 +1192,16 @@ static av_always_inline int last_significant_coeff_suffix_decode(HEVCContext *s,
++     return value;
++ }
++ 
++-static av_always_inline int significant_coeff_group_flag_decode(HEVCContext *s, int c_idx, int ctx_cg)
+++static av_always_inline int significant_coeff_group_flag_decode(HEVCContext *s, int c_idx_nz, int ctx_cg)
++ {
++     int inc;
++ 
++-    inc = FFMIN(ctx_cg, 1) + (c_idx>0 ? 2 : 0);
+++    inc = (ctx_cg != 0) + (c_idx_nz << 1);
++ 
++     return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_GROUP_FLAG] + inc);
++ }
++-static av_always_inline int significant_coeff_flag_decode(HEVCContext *s, int x_c, int y_c,
++-                                           int offset, const uint8_t *ctx_idx_map)
++-{
++-    int inc = ctx_idx_map[(y_c << 2) + x_c] + offset;
++-    return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_FLAG] + inc);
++-}
++ 
++-static av_always_inline int significant_coeff_flag_decode_0(HEVCContext *s, int c_idx, int offset)
+++static av_always_inline int significant_coeff_flag_decode_0(HEVCContext *s, int offset)
++ {
++     return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_FLAG] + offset);
++ }
++@@ -966,90 +1223,366 @@ static av_always_inline int coeff_abs_level_greater2_flag_decode(HEVCContext *s,
++     return GET_CABAC(elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] + inc);
++ }
++ 
++-static av_always_inline int coeff_abs_level_remaining_decode(HEVCContext *s, int rc_rice_param)
+ +
+-+mov ra8, 0
+-+mov ra9, 0
+-+mov ra10, 0
+-+mov ra11, 0
+-+mov ra12, 0
+-+mov ra13, 0
+-+mov ra14, 0
+-+mov ra15, 0
+++#if !USE_BY22
+++#define coeff_abs_level_remaining_decode_bypass(s,r) coeff_abs_level_remaining_decode(s, r)
+++#endif
+ +
+-+# Compute part of VPM to use for DMA output
+-+mov r2, qpu_num
+-+and r2, r2, 15
+-+mov r1, r2
+-+asr r1, r1, 2
+-+shl r1, r1, 6
+-+mov r0, r2
+-+and r0, r0, 3
+-+add r0, r0, r1
+-+mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
+-+shl r0, r0, 5
+-+add rb27, r0, r1
+ +
+-+# Compute part of VPM to save data into
+-+mov r2, qpu_num
+-+and r2, r2, 15
+-+mov r1, r2
+-+asr r1, r1, 2
+-+shl r1, r1, 6
+-+mov r0, r2
+-+and r0, r0, 3
+-+add r0, r0, r1
+-+mov r1, vpm_setup(0, 4, h8p(0, 0))
+-+add rb28, r0, r1
+++#ifndef coeff_abs_level_remaining_decode_bypass
+++static int coeff_abs_level_remaining_decode_bypass(HEVCContext * const s, const unsigned int rice_param)
++ {
+++    CABACContext * const c = &s->HEVClc->cc;
+++    uint32_t y;
+++    unsigned int prefix;
+++    unsigned int last_coeff_abs_level_remaining;
+++    unsigned int n;
+++
+++    y = get_cabac_by22_peek(c);
+++    prefix = hevc_clz32(~y);
+++    // y << prefix will always have top bit 0
+++
+++    if (prefix < 3) {
+++        const unsigned int suffix = (y << prefix) >> (31 - rice_param);
+++        last_coeff_abs_level_remaining = (prefix << rice_param) + suffix;
+++        n = prefix + 1 + rice_param;
+++    }
+++    else if (prefix * 2 + rice_param <= CABAC_BY22_PEEK_BITS + 2)
+++    {
+++        const uint32_t suffix = ((y << prefix) | 0x80000000) >> (34 - (prefix + rice_param));
+ +
+-+# Compute base address for first and second access
+-+mov r0, ra_x_base           # Load x
+-+max r0, r0, 0; mov r1, ra_y # Load y
+-+min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base  # Load the frame base
+-+shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
+-+add ra_y, r1, 1
+-+add r0, r0, r3
+-+and r0, r0, ~3
+-+max r1, r1, 0 ; mov ra_x_base, r0 # y
+-+min r1, r1, rb_frame_height_minus_1
+-+# submit texture requests for first line
+-+add r2, r2, r0 ; mul24 r1, r1, rb_pitch
+-+add t0s, r0, r1 ; mov ra_x2_base, r2
+-+add t0s, r2, r1
+++        last_coeff_abs_level_remaining = (2 << rice_param) + suffix;
+++        n = prefix * 2 + rice_param - 2;
+++    }
+++    else {
+++        unsigned int suffix;
+ +
+-+# Dump padding words
+-+mov r0, unif
+-+mov r0, unif
+-+mov r0, unif
+++        get_cabac_by22_flush(c, prefix, y);
+++        y = get_cabac_by22_peek(c);
+ +
+-+# submit texture requests for second line
+-+max r1, ra_y, 0
+-+min r1, r1, rb_frame_height_minus_1
+-+add ra_y, ra_y, 1
+-+bra -, ra31
+-+nop ; mul24 r1, r1, rb_pitch
+-+add t0s, r1, ra_x_base
+-+add t0s, r1, ra_x2_base
+++        suffix = (y | 0x80000000) >> (34 - (prefix + rice_param));
+++        last_coeff_abs_level_remaining = (2 << rice_param) + suffix;
+++        n = prefix + rice_param - 2;
+++    }
+ +
+++    get_cabac_by22_flush(c, n, y);
+ +
+++    return last_coeff_abs_level_remaining;
+++}
+++#endif
+ +
+-+################################################################################
+++static int coeff_abs_level_remaining_decode(HEVCContext * const s, int rc_rice_param)
+++{
+++    CABACContext * const c = &s->HEVClc->cc;
++     int prefix = 0;
++     int suffix = 0;
++     int last_coeff_abs_level_remaining;
++     int i;
++ 
++-    while (prefix < CABAC_MAX_BIN && get_cabac_bypass(&s->HEVClc->cc))
+++    while (prefix < CABAC_MAX_BIN && get_cabac_bypass(c))
++         prefix++;
++     if (prefix == CABAC_MAX_BIN) {
++         av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", prefix);
++         return 0;
++     }
+ +
+-+::mc_filter_uv_b
+-+mov ra31, unif
++     if (prefix < 3) {
++         for (i = 0; i < rc_rice_param; i++)
++-            suffix = (suffix << 1) | get_cabac_bypass(&s->HEVClc->cc);
+++            suffix = (suffix << 1) | get_cabac_bypass(c);
++         last_coeff_abs_level_remaining = (prefix << rc_rice_param) + suffix;
++     } else {
++         int prefix_minus3 = prefix - 3;
++         for (i = 0; i < prefix_minus3 + rc_rice_param; i++)
++-            suffix = (suffix << 1) | get_cabac_bypass(&s->HEVClc->cc);
+++            suffix = (suffix << 1) | get_cabac_bypass(c);
++         last_coeff_abs_level_remaining = (((1 << prefix_minus3) + 3 - 1)
++                                               << rc_rice_param) + suffix;
++     }
+ +
+-+# per-channel shifts were calculated on the *previous* invocation
++     return last_coeff_abs_level_remaining;
++ }
++ 
++-static av_always_inline int coeff_sign_flag_decode(HEVCContext *s, uint8_t nb)
+++#if !USE_BY22
+++#define coeff_sign_flag_decode_bypass coeff_sign_flag_decode
+++static inline uint32_t coeff_sign_flag_decode(HEVCContext * const s, const unsigned int nb)
++ {
++-    int i;
++-    int ret = 0;
+++    CABACContext * const c = &s->HEVClc->cc;
+++    unsigned int i;
+++    uint32_t ret = 0;
++ 
++     for (i = 0; i < nb; i++)
++-        ret = (ret << 1) | get_cabac_bypass(&s->HEVClc->cc);
++-    return ret;
+++        ret = (ret << 1) | get_cabac_bypass(c);
+ +
+-+mov ra_xshift, ra_xshift_next
+++    return ret << (32 - nb);
+++}
+++#endif
+ +
+-+# get base addresses and per-channel shifts for *next* invocation
+-+add r0, unif, elem_num    # x
+-+max r0, r0, 0; mov r1, unif # y
+-+min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base
+-+shl ra_xshift_next, r0, 3
+-+sub r2, unif, r3 # compute offset from frame base u to frame base v
+-+add r0, r0, r3
+-+and rb_x_base_next, r0, ~3
+-+mov ra_y_next, r1
+-+add ra_x2_base_next, rb_x_base_next, r2
+++#ifndef coeff_sign_flag_decode_bypass
+++static inline uint32_t coeff_sign_flag_decode_bypass(HEVCContext * const s, const unsigned int nb)
+++{
+++    CABACContext * const c = &s->HEVClc->cc;
+++    uint32_t y;
+++    y = get_cabac_by22_peek(c);
+++    get_cabac_by22_flush(c, nb, y);
+++    return y & ~(0xffffffffU >> nb);
+++}
+++#endif
+ +
+-+# set up VPM write
+-+mov vw_setup, rb28
+ +
+-+# get width,height of block
+-+mov r2, 16
+-+mov r0, unif
+-+shr r1, r0, r2 # Extract width
+-+sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
+-+and r0, r0, rb22 # Extract height
+-+add rb17, r0, 5
+-+add rb18, r0, 7
+-+shl r0, r0, 7
+++#ifndef get_cabac_greater1_bits
+++static inline unsigned int get_cabac_greater1_bits(CABACContext * const c, const unsigned int n,
+++    uint8_t * const state0)
+++{
+++    unsigned int i;
+++    unsigned int rv = 0;
+++    for (i = 0; i != n; ++i) {
+++        const unsigned int idx = rv != 0 ? 0 : i < 3 ? i + 1 : 3;
+++        const unsigned int b = get_cabac(c, state0 + idx);
+++        rv = (rv << 1) | b;
+++    }
+++    return rv;
+++}
+++#endif
+ +
+-+# r0 is currently height<<7
+-+# For vr_setup we want height<<20 (so 20-7=13 additional bits)
+-+shl r3, r0, 13
+-+shl r3, r3, 8 # Mask off top 8 bits
+-+shr r3, r3, 8
+ +
+-+add r0, r0, r1 # Combine width and height of destination area
+-+shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
+-+add rb26, r0, rb27
+++// N.B. levels returned are the values assuming coeff_abs_level_remaining
+++// is uncoded, so 1 must be added if it is coded.  sum_abs also reflects
+++// this version of events.
+++static inline uint32_t get_greaterx_bits(HEVCContext * const s, const unsigned int n_end, int * const levels,
+++    int * const pprev_subset_coded, int * const psum,
+++    const unsigned int idx0_gt1, const unsigned int idx_gt2)
+++{
+++    CABACContext * const c = &s->HEVClc->cc;
+++    uint8_t * const state0 = s->HEVClc->cabac_state + idx0_gt1;
+++    uint8_t * const state_gt2 = s->HEVClc->cabac_state + idx_gt2;
+++    unsigned int rv;
+++    unsigned int i;
+++    const unsigned int n = FFMIN(n_end, 8);
+ +
+-+# In a B frame, so also set up VPM read
+-+add vr_setup, r3, rb28
+++    // Really this is i != n but the simple unconditional loop is cheaper
+++    // and faster
+++    for (i = 0; i != 8; ++i)
+++        levels[i] = 1;
+ +
+-+sub.setf -,8,r1 # 8-r1, so if <0 (negative) we need to use the full code
+++    rv = get_cabac_greater1_bits(c, n, state0);
+ +
+-+# get filter coefficients
+++    *pprev_subset_coded = 0;
+++    *psum = n;
+ +
+-+mov r0, unif
+-+asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra0, r0, rb23;      mov r0, unif
+-+asr ra7, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra6, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra5, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra4, r0, rb23;      mov r0, unif
+-+asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+asr rb8, r0, rb23;      mov r0, unif
+-+asr rb15, r0, rb23;     mul24 r0, r0, ra22
+-+asr rb14, r0, rb23;     mul24 r0, r0, ra22
+-+asr rb13, r0, rb23;     mul24 r0, r0, ra22
+-+asr rb12, r0, rb23
+++    rv <<= (32 - n);
+++    if (rv != 0)
+++    {
+++        *pprev_subset_coded = 1;
+++        *psum = n + 1;
+++        i = hevc_clz32(rv);
+++        levels[i] = 2;
+++        if (get_cabac(c, state_gt2) == 0)
+++        {
+++            // Unset first coded bit
+++            rv &= ~(0x80000000U >> i);
+++        }
+++    }
+ +
+-+# r2 is elem_num
+-+# r3 is loop counter
+++    if (n_end > 8) {
+++        const unsigned int g8 = n_end - 8;
+++        rv |= ((1 << g8) - 1) << (24 - g8);
+++        for (i = 0; i != g8; ++i) {
+++            levels[i + 8] = 0;
+++        }
+++    }
+ +
+-+mov r5rep, -8
+-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++    return rv;
+++}
+ +
+-+# retrieve texture results and pick out bytes
+-+# then submit two more texture requests
+++// extended_precision_processing_flag must be false given we are
+++// putting the result into a 16-bit array
+++// So trans_coeff_level must fit in 16 bits too (7.4.9.1 definition of coeff_abs_level_remaining)
+++// scale_m is uint8_t
+++//
+++// scale is [40 - 72] << [0..12] based on qp- worst case is (45 << 12)
+++//   or it can be 2 (if we have transquant_bypass)
+++// shift is set to one less than we really want but would normally be
+++//   s->ps.sps->bit_depth (max 16, min 8) + log2_trafo_size (max 5, min 2?) - 5 = max 16 min 5?
+++// however the scale shift is substracted from shift to a min 0 so scale_m worst = 45 << 6
+++// This can still theoretically lead to overflow but the coding would have to be very odd (& inefficient)
+++// to achieve it
+++
+++#ifndef trans_scale_sat
+++static inline int trans_scale_sat(const int level, const unsigned int scale, const unsigned int scale_m, const unsigned int shift)
+++{
+++    return av_clip_int16((((level * (int)(scale * scale_m)) >> shift) + 1) >> 1);
++ }
+++#endif
+ +
+-+mov r3, 0
+ +
+-+:uvloop_b
+-+# retrieve texture results and pick out bytes
+-+# then submit two more texture requests
+++#ifndef update_rice
+++static inline void update_rice(uint8_t * const stat_coeff,
+++    const unsigned int last_coeff_abs_level_remaining,
+++    const unsigned int c_rice_param)
+++{
+++    const unsigned int x = (last_coeff_abs_level_remaining << 1) >> c_rice_param;
+++    if (x >= 6)
+++        (*stat_coeff)++;
+++    else if (x == 0 && *stat_coeff > 0)
+++        (*stat_coeff)--;
+++}
+++#endif
+ +
+-+sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
+-+shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
+ +
+-+max r2, ra_y, 0  # y
+-+min r2, r2, rb_frame_height_minus_1
+-+add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+add t0s, ra_x2_base, r2
+++// n must be > 0 on entry
+++#ifndef get_cabac_sig_coeff_flag_idxs
+++static inline uint8_t * get_cabac_sig_coeff_flag_idxs(CABACContext * const c, uint8_t * const state0,
+++    unsigned int n,
+++    const uint8_t const * ctx_map,
+++    uint8_t * p)
+++{
+++    do {
+++        if (get_cabac(c, state0 + ctx_map[n]))
+++            *p++ = n;
+++    } while (--n != 0);
+++    return p;
+++}
+++#endif
+ +
+-+# generate seven shifted versions
+-+# interleave with scroll of vertical context
+ +
+-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++static int get_sig_coeff_flag_idxs(CABACContext * const c, uint8_t * const state0,
+++    unsigned int n,
+++    const uint8_t const * ctx_map,
+++    uint8_t * const flag_idx)
+++{
+++    int rv;
+ +
+-+mov r2, rb21         ; mul24 r3, r0, ra0
+-+nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
+-+sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+-+nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+sub r0, r2, r3
+-+
+-+mov r3, rb31
+-+
+-+mov ra8, ra9
+-+mov ra9, ra10
+-+mov ra10, ra11
+-+mov ra11, ra12
+-+mov ra12, ra13
+-+mov ra13, ra14
+-+
+-+sub.setf -, r3, 8 ; mov r1, ra22
+++    rv = get_cabac_sig_coeff_flag_idxs(c, state0, n, ctx_map, flag_idx) - flag_idx;
+ +
+-+# apply horizontal filter
+-+brr.anyn -, r:uvloop_b
+-+max ra14, ra15, 0       ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
+-+asr r0, r0, 15          ; mov r1, ra21
+-+min.setf ra15, r0, rb22
+++    return rv;
+++}
+ +
+-+# apply vertical filter and write to VPM
+++#define H4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
+++     x0,  x1,  x2,  x3,\
+++     x4,  x5,  x6,  x7,\
+++     x8,  x9, x10, x11,\
+++    x12, x13, x14, x15}
+++
+++#define V4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
+++     x0,  x4,  x8, x12,\
+++     x1,  x5,  x9, x13,\
+++     x2,  x6, x10, x14,\
+++     x3,  x7, x11, x15}
+++
+++#define D4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
+++     x0,  x4,  x1,  x8,\
+++     x5,  x2, x12,  x9,\
+++     x6,  x3, x13, x10,\
+++     x7, x14, x11, x15}
+++
+++
+++static inline int next_subset(HEVCContext * const s, int i, const int c_idx_nz,
+++    uint8_t * const significant_coeff_group_flag,
+++    const uint8_t * const scan_x_cg, const uint8_t * const scan_y_cg,
+++    int * const pPrev_sig)
+++{
+++    while (--i >= 0) {
+++        unsigned int x_cg = scan_x_cg[i];
+++        unsigned int y_cg = scan_y_cg[i];
+++
+++        // For the flag decode we only care about Z/NZ but
+++        // we use the full Right + Down * 2 when calculating
+++        // significant coeff flags so we obtain it here
+++        //.
+++        // The group flag array is one longer than it needs to
+++        // be so we don't need to check for y_cg limits
+++        unsigned int prev_sig = ((significant_coeff_group_flag[y_cg] >> (x_cg + 1)) & 1) |
+++            (((significant_coeff_group_flag[y_cg + 1] >> x_cg) & 1) << 1);
+++
+++        if (i == 0 ||
+++            significant_coeff_group_flag_decode(s, c_idx_nz, prev_sig))
+++        {
+++            significant_coeff_group_flag[y_cg] |= (1 << x_cg);
+++            *pPrev_sig = prev_sig;
+++            break;
+++        }
+++    }
+ +
+-+nop                     ; mul24 r0, ra14, rb14
+-+sub r1, r1, r0          ; mul24 r0, ra13, rb13
+-+sub r1, r1, r0          ; mul24 r0, ra12, rb12
+-+sub r1, r1, r0          ; mul24 r0, ra11, rb11
+-+sub r1, r1, r0          ; mul24 r0, ra10, rb10
+-+sub r1, r1, r0          ; mul24 r0, ra9, rb9
+-+sub r1, r1, r0          ; mul24 r0, ra8, rb8
+-+sub r1, r1, r0          ; mul24 r0, ra15, rb15
+-+sub.ifnn r1, r1, r0     ; mov -, vw_wait
+-+sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+asr r1, r1, 15
+-+min r1, r1, rb22
+-+add r0, vpm, 1          # Blend in previous VPM contents at this location
+-+brr.anyn -, r:uvloop_b
+-+max r1, r1, 0
+-+add r1, r1, r0
+-+shr vpm, r1, 1
+++    return i;
+++}
+ +
++ 
++ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++                                 int log2_trafo_size, enum ScanType scan_idx,
++                                 int c_idx)
++ {
++-#define GET_COORD(offset, n)                                    \
++-    do {                                                        \
++-        x_c = (x_cg << 2) + scan_x_off[n];                      \
++-        y_c = (y_cg << 2) + scan_y_off[n];                      \
++-    } while (0)
++-    HEVCLocalContext *lc = s->HEVClc;
++-    int transform_skip_flag = 0;
+++    HEVCLocalContext * const lc = s->HEVClc;
+++    int trans_skip_or_bypass = lc->cu.cu_transquant_bypass_flag;
++ 
++     int last_significant_coeff_x, last_significant_coeff_y;
++-    int last_scan_pos;
++-    int n_end;
++     int num_coeff = 0;
++-    int greater1_ctx = 1;
+++    int prev_subset_coded = 0;
++ 
++     int num_last_subset;
++     int x_cg_last_sig, y_cg_last_sig;
++ 
++-    const uint8_t *scan_x_cg, *scan_y_cg, *scan_x_off, *scan_y_off;
+++    const uint8_t *scan_x_cg, *scan_y_cg;
+++    const xy_off_t * scan_xy_off;
++ 
++     ptrdiff_t stride = s->frame->linesize[c_idx];
++     int hshift = s->ps.sps->hshift[c_idx];
++     int vshift = s->ps.sps->vshift[c_idx];
++     uint8_t *dst = &s->frame->data[c_idx][(y0 >> vshift) * stride +
++                                           ((x0 >> hshift) << s->ps.sps->pixel_shift)];
+++#ifdef RPI
+++    //***** transform_skip_flag decoded later!
+++    int use_vpu = s->enable_rpi && !lc->cu.cu_transquant_bypass_flag /* && !transform_skip_flag*/ && !lc->tu.cross_pf && log2_trafo_size>=4;
+++#endif
++     int16_t *coeffs = (int16_t*)(c_idx ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
++-    uint8_t significant_coeff_group_flag[8][8] = {{0}};
+++    uint8_t significant_coeff_group_flag[9] = {0};  // Allow 1 final byte that is always zero
++     int explicit_rdpcm_flag = 0;
++     int explicit_rdpcm_dir_flag;
++ 
++     int trafo_size = 1 << log2_trafo_size;
++     int i;
++-    int qp,shift,add,scale,scale_m;
+++    int qp,shift,scale;
++     static const uint8_t level_scale[] = { 40, 45, 51, 57, 64, 72 };
++     const uint8_t *scale_matrix = NULL;
++     uint8_t dc_scale;
++     int pred_mode_intra = (c_idx == 0) ? lc->tu.intra_pred_mode :
++                                          lc->tu.intra_pred_mode_c;
++ 
+++    int prev_sig = 0;
+++    const int c_idx_nz = (c_idx != 0);
+ +
+-+# DMA out for U
+++    int may_hide_sign;
+ +
+-+mov vw_setup, rb26 # VDW setup 0
+-+mov vw_setup, rb29 # Stride
+-+mov vw_addr, unif # start the VDW
+-+
+-+# DMA out for V
+-+# We need to wait for the U to complete first, but have nothing useful to compute while we wait.
+-+# Could potentially push this write into the start of the next pipeline stage.
+-+mov r0, 16
+-+mov -, vw_wait
+++#ifdef RPI
+++    if (s->enable_rpi) {
+++        int n = trafo_size * trafo_size;
+++        if (use_vpu) {
+++            // We support size 4 and size 5.
+++            // Size 4 grows from the front  (Coeffs_buf_arm[2] points to start of buf)
+++            // Size 5 grows from the back   (Coeffs_buf_arm[3] points to end of buf)
+++            // num_coeffs is indexed by log2_trafo_size-2
+++            if (log2_trafo_size == 4)
+++                coeffs = s->coeffs_buf_arm[s->pass0_job][log2_trafo_size - 2] + s->num_coeffs[s->pass0_job][log2_trafo_size - 2];
+++            else
+++                coeffs = s->coeffs_buf_arm[s->pass0_job][log2_trafo_size - 2] - s->num_coeffs[s->pass0_job][log2_trafo_size - 2] - n;
+++            s->num_coeffs[s->pass0_job][log2_trafo_size - 2] += n;
+++        } else {
+++            coeffs = s->coeffs_buf_arm[s->pass0_job][0] + s->num_coeffs[s->pass0_job][0];
+++            s->num_coeffs[s->pass0_job][0] += n;
+++        }
+++    }
+++    // We now do the memset after transform_add while we know the data is cached.
+++    #ifdef RPI_PRECLEAR
+++    #else
+++    memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
+++    #endif
+++#else
++     memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
+++#endif
+ +
+-+bra -, ra31
+-+add vw_setup, rb26, r0 # VDW setup 0
+-+mov vw_setup, rb29 # Stride
+-+mov vw_addr, unif # start the VDW
+ +
+-+::mc_end
+-diff --git a/libavcodec/rpi_user_vcsm.h b/libavcodec/rpi_user_vcsm.h
+-new file mode 100644
+-index 0000000..fbebbbe
+---- /dev/null
+-+++ b/libavcodec/rpi_user_vcsm.h
+-@@ -0,0 +1,425 @@
+-+/*
+-+Copyright (c) 2012, Broadcom Europe Ltd
+-+All rights reserved.
++ 
++     // Derive QP for dequant
++     if (!lc->cu.cu_transquant_bypass_flag) {
++-        static const int qp_c[] = { 29, 30, 31, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37 };
+++        static const uint8_t qp_c[] = { 29, 30, 31, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37 };
++         static const uint8_t rem6[51 + 4 * 6 + 1] = {
++             0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2,
++             3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
++@@ -1065,9 +1598,19 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++         };
++         int qp_y = lc->qp_y;
++ 
+++        may_hide_sign = s->ps.pps->sign_data_hiding_flag;
+++
++         if (s->ps.pps->transform_skip_enabled_flag &&
++             log2_trafo_size <= s->ps.pps->log2_max_transform_skip_block_size) {
++-            transform_skip_flag = hevc_transform_skip_flag_decode(s, c_idx);
+++            int transform_skip_flag = hevc_transform_skip_flag_decode(s, c_idx_nz);
+++            if (transform_skip_flag) {
+++                trans_skip_or_bypass = 1;
+++                if (lc->cu.pred_mode ==  MODE_INTRA  &&
+++                    s->ps.sps->implicit_rdpcm_enabled_flag &&
+++                    (pred_mode_intra == 10 || pred_mode_intra == 26)) {
+++                    may_hide_sign = 0;
+++                }
+++            }
++         }
++ 
++         if (c_idx == 0) {
++@@ -1100,39 +1643,73 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++             qp += s->ps.sps->qp_bd_offset;
++         }
++ 
++-        shift    = s->ps.sps->bit_depth + log2_trafo_size - 5;
++-        add      = 1 << (shift-1);
++-        scale    = level_scale[rem6[qp]] << (div6[qp]);
++-        scale_m  = 16; // default when no custom scaling lists.
++-        dc_scale = 16;
+++        // Shift is set to one less than will actually occur as the scale
+++        // and saturate step adds 1 and then shifts right again
+++        shift = s->ps.sps->bit_depth + log2_trafo_size - 6;
+++        scale = level_scale[rem6[qp]];
+++        if (div6[qp] >= shift) {
+++            scale <<= (div6[qp] - shift);
+++            shift = 0;
+++        } else {
+++            shift -= div6[qp];
+++        }
++ 
++-        if (s->ps.sps->scaling_list_enable_flag && !(transform_skip_flag && log2_trafo_size > 2)) {
+++        if (s->ps.sps->scaling_list_enable_flag && !(trans_skip_or_bypass && log2_trafo_size > 2)) {
++             const ScalingList *sl = s->ps.pps->scaling_list_data_present_flag ?
++-            &s->ps.pps->scaling_list : &s->ps.sps->scaling_list;
+++                &s->ps.pps->scaling_list : &s->ps.sps->scaling_list;
++             int matrix_id = lc->cu.pred_mode != MODE_INTRA;
++ 
++             matrix_id = 3 * matrix_id + c_idx;
++ 
++             scale_matrix = sl->sl[log2_trafo_size - 2][matrix_id];
+++            dc_scale = scale_matrix[0];
++             if (log2_trafo_size >= 4)
++                 dc_scale = sl->sl_dc[log2_trafo_size - 4][matrix_id];
++         }
+++        else
+++        {
+++            static const uint8_t sixteen_scale[64] = {
+++                16, 16, 16, 16, 16, 16, 16, 16,
+++                16, 16, 16, 16, 16, 16, 16, 16,
+++                16, 16, 16, 16, 16, 16, 16, 16,
+++                16, 16, 16, 16, 16, 16, 16, 16,
+++                16, 16, 16, 16, 16, 16, 16, 16,
+++                16, 16, 16, 16, 16, 16, 16, 16,
+++                16, 16, 16, 16, 16, 16, 16, 16,
+++                16, 16, 16, 16, 16, 16, 16, 16
+++            };
+++            scale_matrix = sixteen_scale;
+++            dc_scale = 16;
+++        }
++     } else {
+++        static const uint8_t unit_scale[64] = {
+++            1, 1, 1, 1, 1, 1, 1, 1,
+++            1, 1, 1, 1, 1, 1, 1, 1,
+++            1, 1, 1, 1, 1, 1, 1, 1,
+++            1, 1, 1, 1, 1, 1, 1, 1,
+++            1, 1, 1, 1, 1, 1, 1, 1,
+++            1, 1, 1, 1, 1, 1, 1, 1,
+++            1, 1, 1, 1, 1, 1, 1, 1,
+++            1, 1, 1, 1, 1, 1, 1, 1,
+++        };
+++        scale_matrix = unit_scale;
++         shift        = 0;
++-        add          = 0;
++-        scale        = 0;
++-        dc_scale     = 0;
+++        scale        = 2;  // We will shift right to kill this
+++        dc_scale     = 1;
+++
+++        may_hide_sign = 0;
++     }
++ 
++     if (lc->cu.pred_mode == MODE_INTER && s->ps.sps->explicit_rdpcm_enabled_flag &&
++-        (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
++-        explicit_rdpcm_flag = explicit_rdpcm_flag_decode(s, c_idx);
+++        trans_skip_or_bypass) {
+++        explicit_rdpcm_flag = explicit_rdpcm_flag_decode(s, c_idx_nz);
++         if (explicit_rdpcm_flag) {
++-            explicit_rdpcm_dir_flag = explicit_rdpcm_dir_flag_decode(s, c_idx);
+++            may_hide_sign = 0;
+++            explicit_rdpcm_dir_flag = explicit_rdpcm_dir_flag_decode(s, c_idx_nz);
++         }
++     }
++ 
++-    last_significant_coeff_xy_prefix_decode(s, c_idx, log2_trafo_size,
+++    last_significant_coeff_xy_prefix_decode(s, c_idx_nz, log2_trafo_size,
++                                            &last_significant_coeff_x, &last_significant_coeff_y);
++ 
++     if (last_significant_coeff_x > 3) {
++@@ -1160,119 +1737,113 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++         int last_x_c = last_significant_coeff_x & 3;
++         int last_y_c = last_significant_coeff_y & 3;
++ 
++-        scan_x_off = ff_hevc_diag_scan4x4_x;
++-        scan_y_off = ff_hevc_diag_scan4x4_y;
++         num_coeff = diag_scan4x4_inv[last_y_c][last_x_c];
++-        if (trafo_size == 4) {
+++
+++        switch (log2_trafo_size) {
+++        case 2:
++             scan_x_cg = scan_1x1;
++             scan_y_cg = scan_1x1;
++-        } else if (trafo_size == 8) {
+++            break;
+++        case 3:
++             num_coeff += diag_scan2x2_inv[y_cg_last_sig][x_cg_last_sig] << 4;
++             scan_x_cg = diag_scan2x2_x;
++             scan_y_cg = diag_scan2x2_y;
++-        } else if (trafo_size == 16) {
+++            break;
+++        case 4:
++             num_coeff += diag_scan4x4_inv[y_cg_last_sig][x_cg_last_sig] << 4;
++             scan_x_cg = ff_hevc_diag_scan4x4_x;
++             scan_y_cg = ff_hevc_diag_scan4x4_y;
++-        } else { // trafo_size == 32
+++            break;
+++        case 5:
+++        default:
++             num_coeff += diag_scan8x8_inv[y_cg_last_sig][x_cg_last_sig] << 4;
++             scan_x_cg = ff_hevc_diag_scan8x8_x;
++             scan_y_cg = ff_hevc_diag_scan8x8_y;
+++            break;
++         }
++         break;
++     }
++     case SCAN_HORIZ:
++         scan_x_cg = horiz_scan2x2_x;
++         scan_y_cg = horiz_scan2x2_y;
++-        scan_x_off = horiz_scan4x4_x;
++-        scan_y_off = horiz_scan4x4_y;
++         num_coeff = horiz_scan8x8_inv[last_significant_coeff_y][last_significant_coeff_x];
++         break;
++     default: //SCAN_VERT
++         scan_x_cg = horiz_scan2x2_y;
++         scan_y_cg = horiz_scan2x2_x;
++-        scan_x_off = horiz_scan4x4_y;
++-        scan_y_off = horiz_scan4x4_x;
++         num_coeff = horiz_scan8x8_inv[last_significant_coeff_x][last_significant_coeff_y];
++         break;
++     }
++     num_coeff++;
++     num_last_subset = (num_coeff - 1) >> 4;
++ 
++-    for (i = num_last_subset; i >= 0; i--) {
++-        int n, m;
++-        int x_cg, y_cg, x_c, y_c, pos;
++-        int implicit_non_zero_coeff = 0;
++-        int64_t trans_coeff_level;
++-        int prev_sig = 0;
++-        int offset = i << 4;
++-        int rice_init = 0;
+++    significant_coeff_group_flag[y_cg_last_sig] = 1 << x_cg_last_sig; // 1st subset always significant
++ 
++-        uint8_t significant_coeff_flag_idx[16];
++-        uint8_t nb_significant_coeff_flag = 0;
++-
++-        x_cg = scan_x_cg[i];
++-        y_cg = scan_y_cg[i];
++-
++-        if ((i < num_last_subset) && (i > 0)) {
++-            int ctx_cg = 0;
++-            if (x_cg < (1 << (log2_trafo_size - 2)) - 1)
++-                ctx_cg += significant_coeff_group_flag[x_cg + 1][y_cg];
++-            if (y_cg < (1 << (log2_trafo_size - 2)) - 1)
++-                ctx_cg += significant_coeff_group_flag[x_cg][y_cg + 1];
+++    scan_xy_off = off_xys[scan_idx][log2_trafo_size - 2];
++ 
++-            significant_coeff_group_flag[x_cg][y_cg] =
++-                significant_coeff_group_flag_decode(s, c_idx, ctx_cg);
++-            implicit_non_zero_coeff = 1;
++-        } else {
++-            significant_coeff_group_flag[x_cg][y_cg] =
++-            ((x_cg == x_cg_last_sig && y_cg == y_cg_last_sig) ||
++-             (x_cg == 0 && y_cg == 0));
++-        }
+++    i = num_last_subset;
+++    do {
+++        int implicit_non_zero_coeff = 0;
+++        int n_end;
++ 
++-        last_scan_pos = num_coeff - offset - 1;
+++        uint8_t significant_coeff_flag_idx[16];
+++        unsigned int nb_significant_coeff_flag = 0;
++ 
++         if (i == num_last_subset) {
+++            // First time through
+++            int last_scan_pos = num_coeff - (i << 4) - 1;
++             n_end = last_scan_pos - 1;
++             significant_coeff_flag_idx[0] = last_scan_pos;
++             nb_significant_coeff_flag = 1;
++         } else {
++             n_end = 15;
+++            implicit_non_zero_coeff = (i != 0);
++         }
++ 
++-        if (x_cg < ((1 << log2_trafo_size) - 1) >> 2)
++-            prev_sig = !!significant_coeff_group_flag[x_cg + 1][y_cg];
++-        if (y_cg < ((1 << log2_trafo_size) - 1) >> 2)
++-            prev_sig += (!!significant_coeff_group_flag[x_cg][y_cg + 1] << 1);
++-
++-        if (significant_coeff_group_flag[x_cg][y_cg] && n_end >= 0) {
++-            static const uint8_t ctx_idx_map[] = {
++-                0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8, // log2_trafo_size == 2
++-                1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, // prev_sig == 0
++-                2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, // prev_sig == 1
++-                2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, // prev_sig == 2
++-                2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2  // default
+++        if (n_end >= 0) {
+++            static const uint8_t ctx_idx_maps_ts2[3][16] = {
+++                D4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2
+++                H4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2
+++                V4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8)  // log2_trafo_size == 2
+++            };
+++            static const uint8_t ctx_idx_maps[3][4][16] = {
+++                {
+++                    D4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
+++                    D4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 1
+++                    D4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 2
+++                    D4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
+++                },
+++                {
+++                    H4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
+++                    H4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 1
+++                    H4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 2
+++                    H4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
+++                },
+++                {
+++                    V4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
+++                    V4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 1
+++                    V4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 2
+++                    V4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
+++                }
++             };
++             const uint8_t *ctx_idx_map_p;
++             int scf_offset = 0;
++-            if (s->ps.sps->transform_skip_context_enabled_flag &&
++-                (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
++-                ctx_idx_map_p = (uint8_t*) &ctx_idx_map[4 * 16];
++-                if (c_idx == 0) {
++-                    scf_offset = 40;
++-                } else {
++-                    scf_offset = 14 + 27;
++-                }
+++
+++            if (s->ps.sps->transform_skip_context_enabled_flag && trans_skip_or_bypass) {
+++                ctx_idx_map_p = ctx_idx_maps[0][3];
+++                scf_offset = 40 + c_idx_nz;
++             } else {
++-                if (c_idx != 0)
+++                if (c_idx_nz != 0)
++                     scf_offset = 27;
+++
++                 if (log2_trafo_size == 2) {
++-                    ctx_idx_map_p = (uint8_t*) &ctx_idx_map[0];
+++                    ctx_idx_map_p = ctx_idx_maps_ts2[scan_idx];
++                 } else {
++-                    ctx_idx_map_p = (uint8_t*) &ctx_idx_map[(prev_sig + 1) << 4];
++-                    if (c_idx == 0) {
++-                        if ((x_cg > 0 || y_cg > 0))
+++                    ctx_idx_map_p = ctx_idx_maps[scan_idx][prev_sig];
+++                    if (!c_idx_nz) {
+++                        if (i != 0)
++                             scf_offset += 3;
+++
++                         if (log2_trafo_size == 3) {
++                             scf_offset += (scan_idx == SCAN_DIAG) ? 9 : 15;
++                         } else {
++@@ -1286,34 +1857,30 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++                     }
++                 }
++             }
++-            for (n = n_end; n > 0; n--) {
++-                x_c = scan_x_off[n];
++-                y_c = scan_y_off[n];
++-                if (significant_coeff_flag_decode(s, x_c, y_c, scf_offset, ctx_idx_map_p)) {
++-                    significant_coeff_flag_idx[nb_significant_coeff_flag] = n;
++-                    nb_significant_coeff_flag++;
+++
+++            if (n_end > 0) {
+++                int cnt = get_sig_coeff_flag_idxs(&s->HEVClc->cc,
+++                    s->HEVClc->cabac_state + elem_offset[SIGNIFICANT_COEFF_FLAG] + scf_offset,
+++                    n_end, ctx_idx_map_p,
+++                    significant_coeff_flag_idx + nb_significant_coeff_flag);
+++
+++                nb_significant_coeff_flag += cnt;
+++                if (cnt != 0) {
++                     implicit_non_zero_coeff = 0;
++                 }
++             }
+ +
+-+Redistribution and use in source and binary forms, with or without
+-+modification, are permitted provided that the following conditions are met:
+-+    * Redistributions of source code must retain the above copyright
+-+      notice, this list of conditions and the following disclaimer.
+-+    * Redistributions in binary form must reproduce the above copyright
+-+      notice, this list of conditions and the following disclaimer in the
+-+      documentation and/or other materials provided with the distribution.
+-+    * Neither the name of the copyright holder nor the
+-+      names of its contributors may be used to endorse or promote products
+-+      derived from this software without specific prior written permission.
++             if (implicit_non_zero_coeff == 0) {
++-                if (s->ps.sps->transform_skip_context_enabled_flag &&
++-                    (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
++-                    if (c_idx == 0) {
++-                        scf_offset = 42;
++-                    } else {
++-                        scf_offset = 16 + 27;
++-                    }
+++                if (s->ps.sps->transform_skip_context_enabled_flag && trans_skip_or_bypass) {
+++                    scf_offset = 42 + c_idx_nz;
++                 } else {
++                     if (i == 0) {
++-                        if (c_idx == 0)
++-                            scf_offset = 0;
++-                        else
++-                            scf_offset = 27;
+++                        scf_offset = c_idx_nz ? 27 : 0;
++                     } else {
++                         scf_offset = 2 + scf_offset;
++                     }
++                 }
++-                if (significant_coeff_flag_decode_0(s, c_idx, scf_offset) == 1) {
+++                if (significant_coeff_flag_decode_0(s, scf_offset) == 1) {
++                     significant_coeff_flag_idx[nb_significant_coeff_flag] = 0;
++                     nb_significant_coeff_flag++;
++                 }
++@@ -1323,141 +1890,185 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++             }
++         }
++ 
++-        n_end = nb_significant_coeff_flag;
++-
+++        if (nb_significant_coeff_flag != 0) {
+++            const unsigned int gt1_idx_delta = (c_idx_nz << 2) |
+++                ((i != 0 && !c_idx_nz) ? 2 : 0) |
+++                prev_subset_coded;
+++            const unsigned int idx0_gt1 = elem_offset[COEFF_ABS_LEVEL_GREATER1_FLAG] +
+++                (gt1_idx_delta << 2);
+++            const unsigned int idx_gt2 = elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] +
+++                gt1_idx_delta;
+++
+++            const unsigned int x_cg = scan_x_cg[i];
+++            const unsigned int y_cg = scan_y_cg[i];
+++            int16_t * const blk_coeffs = coeffs +
+++                ((x_cg + (y_cg << log2_trafo_size)) << 2);
+++            // This calculation is 'wrong' for log2_traffo_size == 2
+++            // but that doesn't mattor as in this case x_cg & y_cg
+++            // are always 0 so result is correct (0) anyway
+++            const uint8_t * const blk_scale = scale_matrix +
+++                (((x_cg + (y_cg << 3)) << (5 - log2_trafo_size)));
+++
+++            // * The following code block doesn't deal with these flags:
+++            //   (nor did the one it replaces)
+++            //
+++            // cabac_bypass_alignment_enabled_flag
+++            //    This should be easy but I can't find a test case
+++            // extended_precision_processing_flag
+++            //    This can extend the required precision past 16bits
+++            //    so is probably tricky - also no example found yet
+++
+++#if USE_N_END_1
+++            if (nb_significant_coeff_flag == 1) {
+++                // There is a small gain to be had from special casing the single
+++                // transform coefficient case.  The reduction in complexity
+++                // makes up for the code duplicatioon.
+++
+++                int trans_coeff_level = 1;
+++                int coeff_sign_flag;
+++                int coded_val = 0;
+++
+++                // initialize first elem of coeff_bas_level_greater1_flag
+++                prev_subset_coded = 0;
+++
+++                if (get_cabac(&s->HEVClc->cc, s->HEVClc->cabac_state + idx0_gt1 + 1)) {
+++                    trans_coeff_level = 2;
+++                    prev_subset_coded = 1;
+++                    coded_val = get_cabac(&s->HEVClc->cc, s->HEVClc->cabac_state + idx_gt2);
+++                }
++ 
++-        if (n_end) {
++-            int first_nz_pos_in_cg;
++-            int last_nz_pos_in_cg;
++-            int c_rice_param = 0;
++-            int first_greater1_coeff_idx = -1;
++-            uint8_t coeff_abs_level_greater1_flag[8];
++-            uint16_t coeff_sign_flag;
++-            int sum_abs = 0;
++-            int sign_hidden;
++-            int sb_type;
+++                // Probably not worth the overhead of starting by22 for just one value
+++                coeff_sign_flag = get_cabac_bypass(&s->HEVClc->cc);
++ 
+++                if (coded_val)
+++                {
+++                    if (!s->ps.sps->persistent_rice_adaptation_enabled_flag) {
+++                        trans_coeff_level = 3 + coeff_abs_level_remaining_decode(s, 0);
+++                    } else {
+++                        uint8_t * const stat_coeff =
+++                            lc->stat_coeff + trans_skip_or_bypass + 2 - ((c_idx_nz) << 1);
+++                        const unsigned int c_rice_param = *stat_coeff >> 2;
+++                        const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
++ 
++-            // initialize first elem of coeff_bas_level_greater1_flag
++-            int ctx_set = (i > 0 && c_idx == 0) ? 2 : 0;
+++                        trans_coeff_level = 3 + last_coeff_abs_level_remaining;
+++                        update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
+++                    }
+++                }
++ 
++-            if (s->ps.sps->persistent_rice_adaptation_enabled_flag) {
++-                if (!transform_skip_flag && !lc->cu.cu_transquant_bypass_flag)
++-                    sb_type = 2 * (c_idx == 0 ? 1 : 0);
++-                else
++-                    sb_type = 2 * (c_idx == 0 ? 1 : 0) + 1;
++-                c_rice_param = lc->stat_coeff[sb_type] / 4;
++-            }
+++                {
+++                    const xy_off_t * const xy_off = scan_xy_off + significant_coeff_flag_idx[0];
+++                    const int k = (int32_t)(coeff_sign_flag << 31) >> 31;
+++                    const unsigned int scale_m = blk_scale[xy_off->scale];
++ 
++-            if (!(i == num_last_subset) && greater1_ctx == 0)
++-                ctx_set++;
++-            greater1_ctx = 1;
++-            last_nz_pos_in_cg = significant_coeff_flag_idx[0];
++-
++-            for (m = 0; m < (n_end > 8 ? 8 : n_end); m++) {
++-                int inc = (ctx_set << 2) + greater1_ctx;
++-                coeff_abs_level_greater1_flag[m] =
++-                    coeff_abs_level_greater1_flag_decode(s, c_idx, inc);
++-                if (coeff_abs_level_greater1_flag[m]) {
++-                    greater1_ctx = 0;
++-                    if (first_greater1_coeff_idx == -1)
++-                        first_greater1_coeff_idx = m;
++-                } else if (greater1_ctx > 0 && greater1_ctx < 3) {
++-                    greater1_ctx++;
+++                    blk_coeffs[xy_off->coeff] = trans_scale_sat(
+++                        (trans_coeff_level ^ k) - k,  // Apply sign
+++                        scale,
+++                        i == 0 && xy_off->coeff == 0 ? dc_scale : scale_m,
+++                        shift);
++                 }
++             }
++-            first_nz_pos_in_cg = significant_coeff_flag_idx[n_end - 1];
++-
++-            if (lc->cu.cu_transquant_bypass_flag ||
++-                (lc->cu.pred_mode ==  MODE_INTRA  &&
++-                 s->ps.sps->implicit_rdpcm_enabled_flag  &&  transform_skip_flag  &&
++-                 (pred_mode_intra == 10 || pred_mode_intra  ==  26 )) ||
++-                 explicit_rdpcm_flag)
++-                sign_hidden = 0;
++             else
++-                sign_hidden = (last_nz_pos_in_cg - first_nz_pos_in_cg >= 4);
+++#endif
+++            {
+++                int sign_hidden = may_hide_sign;
+++                int levels[16]; // Should be able to get away with int16_t but that fails some tests
+++                uint32_t coeff_sign_flags;
+++                uint32_t coded_vals = 0;
+++                // Sum(abs(level[]))
+++                // In fact we only need the bottom bit and in some future
+++                // version that may be all we calculate
+++                unsigned int sum_abs;
+++
+++                coded_vals = get_greaterx_bits(s, nb_significant_coeff_flag, levels,
+++                    &prev_subset_coded, &sum_abs, idx0_gt1, idx_gt2);
+++
+++                if (significant_coeff_flag_idx[0] - significant_coeff_flag_idx[nb_significant_coeff_flag - 1] <= 3)
+++                    sign_hidden = 0;
+++
+++                // -- Start bypass block
+++
+++                bypass_start(s);
+++
+++                coeff_sign_flags = coeff_sign_flag_decode_bypass(s, nb_significant_coeff_flag - sign_hidden);
+++
+++                if (coded_vals != 0)
+++                {
+++                    const int rice_adaptation_enabled = s->ps.sps->persistent_rice_adaptation_enabled_flag;
+++                    uint8_t * stat_coeff = !rice_adaptation_enabled ? NULL :
+++                        lc->stat_coeff + trans_skip_or_bypass + 2 - ((c_idx_nz) << 1);
+++                    int c_rice_param = !rice_adaptation_enabled ? 0 : *stat_coeff >> 2;
+++                    int * level = levels - 1;
+++
+++                    do {
+++                        {
+++                            const unsigned int z = hevc_clz32(coded_vals) + 1;
+++                            level += z;
+++                            coded_vals <<= z;
+++                        }
++ 
++-            if (first_greater1_coeff_idx != -1) {
++-                coeff_abs_level_greater1_flag[first_greater1_coeff_idx] += coeff_abs_level_greater2_flag_decode(s, c_idx, ctx_set);
++-            }
++-            if (!s->ps.pps->sign_data_hiding_flag || !sign_hidden ) {
++-                coeff_sign_flag = coeff_sign_flag_decode(s, nb_significant_coeff_flag) << (16 - nb_significant_coeff_flag);
++-            } else {
++-                coeff_sign_flag = coeff_sign_flag_decode(s, nb_significant_coeff_flag - 1) << (16 - (nb_significant_coeff_flag - 1));
++-            }
+++                        {
+++                            const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode_bypass(s, c_rice_param);
+++                            const int trans_coeff_level = *level + last_coeff_abs_level_remaining + 1;
+++
+++                            sum_abs += last_coeff_abs_level_remaining + 1;
+++                            *level = trans_coeff_level;
+++
+++                            if (stat_coeff != NULL)
+++                                update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
+++                            stat_coeff = NULL;
++ 
++-            for (m = 0; m < n_end; m++) {
++-                n = significant_coeff_flag_idx[m];
++-                GET_COORD(offset, n);
++-                if (m < 8) {
++-                    trans_coeff_level = 1 + coeff_abs_level_greater1_flag[m];
++-                    if (trans_coeff_level == ((m == first_greater1_coeff_idx) ? 3 : 2)) {
++-                        int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
++-
++-                        trans_coeff_level += last_coeff_abs_level_remaining;
++-                        if (trans_coeff_level > (3 << c_rice_param))
++-                            c_rice_param = s->ps.sps->persistent_rice_adaptation_enabled_flag ? c_rice_param + 1 : FFMIN(c_rice_param + 1, 4);
++-                        if (s->ps.sps->persistent_rice_adaptation_enabled_flag && !rice_init) {
++-                            int c_rice_p_init = lc->stat_coeff[sb_type] / 4;
++-                            if (last_coeff_abs_level_remaining >= (3 << c_rice_p_init))
++-                                lc->stat_coeff[sb_type]++;
++-                            else if (2 * last_coeff_abs_level_remaining < (1 << c_rice_p_init))
++-                                if (lc->stat_coeff[sb_type] > 0)
++-                                    lc->stat_coeff[sb_type]--;
++-                            rice_init = 1;
+++                            if (trans_coeff_level > (3 << c_rice_param) &&
+++                                (c_rice_param < 4 || rice_adaptation_enabled))
+++                                ++c_rice_param;
++                         }
++-                    }
++-                } else {
++-                    int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
++-
++-                    trans_coeff_level = 1 + last_coeff_abs_level_remaining;
++-                    if (trans_coeff_level > (3 << c_rice_param))
++-                        c_rice_param = s->ps.sps->persistent_rice_adaptation_enabled_flag ? c_rice_param + 1 : FFMIN(c_rice_param + 1, 4);
++-                    if (s->ps.sps->persistent_rice_adaptation_enabled_flag && !rice_init) {
++-                        int c_rice_p_init = lc->stat_coeff[sb_type] / 4;
++-                        if (last_coeff_abs_level_remaining >= (3 << c_rice_p_init))
++-                            lc->stat_coeff[sb_type]++;
++-                        else if (2 * last_coeff_abs_level_remaining < (1 << c_rice_p_init))
++-                            if (lc->stat_coeff[sb_type] > 0)
++-                                lc->stat_coeff[sb_type]--;
++-                        rice_init = 1;
++-                    }
+++                    } while (coded_vals != 0);
++                 }
++-                if (s->ps.pps->sign_data_hiding_flag && sign_hidden) {
++-                    sum_abs += trans_coeff_level;
++-                    if (n == first_nz_pos_in_cg && (sum_abs&1))
++-                        trans_coeff_level = -trans_coeff_level;
+++
+++                // sign_hidden = 0 or 1 so we can combine the tests
+++                if ((sign_hidden & sum_abs) != 0) {
+++                    levels[nb_significant_coeff_flag - 1] = -levels[nb_significant_coeff_flag - 1];
++                 }
++-                if (coeff_sign_flag >> 15)
++-                    trans_coeff_level = -trans_coeff_level;
++-                coeff_sign_flag <<= 1;
++-                if(!lc->cu.cu_transquant_bypass_flag) {
++-                    if (s->ps.sps->scaling_list_enable_flag && !(transform_skip_flag && log2_trafo_size > 2)) {
++-                        if(y_c || x_c || log2_trafo_size < 4) {
++-                            switch(log2_trafo_size) {
++-                                case 3: pos = (y_c << 3) + x_c; break;
++-                                case 4: pos = ((y_c >> 1) << 3) + (x_c >> 1); break;
++-                                case 5: pos = ((y_c >> 2) << 3) + (x_c >> 2); break;
++-                                default: pos = (y_c << 2) + x_c; break;
++-                            }
++-                            scale_m = scale_matrix[pos];
++-                        } else {
++-                            scale_m = dc_scale;
++-                        }
+++
+++                bypass_finish(s);
+++
+++                // -- Finish bypass block
+++
+++                // Scale loop
+++                {
+++                    int m = nb_significant_coeff_flag - 1;
+++
+++                    // Deal with DC component (if any) first
+++                    if (i == 0 && significant_coeff_flag_idx[m] == 0)
+++                    {
+++                        const int k = (int32_t)(coeff_sign_flags << m) >> 31;
+++                        blk_coeffs[0] = trans_scale_sat(
+++                            (levels[m] ^ k) - k, scale, dc_scale, shift);
+++                        --m;
++                     }
++-                    trans_coeff_level = (trans_coeff_level * (int64_t)scale * (int64_t)scale_m + add) >> shift;
++-                    if(trans_coeff_level < 0) {
++-                        if((~trans_coeff_level) & 0xFffffffffff8000)
++-                            trans_coeff_level = -32768;
++-                    } else {
++-                        if(trans_coeff_level & 0xffffffffffff8000)
++-                            trans_coeff_level = 32767;
+++
+++#if !USE_N_END_1
+++                    // If N_END_1 set then m was at least 1 initially
+++                    if (m >= 0)
+++#endif
+++                    {
+++                        do {
+++                            const xy_off_t * const xy_off = scan_xy_off +
+++                                significant_coeff_flag_idx[m];
+++                            const int k = (int32_t)(coeff_sign_flags << m) >> 31;
+++
+++                            blk_coeffs[xy_off->coeff] = trans_scale_sat(
+++                                (levels[m] ^ k) - k,
+++                                scale,
+++                                blk_scale[xy_off->scale],
+++                                shift);
+++                        } while (--m >= 0);
++                     }
++                 }
++-                coeffs[y_c * trafo_size + x_c] = trans_coeff_level;
+ +
+-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+-+*/
++             }
++         }
++-    }
+++    } while ((i = next_subset(s, i, c_idx_nz,
+++        significant_coeff_group_flag, scan_x_cg, scan_y_cg, &prev_sig)) >= 0);
++ 
++     if (lc->cu.cu_transquant_bypass_flag) {
++         if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
++@@ -1467,7 +2078,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++             s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
++         }
++     } else {
++-        if (transform_skip_flag) {
+++        if (trans_skip_or_bypass) { // Must be trans_skip as we've already dealt with bypass
++             int rot = s->ps.sps->transform_skip_rotation_enabled_flag &&
++                       log2_trafo_size == 2 &&
++                       lc->cu.pred_mode == MODE_INTRA;
++@@ -1475,7 +2086,6 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++                 for (i = 0; i < 8; i++)
++                     FFSWAP(int16_t, coeffs[i], coeffs[16 - i - 1]);
++             }
++-
++             s->hevcdsp.transform_skip(coeffs, log2_trafo_size);
++ 
++             if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
++@@ -1486,8 +2096,26 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++                 s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
++             }
++         } else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2) {
++-            s->hevcdsp.idct_4x4_luma(coeffs);
+++           s->hevcdsp.idct_4x4_luma(coeffs);
++         } else {
+++#ifdef RPI
+++            if (!use_vpu) {
+++              int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
+++              if (max_xy == 0) {
+++                  s->hevcdsp.idct_dc[log2_trafo_size-2](coeffs);
+++              } else {
+++                  int col_limit = last_significant_coeff_x + last_significant_coeff_y + 4;
+++                  if (max_xy < 4)
+++                      col_limit = FFMIN(4, col_limit);
+++                  else if (max_xy < 8)
+++                      col_limit = FFMIN(8, col_limit);
+++                  else if (max_xy < 12)
+++                      col_limit = FFMIN(24, col_limit);
+ +
+-+#ifndef __USER_VCSM__H__INCLUDED__
+-+#define __USER_VCSM__H__INCLUDED__
+++                  s->hevcdsp.idct[log2_trafo_size-2](coeffs, col_limit);
+++              }
+++            }
+++#else
++             int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
++             if (max_xy == 0)
++                 s->hevcdsp.idct_dc[log2_trafo_size-2](coeffs);
++@@ -1501,6 +2129,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++                     col_limit = FFMIN(24, col_limit);
++                 s->hevcdsp.idct[log2_trafo_size-2](coeffs, col_limit);
++             }
+++#endif
++         }
++     }
++     if (lc->tu.cross_pf) {
++@@ -1510,6 +2139,17 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++             coeffs[i] = coeffs[i] + ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
++         }
++     }
+++#ifdef RPI
+++    if (s->enable_rpi) {
+++        HEVCPredCmd *cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++;
+++        cmd->type = RPI_PRED_TRANSFORM_ADD;
+++        cmd->size = log2_trafo_size;
+++        cmd->buf = coeffs;
+++        cmd->dst = dst;
+++        cmd->stride = stride;
+++        return;
+++    }
+++#endif
++     s->hevcdsp.transform_add[log2_trafo_size-2](dst, coeffs, stride);
++ }
++ 
++diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
++index 1f33b0c..55a0315 100644
++--- a/libavcodec/hevc_filter.c
+++++ b/libavcodec/hevc_filter.c
++@@ -22,6 +22,12 @@
++  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++  */
++ 
+++//#define DISABLE_SAO
+++//#define DISABLE_DEBLOCK
+++//#define DISABLE_STRENGTHS
+++// define DISABLE_DEBLOCK_NONREF for a 6% speed boost (by skipping deblocking on unimportant frames)
+++//#define DISABLE_DEBLOCK_NONREF
+ +
+-+/* VideoCore Shared Memory - user interface library.
+-+**
+-+** This library provides all the necessary abstraction for any application to
+-+** make use of the shared memory service which is distributed accross a kernel
+-+** driver and a videocore service.
+-+**
+-+** It is an application design decision to choose or not to use this service.
+-+**
+-+** The logical flow of operations that a user application needs to follow when
+-+** using this service is:
+-+**
+-+**       1) Initialize the service.
+-+**       2) Allocate shared memory blocks.
+-+**       3) Start using the allocated blocks.
+-+**          - In order to gain ownership on a block, lock the allocated block,
+-+**            locking a block returns a valid address that the user application
+-+**            can access.
+-+**          - When finished with using the block for the current execution cycle
+-+**            or function, and so when giving up the ownership, unlock the block.
+-+**       4) A block can be locked/unlocked as many times required - within or outside
+-+**          of - a specific execution context.
+-+**       5) To completely release an allocated block, free it.
+-+**       6) If the service is no longer required, terminate it.
+-+**
+-+**
+-+** Some generic considerations:
++ #include "libavutil/common.h"
++ #include "libavutil/internal.h"
++ 
++@@ -31,6 +37,11 @@
++ 
++ #include "bit_depth_template.c"
++ 
+++#ifdef RPI
+++#include "rpi_user_vcsm.h"
+++#include "rpi_qpu.h"
+++#endif
+ +
+-+** Allocating memory blocks.
+-+**
+-+**   Memory blocks can be allocated in different manners depending on the cache
+-+**   behavior desired.  A given block can either be:
++ #define LUMA 0
++ #define CB 1
++ #define CR 2
++@@ -273,6 +284,10 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
++     edges[2]   = x_ctb == s->ps.sps->ctb_width  - 1;
++     edges[3]   = y_ctb == s->ps.sps->ctb_height - 1;
++ 
+++#ifdef DISABLE_SAO
+++    return;
+++#endif
+ +
+-+**       - Allocated in a non cached fashion all the way through host and videocore.
+-+**       - Allocated in a cached fashion on host OR videocore.
+-+**       - Allocated in a cached fashion on host AND videocore.
+-+**
+-+**   It is an application decision to determine how to allocate a block.  Evidently
+-+**   if the application will be doing substantial read/write accesses to a given block,
+-+**   it is recommended to allocate the block at least in a 'host cached' fashion for
+-+**   better results.
+-+**
+-+**
+-+** Locking memory blocks.
+-+**
+-+**   When the memory block has been allocated in a host cached fashion, locking the
+-+**   memory block (and so taking ownership of it) will trigger a cache invalidation.
+-+**
+-+**   For the above reason and when using host cached allocation, it is important that
+-+**   an application properly implements the lock/unlock mechanism to ensure cache will
+-+**   stay coherent, otherwise there is no guarantee it will at all be.
+-+**
+-+**   It is possible to dynamically change the host cache behavior (ie cached or non
+-+**   cached) of a given allocation without needing to free and re-allocate the block.
+-+**   This feature can be useful for such application which requires access to the block
+-+**   only at certain times and not otherwise.  By changing the cache behavior dynamically
+-+**   the application can optimize performances for a given duration of use.
+-+**   Such dynamic cache behavior remapping only applies to host cache and not videocore
+-+**   cache.  If one requires to change the videocore cache behavior, then a new block
+-+**   must be created to replace the old one.
+-+**
+-+**   On successful locking, a valid pointer is returned that the application can use
+-+**   to access to data inside the block.  There is no guarantee that the pointer will
+-+**   stay valid following the unlock action corresponding to this lock.
+-+**
+-+**
+-+** Unocking memory blocks.
+-+**
+-+**   When the memory block has been allocated in a host cached fashion, unlocking the
+-+**   memory block (and so forgiving its ownership) will trigger a cache flush unless
+-+**   explicitely asked not to flush the cache for performances reasons.
+-+**
+-+**   For the above reason and when using host cached allocation, it is important that
+-+**   an application properly implements the lock/unlock mechanism to ensure cache will
+-+**   stay coherent, otherwise there is no guarantee it will at all be.
+-+**
+-+**
+-+** A complete API is defined below.
+-+*/
++     if (restore) {
++         if (!edges[0]) {
++             left_tile_edge  = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1]];
++@@ -496,6 +511,15 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
++                 s->ps.sps->pcm.loop_filter_disable_flag) ||
++                s->ps.pps->transquant_bypass_enable_flag;
++ 
+++#ifdef DISABLE_DEBLOCK_NONREF
+++    if (!s->used_for_ref)
+++      return; // Don't deblock non-reference frames
+++#endif
+++#ifdef DISABLE_DEBLOCK
+++    return;
+++#endif
+++    if (!s->used_for_ref && s->avctx->skip_loop_filter >= AVDISCARD_NONREF)
+++        return;
++     if (x0) {
++         left_tc_offset   = s->deblock[ctb - 1].tc_offset;
++         left_beta_offset = s->deblock[ctb - 1].beta_offset;
++@@ -539,6 +563,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
++                                                          s->frame->linesize[LUMA],
++                                                          beta, tc, no_p, no_q);
++                 } else
+++#ifdef RPI_DEBLOCK_VPU
+++                if (s->enable_rpi_deblock) {
+++                    uint8_t (*setup)[2][2][4];
+++                    int num16 = (y>>4)*s->setup_width + (x>>4);
+++                    int a = ((y>>3) & 1) << 1;
+++                    int b = (x>>3) & 1;
+++                    setup = s->dvq->y_setup_arm[num16];
+++                    setup[0][b][0][a] = beta;
+++                    setup[0][b][0][a + 1] = beta;
+++                    setup[0][b][1][a] = tc[0];
+++                    setup[0][b][1][a + 1] = tc[1];
+++                } else
+++#endif
++                     s->hevcdsp.hevc_v_loop_filter_luma(src,
++                                                        s->frame->linesize[LUMA],
++                                                        beta, tc, no_p, no_q);
++@@ -571,6 +608,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
++                                                          s->frame->linesize[LUMA],
++                                                          beta, tc, no_p, no_q);
++                 } else
+++#ifdef RPI_DEBLOCK_VPU
+++                if (s->enable_rpi_deblock) {
+++                    uint8_t (*setup)[2][2][4];
+++                    int num16 = (y>>4)*s->setup_width + (x>>4);
+++                    int a = ((x>>3) & 1) << 1;
+++                    int b = (y>>3) & 1;
+++                    setup = s->dvq->y_setup_arm[num16];
+++                    setup[1][b][0][a] = beta;
+++                    setup[1][b][0][a + 1] = beta;
+++                    setup[1][b][1][a] = tc[0];
+++                    setup[1][b][1][a + 1] = tc[1];
+++                } else
+++#endif
++                     s->hevcdsp.hevc_h_loop_filter_luma(src,
++                                                        s->frame->linesize[LUMA],
++                                                        beta, tc, no_p, no_q);
++@@ -605,9 +655,23 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
++                                                                    s->frame->linesize[chroma],
++                                                                    c_tc, no_p, no_q);
++                         } else
+++#ifdef RPI_DEBLOCK_VPU
+++                        if (s->enable_rpi_deblock) {
+++                            uint8_t (*setup)[2][2][4];
+++                            int xc = x>>s->ps.sps->hshift[chroma];
+++                            int yc = y>>s->ps.sps->vshift[chroma];
+++                            int num16 = (yc>>4)*s->uv_setup_width + (xc>>4);
+++                            int a = ((yc>>3) & 1) << 1;
+++                            int b = (xc>>3) & 1;
+++                            setup = s->dvq->uv_setup_arm[num16];
+++                            setup[0][b][0][a] = c_tc[0];
+++                            setup[0][b][0][a + 1] = c_tc[1];
+++                        } else
+++#endif
++                             s->hevcdsp.hevc_v_loop_filter_chroma(src,
++                                                                  s->frame->linesize[chroma],
++                                                                  c_tc, no_p, no_q);
+ +
+-+#ifdef __cplusplus
+-+extern "C"
+-+{
++                     }
++                 }
++ 
++@@ -638,6 +702,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
++                                                                    s->frame->linesize[chroma],
++                                                                    c_tc, no_p, no_q);
++                         } else
+++#ifdef RPI_DEBLOCK_VPU
+++                        if (s->enable_rpi_deblock) {
+++                            uint8_t (*setup)[2][2][4];
+++                            int xc = x>>s->ps.sps->hshift[chroma];
+++                            int yc = y>>s->ps.sps->vshift[chroma];
+++                            int num16 = (yc>>4)*s->uv_setup_width + (xc>>4);
+++                            int a = ((xc>>3) & 1) << 1;
+++                            int b = (yc>>3) & 1;
+++                            setup = s->dvq->uv_setup_arm[num16];
+++                            setup[1][b][0][a] = c_tc[0];
+++                            setup[1][b][0][a + 1] = c_tc[1];
+++                        } else
+ +#endif
++                             s->hevcdsp.hevc_h_loop_filter_chroma(src,
++                                                                  s->frame->linesize[chroma],
++                                                                  c_tc, no_p, no_q);
++@@ -648,69 +725,6 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
++     }
++ }
++ 
++-static int boundary_strength(HEVCContext *s, MvField *curr, MvField *neigh,
++-                             RefPicList *neigh_refPicList)
++-{
++-    if (curr->pred_flag == PF_BI &&  neigh->pred_flag == PF_BI) {
++-        // same L0 and L1
++-        if (s->ref->refPicList[0].list[curr->ref_idx[0]] == neigh_refPicList[0].list[neigh->ref_idx[0]]  &&
++-            s->ref->refPicList[0].list[curr->ref_idx[0]] == s->ref->refPicList[1].list[curr->ref_idx[1]] &&
++-            neigh_refPicList[0].list[neigh->ref_idx[0]] == neigh_refPicList[1].list[neigh->ref_idx[1]]) {
++-            if ((FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
++-                 FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4) &&
++-                (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
++-                 FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4))
++-                return 1;
++-            else
++-                return 0;
++-        } else if (neigh_refPicList[0].list[neigh->ref_idx[0]] == s->ref->refPicList[0].list[curr->ref_idx[0]] &&
++-                   neigh_refPicList[1].list[neigh->ref_idx[1]] == s->ref->refPicList[1].list[curr->ref_idx[1]]) {
++-            if (FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
++-                FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4)
++-                return 1;
++-            else
++-                return 0;
++-        } else if (neigh_refPicList[1].list[neigh->ref_idx[1]] == s->ref->refPicList[0].list[curr->ref_idx[0]] &&
++-                   neigh_refPicList[0].list[neigh->ref_idx[0]] == s->ref->refPicList[1].list[curr->ref_idx[1]]) {
++-            if (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
++-                FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4)
++-                return 1;
++-            else
++-                return 0;
++-        } else {
++-            return 1;
++-        }
++-    } else if ((curr->pred_flag != PF_BI) && (neigh->pred_flag != PF_BI)){ // 1 MV
++-        Mv A, B;
++-        int ref_A, ref_B;
++-
++-        if (curr->pred_flag & 1) {
++-            A     = curr->mv[0];
++-            ref_A = s->ref->refPicList[0].list[curr->ref_idx[0]];
++-        } else {
++-            A     = curr->mv[1];
++-            ref_A = s->ref->refPicList[1].list[curr->ref_idx[1]];
++-        }
++-
++-        if (neigh->pred_flag & 1) {
++-            B     = neigh->mv[0];
++-            ref_B = neigh_refPicList[0].list[neigh->ref_idx[0]];
++-        } else {
++-            B     = neigh->mv[1];
++-            ref_B = neigh_refPicList[1].list[neigh->ref_idx[1]];
++-        }
++-
++-        if (ref_A == ref_B) {
++-            if (FFABS(A.x - B.x) >= 4 || FFABS(A.y - B.y) >= 4)
++-                return 1;
++-            else
++-                return 0;
++-        } else
++-            return 1;
++-    }
++-
++-    return 1;
++-}
++ 
++ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
++                                            int log2_trafo_size)
++@@ -721,10 +735,21 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
++     int log2_min_tu_size = s->ps.sps->log2_min_tb_size;
++     int min_pu_width     = s->ps.sps->min_pu_width;
++     int min_tu_width     = s->ps.sps->min_tb_width;
++-    int is_intra = tab_mvf[(y0 >> log2_min_pu_size) * min_pu_width +
++-                           (x0 >> log2_min_pu_size)].pred_flag == PF_INTRA;
++     int boundary_upper, boundary_left;
++-    int i, j, bs;
+++    int i, j;
+++    RefPicList *rpl      = s->ref->refPicList;
+++    int min_pu_in_4pix   = (1 << log2_min_pu_size) >> 2;
+++    int trafo_in_min_pus = (1 << log2_trafo_size) >> log2_min_pu_size;
+++    int y_pu             = y0 >> log2_min_pu_size;
+++    int x_pu             = x0 >> log2_min_pu_size;
+++    MvField *curr        = &tab_mvf[y_pu * min_pu_width + x_pu];
+++    int is_intra         = curr->pred_flag == PF_INTRA;
+++    int inc              = log2_min_pu_size == 2 ? 2 : 1;
+++    uint8_t *bs;
+ +
+-+/* Different status that can be dumped.
+-+*/
+-+typedef enum
+-+{
+-+   VCSM_STATUS_VC_WALK_ALLOC = 0,   // Walks *all* the allocation on videocore.
+-+                                    // Result of the walk is seen in the videocore
+-+                                    // log.
+-+   VCSM_STATUS_HOST_WALK_MAP,       // Walks the *full* mapping allocation on host
+-+                                    // driver (ie for all processes).  Result of
+-+                                    // the walk is seen in the kernel log.
+-+   VCSM_STATUS_HOST_WALK_PID_MAP,   // Walks the per process mapping allocation on host
+-+                                    // driver (for current process).  Result of
+-+                                    // the walk is seen in the kernel log.
+-+   VCSM_STATUS_HOST_WALK_PID_ALLOC, // Walks the per process host allocation on host
+-+                                    // driver (for current process).  Result of
+-+                                    // the walk is seen in the kernel log.
+-+   VCSM_STATUS_VC_MAP_ALL,          // Equivalent to both VCSM_STATUS_VC_WALK_ALLOC and
+-+                                    // VCSM_STATUS_HOST_WALK_MAP.
+-+                                    //
+-+   VCSM_STATUS_NONE,                // Must be last - invalid.
+++#ifdef DISABLE_STRENGTHS
+++    return;
+++#endif
++ 
++     boundary_upper = y0 > 0 && !(y0 & 7);
++     if (boundary_upper &&
++@@ -736,34 +761,56 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
++           (y0 % (1 << s->ps.sps->log2_ctb_size)) == 0)))
++         boundary_upper = 0;
++ 
+++    bs = &s->horizontal_bs[(x0 + y0 * s->bs_width) >> 2];
+ +
+-+} VCSM_STATUS_T;
++     if (boundary_upper) {
++         RefPicList *rpl_top = (lc->boundary_flags & BOUNDARY_UPPER_SLICE) ?
++                               ff_hevc_get_ref_list(s, s->ref, x0, y0 - 1) :
++-                              s->ref->refPicList;
++-        int yp_pu = (y0 - 1) >> log2_min_pu_size;
++-        int yq_pu =  y0      >> log2_min_pu_size;
++-        int yp_tu = (y0 - 1) >> log2_min_tu_size;
++-        int yq_tu =  y0      >> log2_min_tu_size;
+++                              rpl;
+++        MvField *top = curr - min_pu_width;
+ +
+-+/* Different kind of cache behavior.
+-+*/
+-+typedef enum
+-+{
+-+   VCSM_CACHE_TYPE_NONE = 0,        // No caching applies.
+-+   VCSM_CACHE_TYPE_HOST,            // Allocation is cached on host (user space).
+-+   VCSM_CACHE_TYPE_VC,              // Allocation is cached on videocore.
+-+   VCSM_CACHE_TYPE_HOST_AND_VC,     // Allocation is cached on both host and videocore.
+++        if (is_intra) {
+++            for (i = 0; i < (1 << log2_trafo_size); i += 4)
+++                bs[i >> 2] = 2;
+ +
+-+} VCSM_CACHE_TYPE_T;
+++        } else {
+++            int y_tu = y0 >> log2_min_tu_size;
+++            int x_tu = x0 >> log2_min_tu_size;
+++            uint8_t *curr_cbf_luma = &s->cbf_luma[y_tu * min_tu_width + x_tu];
+++            uint8_t *top_cbf_luma = curr_cbf_luma - min_tu_width;
+ +
+-+/* Initialize the vcsm processing.
+-+**
+-+** Must be called once before attempting to do anything else.
+-+**
+-+** Returns 0 on success, -1 on error.
+-+*/
+-+int vcsm_init( void );
+++            s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
+++                    min_pu_in_4pix, sizeof (MvField), 4 >> 2,
+++                    rpl[0].list, rpl[1].list, rpl_top[0].list, rpl_top[1].list,
+++                    curr, top, bs);
++ 
++             for (i = 0; i < (1 << log2_trafo_size); i += 4) {
++-                int x_pu = (x0 + i) >> log2_min_pu_size;
++-                int x_tu = (x0 + i) >> log2_min_tu_size;
++-                MvField *top  = &tab_mvf[yp_pu * min_pu_width + x_pu];
++-                MvField *curr = &tab_mvf[yq_pu * min_pu_width + x_pu];
++-                uint8_t top_cbf_luma  = s->cbf_luma[yp_tu * min_tu_width + x_tu];
++-                uint8_t curr_cbf_luma = s->cbf_luma[yq_tu * min_tu_width + x_tu];
++-
++-                if (curr->pred_flag == PF_INTRA || top->pred_flag == PF_INTRA)
++-                    bs = 2;
++-                else if (curr_cbf_luma || top_cbf_luma)
++-                    bs = 1;
++-                else
++-                    bs = boundary_strength(s, curr, top, rpl_top);
++-                s->horizontal_bs[((x0 + i) + y0 * s->bs_width) >> 2] = bs;
+++                int i_pu = i >> log2_min_pu_size;
+++                int i_tu = i >> log2_min_tu_size;
+ +
+++                if (top[i_pu].pred_flag == PF_INTRA)
+++                    bs[i >> 2] = 2;
+++                else if (curr_cbf_luma[i_tu] || top_cbf_luma[i_tu])
+++                    bs[i >> 2] = 1;
++             }
+++        }
+++    }
+ +
+-+/* Terminates the vcsm processing.
+-+**
+-+** Must be called vcsm services are no longer needed, it will
+-+** take care of removing any allocation under the current process
+-+** control if deemed necessary.
+-+*/
+-+void vcsm_exit( void );
+++    if (!is_intra) {
+++        for (j = inc; j < trafo_in_min_pus; j += inc) {
+++            MvField *top;
+ +
+++            curr += min_pu_width * inc;
+++            top = curr - min_pu_width;
+++            bs += s->bs_width * inc << log2_min_pu_size >> 2;
+ +
+-+/* Queries the status of the the vcsm.
+-+**
+-+** Triggers dump of various kind of information, see the
+-+** different variants specified in VCSM_STATUS_T.
+-+**
+-+** Pid is optional.
+-+*/
+-+void vcsm_status( VCSM_STATUS_T status, int pid );
+++            s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
+++                    min_pu_in_4pix, sizeof (MvField), 4 >> 2,
+++                    rpl[0].list, rpl[1].list, rpl[0].list, rpl[1].list,
+++                    curr, top, bs);
+++        }
++     }
++ 
++-    // bs for vertical TU boundaries
++     boundary_left = x0 > 0 && !(x0 & 7);
++     if (boundary_left &&
++         ((!s->sh.slice_loop_filter_across_slices_enabled_flag &&
++@@ -774,64 +821,54 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
++           (x0 % (1 << s->ps.sps->log2_ctb_size)) == 0)))
++         boundary_left = 0;
++ 
+++    curr = &tab_mvf[y_pu * min_pu_width + x_pu];
+++    bs = &s->vertical_bs[(x0 + y0 * s->bs_width) >> 2];
+ +
++     if (boundary_left) {
++         RefPicList *rpl_left = (lc->boundary_flags & BOUNDARY_LEFT_SLICE) ?
++                                ff_hevc_get_ref_list(s, s->ref, x0 - 1, y0) :
++-                               s->ref->refPicList;
++-        int xp_pu = (x0 - 1) >> log2_min_pu_size;
++-        int xq_pu =  x0      >> log2_min_pu_size;
++-        int xp_tu = (x0 - 1) >> log2_min_tu_size;
++-        int xq_tu =  x0      >> log2_min_tu_size;
+++                               rpl;
+++        MvField *left = curr - 1;
++ 
++-            for (i = 0; i < (1 << log2_trafo_size); i += 4) {
++-                int y_pu      = (y0 + i) >> log2_min_pu_size;
++-                int y_tu      = (y0 + i) >> log2_min_tu_size;
++-                MvField *left = &tab_mvf[y_pu * min_pu_width + xp_pu];
++-                MvField *curr = &tab_mvf[y_pu * min_pu_width + xq_pu];
++-                uint8_t left_cbf_luma = s->cbf_luma[y_tu * min_tu_width + xp_tu];
++-                uint8_t curr_cbf_luma = s->cbf_luma[y_tu * min_tu_width + xq_tu];
++-
++-                if (curr->pred_flag == PF_INTRA || left->pred_flag == PF_INTRA)
++-                    bs = 2;
++-                else if (curr_cbf_luma || left_cbf_luma)
++-                    bs = 1;
++-                else
++-                    bs = boundary_strength(s, curr, left, rpl_left);
++-                s->vertical_bs[(x0 + (y0 + i) * s->bs_width) >> 2] = bs;
++-            }
++-    }
+++        if (is_intra) {
+++            for (j = 0; j < (1 << log2_trafo_size); j += 4)
+++                bs[j * s->bs_width >> 2] = 2;
++ 
++-    if (log2_trafo_size > log2_min_pu_size && !is_intra) {
++-        RefPicList *rpl = s->ref->refPicList;
++-
++-        // bs for TU internal horizontal PU boundaries
++-        for (j = 8; j < (1 << log2_trafo_size); j += 8) {
++-            int yp_pu = (y0 + j - 1) >> log2_min_pu_size;
++-            int yq_pu = (y0 + j)     >> log2_min_pu_size;
++-
++-            for (i = 0; i < (1 << log2_trafo_size); i += 4) {
++-                int x_pu = (x0 + i) >> log2_min_pu_size;
++-                MvField *top  = &tab_mvf[yp_pu * min_pu_width + x_pu];
++-                MvField *curr = &tab_mvf[yq_pu * min_pu_width + x_pu];
++-
++-                bs = boundary_strength(s, curr, top, rpl);
++-                s->horizontal_bs[((x0 + i) + (y0 + j) * s->bs_width) >> 2] = bs;
+++        } else {
+++            int y_tu = y0 >> log2_min_tu_size;
+++            int x_tu = x0 >> log2_min_tu_size;
+++            uint8_t *curr_cbf_luma = &s->cbf_luma[y_tu * min_tu_width + x_tu];
+++            uint8_t *left_cbf_luma = curr_cbf_luma - 1;
+ +
+-+/* Allocates a non-cached block of memory of size 'size' via the vcsm memory
+-+** allocator.
+-+**
+-+** Returns:        0 on error
+-+**                 a non-zero opaque handle on success.
+-+**
+-+** On success, the user must invoke vcsm_lock with the returned opaque
+-+** handle to gain access to the memory associated with the opaque handle.
+-+** When finished using the memory, the user calls vcsm_unlock_xx (see those
+-+** function definition for more details on the one that can be used).
+-+**
+-+** A well behaved application should make every attempt to lock/unlock
+-+** only for the duration it needs to access the memory data associated with
+-+** the opaque handle.
+-+*/
+-+unsigned int vcsm_malloc( unsigned int size, char *name );
+++            s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
+++                    min_pu_in_4pix, min_pu_width * sizeof (MvField), 4 * s->bs_width >> 2,
+++                    rpl[0].list, rpl[1].list, rpl_left[0].list, rpl_left[1].list,
+++                    curr, left, bs);
+ +
+++            for (j = 0; j < (1 << log2_trafo_size); j += 4) {
+++                int j_pu = j >> log2_min_pu_size;
+++                int j_tu = j >> log2_min_tu_size;
+ +
+-+/* Allocates a cached block of memory of size 'size' via the vcsm memory
+-+** allocator, the type of caching requested is passed as argument of the
+-+** function call.
+-+**
+-+** Returns:        0 on error
+-+**                 a non-zero opaque handle on success.
+-+**
+-+** On success, the user must invoke vcsm_lock with the returned opaque
+-+** handle to gain access to the memory associated with the opaque handle.
+-+** When finished using the memory, the user calls vcsm_unlock_xx (see those
+-+** function definition for more details on the one that can be used).
+-+**
+-+** A well behaved application should make every attempt to lock/unlock
+-+** only for the duration it needs to access the memory data associated with
+-+** the opaque handle.
+-+*/
+-+unsigned int vcsm_malloc_cache( unsigned int size, VCSM_CACHE_TYPE_T cache, char *name );
+++                if (left[j_pu * min_pu_width].pred_flag == PF_INTRA)
+++                    bs[j * s->bs_width >> 2] = 2;
+++                else if (curr_cbf_luma[j_tu * min_tu_width] || left_cbf_luma[j_tu * min_tu_width])
+++                    bs[j * s->bs_width >> 2] = 1;
++             }
++         }
+++    }
++ 
++-        // bs for TU internal vertical PU boundaries
++-        for (j = 0; j < (1 << log2_trafo_size); j += 4) {
++-            int y_pu = (y0 + j) >> log2_min_pu_size;
+++    if (!is_intra) {
+++        for (i = inc; i < trafo_in_min_pus; i += inc) {
+++            MvField *left;
++ 
++-            for (i = 8; i < (1 << log2_trafo_size); i += 8) {
++-                int xp_pu = (x0 + i - 1) >> log2_min_pu_size;
++-                int xq_pu = (x0 + i)     >> log2_min_pu_size;
++-                MvField *left = &tab_mvf[y_pu * min_pu_width + xp_pu];
++-                MvField *curr = &tab_mvf[y_pu * min_pu_width + xq_pu];
+++            curr += inc;
+++            left = curr - 1;
+++            bs += inc << log2_min_pu_size >> 2;
++ 
++-                bs = boundary_strength(s, curr, left, rpl);
++-                s->vertical_bs[((x0 + i) + (y0 + j) * s->bs_width) >> 2] = bs;
++-            }
+++            s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
+++                    min_pu_in_4pix, min_pu_width * sizeof (MvField), 4 * s->bs_width >> 2,
+++                    rpl[0].list, rpl[1].list, rpl[0].list, rpl[1].list,
+++                    curr, left, bs);
++         }
++     }
++ }
++@@ -840,11 +877,196 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
++ #undef CB
++ #undef CR
++ 
+++#if !defined(RPI_FAST_CACHEFLUSH)
+++#if defined(RPI_LUMA_QPU) || defined(RPI_DEBLOCK_VPU)
+++static void flush_buffer_y(const AVFrame * const frame) {
+++    GPU_MEM_PTR_T p = get_gpu_mem_ptr_y(frame);
+++    gpu_cache_flush(&p);
+++}
+ +
+++static void flush_buffer_u(const AVFrame * const frame) {
+++    GPU_MEM_PTR_T p = get_gpu_mem_ptr_u(frame);
+++    gpu_cache_flush(&p);
+++}
+ +
+-+/* Shares an allocated block of memory via the vcsm memory allocator.
+-+**
+-+** Returns:        0 on error
+-+**                 a non-zero opaque handle on success.
+-+**
+-+** On success, the user must invoke vcsm_lock with the returned opaque
+-+** handle to gain access to the memory associated with the opaque handle.
+-+** When finished using the memory, the user calls vcsm_unlock_xx (see those
+-+** function definition for more details on the one that can be used).
+-+**
+-+** A well behaved application should make every attempt to lock/unlock
+-+** only for the duration it needs to access the memory data associated with
+-+** the opaque handle.
+-+*/
+-+unsigned int vcsm_malloc_share( unsigned int handle );
+-+
+-+
+-+/* Resizes a block of memory allocated previously by vcsm_alloc.
+-+**
+-+** Returns:        0 on success
+-+**                 -errno on error.
+-+**
+-+** The handle must be unlocked by user prior to attempting any
+-+** resize action.
+-+**
+-+** On error, the original size allocated against the handle
+-+** remains available the same way it would be following a
+-+** successful vcsm_malloc.
+-+*/
+-+int vcsm_resize( unsigned int handle, unsigned int new_size );
+-+
+-+
+-+/* Frees a block of memory that was successfully allocated by
+-+** a prior call the vcms_alloc.
+-+**
+-+** The handle should be considered invalid upon return from this
+-+** call.
+-+**
+-+** Whether any memory is actually freed up or not as the result of
+-+** this call will depends on many factors, if all goes well it will
+-+** be freed.  If something goes wrong, the memory will likely end up
+-+** being freed up as part of the vcsm_exit process.  In the end the
+-+** memory is guaranteed to be freed one way or another.
+-+*/
+-+void vcsm_free( unsigned int handle );
+-+
+-+
+-+/* Retrieves a videocore opaque handle from a mapped user address
+-+** pointer.  The videocore handle will correspond to the actual
+-+** memory mapped in videocore.
+-+**
+-+** Returns:        0 on error
+-+**                 a non-zero opaque handle on success.
+-+**
+-+** Note: the videocore opaque handle is distinct from the user
+-+**       opaque handle (allocated via vcsm_malloc) and it is only
+-+**       significant for such application which knows what to do
+-+**       with it, for the others it is just a number with little
+-+**       use since nothing can be done with it (in particular
+-+**       for safety reason it cannot be used to map anything).
+-+*/
+-+unsigned int vcsm_vc_hdl_from_ptr( void *usr_ptr );
+++static void flush_buffer_v(const AVFrame * const frame) {
+++    GPU_MEM_PTR_T p = get_gpu_mem_ptr_v(frame);
+++    gpu_cache_flush(&p);
+++}
+++#endif
+++#endif
+ +
+ +
+-+/* Retrieves a videocore opaque handle from a opaque handle
+-+** pointer.  The videocore handle will correspond to the actual
+-+** memory mapped in videocore.
+-+**
+-+** Returns:        0 on error
+-+**                 a non-zero opaque handle on success.
+-+**
+-+** Note: the videocore opaque handle is distinct from the user
+-+**       opaque handle (allocated via vcsm_malloc) and it is only
+-+**       significant for such application which knows what to do
+-+**       with it, for the others it is just a number with little
+-+**       use since nothing can be done with it (in particular
+-+**       for safety reason it cannot be used to map anything).
+-+*/
+-+unsigned int vcsm_vc_hdl_from_hdl( unsigned int handle );
+++#ifdef RPI_DEBLOCK_VPU
+++#error Not fixed yet
+ +
+++// ff_hevc_flush_buffer_lines
+++// flushes and invalidates all pixel rows in [start,end-1]
+++static void ff_hevc_flush_buffer_lines(HEVCContext *s, int start, int end, int flush_luma, int flush_chroma)
+++{
+++#ifdef RPI_FAST_CACHEFLUSH
+++        struct vcsm_user_clean_invalid_s iocache = {};
+++        int curr_y = start;
+++        int n = end;
+++        int curr_uv = curr_y >> s->ps.sps->vshift[1];
+++        int n_uv = n >> s->ps.sps->vshift[1];
+++        int sz,base;
+++        GPU_MEM_PTR_T p;
+++        if (curr_uv < 0) curr_uv = 0;
+++        if (n_uv<=curr_uv) { return; }
+++        sz = s->frame->linesize[1] * (n_uv-curr_uv);
+++        base = s->frame->linesize[1] * curr_uv;
+++        if (flush_chroma) {
+++          p = get_gpu_mem_ptr_u(s->frame);
+++          iocache.s[0].handle = p.vcsm_handle;
+++          iocache.s[0].cmd = 3; // clean+invalidate
+++          iocache.s[0].addr = (int)p.arm + base;
+++          iocache.s[0].size  = sz;
+++          p = get_gpu_mem_ptr_v(s->frame);
+++          iocache.s[1].handle = p.vcsm_handle;
+++          iocache.s[1].cmd = 3; // clean+invalidate
+++          iocache.s[1].addr = (int)p.arm + base;
+++          iocache.s[1].size  = sz;
+++        }
+++        if (flush_luma) {
+++          p = get_gpu_mem_ptr_y(s->frame);
+++          sz = s->frame->linesize[0] * (n-curr_y);
+++          base = s->frame->linesize[0] * curr_y;
+++          iocache.s[2].handle = p.vcsm_handle;
+++          iocache.s[2].cmd = 3; // clean+invalidate
+++          iocache.s[2].addr = (int)p.arm + base;
+++          iocache.s[2].size  = sz;
+++        }
+++        vcsm_clean_invalid( &iocache );
+++#else
+++        if (flush_chroma) {
+++          flush_buffer_u(s->frame);
+++          flush_buffer_v(s->frame);
+++        }
+++        if (flush_luma) {
+++          flush_buffer_y(s->frame);
+++        }
+++#endif
+++}
+++#endif
+ +
+-+/* Retrieves a user opaque handle from a mapped user address
+-+** pointer.
+-+**
+-+** Returns:        0 on error
+-+**                 a non-zero opaque handle on success.
+-+*/
+-+unsigned int vcsm_usr_handle( void *usr_ptr );
+++#ifdef RPI_INTER_QPU
+++void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n)
+++{
+++    if (s->enable_rpi && s->used_for_ref) {
+++      // TODO make this use ff_hevc_flush_buffer_lines
+++#ifdef RPI_FAST_CACHEFLUSH
+++        struct vcsm_user_clean_invalid_s iocache = {};
+++        int curr_y = ((int *)f->progress->data)[0];
+++        int curr_uv = curr_y >> s->ps.sps->vshift[1];
+++        int n_uv = n >> s->ps.sps->vshift[1];
+++        int sz,base;
+++        GPU_MEM_PTR_T p;
+++        if (curr_uv < 0) curr_uv = 0;
+++        if (n_uv<=curr_uv) { return; }
+++        sz = s->frame->linesize[1] * (n_uv-curr_uv);
+++        base = s->frame->linesize[1] * curr_uv;
+++        p = get_gpu_mem_ptr_u(s->frame);
+++        iocache.s[0].handle = p.vcsm_handle;
+++        iocache.s[0].cmd = 3; // clean+invalidate
+++        iocache.s[0].addr = (int)p.arm + base;
+++        iocache.s[0].size  = sz;
+++        p = get_gpu_mem_ptr_v(s->frame);
+++        iocache.s[1].handle = p.vcsm_handle;
+++        iocache.s[1].cmd = 3; // clean+invalidate
+++        iocache.s[1].addr = (int)p.arm + base;
+++        iocache.s[1].size  = sz;
+ +
+++#ifdef RPI_LUMA_QPU
+++        p = get_gpu_mem_ptr_y(s->frame);
+++        sz = s->frame->linesize[0] * (n-curr_y);
+++        base = s->frame->linesize[0] * curr_y;
+++        iocache.s[2].handle = p.vcsm_handle;
+++        iocache.s[2].cmd = 3; // clean+invalidate
+++        iocache.s[2].addr = (int)p.arm + base;
+++        iocache.s[2].size  = sz;
+++#endif
+++        vcsm_clean_invalid( &iocache );
+++#else
+++        flush_buffer_u(s->frame);
+++        flush_buffer_v(s->frame);
+++#ifdef RPI_LUMA_QPU
+++        flush_buffer_y(s->frame);
+++#endif
+ +
+-+/* Retrieves a mapped user address from an opaque user
+-+** handle.
+-+**
+-+** Returns:        0 on error
+-+**                 a non-zero address on success.
+-+**
+-+** On success, the address corresponds to the pointer
+-+** which can access the data allocated via the vcsm_malloc
+-+** call.
+-+*/
+-+void *vcsm_usr_address( unsigned int handle );
+++#endif
+++        //memcpy(s->dummy.arm,s->frame->data[0],2048*64);
+++        //memcpy(s->dummy.arm,s->frame->data[1],1024*32);
+++        //memcpy(s->dummy.arm,s->frame->data[2],1024*32);
+++    }
+++}
+++#endif
+ +
+++#ifdef RPI_DEBLOCK_VPU
+++#error XXX
+++/* rpi_deblock deblocks an entire row of ctbs using the VPU */
+++static void rpi_deblock(HEVCContext *s, int y, int ctb_size)
+++{
+++  // Flush image, 4 lines above to bottom of ctb stripe
+++  ff_hevc_flush_buffer_lines(s, FFMAX(y-4,0), y+ctb_size, 1, 1);
+++  // TODO flush buffer of beta/tc setup when it becomes cached
+ +
+-+/* Locks the memory associated with this opaque handle.
+-+**
+-+** Returns:        NULL on error
+-+**                 a valid pointer on success.
+-+**
+-+** A user MUST lock the handle received from vcsm_malloc
+-+** in order to be able to use the memory associated with it.
+-+**
+-+** On success, the pointer returned is only valid within
+-+** the lock content (ie until a corresponding vcsm_unlock_xx
+-+** is invoked).
+-+*/
+-+void *vcsm_lock( unsigned int handle );
+++  // Prepare three commands at once to avoid calling overhead
+++  s->dvq->vpu_cmds_arm[0][0] = get_vc_address_y(s->frame) + s->frame->linesize[0] * y;
+++  s->dvq->vpu_cmds_arm[0][1] = s->frame->linesize[0];
+++  s->dvq->vpu_cmds_arm[0][2] = s->setup_width;
+++  s->dvq->vpu_cmds_arm[0][3] = (int) ( s->dvq->y_setup_vc + s->setup_width * (y>>4) );
+++  s->dvq->vpu_cmds_arm[0][4] = ctb_size>>4;
+++  s->dvq->vpu_cmds_arm[0][5] = 2;
+++
+++  s->dvq->vpu_cmds_arm[1][0] = get_vc_address_u(s->frame) + s->frame->linesize[1] * (y>> s->ps.sps->vshift[1]);
+++  s->dvq->vpu_cmds_arm[1][1] = s->frame->linesize[1];
+++  s->dvq->vpu_cmds_arm[1][2] = s->uv_setup_width;
+++  s->dvq->vpu_cmds_arm[1][3] = (int) ( s->dvq->uv_setup_vc + s->uv_setup_width * ((y>>4)>> s->ps.sps->vshift[1]) );
+++  s->dvq->vpu_cmds_arm[1][4] = (ctb_size>>4)>> s->ps.sps->vshift[1];
+++  s->dvq->vpu_cmds_arm[1][5] = 3;
+++
+++  s->dvq->vpu_cmds_arm[2][0] = get_vc_address_v(s->frame) + s->frame->linesize[2] * (y>> s->ps.sps->vshift[2]);
+++  s->dvq->vpu_cmds_arm[2][1] = s->frame->linesize[2];
+++  s->dvq->vpu_cmds_arm[2][2] = s->uv_setup_width;
+++  s->dvq->vpu_cmds_arm[2][3] = (int) ( s->dvq->uv_setup_vc + s->uv_setup_width * ((y>>4)>> s->ps.sps->vshift[1]) );
+++  s->dvq->vpu_cmds_arm[2][4] = (ctb_size>>4)>> s->ps.sps->vshift[1];
+++  s->dvq->vpu_cmds_arm[2][5] = 4;
+++  // Call VPU
+++  s->dvq->cmd_id = vpu_post_code2( vpu_get_fn(), s->dvq->vpu_cmds_vc, 3, 0, 0, 0, 5, 0); // 5 means to do all the commands
+ +
+++  s->dvq_n = (s->dvq_n + 1) & (RPI_DEBLOCK_VPU_Q_COUNT - 1);
+++  s->dvq = s->dvq_ents + s->dvq_n;
+ +
+-+/* Locks the memory associated with this opaque handle.  The lock
+-+** also gives a chance to update the *host* cache behavior of the
+-+** allocated buffer if so desired.  The *videocore* cache behavior
+-+** of the allocated buffer cannot be changed by this call and such
+-+** attempt will be ignored.
+-+**
+-+** The system will attempt to honour the cache_update mode request,
+-+** the cache_result mode will provide the final answer on which cache
+-+** mode is really in use.  Failing to change the cache mode will not
+-+** result in a failure to lock the buffer as it is an application
+-+** decision to choose what to do if (cache_result != cache_update)
+-+**
+-+** The value returned in cache_result can only be considered valid if
+-+** the returned pointer is non NULL.  The cache_result pointer may be
+-+** NULL if the application does not care about the actual outcome of
+-+** its action with regards to the cache behavior change.
+-+**
+-+** Returns:        NULL on error
+-+**                 a valid pointer on success.
+-+**
+-+** A user MUST lock the handle received from vcsm_malloc
+-+** in order to be able to use the memory associated with it.
+-+**
+-+** On success, the pointer returned is only valid within
+-+** the lock content (ie until a corresponding vcsm_unlock_xx
+-+** is invoked).
+-+*/
+-+void *vcsm_lock_cache( unsigned int handle,
+-+                       VCSM_CACHE_TYPE_T cache_update,
+-+                       VCSM_CACHE_TYPE_T *cache_result );
+++  if (s->dvq->cmd_id != -1) {
+++      vpu_wait(s->dvq->cmd_id);
+++      s->dvq->cmd_id = -1;
+++  }
+++}
+ +
+++#endif
+ +
+-+/* Unlocks the memory associated with this user mapped address.
+-+**
+-+** Returns:        0 on success
+-+**                 -errno on error.
+-+**
+-+** After unlocking a mapped address, the user should no longer
+-+** attempt to reference it.
+-+*/
+-+int vcsm_unlock_ptr( void *usr_ptr );
+-+
+-+
+-+/* Unlocks the memory associated with this user mapped address.
+-+** Apply special processing that would override the otherwise
+-+** default behavior.
+-+**
+-+** If 'cache_no_flush' is specified:
+-+**    Do not flush cache as the result of the unlock (if cache
+-+**    flush was otherwise applicable in this case).
+-+**
+-+** Returns:        0 on success
+-+**                 -errno on error.
+-+**
+-+** After unlocking a mapped address, the user should no longer
+-+** attempt to reference it.
+-+*/
+-+int vcsm_unlock_ptr_sp( void *usr_ptr, int cache_no_flush );
+-+
+-+
+-+/* Unlocks the memory associated with this user opaque handle.
+-+**
+-+** Returns:        0 on success
+-+**                 -errno on error.
+-+**
+-+** After unlocking an opaque handle, the user should no longer
+-+** attempt to reference the mapped addressed once associated
+-+** with it.
+-+*/
+-+int vcsm_unlock_hdl( unsigned int handle );
+-+
+-+
+-+/* Unlocks the memory associated with this user opaque handle.
+-+** Apply special processing that would override the otherwise
+-+** default behavior.
+-+**
+-+** If 'cache_no_flush' is specified:
+-+**    Do not flush cache as the result of the unlock (if cache
+-+**    flush was otherwise applicable in this case).
+-+**
+-+** Returns:        0 on success
+-+**                 -errno on error.
+-+**
+-+** After unlocking an opaque handle, the user should no longer
+-+** attempt to reference the mapped addressed once associated
+-+** with it.
+-+*/
+-+int vcsm_unlock_hdl_sp( unsigned int handle, int cache_no_flush );
+-+
+-+#ifdef __cplusplus
+-+}
++ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
++ {
++     int x_end = x >= s->ps.sps->width  - ctb_size;
+++#ifdef RPI_DEBLOCK_VPU
+++    int done_deblock = 0;
+ +#endif
+-+
+-+#endif /* __USER_VCSM__H__INCLUDED__ */
+--- 
+-2.7.4
+-
+-
+-From 6cfa5910be47865aaaf58c185587189c332765a6 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@argondesign.com>
+-Date: Sat, 2 May 2015 21:15:37 +0100
+-Subject: [PATCH 04/68] First working version with uncached memory
+-
+----
+- libavcodec/hevc.c               |  61 +++++-
+- libavcodec/hevc.h               |  12 +-
+- libavcodec/hevc_cabac.c         |  39 +++-
+- libavcodec/hevc_filter.c        |  16 ++
+- libavcodec/hevcpred_template.c  |   6 +
+- libavcodec/rpi_hevc_transform.h | 422 +++++++++++++++++++++++++++++++++++++++-
+- libavcodec/rpi_hevc_transform.s | 153 +++++++++++++--
+- libavcodec/rpi_qpu.c            |  72 +++++++
+- libavcodec/rpi_qpu.h            |   1 +
+- 9 files changed, 736 insertions(+), 46 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index ab55df1..94ff709 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -45,6 +45,8 @@
+- #include "rpi_qpu.h"
+- #endif
+- 
+-+// #define DISABLE_MC
+-+
+- const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
+- 
+- /**
+-@@ -1079,11 +1081,15 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+-                         for (i = 0; i < (size * size); i++) {
+-                             coeffs[i] = ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
+-                         }
+-+                        printf("Cross component not supported\n"); // TODO
+-+                        exit(-1);
+-                         s->hevcdsp.transform_add[log2_trafo_size_c-2](dst, coeffs, stride);
+-                     }
+-             }
+- 
+-             if (lc->tu.cross_pf) {
+-+                printf("Cross component not supported\n"); // TODO
+-+                exit(-1);
+-                 hls_cross_component_pred(s, 1);
+-             }
+-             for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
+-@@ -1112,6 +1118,8 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+-                         for (i = 0; i < (size * size); i++) {
+-                             coeffs[i] = ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
+-                         }
+-+                        printf("Cross component not supported\n"); // TODO
+-+                        exit(-1);
+-                         s->hevcdsp.transform_add[log2_trafo_size_c-2](dst, coeffs, stride);
+-                     }
+-             }
+-@@ -1409,6 +1417,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+-                            (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
+-     int idx              = ff_hevc_pel_weight[block_w];
+- 
+-+#ifdef DISABLE_MC
+-+    return;
++     if (s->avctx->skip_loop_filter < AVDISCARD_ALL)
++         deblocking_filter_CTB(s, x, y);
+++#ifdef RPI_DEBLOCK_VPU
+++    if (s->enable_rpi_deblock && x_end)
+++    {
+++      int y_at_end = y >= s->ps.sps->height - ctb_size;
+++      int height = 64;  // Deblock in units 64 high to avoid too many VPU calls
+++      int y_start = y&~63;
+++      if (y_at_end) height = s->ps.sps->height - y_start;
+++      if ((((y+ctb_size)&63)==0) || y_at_end) {
+++        done_deblock = 1;
+++        rpi_deblock(s, y_start, height);
+++      }
+++    }
+ +#endif
+-+
+-     x_off += mv->x >> 2;
+-     y_off += mv->y >> 2;
+-     src   += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift));
+-@@ -1479,6 +1491,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+-     uint8_t *src0  = ref0->data[0] + y_off0 * src0stride + (int)((unsigned)x_off0 << s->ps.sps->pixel_shift);
+-     uint8_t *src1  = ref1->data[0] + y_off1 * src1stride + (int)((unsigned)x_off1 << s->ps.sps->pixel_shift);
+- 
+-+#ifdef DISABLE_MC
+-+    return;
++     if (s->ps.sps->sao_enabled) {
++         int y_end = y >= s->ps.sps->height - ctb_size;
++         if (y && x)
++@@ -853,16 +1075,46 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
++             sao_filter_CTB(s, x - ctb_size, y);
++         if (y && x_end) {
++             sao_filter_CTB(s, x, y - ctb_size);
++-            if (s->threads_type & FF_THREAD_FRAME )
+++            if (s->threads_type & FF_THREAD_FRAME ) {
+++#ifdef RPI_INTER_QPU
+++                ff_hevc_flush_buffer(s,&s->ref->tf, y);
+ +#endif
+-+
+-     if (x_off0 < QPEL_EXTRA_BEFORE || y_off0 < QPEL_EXTRA_AFTER ||
+-         x_off0 >= pic_width - block_w - QPEL_EXTRA_AFTER ||
+-         y_off0 >= pic_height - block_h - QPEL_EXTRA_AFTER) {
+-@@ -1564,6 +1580,10 @@ static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
+-     intptr_t _mx         = mx << (1 - hshift);
+-     intptr_t _my         = my << (1 - vshift);
+- 
+-+#ifdef DISABLE_MC
+-+    return;
++                 ff_thread_report_progress(&s->ref->tf, y, 0);
+++            }
++         }
++         if (x_end && y_end) {
++             sao_filter_CTB(s, x , y);
++-            if (s->threads_type & FF_THREAD_FRAME )
+++            if (s->threads_type & FF_THREAD_FRAME ) {
+++#ifdef RPI_INTER_QPU
+++                ff_hevc_flush_buffer(s, &s->ref->tf, y + ctb_size);
+ +#endif
+-+
+-     x_off += mv->x >> (2 + hshift);
+-     y_off += mv->y >> (2 + vshift);
+-     src0  += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift));
+-@@ -1628,6 +1648,10 @@ static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVF
+-     int hshift = s->ps.sps->hshift[1];
+-     int vshift = s->ps.sps->vshift[1];
+- 
+-+#ifdef DISABLE_MC
+-+    return;
++                 ff_thread_report_progress(&s->ref->tf, y + ctb_size, 0);
+++            }
+++        }
+++    } else if (s->threads_type & FF_THREAD_FRAME && x_end) {
+++        //int newh = y + ctb_size - 4;
+++        //int currh = s->ref->tf.progress->data[0];
+++        //if (((y + ctb_size)&63)==0)
+++#ifdef RPI_DEBLOCK_VPU
+++        if (s->enable_rpi_deblock) {
+++          // we no longer need to flush the luma buffer as it is in GPU memory when using deblocking on the rpi
+++          if (done_deblock) {
+++            ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
+++          }
+++        } else {
+++#ifdef RPI_INTER_QPU
+++          ff_hevc_flush_buffer(s, &s->ref->tf, y + ctb_size - 4);
+ +#endif
+-+
+-     intptr_t mx0 = av_mod_uintp2(mv0->x, 2 + hshift);
+-     intptr_t my0 = av_mod_uintp2(mv0->y, 2 + vshift);
+-     intptr_t mx1 = av_mod_uintp2(mv1->x, 2 + hshift);
+-@@ -2367,6 +2391,22 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
+++          ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
++         }
++-    } else if (s->threads_type & FF_THREAD_FRAME && x_end)
+++#else
+++#ifdef RPI_INTER_QPU
+++        ff_hevc_flush_buffer(s, &s->ref->tf, y + ctb_size - 4);
+++        // we no longer need to flush the luma buffer as it is in GPU memory when using deblocking on the rpi
+++#endif
++         ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
+++#endif
+++    }
+  }
+  
+- #ifdef RPI
+-+static void rpi_execute_transform(HEVCContext *s)
++ void ff_hevc_hls_filters(HEVCContext *s, int x_ctb, int y_ctb, int ctb_size)
++diff --git a/libavcodec/hevc_ps.c b/libavcodec/hevc_ps.c
++index 83f2ec2..6882a8d 100644
++--- a/libavcodec/hevc_ps.c
+++++ b/libavcodec/hevc_ps.c
++@@ -989,6 +989,8 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
++     sps->amp_enabled_flag = get_bits1(gb);
++     sps->sao_enabled      = get_bits1(gb);
++ 
+++    av_log(avctx, AV_LOG_INFO, "sao_enabled=%d\n", sps->sao_enabled);
+++
++     sps->pcm_enabled_flag = get_bits1(gb);
++     if (sps->pcm_enabled_flag) {
++         sps->pcm.bit_depth   = get_bits(gb, 4) + 1;
++diff --git a/libavcodec/hevcdsp.c b/libavcodec/hevcdsp.c
++index 9d773d9..a6534a9 100644
++--- a/libavcodec/hevcdsp.c
+++++ b/libavcodec/hevcdsp.c
++@@ -123,6 +123,120 @@ DECLARE_ALIGNED(16, const int8_t, ff_hevc_qpel_filters[3][16]) = {
++ #include "hevcdsp_template.c"
++ #undef BIT_DEPTH
++ 
+++static void hevc_deblocking_boundary_strengths(int pus, int dup, int in_inc, int out_inc,
+++                                               int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
+++                                               MvField *curr, MvField *neigh, uint8_t *bs)
+ +{
+-+    int i=2;
+-+    //int j;
+-+    //int16_t *coeffs = s->coeffs_buf_arm[i];
+-+    //for(j=s->num_coeffs[i]; j > 0; j-= 16*16, coeffs+=16*16) {
+-+    //    s->hevcdsp.idct[4-2](coeffs, 16);
+-+    //}
+++    for (; pus > 0; pus--) {
+++        int strength, out;
+++        int curr_refL0 = curr_rpl0[curr->ref_idx[0]];
+++        int curr_refL1 = curr_rpl1[curr->ref_idx[1]];
+++        int neigh_refL0 = neigh_rpl0[neigh->ref_idx[0]];
+++        int neigh_refL1 = neigh_rpl1[neigh->ref_idx[1]];
+ +
+-+    //gpu_cache_flush(&s->coeffs_buf[i]);
+-+    vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf[i].vc, s->num_coeffs[i] >> 8, 0, 0, 0);
+++#if 1 // This more directly matches the original implementation
+++        if (curr->pred_flag == PF_BI &&  neigh->pred_flag == PF_BI) {
+++            // same L0 and L1
+++            if (curr_refL0 == neigh_refL0 &&
+++                curr_refL0 == curr_refL1 &&
+++                neigh_refL0 == neigh_refL1) {
+++                if ((FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
+++                     FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4) &&
+++                    (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
+++                     FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4))
+++                    strength = 1;
+++                else
+++                    strength = 0;
+++            } else if (neigh_refL0 == curr_refL0 &&
+++                       neigh_refL1 == curr_refL1) {
+++                if (FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
+++                    FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4)
+++                    strength = 1;
+++                else
+++                    strength = 0;
+++            } else if (neigh_refL1 == curr_refL0 &&
+++                       neigh_refL0 == curr_refL1) {
+++                if (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
+++                    FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4)
+++                    strength = 1;
+++                else
+++                    strength = 0;
+++            } else {
+++                strength = 1;
+++            }
+++        } else if ((curr->pred_flag != PF_BI) && (neigh->pred_flag != PF_BI)){ // 1 MV
+++            Mv curr_mv0, neigh_mv0;
+ +
+-+    for(i=0;i<4;i++)
+-+        s->num_coeffs[i] = 0;
+-+}
+++            if (curr->pred_flag & 1) {
+++                curr_mv0   = curr->mv[0];
+++            } else {
+++                curr_mv0   = curr->mv[1];
+++                curr_refL0 = curr_refL1;
+++            }
+ +
+- static void rpi_execute_pred_cmds(HEVCContext *s)
+- {
+-   int i;
+-@@ -2387,7 +2427,6 @@ static void rpi_execute_pred_cmds(HEVCContext *s)
+-       }
+-   }
+-   s->num_pred_cmds = 0;
+--  s->num_coeffs = 0;
+- }
+- #endif
+- 
+-@@ -2434,7 +2473,8 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+- 
+-         more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
+- #ifdef RPI
+--        if (x_ctb + ctb_size >= s->ps.sps->width) {
+-+        if (1 || x_ctb + ctb_size >= s->ps.sps->width) { // TODO watch out for deblocking!
+-+            rpi_execute_transform(s);
+-             rpi_execute_pred_cmds(s);
+-         }
+- #endif
+-@@ -3179,7 +3219,9 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
+-     av_freep(&s->unif_mv_cmds);
+-     av_freep(&s->unif_xfm_cmds);
+-     av_freep(&s->univ_pred_cmds);
+--    av_freep(&s->coeffs_buf);
+-+    for(i = 0; i < 4; i++) {
+-+        gpu_free(&s->coeffs_buf[i]);
+-+    }
+- #endif
+- 
+-     for (i = 0; i < 3; i++) {
+-@@ -3246,13 +3288,16 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
+-     s->univ_pred_cmds = av_mallocz(sizeof(HEVCPredCmd)*RPI_MAX_PRED_CMDS);
+-     if (!s->univ_pred_cmds)
+-         goto fail;
+--    s->coeffs_buf = av_mallocz(sizeof(int16_t)*RPI_MAX_XFM_CMDS*16);
+--    if (!s->coeffs_buf)
+--        goto fail;
+-+    for(i = 0; i < 4; i++) {
+-+        gpu_malloc_uncached(sizeof(int16_t)*RPI_MAX_XFM_CMDS*16, &s->coeffs_buf[i]); // TODO slim this down and share across sizes
+-+        s->coeffs_buf_arm[i] = (int16_t*) s->coeffs_buf[i].arm;
+-+        if (!s->coeffs_buf_arm[i])
+-+            goto fail;
+-+    }
+-     s->enable_rpi = 0;
+- 
+-     // A little test program
+--    {
+-+    /*{
+-       GPU_MEM_PTR_T p;
+-       int err = gpu_malloc_cached(16, &p);
+-       short *q = (short *)p.arm;
+-@@ -3273,7 +3318,7 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
+-       printf(")\n");
+-       gpu_free(&p);
+-       goto fail; // Early out
+--    }
+-+    }*/
+- 
+- #endif
+- 
+-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
+-index 7a1c35f..4167985 100644
+---- a/libavcodec/hevc.h
+-+++ b/libavcodec/hevc.h
+-@@ -40,6 +40,11 @@
+- #include "thread.h"
+- #include "videodsp.h"
+- 
+-+// define RPI to split the CABAC/prediction/transform into separate stages
+-+#ifdef RPI
+-+#include "rpi_qpu.h"
+++            if (neigh->pred_flag & 1) {
+++                neigh_mv0   = neigh->mv[0];
+++            } else {
+++                neigh_mv0   = neigh->mv[1];
+++                neigh_refL0 = neigh_refL1;
+++            }
+++
+++            if (curr_refL0 == neigh_refL0) {
+++                if (FFABS(curr_mv0.x - neigh_mv0.x) >= 4 || FFABS(curr_mv0.y - neigh_mv0.y) >= 4)
+++                    strength = 1;
+++                else
+++                    strength = 0;
+++            } else
+++                strength = 1;
+++        } else
+++            strength = 1;
+++#else // This has exactly the same effect, but is more suitable for vectorisation
+++        Mv curr_mv[2];
+++        Mv neigh_mv[2];
+++        memcpy(curr_mv, curr->mv, sizeof curr_mv);
+++        memcpy(neigh_mv, neigh->mv, sizeof neigh_mv);
+++
+++        if (!(curr->pred_flag & 2)) {
+++            curr_mv[1] = curr_mv[0];
+++            curr_refL1 = curr_refL0;
+++        }
+++        if (!(neigh->pred_flag & 2)) {
+++            neigh_mv[1] = neigh_mv[0];
+++            neigh_refL1 = neigh_refL0;
+++        }
+++        if (!(curr->pred_flag & 1)) {
+++            curr_mv[0] = curr_mv[1];
+++            curr_refL0 = curr_refL1;
+++        }
+++        if (!(neigh->pred_flag & 1)) {
+++            neigh_mv[0] = neigh_mv[1];
+++            neigh_refL0 = neigh_refL1;
+++        }
+++
+++        strength = 1;
+++
+++        strength &= (neigh_refL0 != curr_refL0) | (neigh_refL1 != curr_refL1) |
+++                (FFABS(neigh_mv[0].x - curr_mv[0].x) >= 4) | (FFABS(neigh_mv[0].y - curr_mv[0].y) >= 4) |
+++                (FFABS(neigh_mv[1].x - curr_mv[1].x) >= 4) | (FFABS(neigh_mv[1].y - curr_mv[1].y) >= 4);
+++
+++        strength &= (neigh_refL1 != curr_refL0) | (neigh_refL0 != curr_refL1) |
+++                (FFABS(neigh_mv[1].x - curr_mv[0].x) >= 4) | (FFABS(neigh_mv[1].y - curr_mv[0].y) >= 4) |
+++                (FFABS(neigh_mv[0].x - curr_mv[1].x) >= 4) | (FFABS(neigh_mv[0].y - curr_mv[1].y) >= 4);
+++
+++        strength |= (((curr->pred_flag + 1) ^ (neigh->pred_flag + 1)) >> 2);
+ +#endif
+ +
+- #define MAX_DPB_SIZE 16 // A.4.1
+- #define MAX_REFS 16
+- 
+-@@ -856,11 +861,12 @@ typedef struct HEVCContext {
+-     HEVCMvCmd *unif_mv_cmds;
+-     HEVCXfmCmd *unif_xfm_cmds;
+-     HEVCPredCmd *univ_pred_cmds;
+--    int16_t *coeffs_buf;
+--    int num_mv_cmds;
+-+    GPU_MEM_PTR_T coeffs_buf[4];
+-+    int16_t *coeffs_buf_arm[4];
+-+    int num_coeffs[4];
+-     int num_xfm_cmds;
+-+    int num_mv_cmds;
+-     int num_pred_cmds;
+--    int num_coeffs;
+- #endif
+- 
+-     uint8_t *cabac_state;
+-diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
+-index 4e97f06..d1cba86 100644
+---- a/libavcodec/hevc_cabac.c
+-+++ b/libavcodec/hevc_cabac.c
+-@@ -1031,6 +1031,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+-     int vshift = s->ps.sps->vshift[c_idx];
+-     uint8_t *dst = &s->frame->data[c_idx][(y0 >> vshift) * stride +
+-                                           ((x0 >> hshift) << s->ps.sps->pixel_shift)];
+-+    int use_vpu = s->enable_rpi && !lc->cu.cu_transquant_bypass_flag && !transform_skip_flag && !lc->tu.cross_pf && log2_trafo_size==4;
+-     int16_t *coeffs = (int16_t*)(c_idx ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
+-     uint8_t significant_coeff_group_flag[8][8] = {{0}};
+-     int explicit_rdpcm_flag = 0;
+-@@ -1044,6 +1045,18 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+-     uint8_t dc_scale;
+-     int pred_mode_intra = (c_idx == 0) ? lc->tu.intra_pred_mode :
+-                                          lc->tu.intra_pred_mode_c;
+-+#ifdef RPI
+-+    if (s->enable_rpi) {
+-+        int n = trafo_size * trafo_size;
+-+        if (use_vpu) {
+-+            coeffs = s->coeffs_buf_arm[log2_trafo_size - 2] + s->num_coeffs[log2_trafo_size - 2];
+-+            s->num_coeffs[log2_trafo_size - 2] += n;
+-+        } else {
+-+            coeffs = s->coeffs_buf_arm[0] + s->num_coeffs[0];
+-+            s->num_coeffs[0] += n;
+++        curr += in_inc / sizeof (MvField);
+++        neigh += in_inc / sizeof (MvField);
+++
+++        for (out = dup; out > 0; out--)
+++        {
+++            *bs = strength;
+++            bs += out_inc;
+ +        }
+ +    }
+-+#endif
+- 
+-     memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
+- 
+-@@ -1488,6 +1501,24 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+-         } else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2) {
+-             s->hevcdsp.idct_4x4_luma(coeffs);
+-         } else {
+-+#ifdef RPI
+-+            if (!use_vpu) {
+-+              int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
+-+              if (max_xy == 0)
+-+                  s->hevcdsp.idct_dc[log2_trafo_size-2](coeffs);
+-+              else {
+-+                  int col_limit = last_significant_coeff_x + last_significant_coeff_y + 4;
+-+                  if (max_xy < 4)
+-+                      col_limit = FFMIN(4, col_limit);
+-+                  else if (max_xy < 8)
+-+                      col_limit = FFMIN(8, col_limit);
+-+                  else if (max_xy < 12)
+-+                      col_limit = FFMIN(24, col_limit);
+++}
+ +
+-+                  s->hevcdsp.idct[log2_trafo_size-2](coeffs, col_limit);
+-+              }
+-+            }
+-+#else
+-             int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
+-             if (max_xy == 0)
+-                 s->hevcdsp.idct_dc[log2_trafo_size-2](coeffs);
+-@@ -1501,6 +1532,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+-                     col_limit = FFMIN(24, col_limit);
+-                 s->hevcdsp.idct[log2_trafo_size-2](coeffs, col_limit);
+-             }
+-+#endif
+-         }
+-     }
+-     if (lc->tu.cross_pf) {
+-@@ -1512,14 +1544,11 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
++ {
++ #undef FUNC
++@@ -257,6 +371,8 @@ int i = 0;
++         break;
+      }
+- #ifdef RPI
+-     if (s->enable_rpi) {
+--        int16_t *c = s->coeffs_buf + s->num_coeffs;
+--        int n = trafo_size * trafo_size;
+-         HEVCPredCmd *cmd = s->univ_pred_cmds + s->num_pred_cmds++;
+--        memcpy(c, coeffs, n * sizeof(int16_t));  // TODO change pointer earlier and we can avoid this copy
+--        s->num_coeffs += n;
+-+        //memcpy(coeffs2, coeffs, sizeof(int16_t) * trafo_size * trafo_size); // TODO
+-         cmd->type = RPI_PRED_TRANSFORM_ADD;
+-         cmd->size = log2_trafo_size;
+--        cmd->buf = c;
+-+        cmd->buf = coeffs;
+-         cmd->dst = dst;
+-         cmd->stride = stride;
+-         return;
+-diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
+-index 1f33b0c..e4c3da7 100644
+---- a/libavcodec/hevc_filter.c
+-+++ b/libavcodec/hevc_filter.c
+-@@ -22,6 +22,10 @@
+-  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+-  */
+  
+-+//#define DISABLE_SAO
+-+//#define DISABLE_DEBLOCK
+-+//#define DISABLE_STRENGTHS
+++    hevcdsp->hevc_deblocking_boundary_strengths = hevc_deblocking_boundary_strengths;
+ +
+- #include "libavutil/common.h"
+- #include "libavutil/internal.h"
+- 
+-@@ -273,6 +277,10 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
+-     edges[2]   = x_ctb == s->ps.sps->ctb_width  - 1;
+-     edges[3]   = y_ctb == s->ps.sps->ctb_height - 1;
++     if (ARCH_X86)
++         ff_hevc_dsp_init_x86(hevcdsp, bit_depth);
++     if (ARCH_ARM)
++diff --git a/libavcodec/hevcdsp.h b/libavcodec/hevcdsp.h
++index 9f1f6dd..e221e54 100644
++--- a/libavcodec/hevcdsp.h
+++++ b/libavcodec/hevcdsp.h
++@@ -42,6 +42,17 @@ typedef struct SAOParams {
++     uint8_t type_idx[3];    ///< sao_type_idx
++ } SAOParams;
+  
+-+#ifdef DISABLE_SAO
+-+    return;
+-+#endif
+++typedef struct Mv {
+++    int16_t x;  ///< horizontal component of motion vector
+++    int16_t y;  ///< vertical component of motion vector
+++} Mv;
+ +
+-     if (restore) {
+-         if (!edges[0]) {
+-             left_tile_edge  = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1]];
+-@@ -496,6 +504,10 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+-                 s->ps.sps->pcm.loop_filter_disable_flag) ||
+-                s->ps.pps->transquant_bypass_enable_flag;
+- 
+-+#ifdef DISABLE_DEBLOCK
+-+    return;
+-+#endif
+++typedef struct MvField {
+++    DECLARE_ALIGNED(4, Mv, mv)[2];
+++    int8_t ref_idx[2];
+++    int8_t pred_flag;
+++} MvField;
+ +
+-     if (x0) {
+-         left_tc_offset   = s->deblock[ctb - 1].tc_offset;
+-         left_beta_offset = s->deblock[ctb - 1].beta_offset;
+-@@ -726,6 +738,10 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
+-     int boundary_upper, boundary_left;
+-     int i, j, bs;
++ typedef struct HEVCDSPContext {
++     void (*put_pcm)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
++                     struct GetBitContext *gb, int pcm_bit_depth);
++@@ -120,6 +131,9 @@ typedef struct HEVCDSPContext {
++     void (*hevc_v_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride,
++                                         int32_t *tc, uint8_t *no_p,
++                                         uint8_t *no_q);
+++    void (*hevc_deblocking_boundary_strengths)(int pus, int dup, int in_inc, int out_inc,
+++                                               int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
+++                                               MvField *curr, MvField *neigh, uint8_t *bs);
++ } HEVCDSPContext;
+  
+-+#ifdef DISABLE_STRENGTHS
+-+    return;
+-+#endif
+-+
+-     boundary_upper = y0 > 0 && !(y0 & 7);
+-     if (boundary_upper &&
+-         ((!s->sh.slice_loop_filter_across_slices_enabled_flag &&
++ void ff_hevc_dsp_init(HEVCDSPContext *hpc, int bit_depth);
+ diff --git a/libavcodec/hevcpred_template.c b/libavcodec/hevcpred_template.c
+-index 6ae87cc..71c6d52 100644
++index 6ae87cc..28d2653 100644
+ --- a/libavcodec/hevcpred_template.c
+ +++ b/libavcodec/hevcpred_template.c
+ @@ -20,6 +20,8 @@
+@@ -5149,7 +7113,20 @@ index 6ae87cc..71c6d52 100644
+  #include "libavutil/pixdesc.h"
+  
+  #include "bit_depth_template.c"
+-@@ -114,6 +116,10 @@ do {                                  \
++@@ -69,8 +71,11 @@ do {                                  \
++                 AV_WN4P(&ptr[i], a);                                           \
++             else                                                               \
++                 a = PIXEL_SPLAT_X4(ptr[i + 3])
++-
+++#ifdef RPI_WORKER
+++    HEVCLocalContextIntra *lc = (s->enable_rpi) ? &s->HEVClcIntra : (HEVCLocalContextIntra *)s->HEVClc ;
+++#else
++     HEVCLocalContext *lc = s->HEVClc;
+++#endif
++     int i;
++     int hshift = s->ps.sps->hshift[c_idx];
++     int vshift = s->ps.sps->vshift[c_idx];
++@@ -114,6 +119,10 @@ do {                                  \
+      int top_right_size   = (FFMIN(x0 + 2 * size_in_luma_h, s->ps.sps->width) -
+                             (x0 + size_in_luma_h)) >> hshift;
+  
+@@ -5160,126 +7137,93 @@ index 6ae87cc..71c6d52 100644
+      if (s->ps.pps->constrained_intra_pred_flag == 1) {
+          int size_in_luma_pu_v = PU(size_in_luma_v);
+          int size_in_luma_pu_h = PU(size_in_luma_h);
++diff --git a/libavcodec/mmaldec.c b/libavcodec/mmaldec.c
++index 099a8c5..bdff2d2 100644
++--- a/libavcodec/mmaldec.c
+++++ b/libavcodec/mmaldec.c
++@@ -24,6 +24,9 @@
++  * MMAL Video Decoder
++  */
++ 
+++#pragma GCC diagnostic push
+++// Many many redundant decls in the header files
+++#pragma GCC diagnostic ignored "-Wredundant-decls"
++ #include <bcm_host.h>
++ #include <interface/mmal/mmal.h>
++ #include <interface/mmal/mmal_parameters_video.h>
++@@ -31,6 +34,7 @@
++ #include <interface/mmal/util/mmal_util_params.h>
++ #include <interface/mmal/util/mmal_default_components.h>
++ #include <interface/mmal/vc/mmal_vc_api.h>
+++#pragma GCC diagnostic pop
++ 
++ #include "avcodec.h"
++ #include "internal.h"
++diff --git a/libavcodec/mpeg4videodec.c b/libavcodec/mpeg4videodec.c
++index 3adf28d..2f9195f 100644
++--- a/libavcodec/mpeg4videodec.c
+++++ b/libavcodec/mpeg4videodec.c
++@@ -2205,6 +2205,9 @@ int ff_mpeg4_workaround_bugs(AVCodecContext *avctx)
++ 
++         if (ctx->divx_version >= 0)
++             s->workaround_bugs |= FF_BUG_HPEL_CHROMA;
+++
+++        if (ctx->num_sprite_warping_points > 1)
+++            s->workaround_bugs |= FF_BUG_GMC_UNSUPPORTED;
++     }
++ 
++     if (s->workaround_bugs & FF_BUG_STD_QPEL) {
++@@ -2229,6 +2232,7 @@ int ff_mpeg4_workaround_bugs(AVCodecContext *avctx)
++                s->workaround_bugs, ctx->lavc_build, ctx->xvid_build,
++                ctx->divx_version, ctx->divx_build, s->divx_packed ? "p" : "");
++ 
+++    avctx->workaround_bugs = s->workaround_bugs;
++     if (CONFIG_MPEG4_DECODER && ctx->xvid_build >= 0 &&
++         s->codec_id == AV_CODEC_ID_MPEG4 &&
++         avctx->idct_algo == FF_IDCT_AUTO) {
+ diff --git a/libavcodec/rpi_hevc_transform.h b/libavcodec/rpi_hevc_transform.h
+-index 85a9102..c0c279f 100644
+---- a/libavcodec/rpi_hevc_transform.h
++new file mode 100644
++index 0000000..4309f1c
++--- /dev/null
+ +++ b/libavcodec/rpi_hevc_transform.h
+-@@ -3,11 +3,11 @@ unsigned char rpi_hevc_transform [] = {
+- 3,
+- 3,
+- 232,
+--128,
+-+32,
+- 0,
+- 0,
+- 0,
+--20,
+-+12,
+- 248,
+- 0,
+- 136,
+-@@ -56,9 +56,9 @@ unsigned char rpi_hevc_transform [] = {
+- 5,
+- 232,
+- 0,
+--0,
+- 8,
+- 0,
+-+0,
+- 128,
+- 69,
+- 113,
+-@@ -108,8 +108,8 @@ unsigned char rpi_hevc_transform [] = {
+- 128,
+- 2,
+- 0,
+--248,
+--62,
+-+8,
+-+2,
+- 0,
+- 128,
+- 144,
+-@@ -123,13 +123,13 @@ unsigned char rpi_hevc_transform [] = {
+- 3,
+- 32,
+- 8,
+--16,
+-+20,
+- 0,
+- 76,
+- 254,
+- 48,
+- 192,
+--9,
+-+4,
+- 4,
+- 32,
+- 8,
+-@@ -155,14 +155,46 @@ unsigned char rpi_hevc_transform [] = {
+- 192,
+- 41,
+- 3,
+--68,
+-+70,
+-+192,
+-+80,
+-+7,
+-+164,
+-+255,
+-+36,
+-+204,
+-+96,
+-+2,
++@@ -0,0 +1,3070 @@
+++unsigned char rpi_hevc_transform [] = {
+++21,
+++106,
+ +0,
+-+248,
+-+62,
+++144,
+++47,
+++1,
+++37,
+++106,
+ +0,
+-+3,
+-+255,
+-+55,
+-+208,
+-+120,
+-+3,
+-+224,
+-+3,
+-+190,
+-+11,
+-+16,
+-+139,
+-+246,
+-+91,
+++144,
+++66,
+++1,
+++53,
+++106,
+ +0,
+-+103,
+-+90,
+++144,
+++192,
+++4,
+++69,
+++106,
+ +0,
+-+70,
+- 192,
+- 80,
+- 7,
+- 164,
+- 255,
+- 36,
+--220,
+-+204,
+- 96,
+- 2,
+- 0,
+-@@ -182,7 +214,7 @@ unsigned char rpi_hevc_transform [] = {
+- 16,
+- 139,
+- 246,
+--83,
+-+91,
+- 0,
+- 103,
+- 90,
+-@@ -209,4 +241,374 @@ unsigned char rpi_hevc_transform [] = {
+- 96,
+- 90,
+- 0,
+++144,
+++192,
+++4,
+++85,
+++106,
+++0,
+++144,
+++220,
+++5,
+ +169,
+ +3,
+++62,
+++64,
+++79,
+++64,
+ +3,
+ +232,
+ +32,
+@@ -5312,9 +7256,11 @@ index 85a9102..c0c279f 100644
+ +248,
+ +0,
+ +0,
+++0,
+++96,
+ +3,
+ +232,
+-+128,
+++32,
+ +0,
+ +0,
+ +0,
+@@ -5324,6 +7270,22 @@ index 85a9102..c0c279f 100644
+ +2,
+ +0,
+ +0,
+++8,
+++232,
+++0,
+++4,
+++0,
+++0,
+++12,
+++248,
+++0,
+++128,
+++0,
+++0,
+++192,
+++8,
+++4,
+++0,
+ +4,
+ +232,
+ +64,
+@@ -5336,6 +7298,184 @@ index 85a9102..c0c279f 100644
+ +8,
+ +0,
+ +0,
+++128,
+++69,
+++113,
+++66,
+++12,
+++248,
+++0,
+++128,
+++0,
+++0,
+++192,
+++8,
+++4,
+++0,
+++128,
+++69,
+++113,
+++70,
+++128,
+++144,
+++40,
+++0,
+++4,
+++255,
+++48,
+++192,
+++128,
+++3,
+++32,
+++8,
+++16,
+++0,
+++76,
+++254,
+++48,
+++192,
+++9,
+++4,
+++32,
+++8,
+++0,
+++0,
+++4,
+++254,
+++0,
+++144,
+++128,
+++2,
+++0,
+++8,
+++2,
+++0,
+++128,
+++144,
+++23,
+++0,
+++4,
+++255,
+++48,
+++192,
+++128,
+++3,
+++32,
+++8,
+++20,
+++0,
+++76,
+++254,
+++48,
+++192,
+++4,
+++4,
+++32,
+++8,
+++0,
+++0,
+++140,
+++248,
+++44,
+++0,
+++0,
+++0,
+++32,
+++48,
+++4,
+++0,
+++128,
+++69,
+++113,
+++66,
+++242,
+++140,
+++211,
+++192,
+++34,
+++31,
+++41,
+++3,
+++70,
+++192,
+++80,
+++7,
+++164,
+++255,
+++36,
+++204,
+++96,
+++2,
+++0,
+++248,
+++62,
+++0,
+++3,
+++255,
+++55,
+++208,
+++120,
+++3,
+++224,
+++3,
+++190,
+++11,
+++16,
+++139,
+++246,
+++91,
+++0,
+++103,
+++90,
+++0,
+++70,
+++192,
+++80,
+++7,
+++164,
+++255,
+++36,
+++204,
+++224,
+++2,
+++0,
+++248,
+++62,
+++0,
+++3,
+++255,
+++55,
+++208,
+++120,
+++3,
+++224,
+++3,
+++190,
+++11,
+++16,
+++139,
+++246,
+++91,
+++0,
+++103,
+++90,
+++0,
+++225,
+++64,
+++242,
+++64,
+++3,
+++232,
+++128,
+++0,
+++0,
+++0,
+++7,
+++232,
+++0,
+++2,
+++0,
+++0,
+ +57,
+ +239,
+ +224,
+@@ -5354,18 +7494,26 @@ index 85a9102..c0c279f 100644
+ +64,
+ +26,
+ +64,
+++4,
+++232,
+++64,
+++0,
+++0,
+++0,
+++149,
+++96,
+ +161,
+ +64,
+ +152,
+ +64,
+ +128,
+ +144,
+-+31,
+++35,
+ +0,
+ +72,
+ +232,
+-+32,
+ +0,
+++4,
+ +0,
+ +0,
+ +65,
+@@ -5376,8 +7524,16 @@ index 85a9102..c0c279f 100644
+ +0,
+ +128,
+ +144,
+-+23,
+++27,
+++0,
+++4,
+++232,
+++0,
+++8,
+++0,
+ +0,
+++69,
+++96,
+ +145,
+ +64,
+ +168,
+@@ -5388,8 +7544,8 @@ index 85a9102..c0c279f 100644
+ +0,
+ +72,
+ +232,
+-+32,
+ +0,
+++4,
+ +0,
+ +0,
+ +65,
+@@ -5410,7 +7566,7 @@ index 85a9102..c0c279f 100644
+ +0,
+ +242,
+ +140,
+-+229,
+++221,
+ +192,
+ +57,
+ +239,
+@@ -5420,6 +7576,8 @@ index 85a9102..c0c279f 100644
+ +0,
+ +41,
+ +3,
+++239,
+++3,
+ +12,
+ +248,
+ +0,
+@@ -5427,7 +7585,7 @@ index 85a9102..c0c279f 100644
+ +0,
+ +0,
+ +192,
+-+8,
+++248,
+ +4,
+ +0,
+ +12,
+@@ -5437,14 +7595,14 @@ index 85a9102..c0c279f 100644
+ +64,
+ +0,
+ +192,
+-+8,
+++248,
+ +4,
+ +0,
+ +0,
+ +96,
+ +255,
+ +159,
+-+131,
+++154,
+ +255,
+ +0,
+ +232,
+@@ -5454,7 +7612,7 @@ index 85a9102..c0c279f 100644
+ +0,
+ +255,
+ +159,
+-+142,
+++165,
+ +255,
+ +4,
+ +255,
+@@ -5466,7 +7624,7 @@ index 85a9102..c0c279f 100644
+ +251,
+ +62,
+ +0,
+-+5,
+++4,
+ +255,
+ +51,
+ +204,
+@@ -5476,15 +7634,15 @@ index 85a9102..c0c279f 100644
+ +251,
+ +16,
+ +0,
+-+77,
+++76,
+ +254,
+ +51,
+ +204,
+-+9,
+-+4,
+++128,
+++3,
+ +224,
+ +251,
+-+0,
+++20,
+ +0,
+ +128,
+ +64,
+@@ -5504,16 +7662,6 @@ index 85a9102..c0c279f 100644
+ +99,
+ +0,
+ +0,
+-+4,
+-+254,
+-+0,
+-+144,
+-+128,
+-+2,
+-+0,
+-+8,
+-+2,
+-+0,
+ +32,
+ +247,
+ +240,
+@@ -5525,92 +7673,92 @@ index 85a9102..c0c279f 100644
+ +176,
+ +207,
+ +17,
+-+3,
+++19,
+ +32,
+ +247,
+ +112,
+ +207,
+ +18,
+-+3,
+++35,
+ +32,
+ +247,
+ +48,
+ +207,
+ +19,
+-+3,
+++51,
+ +32,
+ +247,
+ +240,
+ +206,
+ +20,
+-+3,
+++67,
+ +32,
+ +247,
+ +176,
+ +206,
+ +21,
+-+3,
+++83,
+ +32,
+ +247,
+ +112,
+ +206,
+ +22,
+-+3,
+++99,
+ +32,
+ +247,
+ +48,
+ +206,
+ +23,
+-+3,
+++115,
+ +32,
+ +247,
+ +240,
+ +205,
+ +24,
+-+3,
+++131,
+ +32,
+ +247,
+ +176,
+ +205,
+ +25,
+-+3,
+++147,
+ +32,
+ +247,
+ +112,
+ +205,
+ +26,
+-+3,
+++163,
+ +32,
+ +247,
+ +48,
+ +205,
+ +27,
+-+3,
+++179,
+ +32,
+ +247,
+ +240,
+ +204,
+ +28,
+-+3,
+++195,
+ +32,
+ +247,
+ +176,
+ +204,
+ +29,
+-+3,
+++211,
+ +32,
+ +247,
+ +112,
+ +204,
+ +30,
+-+3,
+++227,
+ +32,
+ +247,
+ +48,
+ +204,
+ +31,
+-+3,
+-+5,
+++243,
+++4,
+ +255,
+ +51,
+ +204,
+@@ -5620,20 +7768,20 @@ index 85a9102..c0c279f 100644
+ +251,
+ +16,
+ +0,
+-+77,
+++76,
+ +254,
+ +51,
+ +204,
+-+9,
+-+4,
+++128,
+++3,
+ +224,
+ +251,
+-+0,
+++20,
+ +0,
+ +0,
+ +237,
+++32,
+ +0,
+-+4,
+ +0,
+ +0,
+ +140,
+@@ -5646,29347 +7794,1846 @@ index 85a9102..c0c279f 100644
+ +99,
+ +0,
+ +0,
+++111,
+++3,
+++4,
+++254,
+++0,
+++128,
+++0,
+++4,
+++0,
+++248,
+++0,
+++0,
+++2,
+++232,
+++32,
+++0,
+++0,
+++0,
+++140,
+++248,
+++32,
+++0,
+++0,
+++0,
+++224,
+++35,
+++0,
+++0,
+++64,
+++232,
+++0,
+++2,
+++0,
+++0,
+++193,
+++232,
+++0,
+++1,
+++0,
+++0,
+++1,
+++106,
+++116,
+++30,
+ +90,
+ +0,
+- };
+-diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s
+-index 5e2728d..1e389c7 100644
+---- a/libavcodec/rpi_hevc_transform.s
+-+++ b/libavcodec/rpi_hevc_transform.s
+-@@ -58,13 +58,6 @@
+- #
+- #
+- 
+--test_add:
+--  vldh HX(0,0),(r0)
+--  vadd HX(0,0),HX(0,0),10
+--  vsth HX(0,0),(r0)
+--  mov r0,7 # return value
+--  b lr
+--
+- # Columns are transformed first
+- #
+- # Store top left half of transMatrix2 in
+-@@ -79,7 +72,7 @@ test_add:
+- #
+- 
+- 
+--# hevc_trans_16x16(short *transMatrix2, short *coeffs, int num)
+-+# hevc_trans_16x16(short *transMatrix2, short *coeffs, int num) # TODO add size so we can branch to correct implementation (or perhaps have coeffs32 and num32 as secondary inputs!)
+- # transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory)
+- # coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
+- # num: number of 16x16 transforms to be done
+-@@ -87,17 +80,17 @@ test_add:
+- hevc_trans_16x16:
+-   push r6-r15, lr # TODO cut down number of used registers
+- 
+--  mov r3, 2*32*2 # Twice Stride of transMatrix2 in bytes
+--  vld HX(32++,0),(r0 += r3) REP 16 # This is the 16x16 matrix, a transform is equivalent to multiplying input row vector * matrix
+-+  mov r3, 16*2 # Stride of transMatrix2 in bytes
+-+  vldh HX(32++,0),(r0 += r3) REP 16 # This is the 16x16 matrix, a transform is equivalent to multiplying input row vector * matrix
+-   # Now use r0 to describe which matrix we are working on.
+-   # Allows us to prefetch the next block of coefficients for efficiency.
+-   mov r0,0 # This describes the location where we read our coefficients from
+--  mov r3,16*2 # Stride of coefficients in bytes
+-+  mov r3,16*2 # Stride of coefficients in bytes (TODO remove)
+-   mov r7,16*16*2 # Total block size
+-   mov r8,64*16 # Value used to swap from current to next VRF location
+-   vldh HX(0++,0)+r0,(r1 += r3) REP 16
+-   mov r4,64 # Constant used for rounding first pass
+--  mov r5,1<<19 # Constant used for rounding second pass
+-+  mov r5,1<<11 # Constant used for rounding second pass
+- 
+-   # At start of block r0,r1 point to the current block (that has already been loaded)
+- block_loop:
+-@@ -113,12 +106,12 @@ block_loop:
+-   vadd HY(0++,0)+r0,HY(0++,0)+r0,r4 REP 16   # Now add on rounding, shift down by 7, and saturate
+-   #vsasls HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # 9+7=16 so this ends up with the output saturated and in the top half of the word.
+-   vasl HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16    # This should be saturating, but the instruction above does not assemble?
+--  vmov VX(0,0++), HX(0++,32) REP 16          # For simplicity transpose this back to the original position
+-+  vmov VX(0,0++)+r0, HX(0++,32)+r0 REP 16    # For simplicity transpose this back to the original position
+- 
+-   bl col_trans_16
+--  vadd HY(0++,0)+r0,HY(0++,0)+r0,r4 REP 16   # Now add on rounding, shift down by 7, and saturate
+--  #vsasls HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # 9+7=16 so this ends up with the output saturated and in the top half of the word.
+--  vasl HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16    # This should be saturating, but the instruction above does not assemble?
+-+  vadd HY(0++,0)+r0,HY(0++,0)+r0,r5 REP 16   # Now add on rounding, shift down by 7, and saturate
+-+  #vsasls HY(0++,0)+r0,HY(0++,0)+r0,4 REP 16 # 4+12=16 so this ends up with the output saturated and in the top half of the word.
+-+  vasl HY(0++,0)+r0,HY(0++,0)+r0,4 REP 16    # This should be saturating, but the instruction above does not assemble?  (Probably because it ends with ls which is interpreted as a condition flag)
+- 
+-   # Save results - note there has been a transposition during the processing so we save columns
+-   vsth VX(0,32++)+r0, (r1 += r3) REP 16
+-@@ -132,16 +125,136 @@ block_loop:
+- 
+- # r1,r2,r3 r7,r8 should be preserved
+- # HX(0++,0)+r0 is the block to be transformed
+--# HX(32++,0) is the 16x16 matrix of transform coefficients
+-+# HX(32++,0)+r6 is the 16x16 matrix of transform coefficients
+- # Use HY(48,0) for intermediate results
+- # r0 can be used, but should be returned to its original value at the end
+- col_trans_16:
+--  add r4,r0,16 # Final value for this loop
+-+  add r6,r0,16 # Final value for this loop
+- col_trans_16_loop:
+-   # First compute partial products for a single column
+--  vmul32s VY(48,0++), VX(0,0)+r0, VX(32,0++) REP 16
+-+  vmul32s HY(48++,0), VX(0,0)+r0, VX(32,0++) REP 16
+-   # Then sum up the results and place back
+-   vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
+--  addcmpblt r0,1,r4,col_trans_16_loop
+-+  addcmpblt r0,1,r6,col_trans_16_loop
+-   sub r0,16  # but r0 back to its original value
+-   b lr
+-+
+-+col_trans_odd_16:
+-+  add r6,r0,16 # Final value for this loop
+-+col_trans_odd_16_loop:
+-+  # First compute partial products for a single column
+-+  vmul32s HY(48++,0), VX(0,0)+r0, VX(32,0++) REP 16
+-+  # Then sum up the results and place back
+-+  vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
+-+  addcmpblt r0,1,r6,col_trans_odd_16_loop
+-+  sub r0,16  # but r0 back to its original value
+-+  b lr
+-+
+-+
+-+test_add:
+-+  vldh HX(0,0),(r0)
+-+  vadd HX(0,0),HX(0,0),10
+-+  vsth HX(0,0),(r0)
+-+  mov r0,7 # return value
+-+  b lr
+-+
+-+# hevc_trans_32x32(short *transMatrix2, short *coeffs, int num)
+-+# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory) Even followed by odd
+-+# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
+-+# num: number of 16x16 transforms to be done
+-+#
+-+hevc_trans_32x32:
+-+  push r6-r15, lr # TODO cut down number of used registers
+-+
+-+  # Fetch transform matrices
+-+  mov r3, 16*2 # Stride of transMatrix2 in bytes (and of coefficients)
+-+  vldh HX(32++,0),(r0 += r3) REP 16 # This is the even 16x16 matrix
+-+  add r0, 16*16*2
+-+  vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
+-+
+-+  mov r3, 32*2*2 # Stride used to fetch alternate rows of our input coefficient buffer
+-+  mov r7, 16*16*2 # Total block size
+-+  mov r4, 64 # Constant used for rounding first pass
+-+  mov r5, 1<<11 # Constant used for rounding second pass
+-+  sub sp,sp,32*32*2+32 # Allocate some space on the stack for us to store 32*32 shorts as temporary results (needs to be aligned)
+-+  # set r8 to 32byte aligned stack pointer
+-+  add r8,sp,31
+-+  lsr r8,5
+-+  lsl r8,5
+-+  mov r9,r8  # Backup of the temporary storage
+-+  mov r10,r1 # Backup of the coefficient buffer
+-+block_loop32:
+-+
+-+  # COLUMN TRANSFORM
+-+  # Transform the first 16 columns
+-+  mov r1,r10  # Input Coefficient buffer
+-+  mov r8,r9   # Output temporary storage
+-+  bl trans32
+-+  # Transform the second 16 columns
+-+  add r8,32
+-+  add r1,32
+-+  bl trans32
+-+
+-+  # ROW TRANSFORM
+-+  mov r1,r9  # Input temporary storage
+-+  mov r8,r10   # Output Coefficient buffer
+-+  bl trans32
+-+  # Transform the second 16 columns
+-+  add r8,32
+-+  add r1,32
+-+  bl trans32
+-+
+-+  add r10, 32*32*2 # move onto next block of coefficients
+-+  addcmpbgt r2,-1,0,block_loop32
+-+
+-+  add sp,sp,32*32*2+32 # Restore stack
+-+
+-+  pop r6-r15, pc
+-+
+-+trans32:
+-+  # We can no longer afford the VRF space to do prefetching when doing 32x32
+-+  # Fetch the even rows
+-+  vldh HX(0++,0)+r0,(r1 += r3) REP 16
+-+  # Fetch the odd rows
+-+  vldh HX(16++,0)+r0,64(r1 += r3) REP 16 # First odd row is 32 shorts ahead of r1
+-+
+-+  # Transform the even rows using even matrix
+-+  mov r0, 0 # Even rows
+-+  bl col_trans_16
+-+
+-+  # Now transform the odd rows using odd matrix
+-+  mov r0, 64*16 # Odd rows
+-+  bl col_trans_odd_16
+-+
+-+  # Now apply butterfly to compute the first 16 results
+-+  vadd HY(48++,0),HY(0++,0),HY(16++,0) REP 16
+-+  vadd HY(48++,0),HY(48++,0),r4 REP 32   # add on rounding,
+-+  vasl HY(48++,0),HY(48++,0),9 REP 32    # shift down by 7, and saturate
+-+  # 16bit results now in HX(48,32)
+-+  mov r0,r8
+-+  mov r6,32*2
+-+  vsth VX(48,32++),(r0+=r6) REP 16
+-+  vmov VX(0,0++)+r0, HX(0++,32)+r0 REP 16    # Store transposed
+-+
+-+  # Now apply butterfly to compute the second 16 results (in reverse order)
+-+  vsub HY(63,0),HY(0,0),HY(16,0)
+-+  vsub HY(62,0),HY(0,0),HY(17,0)
+-+  vsub HY(61,0),HY(0,0),HY(18,0)
+-+  vsub HY(60,0),HY(0,0),HY(19,0)
+-+  vsub HY(59,0),HY(0,0),HY(20,0)
+-+  vsub HY(58,0),HY(0,0),HY(21,0)
+-+  vsub HY(57,0),HY(0,0),HY(22,0)
+-+  vsub HY(56,0),HY(0,0),HY(23,0)
+-+  vsub HY(55,0),HY(0,0),HY(24,0)
+-+  vsub HY(54,0),HY(0,0),HY(25,0)
+-+  vsub HY(53,0),HY(0,0),HY(26,0)
+-+  vsub HY(52,0),HY(0,0),HY(27,0)
+-+  vsub HY(51,0),HY(0,0),HY(28,0)
+-+  vsub HY(50,0),HY(0,0),HY(29,0)
+-+  vsub HY(49,0),HY(0,0),HY(30,0)
+-+  vsub HY(48,0),HY(0,0),HY(31,0)
+-+  vadd HY(48++,0),HY(48++,0),r4 REP 32   # add on rounding,
+-+  vasl HY(48++,0),HY(48++,0),9 REP 32    # shift down by 7, and saturate
+-+  add r0,r8,16*32*2 # Move to 16th row
+-+  vsth VX(48,32++),(r0+=r6) REP 16
+-+  b lr
+-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
+-index b1f50ee..d720546 100644
+---- a/libavcodec/rpi_qpu.c
+-+++ b/libavcodec/rpi_qpu.c
+-@@ -3,6 +3,7 @@
+- // This works better than the mmap in that the memory can be cached, but requires a kernel modification to enable the device.
+- #define RPI_USE_VCSM
+- #define RPI_TIME_TOTAL_QPU
+-+#define RPI_TIME_TOTAL_VPU
+- 
+- #include <stdio.h>
+- #include <stdlib.h>
+-@@ -48,10 +49,47 @@ typedef int int32_t;
+- #define QPU_CODE_SIZE 2048
+- #define VPU_CODE_SIZE 2048
+- 
+-+const short rpi_transMatrix2even[32][16] = { // Even rows first
+-+{64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64},
+-+{90,  87,  80,  70,  57,  43,  25,   9,  -9, -25, -43, -57, -70, -80, -87, -90},
+-+{89,  75,  50,  18, -18, -50, -75, -89, -89, -75, -50, -18,  18,  50,  75,  89},
+-+{87,  57,   9, -43, -80, -90, -70, -25,  25,  70,  90,  80,  43,  -9, -57, -87},
+-+{83,  36, -36, -83, -83, -36,  36,  83,  83,  36, -36, -83, -83, -36,  36,  83},
+-+{80,   9, -70, -87, -25,  57,  90,  43, -43, -90, -57,  25,  87,  70,  -9, -80},
+-+{75, -18, -89, -50,  50,  89,  18, -75, -75,  18,  89,  50, -50, -89, -18,  75},
+-+{70, -43, -87,   9,  90,  25, -80, -57,  57,  80, -25, -90,  -9,  87,  43, -70},
+-+{64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64},
+-+{57, -80, -25,  90,  -9, -87,  43,  70, -70, -43,  87,   9, -90,  25,  80, -57},
+-+{50, -89,  18,  75, -75, -18,  89, -50, -50,  89, -18, -75,  75,  18, -89,  50},
+-+{43, -90,  57,  25, -87,  70,   9, -80,  80,  -9, -70,  87, -25, -57,  90, -43},
+-+{36, -83,  83, -36, -36,  83, -83,  36,  36, -83,  83, -36, -36,  83, -83,  36},
+-+{25, -70,  90, -80,  43,   9, -57,  87, -87,  57,  -9, -43,  80, -90,  70, -25},
+-+{18, -50,  75, -89,  89, -75,  50, -18, -18,  50, -75,  89, -89,  75, -50,  18},
+-+{ 9, -25,  43, -57,  70, -80,  87, -90,  90, -87,  80, -70,  57, -43,  25,  -9},
+-+// Odd rows
+-+{90,  90,  88,  85,  82,  78,  73,  67,  61,  54,  46,  38,  31,  22,  13,   4},
+-+{90,  82,  67,  46,  22,  -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13},
+-+{88,  67,  31, -13, -54, -82, -90, -78, -46,  -4,  38,  73,  90,  85,  61,  22},
+-+{85,  46, -13, -67, -90, -73, -22,  38,  82,  88,  54,  -4, -61, -90, -78, -31},
+-+{82,  22, -54, -90, -61,  13,  78,  85,  31, -46, -90, -67,   4,  73,  88,  38},
+-+{78,  -4, -82, -73,  13,  85,  67, -22, -88, -61,  31,  90,  54, -38, -90, -46},
+-+{73, -31, -90, -22,  78,  67, -38, -90, -13,  82,  61, -46, -88,  -4,  85,  54},
+-+{67, -54, -78,  38,  85, -22, -90,   4,  90,  13, -88, -31,  82,  46, -73, -61},
+-+{61, -73, -46,  82,  31, -88, -13,  90,  -4, -90,  22,  85, -38, -78,  54,  67},
+-+{54, -85,  -4,  88, -46, -61,  82,  13, -90,  38,  67, -78, -22,  90, -31, -73},
+-+{46, -90,  38,  54, -90,  31,  61, -88,  22,  67, -85,  13,  73, -82,   4,  78},
+-+{38, -88,  73,  -4, -67,  90, -46, -31,  85, -78,  13,  61, -90,  54,  22, -82},
+-+{31, -78,  90, -61,   4,  54, -88,  82, -38, -22,  73, -90,  67, -13, -46,  85},
+-+{22, -61,  85, -90,  73, -38,  -4,  46, -78,  90, -82,  54, -13, -31,  67, -88},
+-+{13, -38,  61, -78,  88, -90,  85, -73,  54, -31,   4,  22, -46,  67, -82,  90},
+-+{ 4, -13,  22, -31,  38, -46,  54, -61,  67, -73,  78, -82,  85, -88,  90, -90}
+-+};
+-+
+- struct GPU
+- {
+-   unsigned int qpu_code[QPU_CODE_SIZE];
+-   unsigned int vpu_code[VPU_CODE_SIZE];
+-+  short transMatrix2even[16*16];
+-   int open_count; // Number of allocated video buffers
+-   unsigned int vc_handle; // Handle of this memory
+-   int      mb; // Mailbox handle
+-@@ -123,6 +161,8 @@ static int gpu_init(volatile struct GPU **gpu) {
+-     assert(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
+-     memcpy((void*)ptr->vpu_code, rpi_hevc_transform, num_bytes);
+-   }
+-+  // And the transform coefficients
+-+  memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, 16*16*sizeof(short));
+- 
+-   return 0;
+- }
+-@@ -274,11 +314,43 @@ unsigned int vpu_get_fn(void) {
+-   return gpu->vc + offsetof(struct GPU,vpu_code);
+- }
+- 
+-+unsigned int vpu_get_constants(void) {
+-+  if (gpu==NULL) {
+-+    gpu_lock();
+-+    gpu_unlock();
+-+  }
+-+  return gpu->vc + offsetof(struct GPU,transMatrix2even);
+-+}
+-+
+- unsigned vpu_execute_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5)
+- {
+-   unsigned r;
+-+#ifdef RPI_TIME_TOTAL_VPU
+-+  static int last_time=0;
+-+  static long long on_time=0;
+-+  static long long off_time=0;
+-+  int start_time;
+-+  int end_time;
+-+  static int count=0;
+-+  static long long countr2=0;
+-+#endif
+-   gpu_lock();
+-+#ifdef RPI_TIME_TOTAL_VPU
+-+  start_time = Microseconds();
+-+  if (last_time==0)
+-+    last_time = start_time;
+-+  off_time += start_time-last_time;
+-+#endif
+-   r = execute_code(gpu->mb, code, r0, r1, r2, r3, r4, r5);
+-+#ifdef RPI_TIME_TOTAL_VPU
+-+  end_time = Microseconds();
+-+  last_time = end_time;
+-+  on_time += end_time - start_time;
+-+  count++;
+-+  countr2 += r2;
+-+  if ((count&0x7f)==0)
+-+    printf("VPU %d %lld On=%dms, Off=%dms\n",count,countr2,(int)(on_time/1000),(int)(off_time/1000));
+-+#endif
+-   gpu_unlock();
+-   return r;
+- }
+-diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
+-index 4e3c35c..814fc3c 100644
+---- a/libavcodec/rpi_qpu.h
+-+++ b/libavcodec/rpi_qpu.h
+-@@ -34,6 +34,7 @@ extern unsigned int qpu_get_fn(int num);
+- 
+- // VPU specific functions
+- extern unsigned int vpu_get_fn(void);
+-+extern unsigned int vpu_get_constants(void);
+- extern unsigned vpu_execute_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5);
+- 
+- // Simple test of shader code
+--- 
+-2.7.4
+-
+-
+-From 4bb0a7ba6723650e74d63cec2123f76da4c3eb0e Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Tue, 5 May 2015 09:41:23 +0100
+-Subject: [PATCH 05/68] Fixed deblocking
+-
+----
+- libavcodec/hevc.c | 20 +++++++++++++++++---
+- 1 file changed, 17 insertions(+), 3 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 94ff709..391c57a 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -2400,8 +2400,9 @@ static void rpi_execute_transform(HEVCContext *s)
+-     //    s->hevcdsp.idct[4-2](coeffs, 16);
+-     //}
+- 
+--    //gpu_cache_flush(&s->coeffs_buf[i]);
+-+    gpu_cache_flush(&s->coeffs_buf[i]);
+-     vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf[i].vc, s->num_coeffs[i] >> 8, 0, 0, 0);
+-+    gpu_cache_flush(&s->coeffs_buf[i]);
+- 
+-     for(i=0;i<4;i++)
+-         s->num_coeffs[i] = 0;
+-@@ -2440,6 +2441,7 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-     int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs];
+- 
+- #ifdef RPI
+-+    int start_ctb_x = (s->sh.slice_ctb_addr_rs % ((s->ps.sps->width + ctb_size - 1) >> s->ps.sps->log2_ctb_size)) << s->ps.sps->log2_ctb_size;
+-     s->enable_rpi = 1; // TODO this should depend on cross component and frame width etc.
+- #endif
+- 
+-@@ -2473,9 +2475,17 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+- 
+-         more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
+- #ifdef RPI
+--        if (1 || x_ctb + ctb_size >= s->ps.sps->width) { // TODO watch out for deblocking!
+-+        if (s->enable_rpi && x_ctb + ctb_size >= s->ps.sps->width) {
+-+            int x;
+-+            // Transform all blocks
+-             rpi_execute_transform(s);
+-+            // Perform intra prediction and residual reconstruction
+-             rpi_execute_pred_cmds(s);
+-+            // Perform deblocking for CTBs in this row
+-+            for(x = start_ctb_x; x <= x_ctb; x += ctb_size) {  // TODO this will fail for tiles
+-+                ff_hevc_hls_filters(s, x, y_ctb, ctb_size);
+-+            }
+-+            start_ctb_x = 0;
+-         }
+- #endif
+-         if (more_data < 0) {
+-@@ -2486,6 +2496,10 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+- 
+-         ctb_addr_ts++;
+-         ff_hevc_save_states(s, ctb_addr_ts);
+-+#ifdef RPI
+-+        if (s->enable_rpi)
+-+            continue;
+-+#endif
+-         ff_hevc_hls_filters(s, x_ctb, y_ctb, ctb_size);
+-     }
+- 
+-@@ -3289,7 +3303,7 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
+-     if (!s->univ_pred_cmds)
+-         goto fail;
+-     for(i = 0; i < 4; i++) {
+--        gpu_malloc_uncached(sizeof(int16_t)*RPI_MAX_XFM_CMDS*16, &s->coeffs_buf[i]); // TODO slim this down and share across sizes
+-+        gpu_malloc_cached(sizeof(int16_t)*RPI_MAX_XFM_CMDS*16, &s->coeffs_buf[i]); // TODO slim this down and share across sizes
+-         s->coeffs_buf_arm[i] = (int16_t*) s->coeffs_buf[i].arm;
+-         if (!s->coeffs_buf_arm[i])
+-             goto fail;
+--- 
+-2.7.4
+-
+-
+-From 9079ef888e3d81a69f3c802ddc3c5134679e74a6 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Tue, 5 May 2015 11:32:30 +0100
+-Subject: [PATCH 06/68] Added 32x32 transform
+-
+----
+- libavcodec/hevc.c               |   8 +-
+- libavcodec/hevc_cabac.c         |   4 +-
+- libavcodec/rpi_hevc_transform.h | 200 +++++++++++++++++-----------------------
+- libavcodec/rpi_hevc_transform.s | 102 ++++++++++----------
+- libavcodec/rpi_qpu.c            |   4 +-
+- 5 files changed, 148 insertions(+), 170 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 391c57a..0dde6f2 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -2400,9 +2400,11 @@ static void rpi_execute_transform(HEVCContext *s)
+-     //    s->hevcdsp.idct[4-2](coeffs, 16);
+-     //}
+- 
+--    gpu_cache_flush(&s->coeffs_buf[i]);
+--    vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf[i].vc, s->num_coeffs[i] >> 8, 0, 0, 0);
+--    gpu_cache_flush(&s->coeffs_buf[i]);
+-+    gpu_cache_flush(&s->coeffs_buf[2]);
+-+    gpu_cache_flush(&s->coeffs_buf[3]);
+-+    vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf[2].vc, s->num_coeffs[2] >> 8, s->coeffs_buf[3].vc, s->num_coeffs[3] >> 10, 0);
+-+    gpu_cache_flush(&s->coeffs_buf[2]);
+-+    gpu_cache_flush(&s->coeffs_buf[3]);
+- 
+-     for(i=0;i<4;i++)
+-         s->num_coeffs[i] = 0;
+-diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
+-index d1cba86..88aa959 100644
+---- a/libavcodec/hevc_cabac.c
+-+++ b/libavcodec/hevc_cabac.c
+-@@ -1031,7 +1031,9 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+-     int vshift = s->ps.sps->vshift[c_idx];
+-     uint8_t *dst = &s->frame->data[c_idx][(y0 >> vshift) * stride +
+-                                           ((x0 >> hshift) << s->ps.sps->pixel_shift)];
+--    int use_vpu = s->enable_rpi && !lc->cu.cu_transquant_bypass_flag && !transform_skip_flag && !lc->tu.cross_pf && log2_trafo_size==4;
+-+#ifdef RPI
+-+    int use_vpu = s->enable_rpi && !lc->cu.cu_transquant_bypass_flag && !transform_skip_flag && !lc->tu.cross_pf && log2_trafo_size>=4;
+-+#endif
+-     int16_t *coeffs = (int16_t*)(c_idx ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
+-     uint8_t significant_coeff_group_flag[8][8] = {{0}};
+-     int explicit_rdpcm_flag = 0;
+-diff --git a/libavcodec/rpi_hevc_transform.h b/libavcodec/rpi_hevc_transform.h
+-index c0c279f..6d772d7 100644
+---- a/libavcodec/rpi_hevc_transform.h
+-+++ b/libavcodec/rpi_hevc_transform.h
+-@@ -1,6 +1,10 @@
+- unsigned char rpi_hevc_transform [] = {
+- 169,
+- 3,
+-+62,
+++169,
+++3,
+++73,
+ +64,
+-+79,
+++52,
+++64,
+++45,
+++64,
+++2,
+++64,
+++10,
+ +64,
+- 3,
+- 232,
+- 32,
+-@@ -17,6 +21,22 @@ unsigned char rpi_hevc_transform [] = {
+- 248,
+- 0,
+- 0,
+ +64,
+++198,
+++1,
+++7,
+++8,
+ +232,
+++63,
+ +0,
+-+2,
+ +0,
+ +0,
+-+12,
+-+248,
+++6,
+++232,
+++253,
+++255,
+++255,
+++255,
+ +0,
+-+168,
+++246,
+ +0,
+ +0,
+-+192,
+++0,
+++4,
+++215,
+++64,
+++3,
+++96,
+++2,
+ +248,
+ +0,
+++35,
+++0,
+ +0,
+- 0,
+- 96,
+- 3,
+-@@ -79,7 +99,7 @@ unsigned char rpi_hevc_transform [] = {
+- 70,
+- 128,
+- 144,
+--39,
+-+40,
+- 0,
+- 4,
+- 255,
+-@@ -113,7 +133,7 @@ unsigned char rpi_hevc_transform [] = {
+- 0,
+- 128,
+- 144,
+--22,
+-+23,
+- 0,
+- 4,
+- 255,
+-@@ -153,6 +173,8 @@ unsigned char rpi_hevc_transform [] = {
+- 140,
+- 211,
+- 192,
+-+34,
+-+31,
+- 41,
+- 3,
+- 70,
+-@@ -195,7 +217,7 @@ unsigned char rpi_hevc_transform [] = {
+- 255,
+- 36,
+- 204,
+--96,
+-+224,
+- 2,
+- 0,
+- 248,
+-@@ -219,62 +241,10 @@ unsigned char rpi_hevc_transform [] = {
+- 103,
+- 90,
+- 0,
+--8,
+--240,
+--0,
+--128,
+--128,
+--3,
+--0,
+--247,
+--32,
+--128,
+--10,
+--4,
+--136,
+--240,
+--32,
+--0,
+--128,
+--3,
+--112,
+--96,
+--90,
+--0,
+--169,
+--3,
+--3,
+--232,
+--32,
+--0,
+--0,
+--0,
+--12,
+--248,
+--0,
+--136,
+--0,
+--0,
+--192,
+--248,
+--0,
+--0,
+-+225,
+ +64,
+-+242,
+- 64,
+--232,
+--0,
+--2,
+--0,
+--0,
+--12,
+--248,
+--0,
+--168,
+--0,
+--0,
+--192,
+--248,
+--0,
+--0,
+- 3,
+- 232,
+- 128,
+-@@ -287,18 +257,6 @@ unsigned char rpi_hevc_transform [] = {
+- 2,
+- 0,
+- 0,
+--4,
+--232,
+--64,
+--0,
+--0,
+--0,
+--5,
+--232,
+--0,
+--8,
+--0,
+--0,
+- 57,
+- 239,
+- 224,
+-@@ -317,18 +275,26 @@ unsigned char rpi_hevc_transform [] = {
+- 64,
+- 26,
+- 64,
+++56,
+++0,
+++0,
+ +4,
+-+232,
+++248,
+++0,
+++36,
+++0,
+++0,
+++64,
+++56,
+++8,
+++0,
+++0,
+++240,
+ +64,
+ +0,
+++132,
+++3,
+++128,
+++240,
+ +0,
+ +0,
+-+149,
+-+96,
+- 161,
+- 64,
+- 152,
+- 64,
+- 128,
+- 144,
+--31,
+++132,
+++3,
+++128,
+++144,
+++137,
+++0,
+++131,
+++98,
+++0,
+++255,
+++64,
+++0,
+++0,
+++20,
+++200,
+++243,
+++0,
+++0,
+++128,
+++144,
+++129,
+++0,
+++131,
+++102,
+++0,
+++158,
+++67,
+++0,
+++2,
+++248,
+++0,
+ +35,
+- 0,
+- 72,
+- 232,
+--32,
+- 0,
+-+4,
+- 0,
+- 0,
+- 65,
+-@@ -339,8 +305,16 @@ unsigned char rpi_hevc_transform [] = {
+- 0,
+- 128,
+- 144,
+--23,
+-+27,
+++0,
+++0,
+++64,
+++56,
+++0,
+ +0,
+ +4,
+-+232,
+++248,
+++0,
+++36,
+++0,
+ +0,
+++64,
+++56,
+ +8,
+- 0,
+ +0,
+-+69,
+-+96,
+- 145,
+- 64,
+- 168,
+-@@ -351,8 +325,8 @@ unsigned char rpi_hevc_transform [] = {
+- 0,
+- 72,
+- 232,
+--32,
+- 0,
+-+4,
+- 0,
+- 0,
+- 65,
+-@@ -373,7 +347,7 @@ unsigned char rpi_hevc_transform [] = {
+- 0,
+- 242,
+- 140,
+--229,
+-+221,
+- 192,
+- 57,
+- 239,
+-@@ -383,6 +357,8 @@ unsigned char rpi_hevc_transform [] = {
+- 0,
+- 41,
+- 3,
+-+239,
+++0,
+++240,
+++64,
+++0,
+++132,
+ +3,
+- 12,
+- 248,
+- 0,
+-@@ -390,7 +366,7 @@ unsigned char rpi_hevc_transform [] = {
+- 0,
+- 0,
+- 192,
+--8,
+-+248,
+- 4,
+- 0,
+- 12,
+-@@ -400,14 +376,14 @@ unsigned char rpi_hevc_transform [] = {
+- 64,
+- 0,
+- 192,
+--8,
+-+248,
+- 4,
+- 0,
+- 0,
+- 96,
+- 255,
+- 159,
+--131,
+-+154,
+- 255,
+- 0,
+- 232,
+-@@ -417,7 +393,7 @@ unsigned char rpi_hevc_transform [] = {
+- 0,
+- 255,
+- 159,
+--142,
+-+165,
+- 255,
+- 4,
+- 255,
+-@@ -429,7 +405,7 @@ unsigned char rpi_hevc_transform [] = {
+- 251,
+- 62,
+- 0,
+--5,
+-+4,
+- 255,
+- 51,
+- 204,
+-@@ -439,15 +415,15 @@ unsigned char rpi_hevc_transform [] = {
+- 251,
+- 16,
+- 0,
+--77,
+-+76,
+- 254,
+- 51,
+- 204,
+--9,
+--4,
+ +128,
+++240,
+++0,
+++0,
+++132,
+ +3,
+- 224,
+- 251,
+--0,
+++128,
+++144,
+++108,
+++0,
+++131,
+++98,
+++0,
+++255,
+++64,
+++0,
+++0,
+ +20,
+- 0,
+- 128,
+- 64,
+-@@ -467,16 +443,6 @@ unsigned char rpi_hevc_transform [] = {
+- 99,
+- 0,
+- 0,
+--4,
+--254,
+--0,
+--144,
+--128,
+--2,
+--0,
+--8,
+--2,
+--0,
+- 32,
+- 247,
+- 240,
+-@@ -488,92 +454,92 @@ unsigned char rpi_hevc_transform [] = {
+- 176,
+- 207,
+- 17,
+--3,
+-+19,
+- 32,
+- 247,
+- 112,
+- 207,
+- 18,
+--3,
+-+35,
+- 32,
+- 247,
+- 48,
+- 207,
+- 19,
+--3,
+-+51,
+- 32,
+- 247,
+- 240,
+- 206,
+- 20,
+--3,
+-+67,
+- 32,
+- 247,
+- 176,
+- 206,
+- 21,
+--3,
+-+83,
+- 32,
+- 247,
+- 112,
+- 206,
+- 22,
+--3,
+-+99,
+- 32,
+- 247,
+- 48,
+- 206,
+- 23,
+--3,
+-+115,
+- 32,
+- 247,
+- 240,
+- 205,
+- 24,
+--3,
+++200,
+++243,
+++0,
+++0,
+++128,
+++144,
+++100,
+++0,
+ +131,
+- 32,
+- 247,
+- 176,
+- 205,
+- 25,
+--3,
+-+147,
+- 32,
+- 247,
+- 112,
+- 205,
+- 26,
+--3,
+-+163,
+- 32,
+- 247,
+- 48,
+- 205,
+- 27,
+--3,
+-+179,
+- 32,
+- 247,
+- 240,
+- 204,
+- 28,
+--3,
+-+195,
+- 32,
+- 247,
+- 176,
+- 204,
+- 29,
+--3,
+++102,
+++0,
+++248,
+++64,
+++0,
+++112,
+++0,
+++192,
+++243,
+ +211,
+- 32,
+- 247,
+- 112,
+- 204,
+- 30,
+--3,
+-+227,
+- 32,
+- 247,
+- 48,
+- 204,
+- 31,
+--3,
+--5,
+++31,
+++128,
+++248,
+++0,
+++0,
+++112,
+++0,
+++192,
+ +243,
+-+4,
+- 255,
+- 51,
+- 204,
+-@@ -583,20 +549,20 @@ unsigned char rpi_hevc_transform [] = {
+- 251,
+- 16,
+- 0,
+--77,
+-+76,
+- 254,
+- 51,
+- 204,
+--9,
+--4,
+++211,
+++31,
+ +128,
+-+3,
+- 224,
+- 251,
+--0,
+-+20,
+- 0,
+- 0,
+- 237,
+-+32,
+- 0,
+--4,
+- 0,
+- 0,
+- 140,
+-@@ -609,6 +575,6 @@ unsigned char rpi_hevc_transform [] = {
+- 99,
+- 0,
+- 0,
+--90,
+--0,
+-+111,
+-+3,
+- };
+-diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s
+-index 1e389c7..afdb32a 100644
+---- a/libavcodec/rpi_hevc_transform.s
+-+++ b/libavcodec/rpi_hevc_transform.s
+-@@ -76,12 +76,19 @@
+- # transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory)
+- # coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
+- # num: number of 16x16 transforms to be done
+-+# coeffs32
+-+# num32: number of 32x32 transforms
+- #
+- hevc_trans_16x16:
+-   push r6-r15, lr # TODO cut down number of used registers
+--
+-+  mov r14,r3 # coeffs32
+-+  mov r15,r4 # num32
+-   mov r3, 16*2 # Stride of transMatrix2 in bytes
+-   vldh HX(32++,0),(r0 += r3) REP 16 # This is the 16x16 matrix, a transform is equivalent to multiplying input row vector * matrix
+-+
+-+  add r0, 16*16*2 # For 32x32 transforms we also need this matrix
+-+  vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
+-+
+-   # Now use r0 to describe which matrix we are working on.
+-   # Allows us to prefetch the next block of coefficients for efficiency.
+-   mov r0,0 # This describes the location where we read our coefficients from
+-@@ -121,6 +128,10 @@ block_loop:
+-   add r1,r7
+- 
+-   addcmpbgt r2,-1,0,block_loop
+-+
+-+  # Now go and do any 32x32 transforms
+-+  b hevc_trans_32x32
+-+
+-   pop r6-r15, pc
+- 
+- # r1,r2,r3 r7,r8 should be preserved
+-@@ -136,26 +147,18 @@ col_trans_16_loop:
+-   # Then sum up the results and place back
+-   vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
+-   addcmpblt r0,1,r6,col_trans_16_loop
+--  sub r0,16  # but r0 back to its original value
+-+  sub r0,16  # put r0 back to its original value
+-   b lr
+- 
+- col_trans_odd_16:
+-   add r6,r0,16 # Final value for this loop
+- col_trans_odd_16_loop:
+-   # First compute partial products for a single column
+--  vmul32s HY(48++,0), VX(0,0)+r0, VX(32,0++) REP 16
+-+  vmul32s HY(48++,0), VX(0,0)+r0, VX(32,32++) REP 16
+-   # Then sum up the results and place back
+-   vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
+-   addcmpblt r0,1,r6,col_trans_odd_16_loop
+--  sub r0,16  # but r0 back to its original value
+--  b lr
+--
+--
+--test_add:
+--  vldh HX(0,0),(r0)
+--  vadd HX(0,0),HX(0,0),10
+--  vsth HX(0,0),(r0)
+--  mov r0,7 # return value
+-+  sub r0,16  # put r0 back to its original value
+-   b lr
+- 
+- # hevc_trans_32x32(short *transMatrix2, short *coeffs, int num)
+-@@ -164,18 +167,17 @@ test_add:
+- # num: number of 16x16 transforms to be done
+- #
+- hevc_trans_32x32:
+--  push r6-r15, lr # TODO cut down number of used registers
+-+  mov r1,r14 # coeffs
+-+  mov r2,r15 # num
+- 
+--  # Fetch transform matrices
+--  mov r3, 16*2 # Stride of transMatrix2 in bytes (and of coefficients)
+--  vldh HX(32++,0),(r0 += r3) REP 16 # This is the even 16x16 matrix
+--  add r0, 16*16*2
+--  vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
+-+  # Fetch odd transform matrix
+-+  #mov r3, 16*2 # Stride of transMatrix2 in bytes (and of coefficients)
+-+  #vldh HX(32++,0),(r0 += r3) REP 16 # This is the even 16x16 matrix
+-+  #add r0, 16*16*2
+-+  #vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
+- 
+-   mov r3, 32*2*2 # Stride used to fetch alternate rows of our input coefficient buffer
+-   mov r7, 16*16*2 # Total block size
+--  mov r4, 64 # Constant used for rounding first pass
+--  mov r5, 1<<11 # Constant used for rounding second pass
+-   sub sp,sp,32*32*2+32 # Allocate some space on the stack for us to store 32*32 shorts as temporary results (needs to be aligned)
+-   # set r8 to 32byte aligned stack pointer
+-   add r8,sp,31
+-@@ -186,21 +188,27 @@ hevc_trans_32x32:
+- block_loop32:
+- 
+-   # COLUMN TRANSFORM
+-+  mov r4, 64 # Constant used for rounding first pass
+-+  mov r5, 9 # left shift used for rounding first pass
+-+
+-   # Transform the first 16 columns
+-   mov r1,r10  # Input Coefficient buffer
+-   mov r8,r9   # Output temporary storage
+-   bl trans32
+-   # Transform the second 16 columns
+--  add r8,32
+-+  add r8,32*16*2
+-   add r1,32
+-   bl trans32
+- 
+-   # ROW TRANSFORM
+-+  mov r4, 1<<11 # Constant used for rounding second pass
+-+  mov r5, 4 # left shift used for rounding second pass
+-+
+-   mov r1,r9  # Input temporary storage
+-   mov r8,r10   # Output Coefficient buffer
+-   bl trans32
+-   # Transform the second 16 columns
+--  add r8,32
+-+  add r8,32*16*2
+-   add r1,32
+-   bl trans32
+- 
+-@@ -212,11 +220,12 @@ block_loop32:
+-   pop r6-r15, pc
+- 
+- trans32:
+-+  push lr
+-   # We can no longer afford the VRF space to do prefetching when doing 32x32
+-   # Fetch the even rows
+--  vldh HX(0++,0)+r0,(r1 += r3) REP 16
+-+  vldh HX(0++,0),(r1 += r3) REP 16
+-   # Fetch the odd rows
+--  vldh HX(16++,0)+r0,64(r1 += r3) REP 16 # First odd row is 32 shorts ahead of r1
+-+  vldh HX(16++,0),64(r1 += r3) REP 16 # First odd row is 32 shorts ahead of r1
+- 
+-   # Transform the even rows using even matrix
+-   mov r0, 0 # Even rows
+-@@ -228,33 +237,32 @@ trans32:
+- 
+-   # Now apply butterfly to compute the first 16 results
+-   vadd HY(48++,0),HY(0++,0),HY(16++,0) REP 16
+--  vadd HY(48++,0),HY(48++,0),r4 REP 32   # add on rounding,
+--  vasl HY(48++,0),HY(48++,0),9 REP 32    # shift down by 7, and saturate
+-+  vadd HY(48++,0),HY(48++,0),r4 REP 16   # add on rounding,
+-+  vasl HY(48++,0),HY(48++,0),r5 REP 16    # shift down by 7, and saturate
+-   # 16bit results now in HX(48,32)
+-   mov r0,r8
+-   mov r6,32*2
+-   vsth VX(48,32++),(r0+=r6) REP 16
+--  vmov VX(0,0++)+r0, HX(0++,32)+r0 REP 16    # Store transposed
+- 
+-   # Now apply butterfly to compute the second 16 results (in reverse order)
+--  vsub HY(63,0),HY(0,0),HY(16,0)
+--  vsub HY(62,0),HY(0,0),HY(17,0)
+--  vsub HY(61,0),HY(0,0),HY(18,0)
+--  vsub HY(60,0),HY(0,0),HY(19,0)
+--  vsub HY(59,0),HY(0,0),HY(20,0)
+--  vsub HY(58,0),HY(0,0),HY(21,0)
+--  vsub HY(57,0),HY(0,0),HY(22,0)
+--  vsub HY(56,0),HY(0,0),HY(23,0)
+--  vsub HY(55,0),HY(0,0),HY(24,0)
+--  vsub HY(54,0),HY(0,0),HY(25,0)
+--  vsub HY(53,0),HY(0,0),HY(26,0)
+--  vsub HY(52,0),HY(0,0),HY(27,0)
+--  vsub HY(51,0),HY(0,0),HY(28,0)
+--  vsub HY(50,0),HY(0,0),HY(29,0)
+--  vsub HY(49,0),HY(0,0),HY(30,0)
+--  vsub HY(48,0),HY(0,0),HY(31,0)
+--  vadd HY(48++,0),HY(48++,0),r4 REP 32   # add on rounding,
+--  vasl HY(48++,0),HY(48++,0),9 REP 32    # shift down by 7, and saturate
+--  add r0,r8,16*32*2 # Move to 16th row
+-+  vsub HY(63,0),HY(0 ,0),HY(16,0)
+-+  vsub HY(62,0),HY(1 ,0),HY(17,0)
+-+  vsub HY(61,0),HY(2 ,0),HY(18,0)
+-+  vsub HY(60,0),HY(3 ,0),HY(19,0)
+-+  vsub HY(59,0),HY(4 ,0),HY(20,0)
+-+  vsub HY(58,0),HY(5 ,0),HY(21,0)
+-+  vsub HY(57,0),HY(6 ,0),HY(22,0)
+-+  vsub HY(56,0),HY(7 ,0),HY(23,0)
+-+  vsub HY(55,0),HY(8 ,0),HY(24,0)
+-+  vsub HY(54,0),HY(9 ,0),HY(25,0)
+-+  vsub HY(53,0),HY(10,0),HY(26,0)
+-+  vsub HY(52,0),HY(11,0),HY(27,0)
+-+  vsub HY(51,0),HY(12,0),HY(28,0)
+-+  vsub HY(50,0),HY(13,0),HY(29,0)
+-+  vsub HY(49,0),HY(14,0),HY(30,0)
+-+  vsub HY(48,0),HY(15,0),HY(31,0)
+-+  vadd HY(48++,0),HY(48++,0),r4 REP 16   # add on rounding,
+-+  vasl HY(48++,0),HY(48++,0),r5 REP 16    # shift down by 7, and saturate
+-+  add r0,r8,32
+-   vsth VX(48,32++),(r0+=r6) REP 16
+--  b lr
+-+  pop pc
+-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
+-index d720546..12ad5fb 100644
+---- a/libavcodec/rpi_qpu.c
+-+++ b/libavcodec/rpi_qpu.c
+-@@ -89,7 +89,7 @@ struct GPU
+- {
+-   unsigned int qpu_code[QPU_CODE_SIZE];
+-   unsigned int vpu_code[VPU_CODE_SIZE];
+--  short transMatrix2even[16*16];
+-+  short transMatrix2even[16*16*2];
+-   int open_count; // Number of allocated video buffers
+-   unsigned int vc_handle; // Handle of this memory
+-   int      mb; // Mailbox handle
+-@@ -162,7 +162,7 @@ static int gpu_init(volatile struct GPU **gpu) {
+-     memcpy((void*)ptr->vpu_code, rpi_hevc_transform, num_bytes);
+-   }
+-   // And the transform coefficients
+--  memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, 16*16*sizeof(short));
+-+  memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, sizeof(rpi_transMatrix2even));
+- 
+-   return 0;
+- }
+--- 
+-2.7.4
+-
+-
+-From 6c2ed6109c4dd5c8ab16bf16e0ae3be6ae166e50 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Tue, 5 May 2015 16:57:03 +0100
+-Subject: [PATCH 07/68] Clear coefficients in advance
+-
+----
+- libavcodec/hevc.c               | 129 ++++++++++++++++++++++++++++------------
+- libavcodec/hevc.h               |   6 +-
+- libavcodec/hevc_cabac.c         |   7 ++-
+- libavcodec/rpi_hevc_transform.h |  50 ++++++++++++++++
+- libavcodec/rpi_hevc_transform.s |  16 +++++
+- 5 files changed, 168 insertions(+), 40 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 0dde6f2..1424007 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -43,6 +43,8 @@
+- 
+- #ifdef RPI
+- #include "rpi_qpu.h"
+-+// For some unknown reason, the code seems to crash if I do a late malloc
+-+#define EARLY_MALLOC
+- #endif
+- 
+- // #define DISABLE_MC
+-@@ -61,6 +63,20 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
+- /* free everything allocated  by pic_arrays_init() */
+- static void pic_arrays_free(HEVCContext *s)
+- {
+-+#ifdef RPI
+-+#ifdef EARLY_MALLOC
+-+#else
+-+    printf("pic_arrays_free\n");
+-+    if (s->coeffs_buf_arm[0]) {
+-+      gpu_free(&s->coeffs_buf_default);
+-+      s->coeffs_buf_arm[0] = 0;
+-+    }
+-+    if (s->coeffs_buf_arm[2]) {
+-+      gpu_free(&s->coeffs_buf_accelerated);
+-+      s->coeffs_buf_arm[2] = 0;
+-+    }
+-+#endif
+-+#endif
+-     av_freep(&s->sao);
+-     av_freep(&s->deblock);
+- 
+-@@ -97,6 +113,28 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
+-     int ctb_count        = sps->ctb_width * sps->ctb_height;
+-     int min_pu_size      = sps->min_pu_width * sps->min_pu_height;
+- 
+-+#ifdef RPI
+-+#ifdef EARLY_MALLOC
+-+#else
+-+    int coeffs_in_ctb = (1 << s->ps.sps->log2_ctb_size) * (1 << s->ps.sps->log2_ctb_size);
+-+    int coefs_per_row = sps->ctb_width * coeffs_in_ctb * 3;  // Allow space for chroma
+-+    printf("pic_arrays_init\n");
+-+    printf("Allocated %d\n",coefs_per_row);
+-+    gpu_malloc_cached(sizeof(int16_t) * coefs_per_row, &s->coeffs_buf_default);
+-+    s->coeffs_buf_arm[0] = (int16_t*) s->coeffs_buf_default.arm;
+-+    if (!s->coeffs_buf_arm[0])
+-+        goto fail;
+-+    gpu_malloc_cached(sizeof(int16_t) * coefs_per_row * 2, &s->coeffs_buf_accelerated);
+-+    s->coeffs_buf_arm[2] = (int16_t*) s->coeffs_buf_accelerated.arm;
+-+    s->coeffs_buf_vc[2] = s->coeffs_buf_accelerated.vc;
+-+    if (!s->coeffs_buf_arm[2])
+-+        goto fail;
+-+    s->coeffs_buf_arm[3] = coefs_per_row + s->coeffs_buf_arm[2];
+-+    s->coeffs_buf_vc[3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[2];
+-+    printf("Done\n");
+-+#endif
+-+#endif
+-+
+-     s->bs_width  = (width  >> 2) + 1;
+-     s->bs_height = (height >> 2) + 1;
+- 
+-@@ -2400,11 +2438,10 @@ static void rpi_execute_transform(HEVCContext *s)
+-     //    s->hevcdsp.idct[4-2](coeffs, 16);
+-     //}
+- 
+--    gpu_cache_flush(&s->coeffs_buf[2]);
+--    gpu_cache_flush(&s->coeffs_buf[3]);
+--    vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf[2].vc, s->num_coeffs[2] >> 8, s->coeffs_buf[3].vc, s->num_coeffs[3] >> 10, 0);
+--    gpu_cache_flush(&s->coeffs_buf[2]);
+--    gpu_cache_flush(&s->coeffs_buf[3]);
+-+
+-+    gpu_cache_flush(&s->coeffs_buf_accelerated);
+-+    vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0);
+-+    //gpu_cache_flush(&s->coeffs_buf_accelerated);
+- 
+-     for(i=0;i<4;i++)
+-         s->num_coeffs[i] = 0;
+-@@ -2426,7 +2463,9 @@ static void rpi_execute_pred_cmds(HEVCContext *s)
+-           lc->na.cand_up_right     = (cmd->na >> 0) & 1;
+-           s->hpc.intra_pred[cmd->size - 2](s, cmd->x, cmd->y, cmd->c_idx);
+-       } else {
+-+          int trafo_size = 1 << cmd->size;
+-           s->hevcdsp.transform_add[cmd->size-2](cmd->dst, cmd->buf, cmd->stride);
+-+          memset(cmd->buf, 0, trafo_size * trafo_size * sizeof(int16_t)); // Clear coefficients here while they are in the cache
+-       }
+-   }
+-   s->num_pred_cmds = 0;
+-@@ -3235,10 +3274,18 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
+-     av_freep(&s->unif_mv_cmds);
+-     av_freep(&s->unif_xfm_cmds);
+-     av_freep(&s->univ_pred_cmds);
+--    for(i = 0; i < 4; i++) {
+--        gpu_free(&s->coeffs_buf[i]);
+-+
+-+#ifdef EARLY_MALLOC
+-+    if (s->coeffs_buf_arm[0]) {
+-+      gpu_free(&s->coeffs_buf_default);
+-+      s->coeffs_buf_arm[0] = 0;
+-+    }
+-+    if (s->coeffs_buf_arm[2]) {
+-+      gpu_free(&s->coeffs_buf_accelerated);
+-+      s->coeffs_buf_arm[2] = 0;
+-     }
+- #endif
+-+#endif
+- 
+-     for (i = 0; i < 3; i++) {
+-         av_freep(&s->sao_pixel_buffer_h[i]);
+-@@ -3281,6 +3328,16 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
+-     return 0;
+- }
+- 
+-+#ifdef RPI
+-+static av_cold void memclear16(int16_t *p, int n)
+-+{
+-+  vpu_execute_code( vpu_get_fn(), p, n, 0, 0, 0, 1);
+-+  //int i;
+-+  //for(i=0;i<n;i++)
+-+  //  p[i] = 0;
+-+}
+-+#endif
+-+
+- static av_cold int hevc_init_context(AVCodecContext *avctx)
+- {
+-     HEVCContext *s = avctx->priv_data;
+-@@ -3304,37 +3361,35 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
+-     s->univ_pred_cmds = av_mallocz(sizeof(HEVCPredCmd)*RPI_MAX_PRED_CMDS);
+-     if (!s->univ_pred_cmds)
+-         goto fail;
+--    for(i = 0; i < 4; i++) {
+--        gpu_malloc_cached(sizeof(int16_t)*RPI_MAX_XFM_CMDS*16, &s->coeffs_buf[i]); // TODO slim this down and share across sizes
+--        s->coeffs_buf_arm[i] = (int16_t*) s->coeffs_buf[i].arm;
+--        if (!s->coeffs_buf_arm[i])
+--            goto fail;
+--    }
+--    s->enable_rpi = 0;
+- 
+--    // A little test program
+--    /*{
+--      GPU_MEM_PTR_T p;
+--      int err = gpu_malloc_cached(16, &p);
+--      short *q = (short *)p.arm;
+--      int i;
+--      int r;
+--      printf("Allocated memory %d ARM 0x%x, VC 0x%x, Code 0x%x\n",err,(int)p.arm,p.vc,(int)vpu_get_fn());
+--      printf("Allocated memory %d ARM 0x%x, VC 0x%x\n",err,(int)p.arm,p.vc);
+--      printf("Preparing data %p\n",q);
+--      for(i=0;i<16;i++)
+--        q[i] = i;
+--      printf("Flush cache\n");
+--      gpu_cache_flush(&p);
+--      printf("Executing code\n");
+--      r = vpu_execute_code( vpu_get_fn(), p.vc, 0, 0, 0, 0, 0);
+--      printf("Return value %d (",r);
+--      for(i=0;i<16;i++)
+--        printf("%d ",q[i]);
+--      printf(")\n");
+--      gpu_free(&p);
+--      goto fail; // Early out
+--    }*/
+-+    s->coeffs_buf_arm[0] = 0;
+-+    s->coeffs_buf_arm[2] = 0;
+-+
+-+#ifdef EARLY_MALLOC
+-+    int coeffs_in_ctb = 64*64;
+-+    int coefs_per_row = (2048/64) * coeffs_in_ctb * 3;  // Allow space for chroma
+-+    printf("Allocated %d\n",coefs_per_row);
+-+    gpu_malloc_cached(sizeof(int16_t) * coefs_per_row, &s->coeffs_buf_default);
+-+    s->coeffs_buf_arm[0] = (int16_t*) s->coeffs_buf_default.arm;
+-+    if (!s->coeffs_buf_arm[0])
+-+        goto fail;
+-+    gpu_malloc_cached(sizeof(int16_t) * coefs_per_row * 2, &s->coeffs_buf_accelerated);
+-+    s->coeffs_buf_arm[2] = (int16_t*) s->coeffs_buf_accelerated.arm;
+-+    s->coeffs_buf_vc[2] = s->coeffs_buf_accelerated.vc;
+-+    if (!s->coeffs_buf_arm[2])
+-+        goto fail;
+-+    s->coeffs_buf_arm[3] = coefs_per_row + s->coeffs_buf_arm[2];
+-+    s->coeffs_buf_vc[3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[2];
+-+    printf("Done\n");
+-+    //memset(s->coeffs_buf_arm[0],0, sizeof(int16_t) * coefs_per_row);
+-+    memclear16(s->coeffs_buf_arm[0], coefs_per_row);
+-+    //memset(s->coeffs_buf_arm[2],0, sizeof(int16_t) * coefs_per_row);
+-+    memclear16(s->coeffs_buf_arm[2], coefs_per_row);
+-+    //memset(s->coeffs_buf_arm[3],0, sizeof(int16_t) * coefs_per_row);
+-+    memclear16(s->coeffs_buf_arm[3], coefs_per_row);
+-+#endif
+-+
+-+    s->enable_rpi = 0;
+- 
+- #endif
+- 
+-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
+-index 4167985..9a228f6 100644
+---- a/libavcodec/hevc.h
+-+++ b/libavcodec/hevc.h
+-@@ -861,8 +861,12 @@ typedef struct HEVCContext {
+-     HEVCMvCmd *unif_mv_cmds;
+-     HEVCXfmCmd *unif_xfm_cmds;
+-     HEVCPredCmd *univ_pred_cmds;
+--    GPU_MEM_PTR_T coeffs_buf[4];
+-+    int buf_width;
+-+    GPU_MEM_PTR_T coeffs_buf_default;
+-+    GPU_MEM_PTR_T coeffs_buf_accelerated;
+-     int16_t *coeffs_buf_arm[4];
+-+    unsigned int coeffs_buf_vc[4];
+-+
+-     int num_coeffs[4];
+-     int num_xfm_cmds;
+-     int num_mv_cmds;
+-diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
+-index 88aa959..dbfee85 100644
+---- a/libavcodec/hevc_cabac.c
+-+++ b/libavcodec/hevc_cabac.c
+-@@ -1058,9 +1058,13 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+-             s->num_coeffs[0] += n;
+-         }
+-     }
+-+    // We now do the memset after transform_add while we know the data is cached.
+-+    //memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
+-+#else
+-+    memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
+- #endif
+- 
+--    memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
+-+
+- 
+-     // Derive QP for dequant
+-     if (!lc->cu.cu_transquant_bypass_flag) {
+-@@ -1547,7 +1551,6 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+- #ifdef RPI
+-     if (s->enable_rpi) {
+-         HEVCPredCmd *cmd = s->univ_pred_cmds + s->num_pred_cmds++;
+--        //memcpy(coeffs2, coeffs, sizeof(int16_t) * trafo_size * trafo_size); // TODO
+-         cmd->type = RPI_PRED_TRANSFORM_ADD;
+-         cmd->size = log2_trafo_size;
+-         cmd->buf = coeffs;
+-diff --git a/libavcodec/rpi_hevc_transform.h b/libavcodec/rpi_hevc_transform.h
+-index 6d772d7..4f13622 100644
+---- a/libavcodec/rpi_hevc_transform.h
+-+++ b/libavcodec/rpi_hevc_transform.h
+-@@ -1,4 +1,10 @@
+- unsigned char rpi_hevc_transform [] = {
+-+21,
+-+106,
+-+0,
+ +144,
+-+35,
+-+1,
+- 169,
+- 3,
+- 62,
+-@@ -577,4 +583,48 @@ unsigned char rpi_hevc_transform [] = {
+- 0,
+- 111,
+- 3,
+-+4,
+-+254,
+++161,
+ +0,
+-+128,
+++188,
+++64,
+++67,
+++232,
+ +0,
+-+4,
+++2,
+ +0,
+-+248,
+ +0,
+ +0,
+-+2,
+++255,
+++64,
+++0,
+++0,
+++20,
+++200,
+++243,
+++0,
+++0,
+++128,
+++144,
+++150,
+++0,
+++195,
+ +232,
+-+32,
+ +0,
+++2,
+ +0,
+ +0,
+-+140,
+++12,
+++128,
+++7,
+++192,
+++130,
+ +248,
+-+32,
+-+0,
+ +0,
+ +0,
+++112,
+++192,
+ +224,
+-+35,
+++16,
+++195,
+++31,
+++132,
+++248,
+++1,
+ +0,
+++112,
+ +0,
+-+64,
+++224,
+++16,
+++203,
+++31,
+++3,
+++99,
+++131,
+++71,
+++68,
+ +232,
+++32,
+ +0,
+-+2,
+ +0,
+ +0,
+-+193,
+-+232,
+ +0,
+-+1,
+++99,
+++2,
+++99,
+++23,
+++102,
+++7,
+++106,
+++127,
+++156,
+++182,
+++255,
+ +0,
+++248,
+++64,
+ +0,
+-+1,
+-+106,
+-+116,
+-+30,
+-+90,
+++112,
+ +0,
+- };
+-diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s
+-index afdb32a..fd159bc 100644
+---- a/libavcodec/rpi_hevc_transform.s
+-+++ b/libavcodec/rpi_hevc_transform.s
+-@@ -78,8 +78,11 @@
+- # num: number of 16x16 transforms to be done
+- # coeffs32
+- # num32: number of 32x32 transforms
+-+# command 0 for transform, 1 for memclear16(int16_t *dst,num16)
+- #
+- hevc_trans_16x16:
+-+  cmp r5,1
+-+  beq memclear16
+-   push r6-r15, lr # TODO cut down number of used registers
+-   mov r14,r3 # coeffs32
+-   mov r15,r4 # num32
+-@@ -266,3 +269,16 @@ trans32:
+-   add r0,r8,32
+-   vsth VX(48,32++),(r0+=r6) REP 16
+-   pop pc
+-+
+-+memclear16:
+-+  # r0 is address
+-+  # r1 is number of 16bits values to set to 0 (may overrun past end and clear more than specified)
+-+  vmov HX(0++,0),0 REP 16
+-+  mov r2,32
+-+loop:
+-+  vsth HX(0++,0),(r0+=r2) REP 16
+-+  add r0,16*16*2
+-+  sub r1,16*16
+-+  cmp r1,0
+-+  bgt loop
+-+  b lr
+--- 
+-2.7.4
+-
+-
+-From 48282c2fb55c0d9a72222f384c03c432f78a3016 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Wed, 6 May 2015 09:56:43 +0100
+-Subject: [PATCH 08/68] Prepared inter offload
+-
+----
+- libavcodec/hevc.c       | 116 +++++++++++++++++++++++++++++++++++++++++++-----
+- libavcodec/hevc.h       |  29 +++++++++++-
+- libavcodec/hevc_cabac.c |   5 ++-
+- 3 files changed, 137 insertions(+), 13 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 1424007..8215201 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -45,6 +45,8 @@
+- #include "rpi_qpu.h"
+- // For some unknown reason, the code seems to crash if I do a late malloc
+- #define EARLY_MALLOC
+-+// Move Inter prediction into separate pass
+-+//#define RPI_INTER
+- #endif
+- 
+- // #define DISABLE_MC
+-@@ -1440,6 +1442,95 @@ static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size)
+-  * @param luma_offset additive offset applied to the luma prediction value
+-  */
+- 
+-+#ifdef RPI_INTER
+-+#define RPI_REDIRECT(fn) rpi_ ## fn
+-+static void rpi_luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+-+                        AVFrame *ref, const Mv *mv, int x_off, int y_off,
+-+                        int block_w, int block_h, int luma_weight, int luma_offset)
+-+{
+-+    HEVCMvCmd *cmd = unif_mv_cmds + s->num_mv_cmds++;
+-+    cmd->cmd = RPI_CMD_LUMA_UNI;
+-+    cmd->dst = dst;
+-+    cmd->dststride = dststride;
+-+    cmd->src = ref->data[0];
+-+    cmd->srcstride = ref->linesize[0];
+-+    cmd->mv = *mv;
+-+    cmd->x_off = x_off;
+-+    cmd->y_off = y_off;
+-+    cmd->block_w = block_w;
+-+    cmd->block_h = block_h;
+-+    cmd->weight = luma_weight;
+-+    cmd->offset = luma_offset;
+-+}
+-+
+-+static void rpi_luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+-+                       AVFrame *ref0, const Mv *mv0, int x_off, int y_off,
+-+                       int block_w, int block_h, AVFrame *ref1, const Mv *mv1, struct MvField *current_mv)
+-+{
+-+    HEVCMvCmd *cmd = unif_mv_cmds + s->num_mv_cmds++;
+-+    cmd->cmd = RPI_CMD_LUMA_BI;
+-+    cmd->dst = dst;
+-+    cmd->dststride = dststride;
+-+    cmd->src = ref->data[0];
+-+    cmd->srcstride = ref->linesize[0];
+-+    cmd->mv = *mv;
+-+    cmd->x_off = x_off;
+-+    cmd->y_off = y_off;
+-+    cmd->block_w = block_w;
+-+    cmd->block_h = block_h;
+-+    cmd->weight = luma_weight;
+-+    cmd->offset = luma_offset;
+-+    cmd->src1 = ref1->data[];
+-+    cmd->srcstride1 = ref1->linesize[0];
+-+    cmd->mv1 = *mv1;
+-+    cmd->ref_idx[0] = current_mv->ref_idx[0];
+-+    cmd->ref_idx[1] = current_mv->ref_idx[1];
+-+}
+-+
+-+static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
+-+                          ptrdiff_t dststride, uint8_t *src0, ptrdiff_t srcstride, int reflist,
+-+                          int x_off, int y_off, int block_w, int block_h, struct MvField *current_mv, int chroma_weight, int chroma_offset)
+-+{
+-+    HEVCMvCmd *cmd = unif_mv_cmds + s->num_mv_cmds++;
+-+    cmd->cmd = RPI_CMD_CHROMA_UNI;
+-+    cmd->dst = dst0;
+-+    cmd->dststride = dststride;
+-+    cmd->src = src0;
+-+    cmd->srcstride = srcstride;
+-+    cmd->mv = current_mv->mv[reflist];
+-+    cmd->x_off = x_off;
+-+    cmd->y_off = y_off;
+-+    cmd->block_w = block_w;
+-+    cmd->block_h = block_h;
+-+    cmd->weight = chroma_weight;
+-+    cmd->offset = chroma_offset;
+-+}
+-+
+-+static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVFrame *ref0, AVFrame *ref1,
+-+                         int x_off, int y_off, int block_w, int block_h, struct MvField *current_mv, int cidx)
+-+{
+-+    HEVCMvCmd *cmd = unif_mv_cmds + s->num_mv_cmds++;
+-+    cmd->cmd = RPI_CMD_CHROMA_BI+cidx;
+-+    cmd->dst = dst0;
+-+    cmd->dststride = dststride;
+-+    cmd->src = ref0->data[cidx+1];
+-+    cmd->srcstride = ref0->linesize[cidx+1];
+-+    cmd->mv = current_mv->mv[reflist];
+-+    cmd->x_off = x_off;
+-+    cmd->y_off = y_off;
+-+    cmd->block_w = block_w;
+-+    cmd->block_h = block_h;
+-+    cmd->weight = chroma_weight;
+-+    cmd->offset = chroma_offset;
+-+    cmd->src = ref1->data[cidx+1];
+-+    cmd->srcstride1 = ref1->linesize[cidx+1];
+-+    cmd->ref_idx[0] = current_mv->ref_idx[0];
+-+    cmd->ref_idx[1] = current_mv->ref_idx[1];
+-+}
+-+#else
+-+#define RPI_REDIRECT(fn) fn
+-+#endif
+-+
+- static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+-                         AVFrame *ref, const Mv *mv, int x_off, int y_off,
+-                         int block_w, int block_h, int luma_weight, int luma_offset)
+-@@ -1505,7 +1596,7 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+-  * @param mv1 motion vector1 (relative to block position) to get pixel data from
+-  * @param current_mv current motion vector structure
+-  */
+-- static void luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+-+static void luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+-                        AVFrame *ref0, const Mv *mv0, int x_off, int y_off,
+-                        int block_w, int block_h, AVFrame *ref1, const Mv *mv1, struct MvField *current_mv)
+- {
+-@@ -1887,16 +1978,16 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-         int nPbW_c = nPbW >> s->ps.sps->hshift[1];
+-         int nPbH_c = nPbH >> s->ps.sps->vshift[1];
+- 
+--        luma_mc_uni(s, dst0, s->frame->linesize[0], ref0->frame,
+-+        RPI_REDIRECT(luma_mc_uni)(s, dst0, s->frame->linesize[0], ref0->frame,
+-                     &current_mv.mv[0], x0, y0, nPbW, nPbH,
+-                     s->sh.luma_weight_l0[current_mv.ref_idx[0]],
+-                     s->sh.luma_offset_l0[current_mv.ref_idx[0]]);
+- 
+-         if (s->ps.sps->chroma_format_idc) {
+--            chroma_mc_uni(s, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1],
+-+            RPI_REDIRECT(chroma_mc_uni)(s, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1],
+-                           0, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
+-                           s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]);
+--            chroma_mc_uni(s, dst2, s->frame->linesize[2], ref0->frame->data[2], ref0->frame->linesize[2],
+-+            RPI_REDIRECT(chroma_mc_uni)(s, dst2, s->frame->linesize[2], ref0->frame->data[2], ref0->frame->linesize[2],
+-                           0, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
+-                           s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1]);
+-         }
+-@@ -1906,17 +1997,17 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-         int nPbW_c = nPbW >> s->ps.sps->hshift[1];
+-         int nPbH_c = nPbH >> s->ps.sps->vshift[1];
+- 
+--        luma_mc_uni(s, dst0, s->frame->linesize[0], ref1->frame,
+-+        RPI_REDIRECT(luma_mc_uni)(s, dst0, s->frame->linesize[0], ref1->frame,
+-                     &current_mv.mv[1], x0, y0, nPbW, nPbH,
+-                     s->sh.luma_weight_l1[current_mv.ref_idx[1]],
+-                     s->sh.luma_offset_l1[current_mv.ref_idx[1]]);
+- 
+-         if (s->ps.sps->chroma_format_idc) {
+--            chroma_mc_uni(s, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1],
+-+            RPI_REDIRECT(chroma_mc_uni)(s, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1],
+-                           1, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
+-                           s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0], s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0]);
+- 
+--            chroma_mc_uni(s, dst2, s->frame->linesize[2], ref1->frame->data[2], ref1->frame->linesize[2],
+-+            RPI_REDIRECT(chroma_mc_uni)(s, dst2, s->frame->linesize[2], ref1->frame->data[2], ref1->frame->linesize[2],
+-                           1, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
+-                           s->sh.chroma_weight_l1[current_mv.ref_idx[1]][1], s->sh.chroma_offset_l1[current_mv.ref_idx[1]][1]);
+-         }
+-@@ -1926,15 +2017,15 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-         int nPbW_c = nPbW >> s->ps.sps->hshift[1];
+-         int nPbH_c = nPbH >> s->ps.sps->vshift[1];
+- 
+--        luma_mc_bi(s, dst0, s->frame->linesize[0], ref0->frame,
+-+        RPI_REDIRECT(luma_mc_bi)(s, dst0, s->frame->linesize[0], ref0->frame,
+-                    &current_mv.mv[0], x0, y0, nPbW, nPbH,
+-                    ref1->frame, &current_mv.mv[1], &current_mv);
+- 
+-         if (s->ps.sps->chroma_format_idc) {
+--            chroma_mc_bi(s, dst1, s->frame->linesize[1], ref0->frame, ref1->frame,
+-+            RPI_REDIRECT(chroma_mc_bi)(s, dst1, s->frame->linesize[1], ref0->frame, ref1->frame,
+-                          x0_c, y0_c, nPbW_c, nPbH_c, &current_mv, 0);
+- 
+--            chroma_mc_bi(s, dst2, s->frame->linesize[2], ref0->frame, ref1->frame,
+-+            RPI_REDIRECT(chroma_mc_bi)(s, dst2, s->frame->linesize[2], ref0->frame, ref1->frame,
+-                          x0_c, y0_c, nPbW_c, nPbH_c, &current_mv, 1);
+-         }
+-     }
+-@@ -2465,7 +2556,9 @@ static void rpi_execute_pred_cmds(HEVCContext *s)
+-       } else {
+-           int trafo_size = 1 << cmd->size;
+-           s->hevcdsp.transform_add[cmd->size-2](cmd->dst, cmd->buf, cmd->stride);
+-+#ifdef RPI_PRECLEAR
+-           memset(cmd->buf, 0, trafo_size * trafo_size * sizeof(int16_t)); // Clear coefficients here while they are in the cache
+-+#endif
+-       }
+-   }
+-   s->num_pred_cmds = 0;
+-@@ -3381,6 +3474,7 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
+-     s->coeffs_buf_arm[3] = coefs_per_row + s->coeffs_buf_arm[2];
+-     s->coeffs_buf_vc[3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[2];
+-     printf("Done\n");
+-+#ifdef RPI_PRECLEAR
+-     //memset(s->coeffs_buf_arm[0],0, sizeof(int16_t) * coefs_per_row);
+-     memclear16(s->coeffs_buf_arm[0], coefs_per_row);
+-     //memset(s->coeffs_buf_arm[2],0, sizeof(int16_t) * coefs_per_row);
+-@@ -3389,6 +3483,8 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
+-     memclear16(s->coeffs_buf_arm[3], coefs_per_row);
+- #endif
+- 
+-+#endif
+-+
+-     s->enable_rpi = 0;
+- 
+- #endif
+-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
+-index 9a228f6..1ac119a 100644
+---- a/libavcodec/hevc.h
+-+++ b/libavcodec/hevc.h
+-@@ -803,14 +803,39 @@ typedef struct HEVCLocalContext {
+- // RPI_MAX_WIDTH is maximum width in pixels supported by the accelerated code
+- #define RPI_MAX_WIDTH 2048
+- 
+--// Worst case is for 4:4:4 4x4 blocks with 64 high coding tree blocks, so 16 MV cmds per 4 pixels across for each colour plane
+--#define RPI_MAX_MV_CMDS   (16*3*(RPI_MAX_WIDTH/4))
+-+// Worst case is for 4:4:4 4x4 blocks with 64 high coding tree blocks, so 16 MV cmds per 4 pixels across for each colour plane, * 2 for bi
+-+#define RPI_MAX_MV_CMDS   (2*16*3*(RPI_MAX_WIDTH/4))
+- #define RPI_MAX_XFM_CMDS  (16*3*(RPI_MAX_WIDTH/4))
+- // Each block can have an intra prediction and a transform_add command
+- #define RPI_MAX_PRED_CMDS (2*16*3*(RPI_MAX_WIDTH/4))
+- 
+-+#define RPI_CMD_LUMA_UNI 0
+-+#define RPI_CMD_CHROMA_UNI 1
+-+#define RPI_CMD_LUMA_BI 2
+-+#define RPI_CMD_U_BI 3
+-+#define RPI_CMD_V_BI 4
+-+
+-+// RPI_PRECLEAR is not working yet - perhaps clearing on VPUs is flawed?
+-+// #define RPI_PRECLEAR
+-+
+- // Command for inter prediction
+- typedef struct HEVCMvCmd {
+-+    int cmd;
+-+    uint8_t *dst;
+-+    ptrdiff_t dststride;
+-+    uint8_t *src;
+-+    ptrdiff_t srcstride;
+-+    Mv mv;
+-+    int x_off;
+-+    int y_off;
+-+    int block_w;
+-+    int block_h;
+-+    int weight;
+-+    int offset;
+-+    uint8_t *src1;
+-+    ptrdiff_t srcstride1;
+-+    Mv mv1;
+-+    int8_t ref_idx[2];
+- } HEVCMvCmd;
+- 
+- // Command for transform to process a block of coefficients
+-diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
+-index dbfee85..4f072be 100644
+---- a/libavcodec/hevc_cabac.c
+-+++ b/libavcodec/hevc_cabac.c
+-@@ -1059,7 +1059,10 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+-         }
+-     }
+-     // We now do the memset after transform_add while we know the data is cached.
+--    //memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
+-+    #ifdef RPI_PRECLEAR
+-+    #else
+-+    memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
+-+    #endif
+- #else
+-     memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
+- #endif
+--- 
+-2.7.4
+-
+-
+-From 25d3b4e876febe08302a01abd85d5009160ead3e Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Wed, 6 May 2015 11:08:50 +0100
+-Subject: [PATCH 09/68] Inter prediction in separate pass
+-
+----
+- libavcodec/hevc.c | 93 +++++++++++++++++++++++++++++++++++++++++++++----------
+- libavcodec/hevc.h |  2 +-
+- 2 files changed, 77 insertions(+), 18 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 8215201..b7bc6ad 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -46,7 +46,7 @@
+- // For some unknown reason, the code seems to crash if I do a late malloc
+- #define EARLY_MALLOC
+- // Move Inter prediction into separate pass
+--//#define RPI_INTER
+-+#define RPI_INTER
+- #endif
+- 
+- // #define DISABLE_MC
+-@@ -1448,7 +1448,7 @@ static void rpi_luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+-                         AVFrame *ref, const Mv *mv, int x_off, int y_off,
+-                         int block_w, int block_h, int luma_weight, int luma_offset)
+- {
+--    HEVCMvCmd *cmd = unif_mv_cmds + s->num_mv_cmds++;
+-+    HEVCMvCmd *cmd = s->unif_mv_cmds + s->num_mv_cmds++;
+-     cmd->cmd = RPI_CMD_LUMA_UNI;
+-     cmd->dst = dst;
+-     cmd->dststride = dststride;
+-@@ -1467,31 +1467,29 @@ static void rpi_luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+-                        AVFrame *ref0, const Mv *mv0, int x_off, int y_off,
+-                        int block_w, int block_h, AVFrame *ref1, const Mv *mv1, struct MvField *current_mv)
+- {
+--    HEVCMvCmd *cmd = unif_mv_cmds + s->num_mv_cmds++;
+-+    HEVCMvCmd *cmd = s->unif_mv_cmds + s->num_mv_cmds++;
+-     cmd->cmd = RPI_CMD_LUMA_BI;
+-     cmd->dst = dst;
+-     cmd->dststride = dststride;
+--    cmd->src = ref->data[0];
+--    cmd->srcstride = ref->linesize[0];
+--    cmd->mv = *mv;
+-+    cmd->src = ref0->data[0];
+-+    cmd->srcstride = ref0->linesize[0];
+-+    cmd->mv = *mv0;
+-     cmd->x_off = x_off;
+-     cmd->y_off = y_off;
+-     cmd->block_w = block_w;
+-     cmd->block_h = block_h;
+--    cmd->weight = luma_weight;
+--    cmd->offset = luma_offset;
+--    cmd->src1 = ref1->data[];
+-+    cmd->src1 = ref1->data[0];
+-     cmd->srcstride1 = ref1->linesize[0];
+-     cmd->mv1 = *mv1;
+-     cmd->ref_idx[0] = current_mv->ref_idx[0];
+-     cmd->ref_idx[1] = current_mv->ref_idx[1];
+- }
+- 
+--static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
+-+static void rpi_chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
+-                           ptrdiff_t dststride, uint8_t *src0, ptrdiff_t srcstride, int reflist,
+-                           int x_off, int y_off, int block_w, int block_h, struct MvField *current_mv, int chroma_weight, int chroma_offset)
+- {
+--    HEVCMvCmd *cmd = unif_mv_cmds + s->num_mv_cmds++;
+-+    HEVCMvCmd *cmd = s->unif_mv_cmds + s->num_mv_cmds++;
+-     cmd->cmd = RPI_CMD_CHROMA_UNI;
+-     cmd->dst = dst0;
+-     cmd->dststride = dststride;
+-@@ -1506,27 +1504,27 @@ static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
+-     cmd->offset = chroma_offset;
+- }
+- 
+--static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVFrame *ref0, AVFrame *ref1,
+-+static void rpi_chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVFrame *ref0, AVFrame *ref1,
+-                          int x_off, int y_off, int block_w, int block_h, struct MvField *current_mv, int cidx)
+- {
+--    HEVCMvCmd *cmd = unif_mv_cmds + s->num_mv_cmds++;
+-+    HEVCMvCmd *cmd = s->unif_mv_cmds + s->num_mv_cmds++;
+-     cmd->cmd = RPI_CMD_CHROMA_BI+cidx;
+-     cmd->dst = dst0;
+-     cmd->dststride = dststride;
+-     cmd->src = ref0->data[cidx+1];
+-     cmd->srcstride = ref0->linesize[cidx+1];
+--    cmd->mv = current_mv->mv[reflist];
+-+    cmd->mv = current_mv->mv[0];
+-+    cmd->mv1 = current_mv->mv[1];
+-     cmd->x_off = x_off;
+-     cmd->y_off = y_off;
+-     cmd->block_w = block_w;
+-     cmd->block_h = block_h;
+--    cmd->weight = chroma_weight;
+--    cmd->offset = chroma_offset;
+--    cmd->src = ref1->data[cidx+1];
+-+    cmd->src1 = ref1->data[cidx+1];
+-     cmd->srcstride1 = ref1->linesize[cidx+1];
+-     cmd->ref_idx[0] = current_mv->ref_idx[0];
+-     cmd->ref_idx[1] = current_mv->ref_idx[1];
+- }
+-+
+- #else
+- #define RPI_REDIRECT(fn) fn
+- #endif
+-@@ -2554,7 +2552,9 @@ static void rpi_execute_pred_cmds(HEVCContext *s)
+-           lc->na.cand_up_right     = (cmd->na >> 0) & 1;
+-           s->hpc.intra_pred[cmd->size - 2](s, cmd->x, cmd->y, cmd->c_idx);
+-       } else {
+-+#ifdef RPI_PRECLEAR
+-           int trafo_size = 1 << cmd->size;
+-+#endif
+-           s->hevcdsp.transform_add[cmd->size-2](cmd->dst, cmd->buf, cmd->stride);
+- #ifdef RPI_PRECLEAR
+-           memset(cmd->buf, 0, trafo_size * trafo_size * sizeof(int16_t)); // Clear coefficients here while they are in the cache
+-@@ -2563,6 +2563,61 @@ static void rpi_execute_pred_cmds(HEVCContext *s)
+-   }
+-   s->num_pred_cmds = 0;
+- }
+-+
+-+static void rpi_execute_inter_cmds(HEVCContext *s)
+-+{
+-+    HEVCMvCmd *cmd = s->unif_mv_cmds;
+-+    int n,cidx;
+-+    AVFrame myref;
+-+    AVFrame myref1;
+-+    struct MvField mymv;
+-+    if (s->num_mv_cmds > RPI_MAX_MV_CMDS) {
+-+        printf("Overflow inter_cmds\n");
+-+        exit(-1);
+-+    }
+-+    for(n = s->num_mv_cmds; n>0 ; n--, cmd++) {
+-+        switch(cmd->cmd) {
+-+        case RPI_CMD_LUMA_UNI:
+-+            myref.data[0] = cmd->src;
+-+            myref.linesize[0] = cmd->srcstride;
+-+            luma_mc_uni(s, cmd->dst, cmd->dststride, &myref, &cmd->mv, cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, cmd->weight, cmd->offset);
+-+            break;
+-+        case RPI_CMD_LUMA_BI:
+-+            myref.data[0] = cmd->src;
+-+            myref.linesize[0] = cmd->srcstride;
+-+            myref1.data[0] = cmd->src1;
+-+            myref1.linesize[0] = cmd->srcstride1;
+-+            mymv.ref_idx[0] = cmd->ref_idx[0];
+-+            mymv.ref_idx[1] = cmd->ref_idx[1];
+-+            luma_mc_bi(s, cmd->dst, cmd->dststride,
+-+                       &myref, &cmd->mv, cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h,
+-+                       &myref1, &cmd->mv1, &mymv);
+-+            break;
+-+        case RPI_CMD_CHROMA_UNI:
+-+            mymv.mv[0] = cmd->mv;
+-+            chroma_mc_uni(s, cmd->dst,
+-+                          cmd->dststride, cmd->src, cmd->srcstride, 0,
+-+                          cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, &mymv, cmd->weight, cmd->offset);
+-+            break;
+-+        case RPI_CMD_CHROMA_BI:
+-+        case RPI_CMD_CHROMA_BI+1:
+-+            cidx = cmd->cmd - RPI_CMD_CHROMA_BI;
+-+            myref.data[cidx+1] = cmd->src;
+-+            myref.linesize[cidx+1] = cmd->srcstride;
+-+            myref1.data[cidx+1] = cmd->src1;
+-+            myref1.linesize[cidx+1] = cmd->srcstride1;
+-+            mymv.ref_idx[0] = cmd->ref_idx[0];
+-+            mymv.ref_idx[1] = cmd->ref_idx[1];
+-+            mymv.mv[0] = cmd->mv;
+-+            mymv.mv[1] = cmd->mv1;
+-+            chroma_mc_bi(s, cmd->dst, cmd->dststride, &myref, &myref1,
+-+                         cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, &mymv, cidx);
+-+            break;
+-+        }
+-+    }
+-+    s->num_mv_cmds = 0;
+-+}
+-+
+- #endif
+- 
+- static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-@@ -2611,6 +2666,8 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+- #ifdef RPI
+-         if (s->enable_rpi && x_ctb + ctb_size >= s->ps.sps->width) {
+-             int x;
+-+            // Perform inter prediction
+-+            rpi_execute_inter_cmds(s);
+-             // Transform all blocks
+-             rpi_execute_transform(s);
+-             // Perform intra prediction and residual reconstruction
+-@@ -3422,6 +3479,7 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
+- }
+- 
+- #ifdef RPI
+-+#ifdef RPI_PRECLEAR
+- static av_cold void memclear16(int16_t *p, int n)
+- {
+-   vpu_execute_code( vpu_get_fn(), p, n, 0, 0, 0, 1);
+-@@ -3430,6 +3488,7 @@ static av_cold void memclear16(int16_t *p, int n)
+-   //  p[i] = 0;
+- }
+- #endif
+-+#endif
+- 
+- static av_cold int hevc_init_context(AVCodecContext *avctx)
+- {
+-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
+-index 1ac119a..a0eb71b 100644
+---- a/libavcodec/hevc.h
+-+++ b/libavcodec/hevc.h
+-@@ -812,7 +812,7 @@ typedef struct HEVCLocalContext {
+- #define RPI_CMD_LUMA_UNI 0
+- #define RPI_CMD_CHROMA_UNI 1
+- #define RPI_CMD_LUMA_BI 2
+--#define RPI_CMD_U_BI 3
+-+#define RPI_CMD_CHROMA_BI 3
+- #define RPI_CMD_V_BI 4
+- 
+- // RPI_PRECLEAR is not working yet - perhaps clearing on VPUs is flawed?
+--- 
+-2.7.4
+-
+-
+-From 8af0a0a036e4bb3883f144d0567bc527772dd65b Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Wed, 6 May 2015 13:03:50 +0100
+-Subject: [PATCH 10/68] Added VPU thread
+-
+----
+- libavcodec/hevc.c    |  11 +++--
+- libavcodec/hevc.h    |   1 +
+- libavcodec/rpi_qpu.c | 125 +++++++++++++++++++++++++++++++++++++++++++++++++--
+- libavcodec/rpi_qpu.h |   2 +
+- 4 files changed, 133 insertions(+), 6 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index b7bc6ad..98dbd69 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -2529,8 +2529,10 @@ static void rpi_execute_transform(HEVCContext *s)
+- 
+- 
+-     gpu_cache_flush(&s->coeffs_buf_accelerated);
+--    vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0);
+-+    s->vpu_id = vpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0, &s->coeffs_buf_accelerated);
+-+    //vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0);
+-     //gpu_cache_flush(&s->coeffs_buf_accelerated);
+-+    //vpu_wait(s->vpu_id);
+- 
+-     for(i=0;i<4;i++)
+-         s->num_coeffs[i] = 0;
+-@@ -2666,10 +2668,12 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+- #ifdef RPI
+-         if (s->enable_rpi && x_ctb + ctb_size >= s->ps.sps->width) {
+-             int x;
+--            // Perform inter prediction
+--            rpi_execute_inter_cmds(s);
+-             // Transform all blocks
+-             rpi_execute_transform(s);
+-+            // Perform inter prediction
+-+            rpi_execute_inter_cmds(s);
+-+            // Wait for transform completion
+-+            vpu_wait(s->vpu_id);
+-             // Perform intra prediction and residual reconstruction
+-             rpi_execute_pred_cmds(s);
+-             // Perform deblocking for CTBs in this row
+-@@ -3426,6 +3430,7 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
+-     av_freep(&s->univ_pred_cmds);
+- 
+- #ifdef EARLY_MALLOC
+-+    printf("hevc_decode_free\n");
+-     if (s->coeffs_buf_arm[0]) {
+-       gpu_free(&s->coeffs_buf_default);
+-       s->coeffs_buf_arm[0] = 0;
+-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
+-index a0eb71b..0d8dfe9 100644
+---- a/libavcodec/hevc.h
+-+++ b/libavcodec/hevc.h
+-@@ -896,6 +896,7 @@ typedef struct HEVCContext {
+-     int num_xfm_cmds;
+-     int num_mv_cmds;
+-     int num_pred_cmds;
+-+    int vpu_id;
+- #endif
+- 
+-     uint8_t *cabac_state;
+-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
+-index 12ad5fb..378dd74 100644
+---- a/libavcodec/rpi_qpu.c
+-+++ b/libavcodec/rpi_qpu.c
+-@@ -1,9 +1,13 @@
+- #ifdef RPI
+--// Use the vcsm device for shared memory
+-+// define RPI_USE_VCSM to use the vcsm device for shared memory
+- // This works better than the mmap in that the memory can be cached, but requires a kernel modification to enable the device.
+- #define RPI_USE_VCSM
+--#define RPI_TIME_TOTAL_QPU
+--#define RPI_TIME_TOTAL_VPU
+-+// define RPI_TIME_TOTAL_QPU to print out how much time is spent in the QPU code
+-+//#define RPI_TIME_TOTAL_QPU
+-+// define RPI_TIME_TOTAL_VPU to print out how much time is spent in the VPI code
+-+//#define RPI_TIME_TOTAL_VPU
+-+// define RPI_ASYNC to run the VPU in a separate thread, need to make a separate call to check for completion
+-+#define RPI_ASYNC
+- 
+- #include <stdio.h>
+- #include <stdlib.h>
+-@@ -113,6 +117,19 @@ static unsigned int Microseconds(void) {
+- }
+- #endif
+- 
+-+#ifdef RPI_ASYNC
+-+pthread_t vpu_thread;
+-+static void *vpu_start(void *arg);
+-+
+-+#define MAXCMDS 128
+-+static pthread_cond_t post_cond = PTHREAD_COND_INITIALIZER;
+-+static pthread_mutex_t post_mutex = PTHREAD_MUTEX_INITIALIZER;
+-+
+-+static int vpu_cmds[MAXCMDS][8];
+-+static volatile int vpu_async_tail=0; // Contains the number of posted jobs
+-+static volatile int vpu_async_head=0;
+-+#endif
+-+
+- // Connect to QPU, returns 0 on success.
+- static int gpu_init(volatile struct GPU **gpu) {
+-   int mb = mbox_open();
+-@@ -164,12 +181,27 @@ static int gpu_init(volatile struct GPU **gpu) {
+-   // And the transform coefficients
+-   memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, sizeof(rpi_transMatrix2even));
+- 
+-+#ifdef RPI_ASYNC
+-+  {
+-+    int err;
+-+    vpu_async_tail = 0;
+-+    vpu_async_head = 0;
+-+    err = pthread_create(&vpu_thread, NULL, vpu_start, NULL);
+-+    //printf("Created thread\n");
+-+    if (err) {
+-+        printf("Failed to create vpu thread\n");
+-+        return -4;
+-+    }
+-+  }
+-+#endif
+-+
+-   return 0;
+- }
+- 
+- // Make sure we have exclusive access to the mailbox, and enable qpu if necessary.
+- static void gpu_lock(void) {
+-   pthread_mutex_lock(&gpu_mutex);
+-+
+-   if (gpu==NULL) {
+-     gpu_init(&gpu);
+-   }
+-@@ -264,6 +296,16 @@ static void gpu_term(void)
+- 	unsigned handle = gpu->vc_handle;
+-   if (gpu==NULL)
+-     return;
+-+
+-+#ifdef RPI_ASYNC
+-+  {
+-+    void *res;
+-+    vpu_post_code(0, 0, 0, 0, 0, 0, -1, NULL);
+-+    pthread_join(vpu_thread, &res);
+-+  }
+-+#endif
+-+
+-+
+- 	unmapmem((void*)gpu, sizeof(struct GPU));
+- 	mem_unlock(mb, handle);
+- 	mem_free(mb, handle);
+-@@ -322,6 +364,79 @@ unsigned int vpu_get_constants(void) {
+-   return gpu->vc + offsetof(struct GPU,transMatrix2even);
+- }
+- 
+-+#ifdef RPI_ASYNC
+-+
+-+static void *vpu_start(void *arg) {
+-+  while(1) {
+-+    pthread_mutex_lock(&post_mutex);
+-+    while( vpu_async_tail - vpu_async_head <= 0)
+-+    {
+-+      //printf("Checking number %d %d\n",vpu_async_head,vpu_async_tail);
+-+      pthread_cond_wait(&post_cond, &post_mutex);
+-+    }
+-+    int *p = vpu_cmds[vpu_async_head%MAXCMDS];
+-+    pthread_mutex_unlock(&post_mutex);
+-+
+-+    if (p[6] == -1) {
+-+      break; // Last job
+-+    }
+-+    if (p[7]) {
+-+        GPU_MEM_PTR_T *buf = (GPU_MEM_PTR_T *)p[7];
+-+        //gpu_cache_flush(buf);
+-+    }
+-+    vpu_execute_code(p[0], p[1], p[2], p[3], p[4], p[5], p[6]);
+-+
+-+    pthread_mutex_lock(&post_mutex);
+-+    vpu_async_head++;
+-+    pthread_cond_broadcast(&post_cond);
+-+    pthread_mutex_unlock(&post_mutex);
+-+  }
+-+
+-+  return NULL;
+-+}
+-+
+-+// Post a command to the queue
+-+// Returns an id which we can use to wait for completion
+-+int vpu_post_code(unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5, GPU_MEM_PTR_T *buf)
+-+{
+-+  pthread_mutex_lock(&post_mutex);
+-+  {
+-+    int id = vpu_async_tail++;
+-+    int *p = vpu_cmds[id%MAXCMDS];
+-+    int num = vpu_async_tail - vpu_async_head;
+-+    if (num>MAXCMDS) {
+-+      printf("Too many commands submitted\n");
+-+      exit(-1);
+-+    }
+-+    p[0] = code;
+-+    p[1] = r0;
+-+    p[2] = r1;
+-+    p[3] = r2;
+-+    p[4] = r3;
+-+    p[5] = r4;
+-+    p[6] = r5;
+-+    p[7] = (int) buf;
+-+    if (num<=1)
+-+      pthread_cond_broadcast(&post_cond); // Otherwise the vpu thread must already be awake
+-+    pthread_mutex_unlock(&post_mutex);
+-+    return id;
+-+  }
+-+}
+-+
+-+// Wait for completion of the given command
+-+void vpu_wait(int id)
+-+{
+-+  pthread_mutex_lock(&post_mutex);
+-+  while( id + 1 - vpu_async_head > 0)
+-+  {
+-+    pthread_cond_wait(&post_cond, &post_mutex);
+-+  }
+-+  pthread_mutex_unlock(&post_mutex);
+-+}
+-+
+-+#endif
+-+
+-+
+- unsigned vpu_execute_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5)
+- {
+-   unsigned r;
+-@@ -334,7 +449,9 @@ unsigned vpu_execute_code( unsigned code, unsigned r0, unsigned r1, unsigned r2,
+-   static int count=0;
+-   static long long countr2=0;
+- #endif
+-+#ifndef RPI_ASYNC
+-   gpu_lock();
+-+#endif
+- #ifdef RPI_TIME_TOTAL_VPU
+-   start_time = Microseconds();
+-   if (last_time==0)
+-@@ -351,7 +468,9 @@ unsigned vpu_execute_code( unsigned code, unsigned r0, unsigned r1, unsigned r2,
+-   if ((count&0x7f)==0)
+-     printf("VPU %d %lld On=%dms, Off=%dms\n",count,countr2,(int)(on_time/1000),(int)(off_time/1000));
+- #endif
+-+#ifndef RPI_ASYNC
+-   gpu_unlock();
+-+#endif
+-   return r;
+- }
+- 
+-diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
+-index 814fc3c..3526fce 100644
+---- a/libavcodec/rpi_qpu.h
+-+++ b/libavcodec/rpi_qpu.h
+-@@ -36,6 +36,8 @@ extern unsigned int qpu_get_fn(int num);
+- extern unsigned int vpu_get_fn(void);
+- extern unsigned int vpu_get_constants(void);
+- extern unsigned vpu_execute_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5);
+-+extern int vpu_post_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5, GPU_MEM_PTR_T *buf);
+-+extern void vpu_wait( int id);
+- 
+- // Simple test of shader code
+- extern int rpi_test_shader(void);
+--- 
+-2.7.4
+-
+-
+-From 016d3db644e60fbe272bfcf1d7c3670c82422317 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Wed, 6 May 2015 15:03:37 +0100
+-Subject: [PATCH 11/68] Added different signal when tail moves
+-
+----
+- libavcodec/rpi_qpu.c | 11 ++++++-----
+- 1 file changed, 6 insertions(+), 5 deletions(-)
+-
+-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
+-index 378dd74..d1c3e20 100644
+---- a/libavcodec/rpi_qpu.c
+-+++ b/libavcodec/rpi_qpu.c
+-@@ -122,7 +122,8 @@ pthread_t vpu_thread;
+- static void *vpu_start(void *arg);
+- 
+- #define MAXCMDS 128
+--static pthread_cond_t post_cond = PTHREAD_COND_INITIALIZER;
+-+static pthread_cond_t post_cond_head = PTHREAD_COND_INITIALIZER;
+-+static pthread_cond_t post_cond_tail = PTHREAD_COND_INITIALIZER;
+- static pthread_mutex_t post_mutex = PTHREAD_MUTEX_INITIALIZER;
+- 
+- static int vpu_cmds[MAXCMDS][8];
+-@@ -372,7 +373,7 @@ static void *vpu_start(void *arg) {
+-     while( vpu_async_tail - vpu_async_head <= 0)
+-     {
+-       //printf("Checking number %d %d\n",vpu_async_head,vpu_async_tail);
+--      pthread_cond_wait(&post_cond, &post_mutex);
+-+      pthread_cond_wait(&post_cond_tail, &post_mutex);
+-     }
+-     int *p = vpu_cmds[vpu_async_head%MAXCMDS];
+-     pthread_mutex_unlock(&post_mutex);
+-@@ -388,7 +389,7 @@ static void *vpu_start(void *arg) {
+- 
+-     pthread_mutex_lock(&post_mutex);
+-     vpu_async_head++;
+--    pthread_cond_broadcast(&post_cond);
+-+    pthread_cond_broadcast(&post_cond_head);
+-     pthread_mutex_unlock(&post_mutex);
+-   }
+- 
+-@@ -417,7 +418,7 @@ int vpu_post_code(unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned
+-     p[6] = r5;
+-     p[7] = (int) buf;
+-     if (num<=1)
+--      pthread_cond_broadcast(&post_cond); // Otherwise the vpu thread must already be awake
+-+      pthread_cond_broadcast(&post_cond_tail); // Otherwise the vpu thread must already be awake
+-     pthread_mutex_unlock(&post_mutex);
+-     return id;
+-   }
+-@@ -429,7 +430,7 @@ void vpu_wait(int id)
+-   pthread_mutex_lock(&post_mutex);
+-   while( id + 1 - vpu_async_head > 0)
+-   {
+--    pthread_cond_wait(&post_cond, &post_mutex);
+-+    pthread_cond_wait(&post_cond_head, &post_mutex);
+-   }
+-   pthread_mutex_unlock(&post_mutex);
+- }
+--- 
+-2.7.4
+-
+-
+-From b04a72641253dc89fd1ec688035c3e2a946aa370 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Thu, 7 May 2015 08:57:11 +0100
+-Subject: [PATCH 12/68] Add option to test for gpu_idle
+-
+----
+- libavcodec/hevc.c    |  3 ++-
+- libavcodec/rpi_qpu.c | 18 ++++++++++++++++++
+- 2 files changed, 20 insertions(+), 1 deletion(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 98dbd69..2e269b6 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -2527,7 +2527,6 @@ static void rpi_execute_transform(HEVCContext *s)
+-     //    s->hevcdsp.idct[4-2](coeffs, 16);
+-     //}
+- 
+--
+-     gpu_cache_flush(&s->coeffs_buf_accelerated);
+-     s->vpu_id = vpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0, &s->coeffs_buf_accelerated);
+-     //vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0);
+-@@ -2669,6 +2668,8 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-         if (s->enable_rpi && x_ctb + ctb_size >= s->ps.sps->width) {
+-             int x;
+-             // Transform all blocks
+-+            //printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
+-+
+-             rpi_execute_transform(s);
+-             // Perform inter prediction
+-             rpi_execute_inter_cmds(s);
+-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
+-index d1c3e20..85f49db 100644
+---- a/libavcodec/rpi_qpu.c
+-+++ b/libavcodec/rpi_qpu.c
+-@@ -199,6 +199,17 @@ static int gpu_init(volatile struct GPU **gpu) {
+-   return 0;
+- }
+- 
+-+// Returns 1 if the gpu is currently idle
+-+static int gpu_idle(void)
+-+{
+-+  int ret = pthread_mutex_trylock(&gpu_mutex);
+-+  if (ret==0) {
+-+    pthread_mutex_unlock(&gpu_mutex);
+-+    return 1;
+-+  }
+-+  return 0;
+-+}
+-+
+- // Make sure we have exclusive access to the mailbox, and enable qpu if necessary.
+- static void gpu_lock(void) {
+-   pthread_mutex_lock(&gpu_mutex);
+-@@ -400,6 +411,13 @@ static void *vpu_start(void *arg) {
+- // Returns an id which we can use to wait for completion
+- int vpu_post_code(unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5, GPU_MEM_PTR_T *buf)
+- {
+-+  // If the gpu is idle then just run the command immediately
+-+  // This works, but doesn't seem to give any benefit
+-+  // if (gpu_idle()) {
+-+  //   vpu_execute_code( code,  r0,  r1,  r2,  r3,  r4,  r5);
+-+  //   return -1; // TODO perhaps a wraparound bug here?
+-+  // }
+-+
+-   pthread_mutex_lock(&post_mutex);
+-   {
+-     int id = vpu_async_tail++;
+--- 
+-2.7.4
+-
+-
+-From e7b457e683d4ca92bf2677b69708fbfc3849847b Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Thu, 7 May 2015 11:01:35 +0100
+-Subject: [PATCH 13/68] Added deblocking pass
+-
+----
+- libavcodec/hevc.c        | 33 +++++++++++++++++++++++++++------
+- libavcodec/hevc.h        |  7 ++++++-
+- libavcodec/hevc_filter.c |  6 +++++-
+- libavcodec/rpi_qpu.c     |  2 +-
+- 4 files changed, 39 insertions(+), 9 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 2e269b6..29f8415 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -2518,6 +2518,17 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
+- }
+- 
+- #ifdef RPI
+-+static void rpi_execute_dblk_cmds(HEVCContext *s)
+-+{
+-+    int n;
+-+    int ctb_size    = 1 << s->ps.sps->log2_ctb_size;
+-+    int (*p)[2] = s->dblk_cmds;
+-+    for(n = s->num_dblk_cmds; n>0 ;n--,p++) {
+-+        ff_hevc_hls_filters(s, (*p)[0], (*p)[1], ctb_size);
+-+    }
+-+    s->num_dblk_cmds = 0;
+-+}
+-+
+- static void rpi_execute_transform(HEVCContext *s)
+- {
+-     int i=2;
+-@@ -2631,7 +2642,6 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-     int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs];
+- 
+- #ifdef RPI
+--    int start_ctb_x = (s->sh.slice_ctb_addr_rs % ((s->ps.sps->width + ctb_size - 1) >> s->ps.sps->log2_ctb_size)) << s->ps.sps->log2_ctb_size;
+-     s->enable_rpi = 1; // TODO this should depend on cross component and frame width etc.
+- #endif
+- 
+-@@ -2665,7 +2675,10 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+- 
+-         more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
+- #ifdef RPI
+--        if (s->enable_rpi && x_ctb + ctb_size >= s->ps.sps->width) {
+-+        if (s->enable_rpi) {
+-+          s->dblk_cmds[s->num_dblk_cmds][0] = x_ctb;
+-+          s->dblk_cmds[s->num_dblk_cmds++][1] = y_ctb;
+-+          if ( (((y_ctb + ctb_size)&63) == 0) && x_ctb + ctb_size >= s->ps.sps->width) {
+-             int x;
+-             // Transform all blocks
+-             //printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
+-@@ -2678,10 +2691,8 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-             // Perform intra prediction and residual reconstruction
+-             rpi_execute_pred_cmds(s);
+-             // Perform deblocking for CTBs in this row
+--            for(x = start_ctb_x; x <= x_ctb; x += ctb_size) {  // TODO this will fail for tiles
+--                ff_hevc_hls_filters(s, x, y_ctb, ctb_size);
+--            }
+--            start_ctb_x = 0;
+-+            rpi_execute_dblk_cmds(s);
+-+          }
+-         }
+- #endif
+-         if (more_data < 0) {
+-@@ -2699,6 +2710,16 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-         ff_hevc_hls_filters(s, x_ctb, y_ctb, ctb_size);
+-     }
+- 
+-+#ifdef RPI
+-+    if (s->enable_rpi && s->num_dblk_cmds) {
+-+        rpi_execute_transform(s);
+-+        rpi_execute_inter_cmds(s);
+-+        vpu_wait(s->vpu_id);
+-+        rpi_execute_pred_cmds(s);
+-+        rpi_execute_dblk_cmds(s);
+-+    }
+-+#endif
+-+
+-     if (x_ctb + ctb_size >= s->ps.sps->width &&
+-         y_ctb + ctb_size >= s->ps.sps->height)
+-         ff_hevc_hls_filter(s, x_ctb, y_ctb, ctb_size);
+-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
+-index 0d8dfe9..990bd8c 100644
+---- a/libavcodec/hevc.h
+-+++ b/libavcodec/hevc.h
+-@@ -808,6 +808,8 @@ typedef struct HEVCLocalContext {
+- #define RPI_MAX_XFM_CMDS  (16*3*(RPI_MAX_WIDTH/4))
+- // Each block can have an intra prediction and a transform_add command
+- #define RPI_MAX_PRED_CMDS (2*16*3*(RPI_MAX_WIDTH/4))
+-+// Worst case is 16x16 CTUs
+-+#define RPI_MAX_DEBLOCK_CMDS (RPI_MAX_WIDTH*4/16)
+- 
+- #define RPI_CMD_LUMA_UNI 0
+- #define RPI_CMD_CHROMA_UNI 1
+-@@ -867,6 +869,9 @@ typedef struct HEVCPredCmd {
+- #endif
+- 
+- typedef struct HEVCContext {
+-+#ifdef RPI
+-+    int dblk_cmds[RPI_MAX_DEBLOCK_CMDS][2];
+-+#endif
+-     const AVClass *c;  // needed by private avoptions
+-     AVCodecContext *avctx;
+- 
+-@@ -891,11 +896,11 @@ typedef struct HEVCContext {
+-     GPU_MEM_PTR_T coeffs_buf_accelerated;
+-     int16_t *coeffs_buf_arm[4];
+-     unsigned int coeffs_buf_vc[4];
+--
+-     int num_coeffs[4];
+-     int num_xfm_cmds;
+-     int num_mv_cmds;
+-     int num_pred_cmds;
+-+    int num_dblk_cmds;
+-     int vpu_id;
+- #endif
+- 
+-diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
+-index e4c3da7..ea0af91 100644
+---- a/libavcodec/hevc_filter.c
+-+++ b/libavcodec/hevc_filter.c
+-@@ -877,8 +877,12 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
+-             if (s->threads_type & FF_THREAD_FRAME )
+-                 ff_thread_report_progress(&s->ref->tf, y + ctb_size, 0);
+-         }
+--    } else if (s->threads_type & FF_THREAD_FRAME && x_end)
+-+    } else if (s->threads_type & FF_THREAD_FRAME && x_end) {
+-+        int newh = y + ctb_size - 4;
+-+        //int currh = s->ref->tf.progress->data[0];
+-+        //if (((y + ctb_size)&63)==0)
+-         ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
+-+    }
+- }
+- 
+- void ff_hevc_hls_filters(HEVCContext *s, int x_ctb, int y_ctb, int ctb_size)
+-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
+-index 85f49db..3b6dae7 100644
+---- a/libavcodec/rpi_qpu.c
+-+++ b/libavcodec/rpi_qpu.c
+-@@ -105,7 +105,7 @@ struct GPU
+- static pthread_mutex_t gpu_mutex = PTHREAD_MUTEX_INITIALIZER;
+- static volatile struct GPU* gpu = NULL;
+- 
+--#ifdef RPI_TIME_TOTAL_QPU
+-+#if defined(RPI_TIME_TOTAL_QPU) || defined(RPI_TIME_TOTAL_VPU)
+- static unsigned int Microseconds(void) {
+-     struct timespec ts;
+-     unsigned int x;
+--- 
+-2.7.4
+-
+-
+-From 7a443df9115f21b4428de378bd146dcdba3dd42a Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Thu, 7 May 2015 16:47:47 +0100
+-Subject: [PATCH 14/68] Added option to disable deblocking for non-ref frames
+-
+----
+- libavcodec/hevc_filter.c | 10 ++++++++++
+- 1 file changed, 10 insertions(+)
+-
+-diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
+-index ea0af91..2cdd621 100644
+---- a/libavcodec/hevc_filter.c
+-+++ b/libavcodec/hevc_filter.c
+-@@ -25,6 +25,8 @@
+- //#define DISABLE_SAO
+- //#define DISABLE_DEBLOCK
+- //#define DISABLE_STRENGTHS
+-+// define DISABLE_DEBLOCK_NONREF for a 6% speed boost (by skipping deblocking on unimportant frames)
+-+//#define DISABLE_DEBLOCK_NONREF
+- 
+- #include "libavutil/common.h"
+- #include "libavutil/internal.h"
+-@@ -504,6 +506,14 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+-                 s->ps.sps->pcm.loop_filter_disable_flag) ||
+-                s->ps.pps->transquant_bypass_enable_flag;
+- 
+-+#ifdef DISABLE_DEBLOCK_NONREF
+-+    if (    s->nal_unit_type == NAL_TRAIL_N ||
+-+            s->nal_unit_type == NAL_TSA_N   ||
+-+            s->nal_unit_type == NAL_STSA_N  ||
+-+            s->nal_unit_type == NAL_RADL_N  ||
+-+            s->nal_unit_type == NAL_RASL_N )
+-+      return; // Don't deblock non-reference frames
+-+#endif
+- #ifdef DISABLE_DEBLOCK
+-     return;
+- #endif
+--- 
+-2.7.4
+-
+-
+-From 9606e160a582db64ccf981d971cdc258d8cc02f7 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Mon, 11 May 2015 10:00:27 +0100
+-Subject: [PATCH 15/68] Moved buffers to VPU memory
+-
+----
+- libavcodec/hevc_filter.c | 17 +++++++++++++-
+- libavcodec/utils.c       | 59 ++++++++++++++++++++++++++++++++++++++++++++++++
+- libavutil/buffer.c       |  6 +++++
+- libavutil/buffer.h       |  3 +++
+- 4 files changed, 84 insertions(+), 1 deletion(-)
+-
+-diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
+-index 2cdd621..e1b32d4 100644
+---- a/libavcodec/hevc_filter.c
+-+++ b/libavcodec/hevc_filter.c
+-@@ -866,6 +866,13 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
+- #undef CB
+- #undef CR
+- 
+-+#ifdef RPI_INTER_QPU
+-+static void flush_buffer(AVBufferRef *bref) {
+-+    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
+-+    gpu_cache_flush(p);
+-+}
+-+#endif
+-+
+- void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
+- {
+-     int x_end = x >= s->ps.sps->width  - ctb_size;
+-@@ -888,9 +895,17 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
+-                 ff_thread_report_progress(&s->ref->tf, y + ctb_size, 0);
+-         }
+-     } else if (s->threads_type & FF_THREAD_FRAME && x_end) {
+--        int newh = y + ctb_size - 4;
+-+        //int newh = y + ctb_size - 4;
+-         //int currh = s->ref->tf.progress->data[0];
+-         //if (((y + ctb_size)&63)==0)
+-+        if (!(  s->nal_unit_type == NAL_TRAIL_N ||
+-+            s->nal_unit_type == NAL_TSA_N   ||
+-+            s->nal_unit_type == NAL_STSA_N  ||
+-+            s->nal_unit_type == NAL_RADL_N  ||
+-+            s->nal_unit_type == NAL_RASL_N )) {
+-+            flush_buffer(s->frame->buf[1]);
+-+            flush_buffer(s->frame->buf[2]);
+-+        }
+-         ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
+-     }
+- }
+-diff --git a/libavcodec/utils.c b/libavcodec/utils.c
+-index f7adb52..708526e 100644
+---- a/libavcodec/utils.c
+-+++ b/libavcodec/utils.c
+-@@ -26,6 +26,12 @@
+-  */
+- 
+- #include "config.h"
+-+
+-+#ifdef RPI
+-+// Move video buffers to GPU memory
+-+#define RPI_GPU_BUFFERS
+-+#endif
+-+
+- #include "libavutil/atomic.h"
+- #include "libavutil/attributes.h"
+- #include "libavutil/avassert.h"
+-@@ -64,6 +70,10 @@
+- #include "libavutil/ffversion.h"
+- const char av_codec_ffversion[] = "FFmpeg version " FFMPEG_VERSION;
+- 
+-+#ifdef RPI_GPU_BUFFERS
+-+#include "rpi_qpu.h"
+-+#endif
+-+
+- #if HAVE_PTHREADS || HAVE_W32THREADS || HAVE_OS2THREADS
+- static int default_lockmgr_cb(void **arg, enum AVLockOp op)
+- {
+-@@ -503,6 +513,47 @@ int avcodec_fill_audio_frame(AVFrame *frame, int nb_channels,
+-     return ret;
+- }
+- 
+-+#ifdef RPI_GPU_BUFFERS
+-+static void rpi_buffer_default_free(void *opaque, uint8_t *data)
+-+{
+-+    GPU_MEM_PTR_T *p = opaque;
+-+    gpu_free(p);
+-+    av_free(p);
+-+}
+-+
+-+static AVBufferRef *rpi_buffer_alloc(int size)
+-+{
+-+    AVBufferRef *ret = NULL;
+-+    uint8_t    *data = NULL;
+-+    GPU_MEM_PTR_T *p;
+-+
+-+    static int total=0;
+-+    total+=size;
+-+
+-+    p = av_malloc(sizeof *p);
+-+    if (!p)
+-+        return NULL;
+-+
+-+    if (gpu_malloc_cached(size,p)<0)  // Change this line to choose cached or uncached memory.  The caching here refers to the ARM data cache.
+-+        return NULL;
+-+
+-+    data = p->arm;
+-+    printf("Rpi alloc %d/%d ARM=%p VC=%x->%x\n",size,total,p->arm,p->vc,p->vc+size);
+-+    //memset(data, 64, size);
+-+
+-+    if (!data)
+-+        return NULL;
+-+
+-+    ret = av_buffer_create(data, size, rpi_buffer_default_free, p, 0);
+-+    if (!ret) {
+-+        gpu_free(p);
+-+        av_freep(&p);
+-+    }
+-+
+-+    return ret;
+-+}
+-+#endif
+-+
+- static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame)
+- {
+-     FramePool *pool = avctx->internal->pool;
+-@@ -550,6 +601,14 @@ static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame)
+-             av_buffer_pool_uninit(&pool->pools[i]);
+-             pool->linesize[i] = linesize[i];
+-             if (size[i]) {
+-+#ifdef RPI_GPU_BUFFERS
+-+                if (avctx->codec_id == AV_CODEC_ID_HEVC)
+-+                    pool->pools[i] = av_buffer_pool_init(size[i] + 16 + STRIDE_ALIGN - 1,
+-+                                                     CONFIG_MEMORY_POISONING ?
+-+                                                        NULL :
+-+                                                        rpi_buffer_alloc);
+-+                else
+-+#endif
+-                 pool->pools[i] = av_buffer_pool_init(size[i] + 16 + STRIDE_ALIGN - 1,
+-                                                      CONFIG_MEMORY_POISONING ?
+-                                                         NULL :
+-diff --git a/libavutil/buffer.c b/libavutil/buffer.c
+-index 694e116..203ca7b 100644
+---- a/libavutil/buffer.c
+-+++ b/libavutil/buffer.c
+-@@ -425,3 +425,9 @@ AVBufferRef *av_buffer_pool_get(AVBufferPool *pool)
+- 
+-     return ret;
+- }
+-+
+-+// Return the opaque for the underlying frame (gives us a GPU_MEM_PTR_T)
+-+void *av_buffer_pool_opaque(AVBufferRef *ref) {
+-+  BufferPoolEntry *buf = av_buffer_get_opaque(ref);
+-+  return buf->opaque;
+-+}
+-diff --git a/libavutil/buffer.h b/libavutil/buffer.h
+-index 0c0ce12..82e0bc3 100644
+---- a/libavutil/buffer.h
+-+++ b/libavutil/buffer.h
+-@@ -283,6 +283,9 @@ void av_buffer_pool_uninit(AVBufferPool **pool);
+-  */
+- AVBufferRef *av_buffer_pool_get(AVBufferPool *pool);
+- 
+-+// Return the opaque for the underlying frame
+-+void *av_buffer_pool_opaque(AVBufferRef *ref);
+-+
+- /**
+-  * @}
+-  */
+--- 
+-2.7.4
+-
+-
+-From f56515b9a720c829ba3ddf6da4232a91b13e0f03 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Mon, 11 May 2015 14:04:37 +0100
+-Subject: [PATCH 16/68] Prepared QPU execute code
+-
+----
+- libavcodec/hevc.c        | 227 ++++++++++++++++++++++++++++++++++++++++-------
+- libavcodec/hevc.h        |  22 ++++-
+- libavcodec/hevc_filter.c |   7 +-
+- libavcodec/rpi_qpu.c     |  55 +++++++++++-
+- libavcodec/rpi_qpu.h     |   2 +
+- 5 files changed, 276 insertions(+), 37 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 29f8415..66ed37a 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -42,17 +42,45 @@
+- #include "profiles.h"
+- 
+- #ifdef RPI
+--#include "rpi_qpu.h"
+--// For some unknown reason, the code seems to crash if I do a late malloc
+--#define EARLY_MALLOC
+--// Move Inter prediction into separate pass
+--#define RPI_INTER
+-+  #include "rpi_qpu.h"
+-+  // For some unknown reason, the code seems to crash if I do a late malloc
+-+  #define EARLY_MALLOC
+-+  // Move Inter prediction into separate pass
+-+  #define RPI_INTER
+- #endif
+- 
+- // #define DISABLE_MC
+- 
+- const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
+- 
+-+
+-+#ifdef RPI_INTER_QPU
+-+
+-+#define RPI_CHROMA_COMMAND_WORDS 12
+-+// The QPU code for UV blocks only works up to a block width of 8
+-+#define RPI_CHROMA_BLOCK_WIDTH 8
+-+
+-+#define ENCODE_COEFFS(c0, c1, c2, c3) (((-c0) & 0xff) | ((-c1) & 0xff) << 8 | ((-c2) & 0xff) << 16 | ((-c3) & 0xff) << 24)
+-+
+-+// TODO Chroma only needs 4 taps
+-+static uint32_t rpi_filter_coefs[8][2] = {
+-+        { ENCODE_COEFFS(  0,  0,  0, 128), ENCODE_COEFFS(   0,   0,  0,  0 ) },
+-+        { ENCODE_COEFFS(  0,  0, -2,  58), ENCODE_COEFFS(  10,  -2,  0,  0 ) },
+-+        { ENCODE_COEFFS(  0,  0, -4,  54), ENCODE_COEFFS(  16,  -2,  0,  0 ) },
+-+        { ENCODE_COEFFS(  0,  0, -6,  46), ENCODE_COEFFS(  28,  -4,  0,  0 ) },
+-+        { ENCODE_COEFFS(  0,  0, -4,  36), ENCODE_COEFFS(  36,  -4,  0,  0 ) },
+-+        { ENCODE_COEFFS(  0,  0, -4,  28), ENCODE_COEFFS(  46,  -6,  0,  0 ) },
+-+        { ENCODE_COEFFS(  0,  0, -2,  16), ENCODE_COEFFS(  54,  -4,  0,  0 ) },
+-+        { ENCODE_COEFFS(  0,  0, -2,  10), ENCODE_COEFFS(  58,  -2,  0,  0 ) }
+-+};
+-+
+-+static uint32_t get_vc_address(AVBufferRef *bref) {
+-+  GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
+-+  return p->vc;
+-+}
+-+
+-+#endif
+-+
+- /**
+-  * NOTE: Each function hls_foo correspond to the function foo in the
+-  * specification (HLS stands for High Level Syntax).
+-@@ -66,6 +94,7 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
+- static void pic_arrays_free(HEVCContext *s)
+- {
+- #ifdef RPI
+-+
+- #ifdef EARLY_MALLOC
+- #else
+-     printf("pic_arrays_free\n");
+-@@ -1982,6 +2011,43 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                     s->sh.luma_offset_l0[current_mv.ref_idx[0]]);
+- 
+-         if (s->ps.sps->chroma_format_idc) {
+-+#ifdef RPI_INTER_QPU
+-+            if (s->enable_rpi) {
+-+                int reflist = 0;
+-+                int hshift           = s->ps.sps->hshift[1];
+-+                int vshift           = s->ps.sps->vshift[1];
+-+                const Mv *mv         = &current_mv.mv[reflist];
+-+                intptr_t mx          = av_mod_uintp2(mv->x, 2 + hshift);
+-+                intptr_t my          = av_mod_uintp2(mv->y, 2 + vshift);
+-+                intptr_t _mx         = mx << (1 - hshift);
+-+                intptr_t _my         = my << (1 - vshift); // Fractional part of motion vector
+-+
+-+                int x1_c = x0_c + (mv->x >> (2 + hshift));
+-+                int y1_c = y0_c + (mv->y >> (2 + hshift));
+-+                int chan = x0>>8; // Allocate commands for the first 256 luma pixels across to the first QPU.  This is optimised for images around 1920 width
+-+
+-+                uint32_t *u = s->u_mvs[chan & 7];
+-+                for(int start_y=0;start_y < nPbH_c;start_y+=16) {
+-+                  for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 3 + start_x;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 3 + start_y;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[1]);
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[2]);
+-+                      *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
+-+                      // TODO chroma weight and offset... s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]
+-+                      *u++ = rpi_filter_coefs[_mx][0];
+-+                      *u++ = rpi_filter_coefs[_mx][1];
+-+                      *u++ = rpi_filter_coefs[_my][0];
+-+                      *u++ = rpi_filter_coefs[_my][1];
+-+                      *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
+-+                      *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+-+                    }
+-+                }
+-+                s->u_mvs[chan & 7] = u;
+-+                return;
+-+            }
+-+#endif
+-             RPI_REDIRECT(chroma_mc_uni)(s, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1],
+-                           0, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
+-                           s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]);
+-@@ -2632,6 +2698,54 @@ static void rpi_execute_inter_cmds(HEVCContext *s)
+- 
+- #endif
+- 
+-+#ifdef RPI_INTER_QPU
+-+static void rpi_inter_clear(HEVCContext *s)
+-+{
+-+    int i;
+-+    int pic_width        = s->ps.sps->width >> s->ps.sps->hshift[1];
+-+    int pic_height       = s->ps.sps->height >> s->ps.sps->vshift[1];
+-+    for(i=0;i<8;i++) {
+-+        s->u_mvs[i] = s->mvs_base[i];
+-+        *s->u_mvs[i]++ = 0;
+-+        *s->u_mvs[i]++ = 0;
+-+        *s->u_mvs[i]++ = 0;
+-+        *s->u_mvs[i]++ = 0;
+-+        *s->u_mvs[i]++ = 0;
+-+        *s->u_mvs[i]++ = pic_width;
+-+        *s->u_mvs[i]++ = pic_height;
+-+        *s->u_mvs[i]++ = s->frame->linesize[1];
+-+        *s->u_mvs[i]++ = s->frame->linesize[2];
+-+        s->u_mvs[i] += 3;  // Padding words
+-+    }
+-+}
+-+
+-+static void rpi_execute_inter_qpu(HEVCContext *s)
+-+{
+-+    int k;
+-+    uint32_t *unif_vc = (uint32_t *)s->unif_mvs_ptr.vc;
+-+
+-+    if (s->sh.slice_type == I_SLICE)
+-+        return;
+-+    for(k=0;k<8;k++) {
+-+        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
+-+        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
+-+    }
+-+
+-+    s->u_mvs[8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
+-+
+-+    qpu_run_shader8(qpu_get_fn(QPU_MC_SETUP_UV),
+-+      (uint32_t)(unif_vc+(s->mvs_base[0 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[1 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[2 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[3 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[4 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[5 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[6 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[7 ] - (uint32_t*)s->unif_mvs_ptr.arm))
+-+      );
+-+}
+-+#endif
+-+
+- static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+- {
+-     HEVCContext *s  = avctxt->priv_data;
+-@@ -2658,6 +2772,10 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-         }
+-     }
+- 
+-+#ifdef RPI_INTER_QPU
+-+    rpi_inter_clear(s);
+-+#endif
+-+
+-     while (more_data && ctb_addr_ts < s->ps.sps->ctb_size) {
+-         int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
+- 
+-@@ -2679,19 +2797,30 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-           s->dblk_cmds[s->num_dblk_cmds][0] = x_ctb;
+-           s->dblk_cmds[s->num_dblk_cmds++][1] = y_ctb;
+-           if ( (((y_ctb + ctb_size)&63) == 0) && x_ctb + ctb_size >= s->ps.sps->width) {
+--            int x;
+-+#ifdef RPI_INTER_QPU
+-+            // Kick off inter prediction on QPUs
+-+            rpi_execute_inter_qpu(s);
+-+#endif
+-             // Transform all blocks
+-             //printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
+--
+-             rpi_execute_transform(s);
+-             // Perform inter prediction
+-             rpi_execute_inter_cmds(s);
+-             // Wait for transform completion
+-             vpu_wait(s->vpu_id);
+-+
+-+            // Copy back reconstructed data
+-+            //memcpy(s->frame->data[0],s->dummy.arm,2048*64);
+-+            //memcpy(s->frame->data[1],s->dummy.arm,1024*32);
+-+            //memcpy(s->frame->data[2],s->dummy.arm,1024*32);
+-+
+-             // Perform intra prediction and residual reconstruction
+-             rpi_execute_pred_cmds(s);
+-             // Perform deblocking for CTBs in this row
+-             rpi_execute_dblk_cmds(s);
+-+#ifdef RPI_INTER_QPU
+-+            rpi_inter_clear(s);
+-+#endif
+-           }
+-         }
+- #endif
+-@@ -2712,6 +2841,9 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+- 
+- #ifdef RPI
+-     if (s->enable_rpi && s->num_dblk_cmds) {
+-+#ifdef RPI_INTER_QPU
+-+        rpi_execute_inter_qpu(s);
+-+#endif
+-         rpi_execute_transform(s);
+-         rpi_execute_inter_cmds(s);
+-         vpu_wait(s->vpu_id);
+-@@ -3451,6 +3583,14 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
+-     av_freep(&s->unif_xfm_cmds);
+-     av_freep(&s->univ_pred_cmds);
+- 
+-+#ifdef RPI_INTER_QPU
+-+    if (s->unif_mvs) {
+-+        gpu_free( &s->unif_mvs_ptr );
+-+        s->unif_mvs = 0;
+-+    }
+-+#endif
+-+    //gpu_free(&s->dummy);
+-+
+- #ifdef EARLY_MALLOC
+-     printf("hevc_decode_free\n");
+-     if (s->coeffs_buf_arm[0]) {
+-@@ -3541,34 +3681,59 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
+-     if (!s->univ_pred_cmds)
+-         goto fail;
+- 
+--    s->coeffs_buf_arm[0] = 0;
+--    s->coeffs_buf_arm[2] = 0;
+-+#ifdef RPI_INTER_QPU
+-+    // We divide the image into blocks 256 wide and 64 high
+-+    // We support up to 2048 widths
+-+    // We compute the number of chroma motion vector commands for 4:4:4 format and 4x4 chroma blocks - assuming all blocks are B predicted
+-+    // Also add space for the startup command for each stream.
+-+
+-+    {
+-+        int uv_commands_per_qpu = (1 + (256*64*2)/(4*4)) * RPI_CHROMA_COMMAND_WORDS;
+-+        uint32_t *p;
+-+        gpu_malloc_uncached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr );
+-+        s->unif_mvs = (uint32_t *) s->unif_mvs_ptr.arm; // TODO support this allocation in non EARLY_MALLOC
+-+
+-+        // Set up initial locations for uniform streams
+-+        p = s->unif_mvs;
+-+        for(i = 0; i < 8; i++) {
+-+            s->mvs_base[i] = p;
+-+            p += uv_commands_per_qpu;
+-+        }
+-+        s->mc_filter_uv = qpu_get_fn(QPU_MC_FILTER_UV);
+-+        s->mc_filter_uv_b = qpu_get_fn(QPU_MC_FILTER_UV_B);
+-+
+-+    }
+-+#endif
+-+    //gpu_malloc_uncached(2048*64,&s->dummy);
+- 
+- #ifdef EARLY_MALLOC
+--    int coeffs_in_ctb = 64*64;
+--    int coefs_per_row = (2048/64) * coeffs_in_ctb * 3;  // Allow space for chroma
+--    printf("Allocated %d\n",coefs_per_row);
+--    gpu_malloc_cached(sizeof(int16_t) * coefs_per_row, &s->coeffs_buf_default);
+--    s->coeffs_buf_arm[0] = (int16_t*) s->coeffs_buf_default.arm;
+--    if (!s->coeffs_buf_arm[0])
+--        goto fail;
+--    gpu_malloc_cached(sizeof(int16_t) * coefs_per_row * 2, &s->coeffs_buf_accelerated);
+--    s->coeffs_buf_arm[2] = (int16_t*) s->coeffs_buf_accelerated.arm;
+--    s->coeffs_buf_vc[2] = s->coeffs_buf_accelerated.vc;
+--    if (!s->coeffs_buf_arm[2])
+--        goto fail;
+--    s->coeffs_buf_arm[3] = coefs_per_row + s->coeffs_buf_arm[2];
+--    s->coeffs_buf_vc[3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[2];
+--    printf("Done\n");
+-+    {
+-+        int coeffs_in_ctb = 64*64;
+-+        int coefs_per_row = (2048/64) * coeffs_in_ctb * 3;  // Allow space for chroma
+-+        s->coeffs_buf_arm[0] = 0;
+-+        s->coeffs_buf_arm[2] = 0;
+-+        printf("Allocated %d\n",coefs_per_row);
+-+        gpu_malloc_cached(sizeof(int16_t) * coefs_per_row, &s->coeffs_buf_default);
+-+        s->coeffs_buf_arm[0] = (int16_t*) s->coeffs_buf_default.arm;
+-+        if (!s->coeffs_buf_arm[0])
+-+            goto fail;
+-+        gpu_malloc_cached(sizeof(int16_t) * coefs_per_row * 2, &s->coeffs_buf_accelerated);
+-+        s->coeffs_buf_arm[2] = (int16_t*) s->coeffs_buf_accelerated.arm;
+-+        s->coeffs_buf_vc[2] = s->coeffs_buf_accelerated.vc;
+-+        if (!s->coeffs_buf_arm[2])
+-+            goto fail;
+-+        s->coeffs_buf_arm[3] = coefs_per_row + s->coeffs_buf_arm[2];
+-+        s->coeffs_buf_vc[3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[2];
+-+        printf("Done\n");
+- #ifdef RPI_PRECLEAR
+--    //memset(s->coeffs_buf_arm[0],0, sizeof(int16_t) * coefs_per_row);
+--    memclear16(s->coeffs_buf_arm[0], coefs_per_row);
+--    //memset(s->coeffs_buf_arm[2],0, sizeof(int16_t) * coefs_per_row);
+--    memclear16(s->coeffs_buf_arm[2], coefs_per_row);
+--    //memset(s->coeffs_buf_arm[3],0, sizeof(int16_t) * coefs_per_row);
+--    memclear16(s->coeffs_buf_arm[3], coefs_per_row);
+-+        //memset(s->coeffs_buf_arm[0],0, sizeof(int16_t) * coefs_per_row);
+-+        memclear16(s->coeffs_buf_arm[0], coefs_per_row);
+-+        //memset(s->coeffs_buf_arm[2],0, sizeof(int16_t) * coefs_per_row);
+-+        memclear16(s->coeffs_buf_arm[2], coefs_per_row);
+-+        //memset(s->coeffs_buf_arm[3],0, sizeof(int16_t) * coefs_per_row);
+-+        memclear16(s->coeffs_buf_arm[3], coefs_per_row);
+- #endif
+--
+-+    }
+- #endif
+- 
+-     s->enable_rpi = 0;
+-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
+-index 990bd8c..da345f6 100644
+---- a/libavcodec/hevc.h
+-+++ b/libavcodec/hevc.h
+-@@ -42,7 +42,11 @@
+- 
+- // define RPI to split the CABAC/prediction/transform into separate stages
+- #ifdef RPI
+--#include "rpi_qpu.h"
+-+
+-+  #include "rpi_qpu.h"
+-+  // Use QPU for inter prediction
+-+  //#define RPI_INTER_QPU
+-+
+- #endif
+- 
+- #define MAX_DPB_SIZE 16 // A.4.1
+-@@ -888,7 +892,7 @@ typedef struct HEVCContext {
+- 
+- #ifdef RPI
+-     int enable_rpi;
+--    HEVCMvCmd *unif_mv_cmds;
+-+    HEVCMvCmd *unif_mv_cmds;  // TODO rename
+-     HEVCXfmCmd *unif_xfm_cmds;
+-     HEVCPredCmd *univ_pred_cmds;
+-     int buf_width;
+-@@ -902,6 +906,20 @@ typedef struct HEVCContext {
+-     int num_pred_cmds;
+-     int num_dblk_cmds;
+-     int vpu_id;
+-+    //GPU_MEM_PTR_T dummy;
+-+#ifdef RPI_INTER_QPU
+-+    GPU_MEM_PTR_T unif_mvs_ptr;
+-+    uint32_t *unif_mvs; // Base of memory for motion vector commands
+-+
+-+    // _base pointers are to the start of the row
+-+    uint32_t *mvs_base[8];
+-+    // these pointers are to the next free space
+-+    uint32_t *u_mvs[8];
+-+    // Function pointers
+-+    uint32_t mc_filter_uv;
+-+    uint32_t mc_filter_uv_b;
+-+#endif
+-+
+- #endif
+- 
+-     uint8_t *cabac_state;
+-diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
+-index e1b32d4..5b3d759 100644
+---- a/libavcodec/hevc_filter.c
+-+++ b/libavcodec/hevc_filter.c
+-@@ -903,8 +903,11 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
+-             s->nal_unit_type == NAL_STSA_N  ||
+-             s->nal_unit_type == NAL_RADL_N  ||
+-             s->nal_unit_type == NAL_RASL_N )) {
+--            flush_buffer(s->frame->buf[1]);
+--            flush_buffer(s->frame->buf[2]);
+-+            //flush_buffer(s->frame->buf[1]);
+-+            //flush_buffer(s->frame->buf[2]);
+-+            //memcpy(s->dummy.arm,s->frame->data[0],2048*64);
+-+            //memcpy(s->dummy.arm,s->frame->data[1],1024*32);
+-+            //memcpy(s->dummy.arm,s->frame->data[2],1024*32);
+-         }
+-         ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
+-     }
+-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
+-index 3b6dae7..e4dd58a 100644
+---- a/libavcodec/rpi_qpu.c
+-+++ b/libavcodec/rpi_qpu.c
+-@@ -3,7 +3,7 @@
+- // This works better than the mmap in that the memory can be cached, but requires a kernel modification to enable the device.
+- #define RPI_USE_VCSM
+- // define RPI_TIME_TOTAL_QPU to print out how much time is spent in the QPU code
+--//#define RPI_TIME_TOTAL_QPU
+-+#define RPI_TIME_TOTAL_QPU
+- // define RPI_TIME_TOTAL_VPU to print out how much time is spent in the VPI code
+- //#define RPI_TIME_TOTAL_VPU
+- // define RPI_ASYNC to run the VPU in a separate thread, need to make a separate call to check for completion
+-@@ -30,7 +30,7 @@
+- #endif
+- 
+- // On Pi2 there is no way to access the VPU L2 cache
+--// GPU_MEM_FLG should be 4 for uncached memory.
+-+// GPU_MEM_FLG should be 4 for uncached memory.  (Or C for alias to allocate in the VPU L2 cache)
+- // However, if using VCSM allocated buffers, need to use C at the moment because VCSM does not allocate uncached memory correctly
+- // The QPU crashes if we mix L2 cached and L2 uncached accesses due to a HW bug.
+- #define GPU_MEM_FLG 0xC
+-@@ -549,6 +549,54 @@ void qpu_run_shader12(int code, int num, int code2, int num2, int unifs1, int un
+-   gpu_unlock();
+- }
+- 
+-+// Run a program on 8 QPUs with the given code and uniform stream (given in GPU addresses)
+-+void qpu_run_shader8(int code, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8)
+-+{
+-+  int i;
+-+#ifdef RPI_TIME_TOTAL_QPU
+-+  static int last_time=0;
+-+  static long long on_time=0;
+-+  static long long off_time=0;
+-+  int start_time;
+-+  int end_time;
+-+  static int count=0;
+-+#endif
+-+
+-+  gpu_lock();
+-+#ifdef RPI_TIME_TOTAL_QPU
+-+  start_time = Microseconds();
+-+  if (last_time==0)
+-+    last_time = start_time;
+-+  off_time += start_time-last_time;
+-+#endif
+-+  for(i=0;i<8;i++) {
+-+    gpu->mail[i*2 + 1] = code;
+-+  }
+-+  gpu->mail[0 ] = unifs1;
+-+  gpu->mail[2 ] = unifs2;
+-+  gpu->mail[4 ] = unifs3;
+-+  gpu->mail[6 ] = unifs4;
+-+  gpu->mail[8 ] = unifs5;
+-+  gpu->mail[10] = unifs6;
+-+	gpu->mail[12] = unifs7;
+-+	gpu->mail[14] = unifs8;
+-+	execute_qpu(
+-+		gpu->mb,
+-+		8 /* Number of QPUs */,
+-+		gpu->vc + offsetof(struct GPU, mail),
+-+		1 /* no flush */,  // Don't flush VPU L1 cache
+-+		5000 /* timeout ms */);
+-+#ifdef RPI_TIME_TOTAL_QPU
+-+  end_time = Microseconds();
+-+  last_time = end_time;
+-+  on_time += end_time - start_time;
+-+  count++;
+-+  if ((count&0x7f)==0)
+-+    printf("On=%dms, Off=%dms\n",(int)(on_time/1000),(int)(off_time/1000));
+-+#endif
+-+  gpu_unlock();
+-+}
+-+
+- unsigned int qpu_get_fn(int num) {
+-     // Make sure that the gpu is initialized
+-     unsigned int *fn;
+-@@ -585,6 +633,9 @@ unsigned int qpu_get_fn(int num) {
+-     case QPU_MC_FILTER_UV_B:
+-       fn = mc_filter_uv_b;
+-       break;
+-+    case QPU_MC_INTERRUPT_EXIT8:
+-+      fn = mc_interrupt_exit8;
+-+      break;
+-     case QPU_MC_END:
+-       fn = mc_end;
+-       break;
+-diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
+-index 3526fce..2b22d98 100644
+---- a/libavcodec/rpi_qpu.h
+-+++ b/libavcodec/rpi_qpu.h
+-@@ -16,6 +16,7 @@ extern void gpu_free(GPU_MEM_PTR_T *p);
+- extern void gpu_cache_flush(GPU_MEM_PTR_T *p);
+- 
+- // QPU specific functions
+-+extern void qpu_run_shader8(int code, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8);
+- extern void qpu_run_shader12(int code, int num, int code2, int num2, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8, int unifs9, int unifs10, int unifs11, int unifs12);
+- 
+- enum {
+-@@ -28,6 +29,7 @@ enum {
+-   QPU_MC_SETUP_UV,
+-   QPU_MC_FILTER_UV,
+-   QPU_MC_FILTER_UV_B,
+-+  QPU_MC_INTERRUPT_EXIT8,
+-   QPU_MC_END
+-   };
+- extern unsigned int qpu_get_fn(int num);
+--- 
+-2.7.4
+-
+-
+-From bd651e1569ebe0cdc41a6be169e139758cce069d Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Wed, 13 May 2015 11:47:23 +0100
+-Subject: [PATCH 17/68] Drafted chroma interpolation on QPUs
+-
+----
+- libavcodec/hevc.c          |   5 ++-
+- libavcodec/hevc.h          |   2 +-
+- libavcodec/hevc_filter.c   |   6 ++-
+- libavcodec/rpi_qpu.c       | 101 +++++++++++++++++++++++++++++++++++++++++++--
+- libavcodec/rpi_qpu.h       |   1 +
+- libavcodec/rpi_shader.c    |  42 +++++++++----------
+- libavcodec/rpi_shader.qasm |  42 +++++++++----------
+- 7 files changed, 149 insertions(+), 50 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 66ed37a..d5ea45e 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -60,11 +60,11 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
+- // The QPU code for UV blocks only works up to a block width of 8
+- #define RPI_CHROMA_BLOCK_WIDTH 8
+- 
+--#define ENCODE_COEFFS(c0, c1, c2, c3) (((-c0) & 0xff) | ((-c1) & 0xff) << 8 | ((-c2) & 0xff) << 16 | ((-c3) & 0xff) << 24)
+-+#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24)
+- 
+- // TODO Chroma only needs 4 taps
+- static uint32_t rpi_filter_coefs[8][2] = {
+--        { ENCODE_COEFFS(  0,  0,  0, 128), ENCODE_COEFFS(   0,   0,  0,  0 ) },
+-+        { ENCODE_COEFFS(  0,  0,  0,  64), ENCODE_COEFFS(   0,   0,  0,  0 ) },
+-         { ENCODE_COEFFS(  0,  0, -2,  58), ENCODE_COEFFS(  10,  -2,  0,  0 ) },
+-         { ENCODE_COEFFS(  0,  0, -4,  54), ENCODE_COEFFS(  16,  -2,  0,  0 ) },
+-         { ENCODE_COEFFS(  0,  0, -6,  46), ENCODE_COEFFS(  28,  -4,  0,  0 ) },
+-@@ -2729,6 +2729,7 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
+-     for(k=0;k<8;k++) {
+-         s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
+-         s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
+-+        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP); // Also need a dummy for V
+-     }
+- 
+-     s->u_mvs[8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
+-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
+-index da345f6..2497c47 100644
+---- a/libavcodec/hevc.h
+-+++ b/libavcodec/hevc.h
+-@@ -45,7 +45,7 @@
+- 
+-   #include "rpi_qpu.h"
+-   // Use QPU for inter prediction
+--  //#define RPI_INTER_QPU
+-+  // #define RPI_INTER_QPU
+- 
+- #endif
+- 
+-diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
+-index 5b3d759..9b6e26d 100644
+---- a/libavcodec/hevc_filter.c
+-+++ b/libavcodec/hevc_filter.c
+-@@ -903,8 +903,10 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
+-             s->nal_unit_type == NAL_STSA_N  ||
+-             s->nal_unit_type == NAL_RADL_N  ||
+-             s->nal_unit_type == NAL_RASL_N )) {
+--            //flush_buffer(s->frame->buf[1]);
+--            //flush_buffer(s->frame->buf[2]);
+-+#ifdef RPI_INTER_QPU
+-+            flush_buffer(s->frame->buf[1]);
+-+            flush_buffer(s->frame->buf[2]);
+-+#endif
+-             //memcpy(s->dummy.arm,s->frame->data[0],2048*64);
+-             //memcpy(s->dummy.arm,s->frame->data[1],1024*32);
+-             //memcpy(s->dummy.arm,s->frame->data[2],1024*32);
+-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
+-index e4dd58a..4d9eda8 100644
+---- a/libavcodec/rpi_qpu.c
+-+++ b/libavcodec/rpi_qpu.c
+-@@ -33,7 +33,8 @@
+- // GPU_MEM_FLG should be 4 for uncached memory.  (Or C for alias to allocate in the VPU L2 cache)
+- // However, if using VCSM allocated buffers, need to use C at the moment because VCSM does not allocate uncached memory correctly
+- // The QPU crashes if we mix L2 cached and L2 uncached accesses due to a HW bug.
+--#define GPU_MEM_FLG 0xC
+-+#define GPU_MEM_FLG 0x4
+-+// GPU_MEM_MAP is meaningless on the Pi2 and should be left at 0  (On Pi1 it allows ARM to access VPU L2 cache)
+- #define GPU_MEM_MAP 0x0
+- 
+- #define vcos_verify(x) ((x)>=0)
+-@@ -165,6 +166,8 @@ static int gpu_init(volatile struct GPU **gpu) {
+- 	ptr->vc_handle = handle;
+- 	ptr->vc = vc;
+- 
+-+  printf("GPU allocated at 0x%x\n",vc);
+-+
+-   *gpu = ptr;
+- 
+-   // Now copy over the QPU code into GPU memory
+-@@ -304,10 +307,13 @@ int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p) {
+- 
+- static void gpu_term(void)
+- {
+--	int mb = gpu->mb;
+--	unsigned handle = gpu->vc_handle;
+-+	int mb;
+-+	unsigned handle;
+-+
+-   if (gpu==NULL)
+-     return;
+-+  mb = gpu->mb;
+-+  handle = gpu->vc_handle;
+- 
+- #ifdef RPI_ASYNC
+-   {
+-@@ -648,6 +654,95 @@ unsigned int qpu_get_fn(int num) {
+- }
+- 
+- #if 0
+-+typedef unsigned int uint32_t;
+-+
+-+typedef struct mvs_s {
+-+    GPU_MEM_PTR_T unif_mvs_ptr;
+-+    uint32_t *unif_mvs; // Base of memory for motion vector commands
+-+
+-+    // _base pointers are to the start of the row
+-+    uint32_t *mvs_base[8];
+-+    // these pointers are to the next free space
+-+    uint32_t *u_mvs[8];
+-+
+-+} HEVCContext;
+-+
+-+#define RPI_CHROMA_COMMAND_WORDS 12
+-+
+-+static void rpi_inter_clear(HEVCContext *s)
+-+{
+-+    int i;
+-+    for(i=0;i<8;i++) {
+-+        s->u_mvs[i] = s->mvs_base[i];
+-+        *s->u_mvs[i]++ = 0;
+-+        *s->u_mvs[i]++ = 0;
+-+        *s->u_mvs[i]++ = 0;
+-+        *s->u_mvs[i]++ = 0;
+-+        *s->u_mvs[i]++ = 0;
+-+        *s->u_mvs[i]++ = 128;  // w
+-+        *s->u_mvs[i]++ = 128;  // h
+-+        *s->u_mvs[i]++ = 128;  // stride u
+-+        *s->u_mvs[i]++ = 128;  // stride v
+-+        s->u_mvs[i] += 3;  // Padding words
+-+    }
+-+}
+-+
+-+static void rpi_execute_inter_qpu(HEVCContext *s)
+-+{
+-+    int k;
+-+    uint32_t *unif_vc = (uint32_t *)s->unif_mvs_ptr.vc;
+-+
+-+    for(k=0;k<8;k++) {
+-+        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
+-+        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
+-+        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP); //  dummy location for V
+-+    }
+-+
+-+    s->u_mvs[8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
+-+
+-+    qpu_run_shader8(qpu_get_fn(QPU_MC_SETUP_UV),
+-+      (uint32_t)(unif_vc+(s->mvs_base[0 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[1 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[2 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[3 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[4 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[5 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[6 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[7 ] - (uint32_t*)s->unif_mvs_ptr.arm))
+-+      );
+-+}
+-+
+-+void rpi_test_qpu(void)
+-+{
+-+    HEVCContext mvs;
+-+    HEVCContext *s = &mvs;
+-+    int i;
+-+    int uv_commands_per_qpu = (1 + (256*64*2)/(4*4)) * RPI_CHROMA_COMMAND_WORDS;
+-+    uint32_t *p;
+-+    printf("Allocate memory\n");
+-+    gpu_malloc_uncached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr );
+-+    s->unif_mvs = (uint32_t *) s->unif_mvs_ptr.arm;
+-+
+-+    // Set up initial locations for uniform streams
+-+    p = s->unif_mvs;
+-+    for(i = 0; i < 8; i++) {
+-+        s->mvs_base[i] = p;
+-+        p += uv_commands_per_qpu;
+-+    }
+-+    // Now run a simple program that should just quit immediately after a single texture fetch
+-+    rpi_inter_clear(s);
+-+    for(i=0;i<4;i++) {
+-+      printf("Launch QPUs\n");
+-+      rpi_execute_inter_qpu(s);
+-+      printf("Done\n");
+-+    }
+-+    printf("Free memory\n");
+-+    gpu_free(&s->unif_mvs_ptr);
+-+    return;
+-+}
+-+#endif
+-+
+-+#if 0
+- 
+- int32_t hcoeffs[] = {-4, 10, -21, 70, 90, -24, 11, -4};
+- //int32_t hcoeffs[] = {1, 1, 1, 1, 1, 1, 1, 1};
+-diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
+-index 2b22d98..f9ad333 100644
+---- a/libavcodec/rpi_qpu.h
+-+++ b/libavcodec/rpi_qpu.h
+-@@ -18,6 +18,7 @@ extern void gpu_cache_flush(GPU_MEM_PTR_T *p);
+- // QPU specific functions
+- extern void qpu_run_shader8(int code, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8);
+- extern void qpu_run_shader12(int code, int num, int code2, int num2, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8, int unifs9, int unifs10, int unifs11, int unifs12);
+-+extern void rpi_test_qpu(void);
+- 
+- enum {
+-   QPU_MC_SETUP,
+-diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
+-index 41cc2e1..d7ed297 100644
+---- a/libavcodec/rpi_shader.c
+-+++ b/libavcodec/rpi_shader.c
+-@@ -152,23 +152,23 @@ unsigned int rpi_shader[] = {
+- /* [0x000003e8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+- /* [0x000003f0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+- /* [0x000003f8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000400] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
+--/* [0x00000408] */ 0x40038031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
+--/* [0x00000410] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00000400] */ 0x55015fc6, 0x100248a2, // mov r2, rb21         ; mul24 r2, r0, ra0
+-+/* [0x00000408] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x00000410] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+- /* [0x00000418] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x00000420] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000420] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+- /* [0x00000428] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x00000430] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00000430] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+- /* [0x00000438] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00000440] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+/* [0x00000440] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+- /* [0x00000448] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--/* [0x00000450] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+/* [0x00000450] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+- /* [0x00000458] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--/* [0x00000460] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+/* [0x00000460] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+- /* [0x00000468] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--/* [0x00000470] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+/* [0x00000470] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+- /* [0x00000478] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--/* [0x00000480] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
+-+/* [0x00000480] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
+- /* [0x00000488] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+- /* [0x00000490] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+- /* [0x00000498] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+-@@ -179,20 +179,20 @@ unsigned int rpi_shader[] = {
+- /* [0x000004c0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+- /* [0x000004c8] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop
+- /* [0x000004d0] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
+--/* [0x000004d8] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
+-+/* [0x000004d8] */ 0x8f54e1f6, 0xd0024821, // asr r0, r0, 14          ; mov r1, ra21
+- /* [0x000004e0] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
+--/* [0x000004e8] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
+--/* [0x000004f0] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
+--/* [0x000004f8] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
+--/* [0x00000500] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
+--/* [0x00000508] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
+--/* [0x00000510] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
+--/* [0x00000518] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
+--/* [0x00000520] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
+--/* [0x00000528] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
+-+/* [0x000004e8] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
+-+/* [0x000004f0] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
+-+/* [0x000004f8] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
+-+/* [0x00000500] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
+-+/* [0x00000508] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
+-+/* [0x00000510] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
+-+/* [0x00000518] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
+-+/* [0x00000520] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
+-+/* [0x00000528] */ 0x8c9f223f, 0x100a0867, // add.ifnn r1, r1, r0     ; mov -, vw_wait
+- /* [0x00000530] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+- /* [0x00000538] */ 0xfffffe50, 0xf06809e7, // brr.anyn -, r:uvloop
+--/* [0x00000540] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
+-+/* [0x00000540] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+- /* [0x00000548] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+- /* [0x00000550] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+- /* [0x00000558] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
+-index 6851e83..02fdcb2 100644
+---- a/libavcodec/rpi_shader.qasm
+-+++ b/libavcodec/rpi_shader.qasm
+-@@ -270,23 +270,23 @@ add t0s, ra_x2_base, r2
+- 
+- mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+- 
+--mov r2, rb21         ; mul24 r3, r0, ra0
+--nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
+--sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+-+mov r2, rb21         ; mul24 r2, r0, ra0
+-+nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+nop                  ; mul24      r3, ra1 << 1, r0 << 1
+- nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+- nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+- nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+- nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+- nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+- nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+- nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--sub r0, r2, r3
+-+add r0, r2, r3
+- 
+- mov r3, rb31
+- 
+-@@ -302,23 +302,23 @@ sub.setf -, r3, 8 ; mov r1, ra22
+- # apply horizontal filter
+- brr.anyn -, r:uvloop
+- max ra14, ra15, 0       ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
+--asr r0, r0, 15          ; mov r1, ra21
+-+asr r0, r0, 14          ; mov r1, ra21
+- min.setf ra15, r0, rb22
+- 
+- # apply vertical filter and write to VPM
+- 
+--nop                     ; mul24 r0, ra14, rb14
+--sub r1, r1, r0          ; mul24 r0, ra13, rb13
+--sub r1, r1, r0          ; mul24 r0, ra12, rb12
+--sub r1, r1, r0          ; mul24 r0, ra11, rb11
+--sub r1, r1, r0          ; mul24 r0, ra10, rb10
+--sub r1, r1, r0          ; mul24 r0, ra9, rb9
+--sub r1, r1, r0          ; mul24 r0, ra8, rb8
+--sub r1, r1, r0          ; mul24 r0, ra15, rb15
+--sub.ifnn r1, r1, r0     ; mov -, vw_wait
+-+nop                     ; mul24 r1, ra14, rb14
+-+nop                     ; mul24 r0, ra13, rb13
+-+add r1, r1, r0          ; mul24 r0, ra12, rb12
+-+add r1, r1, r0          ; mul24 r0, ra11, rb11
+-+add r1, r1, r0          ; mul24 r0, ra10, rb10
+-+add r1, r1, r0          ; mul24 r0, ra9, rb9
+-+add r1, r1, r0          ; mul24 r0, ra8, rb8
+-+add r1, r1, r0          ; mul24 r0, ra15, rb15
+-+add.ifnn r1, r1, r0     ; mov -, vw_wait
+- sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+- brr.anyn -, r:uvloop
+--asr r1, r1, 15
+-+asr r1, r1, 14
+- min r1, r1, rb22
+- max vpm, r1, 0
+- 
+--- 
+-2.7.4
+-
+-
+-From 61628063461ee5d891af6dbedfd495efcf464012 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Wed, 13 May 2015 13:54:11 +0100
+-Subject: [PATCH 18/68] Fixed chroma inter prediction
+-
+----
+- libavcodec/hevc.c          |    8 +-
+- libavcodec/hevc.h          |    2 +-
+- libavcodec/rpi_shader.c    | 1170 ++++++++++++++++++++++----------------------
+- libavcodec/rpi_shader.h    |   22 +-
+- libavcodec/rpi_shader.qasm |   24 +-
+- 5 files changed, 617 insertions(+), 609 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index d5ea45e..d6d78ee 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -57,9 +57,11 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
+- #ifdef RPI_INTER_QPU
+- 
+- #define RPI_CHROMA_COMMAND_WORDS 12
+-+#define UV_COMMANDS_PER_QPU ((1 + (256*64*2)/(4*4)) * RPI_CHROMA_COMMAND_WORDS)
+- // The QPU code for UV blocks only works up to a block width of 8
+- #define RPI_CHROMA_BLOCK_WIDTH 8
+- 
+-+
+- #define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24)
+- 
+- // TODO Chroma only needs 4 taps
+-@@ -2024,7 +2026,8 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+- 
+-                 int x1_c = x0_c + (mv->x >> (2 + hshift));
+-                 int y1_c = y0_c + (mv->y >> (2 + hshift));
+--                int chan = x0>>8; // Allocate commands for the first 256 luma pixels across to the first QPU.  This is optimised for images around 1920 width
+-+                //int chan = x0>>8; // Allocate commands for the first 256 luma pixels across to the first QPU.  This is optimised for images around 1920 width
+-+                int chan = x0>>8;
+- 
+-                 uint32_t *u = s->u_mvs[chan & 7];
+-                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
+-@@ -2730,6 +2733,7 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
+-         s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
+-         s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
+-         s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP); // Also need a dummy for V
+-+        assert(s->u_mvs[k] - s->mvs_base[k] < UV_COMMANDS_PER_QPU);
+-     }
+- 
+-     s->u_mvs[8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
+-@@ -3689,7 +3693,7 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
+-     // Also add space for the startup command for each stream.
+- 
+-     {
+--        int uv_commands_per_qpu = (1 + (256*64*2)/(4*4)) * RPI_CHROMA_COMMAND_WORDS;
+-+        int uv_commands_per_qpu = UV_COMMANDS_PER_QPU;
+-         uint32_t *p;
+-         gpu_malloc_uncached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr );
+-         s->unif_mvs = (uint32_t *) s->unif_mvs_ptr.arm; // TODO support this allocation in non EARLY_MALLOC
+-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
+-index 2497c47..d513579 100644
+---- a/libavcodec/hevc.h
+-+++ b/libavcodec/hevc.h
+-@@ -45,7 +45,7 @@
+- 
+-   #include "rpi_qpu.h"
+-   // Use QPU for inter prediction
+--  // #define RPI_INTER_QPU
+-+  #define RPI_INTER_QPU
+- 
+- #endif
+- 
+-diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
+-index d7ed297..831633b 100644
+---- a/libavcodec/rpi_shader.c
+-+++ b/libavcodec/rpi_shader.c
+-@@ -33,7 +33,7 @@ unsigned int rpi_shader[] = {
+- /* [0x00000040] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+- /* [0x00000048] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
+- /* [0x00000050] */ 0x00000001, 0xe0020527, // mov ra20, 1
+--/* [0x00000058] */ 0x00000040, 0xe0020567, // mov ra21, 64
+-+/* [0x00000058] */ 0x00000020, 0xe0020567, // mov ra21, 32
+- /* [0x00000060] */ 0x00000100, 0xe00205a7, // mov ra22, 256
+- /* [0x00000068] */ 0x00000008, 0xe00205e7, // mov ra23, 8
+- /* [0x00000070] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
+-@@ -152,7 +152,7 @@ unsigned int rpi_shader[] = {
+- /* [0x000003e8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+- /* [0x000003f0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+- /* [0x000003f8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000400] */ 0x55015fc6, 0x100248a2, // mov r2, rb21         ; mul24 r2, r0, ra0
+-+/* [0x00000400] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+- /* [0x00000408] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+- /* [0x00000410] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+- /* [0x00000418] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-@@ -178,9 +178,9 @@ unsigned int rpi_shader[] = {
+- /* [0x000004b8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+- /* [0x000004c0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+- /* [0x000004c8] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop
+--/* [0x000004d0] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
+--/* [0x000004d8] */ 0x8f54e1f6, 0xd0024821, // asr r0, r0, 14          ; mov r1, ra21
+--/* [0x000004e0] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
+-+/* [0x000004d0] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+-+/* [0x000004d8] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+-+/* [0x000004e0] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+- /* [0x000004e8] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
+- /* [0x000004f0] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
+- /* [0x000004f8] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
+-@@ -189,400 +189,400 @@ unsigned int rpi_shader[] = {
+- /* [0x00000510] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
+- /* [0x00000518] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
+- /* [0x00000520] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
+--/* [0x00000528] */ 0x8c9f223f, 0x100a0867, // add.ifnn r1, r1, r0     ; mov -, vw_wait
+-+/* [0x00000528] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+- /* [0x00000530] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x00000538] */ 0xfffffe50, 0xf06809e7, // brr.anyn -, r:uvloop
+--/* [0x00000540] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+--/* [0x00000548] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x00000550] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+--/* [0x00000558] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00000560] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000568] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x00000570] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x00000578] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x00000580] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000588] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x00000590] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000598] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000538] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x00000540] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
+-+/* [0x00000548] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:uvloop
+-+/* [0x00000550] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
+-+/* [0x00000558] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x00000560] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x00000568] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00000570] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000578] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000580] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x00000588] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x00000590] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000598] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x000005a0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x000005a8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_filter
+--/* [0x000005a0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x000005a8] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x000005b0] */ 0x156e7d80, 0x10020667, // mov ra_x2shift, ra_x2shift_next
+--/* [0x000005b8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x000005c0] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
+--/* [0x000005c8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x000005d0] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3,unif
+--/* [0x000005d8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x000005e0] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
+--/* [0x000005e8] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
+--/* [0x000005f0] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
+--/* [0x000005f8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x00000600] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
+--/* [0x00000608] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+--/* [0x00000610] */ 0x149dc5c0, 0xd00206a7, // and ra_x2_base_next, r2, ~3
+--/* [0x00000618] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x00000620] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+--/* [0x00000628] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x00000630] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000638] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x00000640] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x00000648] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x00000650] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+--/* [0x00000658] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+--/* [0x00000660] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x00000668] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000670] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000678] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00000680] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
+--/* [0x00000688] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000690] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000698] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000006a0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000006a8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x000006b0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000006b8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000006c0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000006c8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+--/* [0x000006d0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000006d8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000006e0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000006e8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+--/* [0x000006f0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000006f8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000700] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000708] */ 0x000001d0, 0xf07809e7, // brr.anynn -, r:fast_path
+--/* [0x00000710] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+--/* [0x00000718] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x00000720] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000728] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x000005b0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x000005b8] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x000005c0] */ 0x156e7d80, 0x10020667, // mov ra_x2shift, ra_x2shift_next
+-+/* [0x000005c8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x000005d0] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
+-+/* [0x000005d8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x000005e0] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3,unif
+-+/* [0x000005e8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x000005f0] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
+-+/* [0x000005f8] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
+-+/* [0x00000600] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
+-+/* [0x00000608] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x00000610] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
+-+/* [0x00000618] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+-+/* [0x00000620] */ 0x149dc5c0, 0xd00206a7, // and ra_x2_base_next, r2, ~3
+-+/* [0x00000628] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x00000630] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x00000638] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x00000640] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000648] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x00000650] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x00000658] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x00000660] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+-+/* [0x00000668] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+-+/* [0x00000670] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x00000678] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000680] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000688] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00000690] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
+-+/* [0x00000698] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000006a0] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000006a8] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000006b0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000006b8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x000006c0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000006c8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000006d0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000006d8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+-+/* [0x000006e0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000006e8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000006f0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000006f8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+-+/* [0x00000700] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000708] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000710] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000718] */ 0x000001d0, 0xf07809e7, // brr.anynn -, r:fast_path
+-+/* [0x00000720] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+-+/* [0x00000728] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x00000730] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000738] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :loop
+--/* [0x00000730] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x00000738] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x00000740] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x00000748] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x00000750] */ 0xee654987, 0x10024860, // shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20
+--/* [0x00000758] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x00000760] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x00000768] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x00000770] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x00000778] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x00000780] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000788] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
+--/* [0x00000790] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x00000798] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x000007a0] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x000007a8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x000007b0] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x000007b8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x000007c0] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--/* [0x000007c8] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--/* [0x000007d0] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--/* [0x000007d8] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--/* [0x000007e0] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--/* [0x000007e8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--/* [0x000007f0] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--/* [0x000007f8] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--/* [0x00000800] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
+--/* [0x00000808] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+--/* [0x00000810] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+--/* [0x00000818] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+--/* [0x00000820] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+--/* [0x00000828] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+--/* [0x00000830] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+--/* [0x00000838] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x00000840] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+--/* [0x00000848] */ 0xfffffec8, 0xf06809e7, // brr.anyn -, r:loop
+--/* [0x00000850] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
+--/* [0x00000858] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
+--/* [0x00000860] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
+--/* [0x00000868] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
+--/* [0x00000870] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
+--/* [0x00000878] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
+--/* [0x00000880] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
+--/* [0x00000888] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
+--/* [0x00000890] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
+--/* [0x00000898] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
+--/* [0x000008a0] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
+--/* [0x000008a8] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
+--/* [0x000008b0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x000008b8] */ 0xfffffe58, 0xf06809e7, // brr.anyn -, r:loop
+--/* [0x000008c0] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
+--/* [0x000008c8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x000008d0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+--/* [0x000008d8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x000008e0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x000008e8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x000008f0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000740] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x00000748] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x00000750] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x00000758] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000760] */ 0xee654987, 0x10024860, // shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20
+-+/* [0x00000768] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000770] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000778] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x00000780] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x00000788] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x00000790] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000798] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
+-+/* [0x000007a0] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x000007a8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x000007b0] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x000007b8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x000007c0] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x000007c8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x000007d0] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+/* [0x000007d8] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+/* [0x000007e0] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+/* [0x000007e8] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+/* [0x000007f0] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+/* [0x000007f8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+/* [0x00000800] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+/* [0x00000808] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+/* [0x00000810] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
+-+/* [0x00000818] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+-+/* [0x00000820] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+-+/* [0x00000828] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+-+/* [0x00000830] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+-+/* [0x00000838] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+-+/* [0x00000840] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+-+/* [0x00000848] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00000850] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+-+/* [0x00000858] */ 0xfffffec8, 0xf06809e7, // brr.anyn -, r:loop
+-+/* [0x00000860] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
+-+/* [0x00000868] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
+-+/* [0x00000870] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
+-+/* [0x00000878] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
+-+/* [0x00000880] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
+-+/* [0x00000888] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
+-+/* [0x00000890] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
+-+/* [0x00000898] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
+-+/* [0x000008a0] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
+-+/* [0x000008a8] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
+-+/* [0x000008b0] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
+-+/* [0x000008b8] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
+-+/* [0x000008c0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x000008c8] */ 0xfffffe58, 0xf06809e7, // brr.anyn -, r:loop
+-+/* [0x000008d0] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
+-+/* [0x000008d8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x000008e0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x000008e8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x000008f0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x000008f8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000900] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // :fast_path
+--/* [0x000008f8] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x00000908] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :fast_loop
+--/* [0x00000900] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x00000908] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x00000910] */ 0x95727d9b, 0x1004475f, // mov.ifz ra_y, ra_y_next   ; mov rb31, r3
+--/* [0x00000918] */ 0x95690dbf, 0x10044623, // mov.ifz ra_x2_base, ra_x2_base_next   ; mov r3, rb_pitch
+--/* [0x00000920] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x00000928] */ 0x929de5e4, 0x100248a1, // min r2, r2, rb_frame_height_minus_1 ; mov r1, r4
+--/* [0x00000930] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x00000938] */ 0xec414c87, 0x10024e20, // add t0s, ra_x_base, r2    ; v8subs r0, r0, rb20
+--/* [0x00000940] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x00000948] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
+--/* [0x00000950] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra1 << 1, r0 << 1
+--/* [0x00000958] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra2 << 2, r0 << 2
+--/* [0x00000960] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra3 << 3, r0 << 3
+--/* [0x00000968] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra4 << 4, r0 << 4
+--/* [0x00000970] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra5 << 5, r0 << 5
+--/* [0x00000978] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra6 << 6, r0 << 6
+--/* [0x00000980] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra7 << 7, r0 << 7
+--/* [0x00000988] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
+--/* [0x00000990] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+--/* [0x00000998] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+--/* [0x000009a0] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+--/* [0x000009a8] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+--/* [0x000009b0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+--/* [0x000009b8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x000009c0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8       ; mov r1, ra22
+--/* [0x000009c8] */ 0xffffff18, 0xf06809e7, // brr.anyn -, r:fast_loop
+--/* [0x000009d0] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
+--/* [0x000009d8] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
+--/* [0x000009e0] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
+--/* [0x000009e8] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
+--/* [0x000009f0] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
+--/* [0x000009f8] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
+--/* [0x00000a00] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
+--/* [0x00000a08] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
+--/* [0x00000a10] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
+--/* [0x00000a18] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
+--/* [0x00000a20] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
+--/* [0x00000a28] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
+--/* [0x00000a30] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x00000a38] */ 0xfffffea8, 0xf06809e7, // brr.anyn -, r:fast_loop
+--/* [0x00000a40] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
+--/* [0x00000a48] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x00000a50] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+--/* [0x00000a58] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000a60] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00000a68] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000a70] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000910] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x00000918] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x00000920] */ 0x95727d9b, 0x1004475f, // mov.ifz ra_y, ra_y_next   ; mov rb31, r3
+-+/* [0x00000928] */ 0x95690dbf, 0x10044623, // mov.ifz ra_x2_base, ra_x2_base_next   ; mov r3, rb_pitch
+-+/* [0x00000930] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000938] */ 0x929de5e4, 0x100248a1, // min r2, r2, rb_frame_height_minus_1 ; mov r1, r4
+-+/* [0x00000940] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x00000948] */ 0xec414c87, 0x10024e20, // add t0s, ra_x_base, r2    ; v8subs r0, r0, rb20
+-+/* [0x00000950] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x00000958] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
+-+/* [0x00000960] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra1 << 1, r0 << 1
+-+/* [0x00000968] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra2 << 2, r0 << 2
+-+/* [0x00000970] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra3 << 3, r0 << 3
+-+/* [0x00000978] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra4 << 4, r0 << 4
+-+/* [0x00000980] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra5 << 5, r0 << 5
+-+/* [0x00000988] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra6 << 6, r0 << 6
+-+/* [0x00000990] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra7 << 7, r0 << 7
+-+/* [0x00000998] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
+-+/* [0x000009a0] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+-+/* [0x000009a8] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+-+/* [0x000009b0] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+-+/* [0x000009b8] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+-+/* [0x000009c0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+-+/* [0x000009c8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x000009d0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8       ; mov r1, ra22
+-+/* [0x000009d8] */ 0xffffff18, 0xf06809e7, // brr.anyn -, r:fast_loop
+-+/* [0x000009e0] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
+-+/* [0x000009e8] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
+-+/* [0x000009f0] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
+-+/* [0x000009f8] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
+-+/* [0x00000a00] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
+-+/* [0x00000a08] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
+-+/* [0x00000a10] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
+-+/* [0x00000a18] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
+-+/* [0x00000a20] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
+-+/* [0x00000a28] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
+-+/* [0x00000a30] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
+-+/* [0x00000a38] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
+-+/* [0x00000a40] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x00000a48] */ 0xfffffea8, 0xf06809e7, // brr.anyn -, r:fast_loop
+-+/* [0x00000a50] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
+-+/* [0x00000a58] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x00000a60] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x00000a68] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000a70] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00000a78] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000a80] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_filter_b
+--/* [0x00000a78] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00000a80] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x00000a88] */ 0x156e7d80, 0x10020667, // mov ra_x2shift, ra_x2shift_next
+--/* [0x00000a90] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00000a98] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
+--/* [0x00000aa0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x00000aa8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3,unif
+--/* [0x00000ab0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x00000ab8] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
+--/* [0x00000ac0] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
+--/* [0x00000ac8] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
+--/* [0x00000ad0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x00000ad8] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
+--/* [0x00000ae0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+--/* [0x00000ae8] */ 0x149dc5c0, 0xd00206a7, // and ra_x2_base_next, r2, ~3
+--/* [0x00000af0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x00000af8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+--/* [0x00000b00] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x00000b08] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000b10] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x00000b18] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x00000b20] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x00000b28] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+--/* [0x00000b30] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+--/* [0x00000b38] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x00000b40] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+--/* [0x00000b48] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+--/* [0x00000b50] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+--/* [0x00000b58] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000b60] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000b68] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00000b70] */ 0x0c9dc7c0, 0x10020c67, // add vr_setup, r3, rb28
+--/* [0x00000b78] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000b80] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000b88] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000b90] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000b98] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x00000ba0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000ba8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000bb0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000bb8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+--/* [0x00000bc0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000bc8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000bd0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000bd8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+--/* [0x00000be0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000be8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000bf0] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000bf8] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+--/* [0x00000c00] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x00000c08] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000c10] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x00000a88] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000a90] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00000a98] */ 0x156e7d80, 0x10020667, // mov ra_x2shift, ra_x2shift_next
+-+/* [0x00000aa0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000aa8] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
+-+/* [0x00000ab0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000ab8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3,unif
+-+/* [0x00000ac0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000ac8] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
+-+/* [0x00000ad0] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
+-+/* [0x00000ad8] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
+-+/* [0x00000ae0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x00000ae8] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
+-+/* [0x00000af0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+-+/* [0x00000af8] */ 0x149dc5c0, 0xd00206a7, // and ra_x2_base_next, r2, ~3
+-+/* [0x00000b00] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x00000b08] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x00000b10] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x00000b18] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000b20] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x00000b28] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x00000b30] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x00000b38] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+-+/* [0x00000b40] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+-+/* [0x00000b48] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x00000b50] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+-+/* [0x00000b58] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+-+/* [0x00000b60] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+-+/* [0x00000b68] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000b70] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000b78] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00000b80] */ 0x0c9dc7c0, 0x10020c67, // add vr_setup, r3, rb28
+-+/* [0x00000b88] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000b90] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000b98] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000ba0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000ba8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x00000bb0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000bb8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000bc0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000bc8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+-+/* [0x00000bd0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000bd8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000be0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000be8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+-+/* [0x00000bf0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000bf8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000c00] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000c08] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+-+/* [0x00000c10] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x00000c18] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000c20] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :bloop
+--/* [0x00000c18] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x00000c20] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x00000c28] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x00000c30] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x00000c38] */ 0xee654987, 0x10024860, // shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20
+--/* [0x00000c40] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x00000c48] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x00000c50] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x00000c58] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x00000c60] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x00000c68] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000c70] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
+--/* [0x00000c78] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x00000c80] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x00000c88] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x00000c90] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x00000c98] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x00000ca0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00000ca8] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--/* [0x00000cb0] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--/* [0x00000cb8] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--/* [0x00000cc0] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--/* [0x00000cc8] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--/* [0x00000cd0] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--/* [0x00000cd8] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--/* [0x00000ce0] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--/* [0x00000ce8] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
+--/* [0x00000cf0] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+--/* [0x00000cf8] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+--/* [0x00000d00] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+--/* [0x00000d08] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+--/* [0x00000d10] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+--/* [0x00000d18] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+--/* [0x00000d20] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x00000d28] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+--/* [0x00000d30] */ 0xfffffec8, 0xf06809e7, // brr.anyn -, r:bloop
+--/* [0x00000d38] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
+--/* [0x00000d40] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
+--/* [0x00000d48] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
+--/* [0x00000d50] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
+--/* [0x00000d58] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
+--/* [0x00000d60] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
+--/* [0x00000d68] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
+--/* [0x00000d70] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
+--/* [0x00000d78] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
+--/* [0x00000d80] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
+--/* [0x00000d88] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
+--/* [0x00000d90] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
+--/* [0x00000d98] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x00000da0] */ 0x8fc8f3f6, 0xd0020867, // asr r1, r1, 15          ; mov -, vr_wait
+--/* [0x00000da8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x00000db0] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
+--/* [0x00000db8] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:bloop
+--/* [0x00000dc0] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+--/* [0x00000dc8] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
+--/* [0x00000dd0] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
+--/* [0x00000dd8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000de0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00000de8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000df0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000c28] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x00000c30] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x00000c38] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x00000c40] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000c48] */ 0xee654987, 0x10024860, // shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20
+-+/* [0x00000c50] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000c58] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000c60] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x00000c68] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x00000c70] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x00000c78] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000c80] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
+-+/* [0x00000c88] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00000c90] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x00000c98] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000ca0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00000ca8] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00000cb0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00000cb8] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+/* [0x00000cc0] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+/* [0x00000cc8] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+/* [0x00000cd0] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+/* [0x00000cd8] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+/* [0x00000ce0] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+/* [0x00000ce8] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+/* [0x00000cf0] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+/* [0x00000cf8] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
+-+/* [0x00000d00] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+-+/* [0x00000d08] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+-+/* [0x00000d10] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+-+/* [0x00000d18] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+-+/* [0x00000d20] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+-+/* [0x00000d28] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+-+/* [0x00000d30] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00000d38] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+-+/* [0x00000d40] */ 0xfffffec8, 0xf06809e7, // brr.anyn -, r:bloop
+-+/* [0x00000d48] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
+-+/* [0x00000d50] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
+-+/* [0x00000d58] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
+-+/* [0x00000d60] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
+-+/* [0x00000d68] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
+-+/* [0x00000d70] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
+-+/* [0x00000d78] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
+-+/* [0x00000d80] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
+-+/* [0x00000d88] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
+-+/* [0x00000d90] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
+-+/* [0x00000d98] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
+-+/* [0x00000da0] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
+-+/* [0x00000da8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x00000db0] */ 0x8fc8f3f6, 0xd0020867, // asr r1, r1, 15          ; mov -, vr_wait
+-+/* [0x00000db8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x00000dc0] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
+-+/* [0x00000dc8] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:bloop
+-+/* [0x00000dd0] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+-+/* [0x00000dd8] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
+-+/* [0x00000de0] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
+-+/* [0x00000de8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000df0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00000df8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000e00] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_filter_honly
+--/* [0x00000df8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00000e00] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x00000e08] */ 0x156e7d80, 0x10020667, // mov ra_x2shift, ra_x2shift_next
+--/* [0x00000e10] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00000e18] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
+--/* [0x00000e20] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x00000e28] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3,unif
+--/* [0x00000e30] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x00000e38] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
+--/* [0x00000e40] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
+--/* [0x00000e48] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
+--/* [0x00000e50] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x00000e58] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
+--/* [0x00000e60] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+--/* [0x00000e68] */ 0x149dc5c0, 0xd00206a7, // and ra_x2_base_next, r2, ~3
+--/* [0x00000e70] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x00000e78] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+--/* [0x00000e80] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x00000e88] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000e90] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x00000e98] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x00000ea0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x00000ea8] */ 0x0c9de1c0, 0xd0021467, // add rb17, r0, -2
+--/* [0x00000eb0] */ 0x919c71c0, 0xd0024812, // shl r0, r0, 7 ; mov rb18,r0
+--/* [0x00000eb8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000ec0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000ec8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00000ed0] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000ed8] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000ee0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000ee8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000ef0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x00000ef8] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000f00] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000f08] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000f10] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+--/* [0x00000f18] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000f20] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x00000f28] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000f30] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x00000e08] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000e10] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00000e18] */ 0x156e7d80, 0x10020667, // mov ra_x2shift, ra_x2shift_next
+-+/* [0x00000e20] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000e28] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
+-+/* [0x00000e30] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000e38] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3,unif
+-+/* [0x00000e40] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000e48] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
+-+/* [0x00000e50] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
+-+/* [0x00000e58] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
+-+/* [0x00000e60] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x00000e68] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
+-+/* [0x00000e70] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+-+/* [0x00000e78] */ 0x149dc5c0, 0xd00206a7, // and ra_x2_base_next, r2, ~3
+-+/* [0x00000e80] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x00000e88] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x00000e90] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x00000e98] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000ea0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x00000ea8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x00000eb0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x00000eb8] */ 0x0c9de1c0, 0xd0021467, // add rb17, r0, -2
+-+/* [0x00000ec0] */ 0x919c71c0, 0xd0024812, // shl r0, r0, 7 ; mov rb18,r0
+-+/* [0x00000ec8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000ed0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000ed8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00000ee0] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000ee8] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000ef0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000ef8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000f00] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x00000f08] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000f10] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000f18] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000f20] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+-+/* [0x00000f28] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000f30] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x00000f38] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000f40] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :loop_honly
+--/* [0x00000f38] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x00000f40] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x00000f48] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x00000f50] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x00000f58] */ 0xee654987, 0x10024860, // shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20
+--/* [0x00000f60] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x00000f68] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x00000f70] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x00000f78] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x00000f80] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x00000f88] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000f90] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
+--/* [0x00000f98] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x00000fa0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x00000fa8] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x00000fb0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x00000fb8] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x00000fc0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00000fc8] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--/* [0x00000fd0] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--/* [0x00000fd8] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--/* [0x00000fe0] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--/* [0x00000fe8] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--/* [0x00000ff0] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--/* [0x00000ff8] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--/* [0x00001000] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--/* [0x00001008] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
+--/* [0x00001010] */ 0x8d5927f6, 0x100269e1, // sub.setf -, r3, rb18 ; mov r1, ra22
+--/* [0x00001018] */ 0x559f2fc1, 0x100049e0, // mov -, vw_wait   ; mul24 r0, r0, r1
+--/* [0x00001020] */ 0xfffffef8, 0xf06809e7, // brr.anyn -, r:loop_honly
+--/* [0x00001028] */ 0x0f9cf1c0, 0xd0020827, // asr r0, r0, 15
+--/* [0x00001030] */ 0x129d61c0, 0x10020827, // min r0, r0, rb22
+--/* [0x00001038] */ 0x139c01c0, 0xd0020c27, // max vpm, r0, 0
+--/* [0x00001040] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00001048] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00001050] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00001058] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000f48] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x00000f50] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x00000f58] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x00000f60] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000f68] */ 0xee654987, 0x10024860, // shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20
+-+/* [0x00000f70] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000f78] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000f80] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x00000f88] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x00000f90] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x00000f98] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000fa0] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
+-+/* [0x00000fa8] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00000fb0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x00000fb8] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000fc0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00000fc8] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00000fd0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00000fd8] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+/* [0x00000fe0] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+/* [0x00000fe8] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+/* [0x00000ff0] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+/* [0x00000ff8] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+/* [0x00001000] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+/* [0x00001008] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+/* [0x00001010] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+/* [0x00001018] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
+-+/* [0x00001020] */ 0x8d5927f6, 0x100269e1, // sub.setf -, r3, rb18 ; mov r1, ra22
+-+/* [0x00001028] */ 0x559f2fc1, 0x100049e0, // mov -, vw_wait   ; mul24 r0, r0, r1
+-+/* [0x00001030] */ 0xfffffef8, 0xf06809e7, // brr.anyn -, r:loop_honly
+-+/* [0x00001038] */ 0x0f9cf1c0, 0xd0020827, // asr r0, r0, 15
+-+/* [0x00001040] */ 0x129d61c0, 0x10020827, // min r0, r0, rb22
+-+/* [0x00001048] */ 0x139c01c0, 0xd0020c27, // max vpm, r0, 0
+-+/* [0x00001050] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00001058] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00001060] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00001068] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_exit
+--/* [0x00001060] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x00001068] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+--/* [0x00001070] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00001078] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00001070] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00001078] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+- /* [0x00001080] */ 0x009e7000, 0xa00009e7, // ldtmu0
+- /* [0x00001088] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00001090] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x00001098] */ 0x009e7000, 0x100009e7, // nop        ; nop
+--/* [0x000010a0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00001090] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00001098] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x000010a0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x000010a8] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x000010b0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_exit1
+--/* [0x000010a8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x000010b0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x000010b8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x000010b8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+- /* [0x000010c0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+- /* [0x000010c8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x000010d0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x000010d8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+--/* [0x000010e0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x000010d0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x000010d8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x000010e0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x000010e8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x000010f0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_interrupt_exit
+--/* [0x000010e8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x000010f0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x000010f8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x000010f8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+- /* [0x00001100] */ 0x009e7000, 0xa00009e7, // ldtmu0
+- /* [0x00001108] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00001110] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00001118] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001110] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00001118] */ 0x009e7000, 0xa00009e7, // ldtmu0
+- /* [0x00001120] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00001128] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00001130] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-@@ -592,225 +592,227 @@ unsigned int rpi_shader[] = {
+- /* [0x00001150] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00001158] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00001160] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00001168] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x00001170] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+--/* [0x00001178] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00001168] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001170] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001178] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00001180] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x00001188] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_interrupt_exit4
+--/* [0x00001180] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x00001188] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00001190] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00001190] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+- /* [0x00001198] */ 0x009e7000, 0xa00009e7, // ldtmu0
+- /* [0x000011a0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x000011a8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x000011b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000011a8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x000011b0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+- /* [0x000011b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x000011c0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x000011c8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+--/* [0x000011d0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x000011c0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000011c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000011d0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x000011d8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x000011e0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_interrupt_exit8
+--/* [0x000011d8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x000011e0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x000011e8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x000011e8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+- /* [0x000011f0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+- /* [0x000011f8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00001200] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00001208] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001200] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00001208] */ 0x009e7000, 0xa00009e7, // ldtmu0
+- /* [0x00001210] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00001218] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00001220] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00001228] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00001230] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00001238] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x00001240] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+--/* [0x00001248] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00001238] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001240] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001248] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00001250] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x00001258] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_setup_uv
+--/* [0x00001250] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00001258] */ 0x0c9a0f80, 0x10020427, // add ra_x_base, unif, elem_num
+--/* [0x00001260] */ 0x15827d80, 0x10020767, // mov ra_y, unif
+--/* [0x00001268] */ 0x15827d80, 0x10020627, // mov ra_x2_base, unif
+--/* [0x00001270] */ 0x009e7000, 0x100009e7, // nop
+--/* [0x00001278] */ 0x0d620f80, 0x10020667, // sub ra_u2v_ref_offset, unif, ra_x2_base
+--/* [0x00001280] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
+--/* [0x00001288] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
+--/* [0x00001290] */ 0x15827d80, 0x10021427, // mov rb16, unif
+--/* [0x00001298] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000012a0] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+--/* [0x000012a8] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
+--/* [0x000012b0] */ 0x00000001, 0xe0020527, // mov ra20, 1
+--/* [0x000012b8] */ 0x00000040, 0xe0020567, // mov ra21, 64
+--/* [0x000012c0] */ 0x00000100, 0xe00205a7, // mov ra22, 256
+--/* [0x000012c8] */ 0x00000008, 0xe00205e7, // mov ra23, 8
+--/* [0x000012d0] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
+--/* [0x000012d8] */ 0x00000040, 0xe0021567, // mov rb21, 64
+--/* [0x000012e0] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
+--/* [0x000012e8] */ 0x00000018, 0xe00215e7, // mov rb23, 24
+--/* [0x000012f0] */ 0x00000000, 0xe0020227, // mov ra8, 0
+--/* [0x000012f8] */ 0x00000000, 0xe0020267, // mov ra9, 0
+--/* [0x00001300] */ 0x00000000, 0xe00202a7, // mov ra10, 0
+--/* [0x00001308] */ 0x00000000, 0xe00202e7, // mov ra11, 0
+--/* [0x00001310] */ 0x00000000, 0xe0020327, // mov ra12, 0
+--/* [0x00001318] */ 0x00000000, 0xe0020367, // mov ra13, 0
+--/* [0x00001320] */ 0x00000000, 0xe00203a7, // mov ra14, 0
+--/* [0x00001328] */ 0x00000000, 0xe00203e7, // mov ra15, 0
+--/* [0x00001330] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+--/* [0x00001338] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+--/* [0x00001340] */ 0x159e7480, 0x10020867, // mov r1, r2
+--/* [0x00001348] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+--/* [0x00001350] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+--/* [0x00001358] */ 0x159e7480, 0x10020827, // mov r0, r2
+--/* [0x00001360] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+--/* [0x00001368] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00001370] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
+--/* [0x00001378] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+--/* [0x00001380] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
+--/* [0x00001388] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+--/* [0x00001390] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+--/* [0x00001398] */ 0x159e7480, 0x10020867, // mov r1, r2
+--/* [0x000013a0] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+--/* [0x000013a8] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+--/* [0x000013b0] */ 0x159e7480, 0x10020827, // mov r0, r2
+--/* [0x000013b8] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+--/* [0x000013c0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x000013c8] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+--/* [0x000013d0] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
+--/* [0x000013d8] */ 0x15427d80, 0x10020827, // mov r0, ra_x_base
+--/* [0x000013e0] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
+--/* [0x000013e8] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base
+--/* [0x000013f0] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
+--/* [0x000013f8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
+--/* [0x00001400] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x00001408] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+--/* [0x00001410] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x_base, r0
+--/* [0x00001418] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+--/* [0x00001420] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
+--/* [0x00001428] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
+--/* [0x00001430] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
+--/* [0x00001438] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00001440] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00001260] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00001268] */ 0x0c9a0f80, 0x10020427, // add ra_x_base, unif, elem_num
+-+/* [0x00001270] */ 0x15827d80, 0x10020767, // mov ra_y, unif
+-+/* [0x00001278] */ 0x15827d80, 0x10020627, // mov ra_x2_base, unif
+-+/* [0x00001280] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x00001288] */ 0x0d620f80, 0x10020667, // sub ra_u2v_ref_offset, unif, ra_x2_base
+-+/* [0x00001290] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
+-+/* [0x00001298] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
+-+/* [0x000012a0] */ 0x15827d80, 0x10021427, // mov rb16, unif
+-+/* [0x000012a8] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000012b0] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+-+/* [0x000012b8] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
+-+/* [0x000012c0] */ 0x00000001, 0xe0020527, // mov ra20, 1
+-+/* [0x000012c8] */ 0x00000020, 0xe0020567, // mov ra21, 32
+-+/* [0x000012d0] */ 0x00000100, 0xe00205a7, // mov ra22, 256
+-+/* [0x000012d8] */ 0x00000008, 0xe00205e7, // mov ra23, 8
+-+/* [0x000012e0] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
+-+/* [0x000012e8] */ 0x00000040, 0xe0021567, // mov rb21, 64
+-+/* [0x000012f0] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
+-+/* [0x000012f8] */ 0x00000018, 0xe00215e7, // mov rb23, 24
+-+/* [0x00001300] */ 0x00000000, 0xe0020227, // mov ra8, 0
+-+/* [0x00001308] */ 0x00000000, 0xe0020267, // mov ra9, 0
+-+/* [0x00001310] */ 0x00000000, 0xe00202a7, // mov ra10, 0
+-+/* [0x00001318] */ 0x00000000, 0xe00202e7, // mov ra11, 0
+-+/* [0x00001320] */ 0x00000000, 0xe0020327, // mov ra12, 0
+-+/* [0x00001328] */ 0x00000000, 0xe0020367, // mov ra13, 0
+-+/* [0x00001330] */ 0x00000000, 0xe00203a7, // mov ra14, 0
+-+/* [0x00001338] */ 0x00000000, 0xe00203e7, // mov ra15, 0
+-+/* [0x00001340] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+-+/* [0x00001348] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+-+/* [0x00001350] */ 0x159e7480, 0x10020867, // mov r1, r2
+-+/* [0x00001358] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+-+/* [0x00001360] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+-+/* [0x00001368] */ 0x159e7480, 0x10020827, // mov r0, r2
+-+/* [0x00001370] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+-+/* [0x00001378] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00001380] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
+-+/* [0x00001388] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+-+/* [0x00001390] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
+-+/* [0x00001398] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+-+/* [0x000013a0] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+-+/* [0x000013a8] */ 0x159e7480, 0x10020867, // mov r1, r2
+-+/* [0x000013b0] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+-+/* [0x000013b8] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+-+/* [0x000013c0] */ 0x159e7480, 0x10020827, // mov r0, r2
+-+/* [0x000013c8] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+-+/* [0x000013d0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x000013d8] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+-+/* [0x000013e0] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
+-+/* [0x000013e8] */ 0x15427d80, 0x10020827, // mov r0, ra_x_base
+-+/* [0x000013f0] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
+-+/* [0x000013f8] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base
+-+/* [0x00001400] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
+-+/* [0x00001408] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
+-+/* [0x00001410] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x00001418] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+-+/* [0x00001420] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x_base, r0
+-+/* [0x00001428] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x00001430] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
+-+/* [0x00001438] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
+-+/* [0x00001440] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
+- /* [0x00001448] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00001450] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
+--/* [0x00001458] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+--/* [0x00001460] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
+--/* [0x00001468] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00001470] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+--/* [0x00001478] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
+--/* [0x00001480] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
+-+/* [0x00001450] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00001458] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00001460] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
+-+/* [0x00001468] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x00001470] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
+-+/* [0x00001478] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00001480] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+-+/* [0x00001488] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
+-+/* [0x00001490] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
+- // ::mc_filter_uv_b
+--/* [0x00001488] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00001490] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x00001498] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x000014a0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x000014a8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+--/* [0x000014b0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x000014b8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+--/* [0x000014c0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x000014c8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+--/* [0x000014d0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x000014d8] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+--/* [0x000014e0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+--/* [0x000014e8] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x000014f0] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000014f8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x00001500] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x00001508] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x00001510] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+--/* [0x00001518] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+--/* [0x00001520] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x00001528] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+--/* [0x00001530] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+--/* [0x00001538] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+--/* [0x00001540] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00001548] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00001550] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00001558] */ 0x0c9dc7c0, 0x10020c67, // add vr_setup, r3, rb28
+--/* [0x00001560] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
+--/* [0x00001568] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00001570] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00001578] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00001580] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00001588] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x00001590] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00001598] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000015a0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000015a8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+--/* [0x000015b0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000015b8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000015c0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000015c8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+--/* [0x000015d0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000015d8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000015e0] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000015e8] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+--/* [0x000015f0] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x000015f8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00001600] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x00001498] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x000014a0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x000014a8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x000014b0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x000014b8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x000014c0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x000014c8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x000014d0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x000014d8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+-+/* [0x000014e0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x000014e8] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+-+/* [0x000014f0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x000014f8] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x00001500] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00001508] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x00001510] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x00001518] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x00001520] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+-+/* [0x00001528] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+-+/* [0x00001530] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x00001538] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+-+/* [0x00001540] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+-+/* [0x00001548] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+-+/* [0x00001550] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00001558] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00001560] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00001568] */ 0x0c9dc7c0, 0x10020c67, // add vr_setup, r3, rb28
+-+/* [0x00001570] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
+-+/* [0x00001578] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00001580] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00001588] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00001590] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00001598] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x000015a0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000015a8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000015b0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000015b8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+-+/* [0x000015c0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000015c8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000015d0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000015d8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+-+/* [0x000015e0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000015e8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000015f0] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000015f8] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+-+/* [0x00001600] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x00001608] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00001610] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop_b
+--/* [0x00001608] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x00001610] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x00001618] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x00001620] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x00001628] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x00001630] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x00001638] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x00001640] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x00001648] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x00001650] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x00001658] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00001660] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
+--/* [0x00001668] */ 0x40038031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
+--/* [0x00001670] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x00001678] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x00001680] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x00001688] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x00001690] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x00001698] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x000016a0] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--/* [0x000016a8] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--/* [0x000016b0] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--/* [0x000016b8] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--/* [0x000016c0] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--/* [0x000016c8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--/* [0x000016d0] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--/* [0x000016d8] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--/* [0x000016e0] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
+--/* [0x000016e8] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+--/* [0x000016f0] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+--/* [0x000016f8] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+--/* [0x00001700] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+--/* [0x00001708] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+--/* [0x00001710] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+--/* [0x00001718] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x00001720] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+--/* [0x00001728] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
+--/* [0x00001730] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
+--/* [0x00001738] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
+--/* [0x00001740] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
+--/* [0x00001748] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
+--/* [0x00001750] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
+--/* [0x00001758] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
+--/* [0x00001760] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
+--/* [0x00001768] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
+--/* [0x00001770] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
+--/* [0x00001778] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
+--/* [0x00001780] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
+--/* [0x00001788] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
+--/* [0x00001790] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x00001798] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
+--/* [0x000017a0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x000017a8] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
+--/* [0x000017b0] */ 0xfffffe38, 0xf06809e7, // brr.anyn -, r:uvloop_b
+--/* [0x000017b8] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+--/* [0x000017c0] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
+--/* [0x000017c8] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
+--/* [0x000017d0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x000017d8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x000017e0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x000017e8] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x000017f0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x000017f8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00001800] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x00001808] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00001810] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00001618] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x00001620] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x00001628] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x00001630] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00001638] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x00001640] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00001648] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00001650] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x00001658] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x00001660] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x00001668] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00001670] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
+-+/* [0x00001678] */ 0x40038031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
+-+/* [0x00001680] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00001688] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x00001690] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00001698] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x000016a0] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x000016a8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x000016b0] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+/* [0x000016b8] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+/* [0x000016c0] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+/* [0x000016c8] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+/* [0x000016d0] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+/* [0x000016d8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+/* [0x000016e0] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+/* [0x000016e8] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+/* [0x000016f0] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
+-+/* [0x000016f8] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+-+/* [0x00001700] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+-+/* [0x00001708] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+-+/* [0x00001710] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+-+/* [0x00001718] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+-+/* [0x00001720] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+-+/* [0x00001728] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00001730] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+-+/* [0x00001738] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x00001740] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
+-+/* [0x00001748] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
+-+/* [0x00001750] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
+-+/* [0x00001758] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
+-+/* [0x00001760] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
+-+/* [0x00001768] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
+-+/* [0x00001770] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
+-+/* [0x00001778] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
+-+/* [0x00001780] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
+-+/* [0x00001788] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
+-+/* [0x00001790] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
+-+/* [0x00001798] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
+-+/* [0x000017a0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x000017a8] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
+-+/* [0x000017b0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x000017b8] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
+-+/* [0x000017c0] */ 0xfffffe38, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x000017c8] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+-+/* [0x000017d0] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
+-+/* [0x000017d8] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
+-+/* [0x000017e0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x000017e8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x000017f0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x000017f8] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x00001800] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x00001808] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00001810] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x00001818] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00001820] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_end
+- };
+- #ifdef __HIGHC__
+-diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
+-index db971f4..3464cdb 100644
+---- a/libavcodec/rpi_shader.h
+-+++ b/libavcodec/rpi_shader.h
+-@@ -5,16 +5,16 @@ extern unsigned int rpi_shader[];
+- 
+- #define mc_setup (rpi_shader + 0)
+- #define mc_filter_uv (rpi_shader + 146)
+--#define mc_filter (rpi_shader + 360)
+--#define mc_filter_b (rpi_shader + 670)
+--#define mc_filter_honly (rpi_shader + 894)
+--#define mc_exit (rpi_shader + 1048)
+--#define mc_exit1 (rpi_shader + 1066)
+--#define mc_interrupt_exit (rpi_shader + 1082)
+--#define mc_interrupt_exit4 (rpi_shader + 1120)
+--#define mc_interrupt_exit8 (rpi_shader + 1142)
+--#define mc_setup_uv (rpi_shader + 1172)
+--#define mc_filter_uv_b (rpi_shader + 1314)
+--#define mc_end (rpi_shader + 1542)
+-+#define mc_filter (rpi_shader + 364)
+-+#define mc_filter_b (rpi_shader + 674)
+-+#define mc_filter_honly (rpi_shader + 898)
+-+#define mc_exit (rpi_shader + 1052)
+-+#define mc_exit1 (rpi_shader + 1070)
+-+#define mc_interrupt_exit (rpi_shader + 1086)
+-+#define mc_interrupt_exit4 (rpi_shader + 1124)
+-+#define mc_interrupt_exit8 (rpi_shader + 1146)
+-+#define mc_setup_uv (rpi_shader + 1176)
+-+#define mc_filter_uv_b (rpi_shader + 1318)
+-+#define mc_end (rpi_shader + 1546)
+- 
+- #endif
+-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
+-index 02fdcb2..4809e1d 100644
+---- a/libavcodec/rpi_shader.qasm
+-+++ b/libavcodec/rpi_shader.qasm
+-@@ -21,7 +21,7 @@
+- # rb19                                          next ra16
+- #
+- # ra20                                          1
+--# ra21                                          64
+-+# ra21                                          32
+- # ra22                                          256
+- # ra23                                          8
+- #
+-@@ -97,7 +97,7 @@ add rb24, r1, r0
+- # load constants
+- 
+- mov ra20, 1
+--mov ra21, 64
+-+mov ra21, 32
+- mov ra22, 256
+- mov ra23, 8
+- 
+-@@ -270,7 +270,7 @@ add t0s, ra_x2_base, r2
+- 
+- mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+- 
+--mov r2, rb21         ; mul24 r2, r0, ra0
+-+nop                  ; mul24 r2, r0, ra0
+- nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+- nop                  ; mul24      r3, ra1 << 1, r0 << 1
+- nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-@@ -301,9 +301,9 @@ sub.setf -, r3, 8 ; mov r1, ra22
+- 
+- # apply horizontal filter
+- brr.anyn -, r:uvloop
+--max ra14, ra15, 0       ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
+--asr r0, r0, 14          ; mov r1, ra21
+--min.setf ra15, r0, rb22
+-+mov ra14, ra15          ; mul24 r0, r0, r1         # last bit of context scroll
+-+asr ra15, r0, 8         ; nop
+-+nop                     ; nop  # Delay slot 3 (TODO move more of the context scroll into here)
+- 
+- # apply vertical filter and write to VPM
+- 
+-@@ -315,12 +315,14 @@ add r1, r1, r0          ; mul24 r0, ra10, rb10
+- add r1, r1, r0          ; mul24 r0, ra9, rb9
+- add r1, r1, r0          ; mul24 r0, ra8, rb8
+- add r1, r1, r0          ; mul24 r0, ra15, rb15
+--add.ifnn r1, r1, r0     ; mov -, vw_wait
+-+add r1, r1, r0          ; mov -, vw_wait
+- sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--brr.anyn -, r:uvloop
+- asr r1, r1, 14
+--min r1, r1, rb22
+--max vpm, r1, 0
+-+add r1, r1, ra21
+-+brr.anyn -, r:uvloop
+-+asr r1, r1, 6          # Delay 1
+-+min r1, r1, rb22       # Delay 2
+-+max vpm, r1, 0         # Delay 3
+- 
+- # DMA out for U
+- 
+-@@ -1161,7 +1163,7 @@ add rb24, r1, r0
+- # load constants
+- 
+- mov ra20, 1
+--mov ra21, 64
+-+mov ra21, 32
+- mov ra22, 256
+- mov ra23, 8
+- 
+--- 
+-2.7.4
+-
+-
+-From b7321192751956ed7deceeb3dabe22ccedb8e08d Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Wed, 13 May 2015 14:37:32 +0100
+-Subject: [PATCH 19/68] Removed unused luma functions
+-
+----
+- libavcodec/hevc.c          |    4 +-
+- libavcodec/rpi_qpu.c       |   32 +-
+- libavcodec/rpi_shader.c    | 1097 +++++++++++++-------------------------------
+- libavcodec/rpi_shader.h    |   19 +-
+- libavcodec/rpi_shader.qasm |  970 +++------------------------------------
+- 5 files changed, 396 insertions(+), 1726 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index d6d78ee..31b8b2f 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -2731,8 +2731,8 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
+-         return;
+-     for(k=0;k<8;k++) {
+-         s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
+--        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
+--        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP); // Also need a dummy for V
+-+        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
+-+        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP_UV); // Also need a dummy for V
+-         assert(s->u_mvs[k] - s->mvs_base[k] < UV_COMMANDS_PER_QPU);
+-     }
+- 
+-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
+-index 4d9eda8..4e90cc1 100644
+---- a/libavcodec/rpi_qpu.c
+-+++ b/libavcodec/rpi_qpu.c
+-@@ -172,7 +172,7 @@ static int gpu_init(volatile struct GPU **gpu) {
+- 
+-   // Now copy over the QPU code into GPU memory
+-   {
+--    int num_bytes = qpu_get_fn(QPU_MC_END) - qpu_get_fn(QPU_MC_SETUP);
+-+    int num_bytes = qpu_get_fn(QPU_MC_END) - qpu_get_fn(QPU_MC_SETUP_UV);
+-     assert(num_bytes<=QPU_CODE_SIZE*sizeof(unsigned int));
+-     memcpy((void*)ptr->qpu_code, rpi_shader, num_bytes);
+-   }
+-@@ -612,24 +612,24 @@ unsigned int qpu_get_fn(int num) {
+-       gpu_unlock();
+-     }
+-     switch(num) {
+--    case QPU_MC_SETUP:
+--      fn = mc_setup;
+--      break;
+--    case QPU_MC_FILTER:
+--      fn = mc_filter;
+--      break;
+-+    //case QPU_MC_SETUP:
+-+    //  fn = mc_setup;
+-+    //  break;
+-+    //case QPU_MC_FILTER:
+-+    //  fn = mc_filter;
+-+    //  break;
+-     case QPU_MC_EXIT:
+-       fn = mc_exit;
+-       break;
+--    case QPU_MC_INTERRUPT_EXIT:
+--      fn = mc_interrupt_exit;
+--      break;
+--    case QPU_MC_FILTER_B:
+--      fn = mc_filter_b;
+--      break;
+--    case QPU_MC_FILTER_HONLY:
+--      fn = mc_filter_honly;
+--      break;
+-+    //case QPU_MC_INTERRUPT_EXIT:
+-+    //  fn = mc_interrupt_exit;
+-+    //  break;
+-+    //case QPU_MC_FILTER_B:
+-+    //  fn = mc_filter_b;
+-+    //  break;
+-+    //case QPU_MC_FILTER_HONLY:
+-+    //  fn = mc_filter_honly;
+-+    //  break;
+-     case QPU_MC_SETUP_UV:
+-       fn = mc_setup_uv;
+-       break;
+-diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
+-index 831633b..170e8ac 100644
+---- a/libavcodec/rpi_shader.c
+-+++ b/libavcodec/rpi_shader.c
+-@@ -21,798 +21,331 @@ __declspec(align(8))
+- __attribute__((aligned(8)))
+- #endif
+- unsigned int rpi_shader[] = {
+--// ::mc_setup
+-+// ::mc_setup_uv
+- /* [0x00000000] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+- /* [0x00000008] */ 0x0c9a0f80, 0x10020427, // add ra_x_base, unif, elem_num
+- /* [0x00000010] */ 0x15827d80, 0x10020767, // mov ra_y, unif
+- /* [0x00000018] */ 0x15827d80, 0x10020627, // mov ra_x2_base, unif
+--/* [0x00000020] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
+--/* [0x00000028] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
+--/* [0x00000030] */ 0x15827d80, 0x10021427, // mov rb16, unif
+--/* [0x00000038] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000040] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+--/* [0x00000048] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
+--/* [0x00000050] */ 0x00000001, 0xe0020527, // mov ra20, 1
+--/* [0x00000058] */ 0x00000020, 0xe0020567, // mov ra21, 32
+--/* [0x00000060] */ 0x00000100, 0xe00205a7, // mov ra22, 256
+--/* [0x00000068] */ 0x00000008, 0xe00205e7, // mov ra23, 8
+--/* [0x00000070] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
+--/* [0x00000078] */ 0x00000040, 0xe0021567, // mov rb21, 64
+--/* [0x00000080] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
+--/* [0x00000088] */ 0x00000018, 0xe00215e7, // mov rb23, 24
+--/* [0x00000090] */ 0x00000000, 0xe0020227, // mov ra8, 0
+--/* [0x00000098] */ 0x00000000, 0xe0020267, // mov ra9, 0
+--/* [0x000000a0] */ 0x00000000, 0xe00202a7, // mov ra10, 0
+--/* [0x000000a8] */ 0x00000000, 0xe00202e7, // mov ra11, 0
+--/* [0x000000b0] */ 0x00000000, 0xe0020327, // mov ra12, 0
+--/* [0x000000b8] */ 0x00000000, 0xe0020367, // mov ra13, 0
+--/* [0x000000c0] */ 0x00000000, 0xe00203a7, // mov ra14, 0
+--/* [0x000000c8] */ 0x00000000, 0xe00203e7, // mov ra15, 0
+--/* [0x000000d0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+--/* [0x000000d8] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+--/* [0x000000e0] */ 0x159e7480, 0x10020867, // mov r1, r2
+--/* [0x000000e8] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+--/* [0x000000f0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+--/* [0x000000f8] */ 0x159e7480, 0x10020827, // mov r0, r2
+--/* [0x00000100] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+--/* [0x00000108] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000110] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
+--/* [0x00000118] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+--/* [0x00000120] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
+--/* [0x00000128] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+--/* [0x00000130] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+--/* [0x00000138] */ 0x159e7480, 0x10020867, // mov r1, r2
+--/* [0x00000140] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+--/* [0x00000148] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+--/* [0x00000150] */ 0x159e7480, 0x10020827, // mov r0, r2
+--/* [0x00000158] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+--/* [0x00000160] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000168] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+--/* [0x00000170] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
+--/* [0x00000178] */ 0x15427d80, 0x10020827, // mov r0, ra_x_base
+--/* [0x00000180] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
+--/* [0x00000188] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
+--/* [0x00000190] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base
+--/* [0x00000198] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x000001a0] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
+-+/* [0x00000020] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x00000028] */ 0x0d620f80, 0x10020667, // sub ra_u2v_ref_offset, unif, ra_x2_base
+-+/* [0x00000030] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
+-+/* [0x00000038] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
+-+/* [0x00000040] */ 0x15827d80, 0x10021427, // mov rb16, unif
+-+/* [0x00000048] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000050] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+-+/* [0x00000058] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
+-+/* [0x00000060] */ 0x00000001, 0xe0020527, // mov ra20, 1
+-+/* [0x00000068] */ 0x00000020, 0xe0020567, // mov ra21, 32
+-+/* [0x00000070] */ 0x00000100, 0xe00205a7, // mov ra22, 256
+-+/* [0x00000078] */ 0x00000008, 0xe00205e7, // mov ra23, 8
+-+/* [0x00000080] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
+-+/* [0x00000088] */ 0x00000040, 0xe0021567, // mov rb21, 64
+-+/* [0x00000090] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
+-+/* [0x00000098] */ 0x00000018, 0xe00215e7, // mov rb23, 24
+-+/* [0x000000a0] */ 0x00000000, 0xe0020227, // mov ra8, 0
+-+/* [0x000000a8] */ 0x00000000, 0xe0020267, // mov ra9, 0
+-+/* [0x000000b0] */ 0x00000000, 0xe00202a7, // mov ra10, 0
+-+/* [0x000000b8] */ 0x00000000, 0xe00202e7, // mov ra11, 0
+-+/* [0x000000c0] */ 0x00000000, 0xe0020327, // mov ra12, 0
+-+/* [0x000000c8] */ 0x00000000, 0xe0020367, // mov ra13, 0
+-+/* [0x000000d0] */ 0x00000000, 0xe00203a7, // mov ra14, 0
+-+/* [0x000000d8] */ 0x00000000, 0xe00203e7, // mov ra15, 0
+-+/* [0x000000e0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+-+/* [0x000000e8] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+-+/* [0x000000f0] */ 0x159e7480, 0x10020867, // mov r1, r2
+-+/* [0x000000f8] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+-+/* [0x00000100] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+-+/* [0x00000108] */ 0x159e7480, 0x10020827, // mov r0, r2
+-+/* [0x00000110] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+-+/* [0x00000118] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000120] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
+-+/* [0x00000128] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+-+/* [0x00000130] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
+-+/* [0x00000138] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+-+/* [0x00000140] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+-+/* [0x00000148] */ 0x159e7480, 0x10020867, // mov r1, r2
+-+/* [0x00000150] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+-+/* [0x00000158] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+-+/* [0x00000160] */ 0x159e7480, 0x10020827, // mov r0, r2
+-+/* [0x00000168] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+-+/* [0x00000170] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000178] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+-+/* [0x00000180] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
+-+/* [0x00000188] */ 0x15427d80, 0x10020827, // mov r0, ra_x_base
+-+/* [0x00000190] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
+-+/* [0x00000198] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base
+-+/* [0x000001a0] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
+- /* [0x000001a8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
+--/* [0x000001b0] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
+--/* [0x000001b8] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
+--/* [0x000001c0] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+-+/* [0x000001b0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x000001b8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+-+/* [0x000001c0] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x_base, r0
+- /* [0x000001c8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+--/* [0x000001d0] */ 0x4c9d00cf, 0x10024821, // add r0, r0, r3; mul24 r1, r1, rb_pitch
+--/* [0x000001d8] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
+--/* [0x000001e0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+--/* [0x000001e8] */ 0x949dc5c0, 0xd0025890, // and r2, r2, ~3; mov ra_x_base, r0
+--/* [0x000001f0] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
+--/* [0x000001f8] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
+--/* [0x00000200] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000208] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000210] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
+--/* [0x00000218] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+--/* [0x00000220] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
+--/* [0x00000228] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000230] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+--/* [0x00000238] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
+--/* [0x00000240] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
+-+/* [0x000001d0] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
+-+/* [0x000001d8] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
+-+/* [0x000001e0] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
+-+/* [0x000001e8] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000001f0] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000001f8] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000200] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
+-+/* [0x00000208] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x00000210] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
+-+/* [0x00000218] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000220] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+-+/* [0x00000228] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
+-+/* [0x00000230] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
+- // ::mc_filter_uv
+--/* [0x00000248] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00000250] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x00000258] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00000260] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x00000268] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+--/* [0x00000270] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x00000278] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+--/* [0x00000280] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x00000288] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+--/* [0x00000290] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x00000298] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+--/* [0x000002a0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+--/* [0x000002a8] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x000002b0] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000002b8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x000002c0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x000002c8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x000002d0] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+--/* [0x000002d8] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+--/* [0x000002e0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x000002e8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x000002f0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x000002f8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00000300] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
+--/* [0x00000308] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000310] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000318] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000320] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000328] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x00000330] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000338] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000340] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000348] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+--/* [0x00000350] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000358] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000360] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000368] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+--/* [0x00000370] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000378] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000380] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000388] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+--/* [0x00000390] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x00000398] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x000003a0] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x00000238] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000240] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00000248] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000250] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000258] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x00000260] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000268] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x00000270] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x00000278] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+-+/* [0x00000280] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x00000288] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+-+/* [0x00000290] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x00000298] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x000002a0] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000002a8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x000002b0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x000002b8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x000002c0] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+-+/* [0x000002c8] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+-+/* [0x000002d0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x000002d8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x000002e0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x000002e8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x000002f0] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
+-+/* [0x000002f8] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000300] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000308] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000310] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000318] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x00000320] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000328] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000330] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000338] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+-+/* [0x00000340] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000348] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000350] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000358] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+-+/* [0x00000360] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000368] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000370] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000378] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+-+/* [0x00000380] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x00000388] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000390] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop
+--/* [0x000003a8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x000003b0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x000003b8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x000003c0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x000003c8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x000003d0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x000003d8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x000003e0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x000003e8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x000003f0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x000003f8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000400] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x00000408] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x00000410] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x00000418] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x00000420] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x00000428] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x00000430] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x00000438] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00000440] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--/* [0x00000448] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--/* [0x00000450] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--/* [0x00000458] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--/* [0x00000460] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--/* [0x00000468] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--/* [0x00000470] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--/* [0x00000478] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--/* [0x00000480] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
+--/* [0x00000488] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+--/* [0x00000490] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+--/* [0x00000498] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+--/* [0x000004a0] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+--/* [0x000004a8] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+--/* [0x000004b0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+--/* [0x000004b8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x000004c0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+--/* [0x000004c8] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop
+--/* [0x000004d0] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+--/* [0x000004d8] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+--/* [0x000004e0] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+--/* [0x000004e8] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
+--/* [0x000004f0] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
+--/* [0x000004f8] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
+--/* [0x00000500] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
+--/* [0x00000508] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
+--/* [0x00000510] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
+--/* [0x00000518] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
+--/* [0x00000520] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
+--/* [0x00000528] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x00000530] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x00000538] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+--/* [0x00000540] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
+--/* [0x00000548] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:uvloop
+--/* [0x00000550] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
+--/* [0x00000558] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x00000560] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+--/* [0x00000568] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00000570] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000578] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x00000580] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x00000588] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x00000590] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000598] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x000005a0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x000005a8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--// ::mc_filter
+--/* [0x000005b0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x000005b8] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x000005c0] */ 0x156e7d80, 0x10020667, // mov ra_x2shift, ra_x2shift_next
+--/* [0x000005c8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x000005d0] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
+--/* [0x000005d8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x000005e0] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3,unif
+--/* [0x000005e8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x000005f0] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
+--/* [0x000005f8] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
+--/* [0x00000600] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
+--/* [0x00000608] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x00000610] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
+--/* [0x00000618] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+--/* [0x00000620] */ 0x149dc5c0, 0xd00206a7, // and ra_x2_base_next, r2, ~3
+--/* [0x00000628] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x00000630] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+--/* [0x00000638] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x00000640] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000648] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x00000650] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x00000658] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x00000660] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+--/* [0x00000668] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+--/* [0x00000670] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x00000678] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000680] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000688] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00000690] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
+--/* [0x00000698] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000006a0] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000006a8] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000006b0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000006b8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x000006c0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000006c8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000006d0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000006d8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+--/* [0x000006e0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000006e8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000006f0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000006f8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+--/* [0x00000700] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000708] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000710] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000718] */ 0x000001d0, 0xf07809e7, // brr.anynn -, r:fast_path
+--/* [0x00000720] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+--/* [0x00000728] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x00000730] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000738] */ 0x00000000, 0xe00208e7, // mov r3, 0
+--// :loop
+--/* [0x00000740] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x00000748] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x00000750] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x00000758] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x00000760] */ 0xee654987, 0x10024860, // shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20
+--/* [0x00000768] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x00000770] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x00000778] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x00000780] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x00000788] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x00000790] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000798] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
+--/* [0x000007a0] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x000007a8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x000007b0] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x000007b8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x000007c0] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x000007c8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x000007d0] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--/* [0x000007d8] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--/* [0x000007e0] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--/* [0x000007e8] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--/* [0x000007f0] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--/* [0x000007f8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--/* [0x00000800] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--/* [0x00000808] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--/* [0x00000810] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
+--/* [0x00000818] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+--/* [0x00000820] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+--/* [0x00000828] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+--/* [0x00000830] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+--/* [0x00000838] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+--/* [0x00000840] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+--/* [0x00000848] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x00000850] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+--/* [0x00000858] */ 0xfffffec8, 0xf06809e7, // brr.anyn -, r:loop
+--/* [0x00000860] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
+--/* [0x00000868] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
+--/* [0x00000870] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
+--/* [0x00000878] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
+--/* [0x00000880] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
+--/* [0x00000888] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
+--/* [0x00000890] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
+--/* [0x00000898] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
+--/* [0x000008a0] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
+--/* [0x000008a8] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
+--/* [0x000008b0] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
+--/* [0x000008b8] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
+--/* [0x000008c0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x000008c8] */ 0xfffffe58, 0xf06809e7, // brr.anyn -, r:loop
+--/* [0x000008d0] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
+--/* [0x000008d8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x000008e0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+--/* [0x000008e8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x000008f0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x000008f8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000900] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--// :fast_path
+--/* [0x00000908] */ 0x00000000, 0xe00208e7, // mov r3, 0
+--// :fast_loop
+--/* [0x00000910] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x00000918] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x00000920] */ 0x95727d9b, 0x1004475f, // mov.ifz ra_y, ra_y_next   ; mov rb31, r3
+--/* [0x00000928] */ 0x95690dbf, 0x10044623, // mov.ifz ra_x2_base, ra_x2_base_next   ; mov r3, rb_pitch
+--/* [0x00000930] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x00000938] */ 0x929de5e4, 0x100248a1, // min r2, r2, rb_frame_height_minus_1 ; mov r1, r4
+--/* [0x00000940] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x00000948] */ 0xec414c87, 0x10024e20, // add t0s, ra_x_base, r2    ; v8subs r0, r0, rb20
+--/* [0x00000950] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x00000958] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
+--/* [0x00000960] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra1 << 1, r0 << 1
+--/* [0x00000968] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra2 << 2, r0 << 2
+--/* [0x00000970] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra3 << 3, r0 << 3
+--/* [0x00000978] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra4 << 4, r0 << 4
+--/* [0x00000980] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra5 << 5, r0 << 5
+--/* [0x00000988] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra6 << 6, r0 << 6
+--/* [0x00000990] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra7 << 7, r0 << 7
+--/* [0x00000998] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
+--/* [0x000009a0] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+--/* [0x000009a8] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+--/* [0x000009b0] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+--/* [0x000009b8] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+--/* [0x000009c0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+--/* [0x000009c8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x000009d0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8       ; mov r1, ra22
+--/* [0x000009d8] */ 0xffffff18, 0xf06809e7, // brr.anyn -, r:fast_loop
+--/* [0x000009e0] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
+--/* [0x000009e8] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
+--/* [0x000009f0] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
+--/* [0x000009f8] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
+--/* [0x00000a00] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
+--/* [0x00000a08] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
+--/* [0x00000a10] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
+--/* [0x00000a18] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
+--/* [0x00000a20] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
+--/* [0x00000a28] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
+--/* [0x00000a30] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
+--/* [0x00000a38] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
+--/* [0x00000a40] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x00000a48] */ 0xfffffea8, 0xf06809e7, // brr.anyn -, r:fast_loop
+--/* [0x00000a50] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
+--/* [0x00000a58] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x00000a60] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+--/* [0x00000a68] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000a70] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00000a78] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000a80] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--// ::mc_filter_b
+--/* [0x00000a88] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00000a90] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x00000a98] */ 0x156e7d80, 0x10020667, // mov ra_x2shift, ra_x2shift_next
+--/* [0x00000aa0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00000aa8] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
+--/* [0x00000ab0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x00000ab8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3,unif
+--/* [0x00000ac0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x00000ac8] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
+--/* [0x00000ad0] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
+--/* [0x00000ad8] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
+--/* [0x00000ae0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x00000ae8] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
+--/* [0x00000af0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+--/* [0x00000af8] */ 0x149dc5c0, 0xd00206a7, // and ra_x2_base_next, r2, ~3
+--/* [0x00000b00] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x00000b08] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+--/* [0x00000b10] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x00000b18] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000b20] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x00000b28] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x00000b30] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x00000b38] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+--/* [0x00000b40] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+--/* [0x00000b48] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x00000b50] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+--/* [0x00000b58] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+--/* [0x00000b60] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+--/* [0x00000b68] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000b70] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000b78] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00000b80] */ 0x0c9dc7c0, 0x10020c67, // add vr_setup, r3, rb28
+--/* [0x00000b88] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000b90] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000b98] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000ba0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000ba8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x00000bb0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000bb8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000bc0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000bc8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+--/* [0x00000bd0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000bd8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000be0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000be8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+--/* [0x00000bf0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000bf8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000c00] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000c08] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+--/* [0x00000c10] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x00000c18] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000c20] */ 0x00000000, 0xe00208e7, // mov r3, 0
+--// :bloop
+--/* [0x00000c28] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x00000c30] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x00000c38] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x00000c40] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x00000c48] */ 0xee654987, 0x10024860, // shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20
+--/* [0x00000c50] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x00000c58] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x00000c60] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x00000c68] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x00000c70] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x00000c78] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000c80] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
+--/* [0x00000c88] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x00000c90] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x00000c98] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x00000ca0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x00000ca8] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x00000cb0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00000cb8] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--/* [0x00000cc0] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--/* [0x00000cc8] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--/* [0x00000cd0] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--/* [0x00000cd8] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--/* [0x00000ce0] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--/* [0x00000ce8] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--/* [0x00000cf0] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--/* [0x00000cf8] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
+--/* [0x00000d00] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+--/* [0x00000d08] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+--/* [0x00000d10] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+--/* [0x00000d18] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+--/* [0x00000d20] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+--/* [0x00000d28] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+--/* [0x00000d30] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x00000d38] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+--/* [0x00000d40] */ 0xfffffec8, 0xf06809e7, // brr.anyn -, r:bloop
+--/* [0x00000d48] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
+--/* [0x00000d50] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
+--/* [0x00000d58] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
+--/* [0x00000d60] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
+--/* [0x00000d68] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
+--/* [0x00000d70] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
+--/* [0x00000d78] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
+--/* [0x00000d80] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
+--/* [0x00000d88] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
+--/* [0x00000d90] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
+--/* [0x00000d98] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
+--/* [0x00000da0] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
+--/* [0x00000da8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x00000db0] */ 0x8fc8f3f6, 0xd0020867, // asr r1, r1, 15          ; mov -, vr_wait
+--/* [0x00000db8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x00000dc0] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
+--/* [0x00000dc8] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:bloop
+--/* [0x00000dd0] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+--/* [0x00000dd8] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
+--/* [0x00000de0] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
+--/* [0x00000de8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000df0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00000df8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000e00] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--// ::mc_filter_honly
+--/* [0x00000e08] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00000e10] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x00000e18] */ 0x156e7d80, 0x10020667, // mov ra_x2shift, ra_x2shift_next
+--/* [0x00000e20] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00000e28] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
+--/* [0x00000e30] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x00000e38] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3,unif
+--/* [0x00000e40] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x00000e48] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
+--/* [0x00000e50] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
+--/* [0x00000e58] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
+--/* [0x00000e60] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x00000e68] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
+--/* [0x00000e70] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+--/* [0x00000e78] */ 0x149dc5c0, 0xd00206a7, // and ra_x2_base_next, r2, ~3
+--/* [0x00000e80] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x00000e88] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+--/* [0x00000e90] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x00000e98] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000ea0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x00000ea8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x00000eb0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x00000eb8] */ 0x0c9de1c0, 0xd0021467, // add rb17, r0, -2
+--/* [0x00000ec0] */ 0x919c71c0, 0xd0024812, // shl r0, r0, 7 ; mov rb18,r0
+--/* [0x00000ec8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000ed0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000ed8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00000ee0] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000ee8] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000ef0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000ef8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000f00] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x00000f08] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000f10] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000f18] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000f20] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+--/* [0x00000f28] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000f30] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x00000f38] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000f40] */ 0x00000000, 0xe00208e7, // mov r3, 0
+--// :loop_honly
+--/* [0x00000f48] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x00000f50] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x00000f58] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x00000f60] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x00000f68] */ 0xee654987, 0x10024860, // shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20
+--/* [0x00000f70] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x00000f78] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x00000f80] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x00000f88] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x00000f90] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x00000f98] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000fa0] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
+--/* [0x00000fa8] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x00000fb0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x00000fb8] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x00000fc0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x00000fc8] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x00000fd0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00000fd8] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--/* [0x00000fe0] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--/* [0x00000fe8] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--/* [0x00000ff0] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--/* [0x00000ff8] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--/* [0x00001000] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--/* [0x00001008] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--/* [0x00001010] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--/* [0x00001018] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
+--/* [0x00001020] */ 0x8d5927f6, 0x100269e1, // sub.setf -, r3, rb18 ; mov r1, ra22
+--/* [0x00001028] */ 0x559f2fc1, 0x100049e0, // mov -, vw_wait   ; mul24 r0, r0, r1
+--/* [0x00001030] */ 0xfffffef8, 0xf06809e7, // brr.anyn -, r:loop_honly
+--/* [0x00001038] */ 0x0f9cf1c0, 0xd0020827, // asr r0, r0, 15
+--/* [0x00001040] */ 0x129d61c0, 0x10020827, // min r0, r0, rb22
+--/* [0x00001048] */ 0x139c01c0, 0xd0020c27, // max vpm, r0, 0
+--/* [0x00001050] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00001058] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00001060] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00001068] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--// ::mc_exit
+--/* [0x00001070] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x00001078] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+--/* [0x00001080] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00001088] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00001090] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00001098] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x000010a0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x000010a8] */ 0x009e7000, 0x100009e7, // nop        ; nop
+--/* [0x000010b0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+--// ::mc_exit1
+--/* [0x000010b8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x000010c0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x000010c8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x000010d0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x000010d8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x000010e0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x000010e8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+--/* [0x000010f0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+--// ::mc_interrupt_exit
+--/* [0x000010f8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x00001100] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00001108] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00001110] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00001118] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00001120] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00001128] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00001130] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00001138] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00001140] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00001148] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00001150] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00001158] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00001160] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00001168] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00001170] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00001178] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x00001180] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+--/* [0x00001188] */ 0x009e7000, 0x100009e7, // nop        ; nop
+--// ::mc_interrupt_exit4
+--/* [0x00001190] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x00001198] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x000011a0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x000011a8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x000011b0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x000011b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x000011c0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x000011c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x000011d0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x000011d8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+--/* [0x000011e0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+--// ::mc_interrupt_exit8
+--/* [0x000011e8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x000011f0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x000011f8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00001200] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00001208] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00001210] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00001218] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00001220] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00001228] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00001230] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00001238] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00001240] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00001248] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x00001250] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+--/* [0x00001258] */ 0x009e7000, 0x100009e7, // nop        ; nop
+--// ::mc_setup_uv
+--/* [0x00001260] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00001268] */ 0x0c9a0f80, 0x10020427, // add ra_x_base, unif, elem_num
+--/* [0x00001270] */ 0x15827d80, 0x10020767, // mov ra_y, unif
+--/* [0x00001278] */ 0x15827d80, 0x10020627, // mov ra_x2_base, unif
+--/* [0x00001280] */ 0x009e7000, 0x100009e7, // nop
+--/* [0x00001288] */ 0x0d620f80, 0x10020667, // sub ra_u2v_ref_offset, unif, ra_x2_base
+--/* [0x00001290] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
+--/* [0x00001298] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
+--/* [0x000012a0] */ 0x15827d80, 0x10021427, // mov rb16, unif
+--/* [0x000012a8] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000012b0] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+--/* [0x000012b8] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
+--/* [0x000012c0] */ 0x00000001, 0xe0020527, // mov ra20, 1
+--/* [0x000012c8] */ 0x00000020, 0xe0020567, // mov ra21, 32
+--/* [0x000012d0] */ 0x00000100, 0xe00205a7, // mov ra22, 256
+--/* [0x000012d8] */ 0x00000008, 0xe00205e7, // mov ra23, 8
+--/* [0x000012e0] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
+--/* [0x000012e8] */ 0x00000040, 0xe0021567, // mov rb21, 64
+--/* [0x000012f0] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
+--/* [0x000012f8] */ 0x00000018, 0xe00215e7, // mov rb23, 24
+--/* [0x00001300] */ 0x00000000, 0xe0020227, // mov ra8, 0
+--/* [0x00001308] */ 0x00000000, 0xe0020267, // mov ra9, 0
+--/* [0x00001310] */ 0x00000000, 0xe00202a7, // mov ra10, 0
+--/* [0x00001318] */ 0x00000000, 0xe00202e7, // mov ra11, 0
+--/* [0x00001320] */ 0x00000000, 0xe0020327, // mov ra12, 0
+--/* [0x00001328] */ 0x00000000, 0xe0020367, // mov ra13, 0
+--/* [0x00001330] */ 0x00000000, 0xe00203a7, // mov ra14, 0
+--/* [0x00001338] */ 0x00000000, 0xe00203e7, // mov ra15, 0
+--/* [0x00001340] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+--/* [0x00001348] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+--/* [0x00001350] */ 0x159e7480, 0x10020867, // mov r1, r2
+--/* [0x00001358] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+--/* [0x00001360] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+--/* [0x00001368] */ 0x159e7480, 0x10020827, // mov r0, r2
+--/* [0x00001370] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+--/* [0x00001378] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00001380] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
+--/* [0x00001388] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+--/* [0x00001390] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
+--/* [0x00001398] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+--/* [0x000013a0] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+--/* [0x000013a8] */ 0x159e7480, 0x10020867, // mov r1, r2
+--/* [0x000013b0] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+--/* [0x000013b8] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+--/* [0x000013c0] */ 0x159e7480, 0x10020827, // mov r0, r2
+--/* [0x000013c8] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+--/* [0x000013d0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x000013d8] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+--/* [0x000013e0] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
+--/* [0x000013e8] */ 0x15427d80, 0x10020827, // mov r0, ra_x_base
+--/* [0x000013f0] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
+--/* [0x000013f8] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base
+--/* [0x00001400] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
+--/* [0x00001408] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
+--/* [0x00001410] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x00001418] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+--/* [0x00001420] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x_base, r0
+--/* [0x00001428] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+--/* [0x00001430] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
+--/* [0x00001438] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
+--/* [0x00001440] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
+--/* [0x00001448] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00001450] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00001458] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00001460] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
+--/* [0x00001468] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+--/* [0x00001470] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
+--/* [0x00001478] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00001480] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+--/* [0x00001488] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
+--/* [0x00001490] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
+-+/* [0x00000398] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x000003a0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x000003a8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x000003b0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x000003b8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x000003c0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x000003c8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x000003d0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x000003d8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x000003e0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x000003e8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x000003f0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x000003f8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x00000400] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00000408] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x00000410] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000418] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00000420] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00000428] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00000430] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+/* [0x00000438] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+/* [0x00000440] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+/* [0x00000448] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+/* [0x00000450] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+/* [0x00000458] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+/* [0x00000460] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+/* [0x00000468] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+/* [0x00000470] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
+-+/* [0x00000478] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+-+/* [0x00000480] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+-+/* [0x00000488] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+-+/* [0x00000490] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+-+/* [0x00000498] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+-+/* [0x000004a0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+-+/* [0x000004a8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x000004b0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+-+/* [0x000004b8] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop
+-+/* [0x000004c0] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+-+/* [0x000004c8] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+-+/* [0x000004d0] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+-+/* [0x000004d8] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
+-+/* [0x000004e0] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
+-+/* [0x000004e8] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
+-+/* [0x000004f0] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
+-+/* [0x000004f8] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
+-+/* [0x00000500] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
+-+/* [0x00000508] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
+-+/* [0x00000510] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
+-+/* [0x00000518] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x00000520] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x00000528] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x00000530] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
+-+/* [0x00000538] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:uvloop
+-+/* [0x00000540] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
+-+/* [0x00000548] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x00000550] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x00000558] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00000560] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000568] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000570] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x00000578] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x00000580] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000588] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x00000590] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000598] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_filter_uv_b
+--/* [0x00001498] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x000014a0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x000014a8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x000014b0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x000014b8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+--/* [0x000014c0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x000014c8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+--/* [0x000014d0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x000014d8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+--/* [0x000014e0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x000014e8] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+--/* [0x000014f0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+--/* [0x000014f8] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x00001500] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00001508] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x00001510] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x00001518] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x00001520] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+--/* [0x00001528] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+--/* [0x00001530] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x00001538] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+--/* [0x00001540] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+--/* [0x00001548] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+--/* [0x00001550] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00001558] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00001560] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00001568] */ 0x0c9dc7c0, 0x10020c67, // add vr_setup, r3, rb28
+--/* [0x00001570] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
+--/* [0x00001578] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00001580] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00001588] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00001590] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00001598] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x000015a0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000015a8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000015b0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000015b8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+--/* [0x000015c0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000015c8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000015d0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000015d8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+--/* [0x000015e0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000015e8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000015f0] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000015f8] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+--/* [0x00001600] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x00001608] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00001610] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x000005a0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x000005a8] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x000005b0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x000005b8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x000005c0] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x000005c8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x000005d0] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x000005d8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x000005e0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+-+/* [0x000005e8] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x000005f0] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+-+/* [0x000005f8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x00000600] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x00000608] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000610] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x00000618] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x00000620] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x00000628] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+-+/* [0x00000630] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+-+/* [0x00000638] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x00000640] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+-+/* [0x00000648] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+-+/* [0x00000650] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+-+/* [0x00000658] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000660] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000668] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00000670] */ 0x0c9dc7c0, 0x10020c67, // add vr_setup, r3, rb28
+-+/* [0x00000678] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
+-+/* [0x00000680] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000688] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000690] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000698] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000006a0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x000006a8] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000006b0] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000006b8] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000006c0] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+-+/* [0x000006c8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000006d0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000006d8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000006e0] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+-+/* [0x000006e8] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000006f0] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000006f8] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000700] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+-+/* [0x00000708] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x00000710] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000718] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop_b
+--/* [0x00001618] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x00001620] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x00001628] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x00001630] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x00001638] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x00001640] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x00001648] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x00001650] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x00001658] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x00001660] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x00001668] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00001670] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
+--/* [0x00001678] */ 0x40038031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
+--/* [0x00001680] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x00001688] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x00001690] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x00001698] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x000016a0] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x000016a8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x000016b0] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--/* [0x000016b8] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--/* [0x000016c0] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--/* [0x000016c8] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--/* [0x000016d0] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--/* [0x000016d8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--/* [0x000016e0] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--/* [0x000016e8] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--/* [0x000016f0] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
+--/* [0x000016f8] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+--/* [0x00001700] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+--/* [0x00001708] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+--/* [0x00001710] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+--/* [0x00001718] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+--/* [0x00001720] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+--/* [0x00001728] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x00001730] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+--/* [0x00001738] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
+--/* [0x00001740] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
+--/* [0x00001748] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
+--/* [0x00001750] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
+--/* [0x00001758] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
+--/* [0x00001760] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
+--/* [0x00001768] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
+--/* [0x00001770] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
+--/* [0x00001778] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
+--/* [0x00001780] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
+--/* [0x00001788] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
+--/* [0x00001790] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
+--/* [0x00001798] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
+--/* [0x000017a0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x000017a8] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
+--/* [0x000017b0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x000017b8] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
+--/* [0x000017c0] */ 0xfffffe38, 0xf06809e7, // brr.anyn -, r:uvloop_b
+--/* [0x000017c8] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+--/* [0x000017d0] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
+--/* [0x000017d8] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
+--/* [0x000017e0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x000017e8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x000017f0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x000017f8] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x00001800] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x00001808] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00001810] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x00001818] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00001820] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000720] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x00000728] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x00000730] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x00000738] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000740] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x00000748] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000750] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000758] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x00000760] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x00000768] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x00000770] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000778] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
+-+/* [0x00000780] */ 0x40038031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
+-+/* [0x00000788] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00000790] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x00000798] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x000007a0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x000007a8] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x000007b0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x000007b8] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+/* [0x000007c0] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+/* [0x000007c8] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+/* [0x000007d0] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+/* [0x000007d8] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+/* [0x000007e0] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+/* [0x000007e8] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+/* [0x000007f0] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+/* [0x000007f8] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
+-+/* [0x00000800] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+-+/* [0x00000808] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+-+/* [0x00000810] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+-+/* [0x00000818] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+-+/* [0x00000820] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+-+/* [0x00000828] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+-+/* [0x00000830] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00000838] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+-+/* [0x00000840] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x00000848] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
+-+/* [0x00000850] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
+-+/* [0x00000858] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
+-+/* [0x00000860] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
+-+/* [0x00000868] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
+-+/* [0x00000870] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
+-+/* [0x00000878] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
+-+/* [0x00000880] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
+-+/* [0x00000888] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
+-+/* [0x00000890] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
+-+/* [0x00000898] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
+-+/* [0x000008a0] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
+-+/* [0x000008a8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x000008b0] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
+-+/* [0x000008b8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x000008c0] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
+-+/* [0x000008c8] */ 0xfffffe38, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x000008d0] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+-+/* [0x000008d8] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
+-+/* [0x000008e0] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
+-+/* [0x000008e8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x000008f0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x000008f8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000900] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x00000908] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x00000910] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000918] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x00000920] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000928] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+// ::mc_exit
+-+/* [0x00000930] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00000938] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+-+/* [0x00000940] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000948] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000950] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000958] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000960] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00000968] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000970] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+// ::mc_interrupt_exit8
+-+/* [0x00000978] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00000980] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000988] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000990] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000998] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x000009a0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000009a8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000009b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000009b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000009c0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000009c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000009d0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000009d8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x000009e0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x000009e8] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_end
+- };
+- #ifdef __HIGHC__
+-diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
+-index 3464cdb..9de4535 100644
+---- a/libavcodec/rpi_shader.h
+-+++ b/libavcodec/rpi_shader.h
+-@@ -3,18 +3,11 @@
+- 
+- extern unsigned int rpi_shader[];
+- 
+--#define mc_setup (rpi_shader + 0)
+--#define mc_filter_uv (rpi_shader + 146)
+--#define mc_filter (rpi_shader + 364)
+--#define mc_filter_b (rpi_shader + 674)
+--#define mc_filter_honly (rpi_shader + 898)
+--#define mc_exit (rpi_shader + 1052)
+--#define mc_exit1 (rpi_shader + 1070)
+--#define mc_interrupt_exit (rpi_shader + 1086)
+--#define mc_interrupt_exit4 (rpi_shader + 1124)
+--#define mc_interrupt_exit8 (rpi_shader + 1146)
+--#define mc_setup_uv (rpi_shader + 1176)
+--#define mc_filter_uv_b (rpi_shader + 1318)
+--#define mc_end (rpi_shader + 1546)
+-+#define mc_setup_uv (rpi_shader + 0)
+-+#define mc_filter_uv (rpi_shader + 142)
+-+#define mc_filter_uv_b (rpi_shader + 360)
+-+#define mc_exit (rpi_shader + 588)
+-+#define mc_interrupt_exit8 (rpi_shader + 606)
+-+#define mc_end (rpi_shader + 636)
+- 
+- #endif
+-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
+-index 4809e1d..cd7346d 100644
+---- a/libavcodec/rpi_shader.qasm
+-+++ b/libavcodec/rpi_shader.qasm
+-@@ -71,8 +71,10 @@
+- 
+- .set rb_const_64,                  rb21
+- 
+--# mc_setup(next_kernel, x, y, ref_base, frame_width, frame_height, pitch, dst_pitch, pad0, pad1)
+--::mc_setup
+-+
+-+################################################################################
+-+# mc_setup_uv(next_kernel, x, y, ref_u_base, ref_v_base, frame_width, frame_height, pitch, dst_pitch, pad0, pad1, pad2)
+-+::mc_setup_uv
+- 
+- # Read starting kernel
+- mov ra31, unif
+-@@ -80,7 +82,9 @@ mov ra31, unif
+- # Load first request location
+- add ra_x_base, unif, elem_num # Store x
+- mov ra_y, unif # Store y
+--mov ra_x2_base, unif # Store frame base
+-+mov ra_x2_base, unif # Store frame u base
+-+nop
+-+sub ra_u2v_ref_offset, unif, ra_x2_base # Store offset to add to move from u to v in reference frame
+- 
+- # Read image dimensions
+- sub rb25,unif,1
+-@@ -143,29 +147,24 @@ mov r1, vpm_setup(0, 4, h8p(0, 0))
+- add rb28, r0, r1
+- 
+- # Compute base address for first and second access
+--#add r0, unif, elem_num     # x
+- mov r0, ra_x_base           # Load x
+--add r2, r0, 8               # x+8
+- max r0, r0, 0; mov r1, ra_y # Load y
+- min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base  # Load the frame base
+--shl ra_xshift_next, r0, 3
+--max r2, r2, 0
+-+shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
+- add ra_y, r1, 1
+--min r2, r2, rb_frame_width_minus_1
+--shl ra_x2shift_next, r2, 3
+--max r1, r1, 0  # y
+--min r1, r1, rb_frame_height_minus_1
+--add r0, r0, r3; mul24 r1, r1, rb_pitch
+--add r2, r2, r3
+-+add r0, r0, r3
+- and r0, r0, ~3
+--and r2, r2, ~3; mov ra_x_base, r0
+-+max r1, r1, 0 ; mov ra_x_base, r0 # y
+-+min r1, r1, rb_frame_height_minus_1
+- # submit texture requests for first line
+-+add r2, r2, r0 ; mul24 r1, r1, rb_pitch
+- add t0s, r0, r1 ; mov ra_x2_base, r2
+- add t0s, r2, r1
+- 
+- # Dump padding words
+- mov r0, unif
+- mov r0, unif
+-+mov r0, unif
+- 
+- # submit texture requests for second line
+- max r1, ra_y, 0
+-@@ -176,6 +175,8 @@ nop ; mul24 r1, r1, rb_pitch
+- add t0s, r1, ra_x_base
+- add t0s, r1, ra_x2_base
+- 
+-+
+-+
+- ################################################################################
+- 
+- # mc_filter_uv(next_kernel, x, y, frame_u_base, frame_v_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_u_dst, this_v_dst)
+-@@ -341,453 +342,26 @@ add vw_setup, rb26, r0 # VDW setup 0
+- mov vw_setup, rb29 # Stride
+- mov vw_addr, unif # start the VDW
+- 
+--################################################################################
+--
+--
+--# mc_filter(next_kernel, x, y, frame_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_dst)
+--
+--# At this point we have already issued two pairs of texture requests for the current block
+--# ra_x_base, ra_x16_base point to the current coordinates for this block
+--::mc_filter
+--mov ra31, unif
+--
+--# per-channel shifts were calculated on the *previous* invocation
+--
+--mov ra_xshift, ra_xshift_next
+--mov ra_x2shift, ra_x2shift_next
+--
+--# get base addresses and per-channel shifts for *next* invocation
+--add r0, unif, elem_num    # x
+--add r2, r0, 8 # x+8
+--max r0, r0, 0; mov r1, unif # y
+--min r0, r0, rb_frame_width_minus_1 ; mov r3,unif # frame_base
+--shl ra_xshift_next, r0, 3
+--max r2, r2, 0
+--min r2, r2, rb_frame_width_minus_1
+--shl ra_x2shift_next, r2, 3
+--add r0, r0, r3
+--add r2, r2, r3
+--and rb_x_base_next, r0, ~3
+--and ra_x2_base_next, r2, ~3
+--mov ra_y_next, r1
+--
+--# set up VPM write
+--mov vw_setup, rb28
+--
+--# get width,height of block
+--mov r2, 16
+--mov r0, unif
+--shr r1, r0, r2 # Extract width
+--sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
+--and r0, r0, rb22 # Extract height
+--add rb17, r0, 5
+--add rb18, r0, 7
+--shl r0, r0, 7
+--add r0, r0, r1 # Combine width and height of destination area
+--shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
+--add rb26, r0, rb27
+--
+--sub.setf -,8,r1 # 8-r1, so if <0 (negative) we need to use the full code
+--
+--# get filter coefficients
+--
+--mov r0, unif
+--asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--asr ra0, r0, rb23;      mov r0, unif
+--asr ra7, r0, rb23;      mul24 r0, r0, ra22
+--asr ra6, r0, rb23;      mul24 r0, r0, ra22
+--asr ra5, r0, rb23;      mul24 r0, r0, ra22
+--asr ra4, r0, rb23;      mov r0, unif
+--asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--asr rb8, r0, rb23;      mov r0, unif
+--asr rb15, r0, rb23;     mul24 r0, r0, ra22
+--asr rb14, r0, rb23;     mul24 r0, r0, ra22
+--asr rb13, r0, rb23;     mul24 r0, r0, ra22
+--brr.anynn -, r:fast_path
+--asr rb12, r0, rb23  # delay slot 1
+--
+--# r2 is elem_num
+--# r3 is loop counter
+--
+--mov r5rep, -8 # delay slot 2
+--
+--mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]  # delay slot 3
+--
+--# retrieve texture results and pick out bytes
+--# then submit two more texture requests
+--
+--## nop                                                                 ; ldtmu0     # loop counter increment
+--## shr r0, r4, ra17                                                    ; ldtmu0
+--## shr r1, r4, ra17     ; v8subs r0, r0, rb20
+--## add t0s, ra16, r5    ; v8subs r1, r1, rb20
+--## add ra16, ra16, rb16 ; mov t0s, ra16
+--##
+--## # generate seven shifted versions
+--## # interleave with scroll of vertical context
+--##
+--## mov r2, rb21         ; mul24 r3, r0, ra0
+--## sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+--## nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--## sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--## nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--## sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--## nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--## sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--## nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--## sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--## nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--## sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--## nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--## sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--## nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--## sub r2, r2, r3                                                    ; ldtmu0
+--##
+--## mov r0, ra22
+--## shr r0, r4, ra17     ; mul24 r2, r2, r0                           ; ldtmu0
+--## shr r1, r4, ra17     ; v8subs r0, r0, rb20
+--## add t0s, ra16, r5    ; v8subs r1, r1, rb20
+--## add ra16, ra16, rb16 ; mov t0s, ra16
+--##
+--## # apply horizontal filter
+--##
+--## asr r2, r2, 15    ; mul24 r3, r0, ra0
+--## min r2, r2, rb22
+--## max ra13, r2, 0
+--##
+--## # generate seven shifted versions
+--## # interleave with scroll of vertical context
+--##
+--## mov r2, rb21
+--## sub r2, r2, r3 ; mul24      r3, ra1 << 1, r0 << 1
+--## nop            ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--## sub r2, r2, r3 ; mul24    r3, ra2 << 2, r0 << 2
+--## nop            ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--## sub r2, r2, r3 ; mul24    r3, ra3 << 3, r0 << 3
+--## nop            ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--## sub r2, r2, r3 ; mul24    r3, ra4 << 4, r0 << 4
+--## nop            ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--## sub r2, r2, r3 ; mul24    r3, ra5 << 5, r0 << 5
+--## nop            ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--## sub r2, r2, r3 ; mul24    r3, ra6 << 6, r0 << 6
+--## nop            ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--## sub r2, r2, r3 ; mul24    r3, ra7 << 7, r0 << 7
+--## nop            ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--## sub r0, r2, r3
+--##
+--## # apply horizontal filter
+--##
+--## nop          ; mul24 r0, r0, ra22         # last bit of context scroll, including clamp to zero
+--## asr r0, r0, 15
+--## min r0, r0, rb22
+--## max ra14, r0, 0
+--##
+--##
+--##
+--##
+--## nop                                                                 ; ldtmu0     # loop counter increment
+--## shr r0, r4, ra17                                                    ; ldtmu0
+--## shr r1, r4, ra17     ; v8subs r0, r0, rb20
+--## add t0s, ra16, r5    ; v8subs r1, r1, rb20
+--## add ra16, ra16, rb16 ; mov t0s, ra16
+--##
+--## # generate seven shifted versions
+--## # interleave with scroll of vertical context
+--##
+--## mov r2, rb21         ; mul24 r3, r0, ra0
+--## sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+--## nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--## sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--## nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--## sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--## nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--## sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--## nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--## sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--## nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--## sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--## nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--## sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--## nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--## sub r0, r2, r3
+--##
+--## # apply horizontal filter
+--##
+--## nop          ; mul24 r0, r0, ra22         # last bit of context scroll, including clamp to zero
+--## asr r0, r0, 15
+--## min r0, r0, rb22
+--## max ra15, r0, 0
+--
+--
+--
+--
+--mov r3, 0
+--
+--:loop
+--# retrieve texture results and pick out bytes
+--# then submit two more texture requests
+--
+--sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
+--shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
+--
+--max r2, ra_y, 0  # y
+--min r2, r2, rb_frame_height_minus_1
+--add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--add t0s, ra_x2_base, r2
+--
+--# generate seven shifted versions
+--# interleave with scroll of vertical context
+--
+--mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--
+--mov r2, rb21         ; mul24 r3, r0, ra0
+--sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+--nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--sub r0, r2, r3
+--
+--mov r3, rb31
+--
+--mov ra8, ra9
+--mov ra9, ra10
+--mov ra10, ra11
+--mov ra11, ra12
+--mov ra12, ra13
+--mov ra13, ra14
+--
+--sub.setf -, r3, 8 ; mov r1, ra22
+--
+--# apply horizontal filter
+--brr.anyn -, r:loop
+--max ra14, ra15, 0       ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
+--asr r0, r0, 15          ; mov r1, ra21
+--min.setf ra15, r0, rb22
+--
+--# apply vertical filter and write to VPM
+--
+--nop                     ; mul24 r0, ra14, rb14
+--sub r1, r1, r0          ; mul24 r0, ra13, rb13
+--sub r1, r1, r0          ; mul24 r0, ra12, rb12
+--sub r1, r1, r0          ; mul24 r0, ra11, rb11
+--sub r1, r1, r0          ; mul24 r0, ra10, rb10
+--sub r1, r1, r0          ; mul24 r0, ra9, rb9
+--sub r1, r1, r0          ; mul24 r0, ra8, rb8
+--sub r1, r1, r0          ; mul24 r0, ra15, rb15
+--sub.ifnn r1, r1, r0     ; mov -, vw_wait
+--sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--brr.anyn -, r:loop
+--asr r1, r1, 15
+--min r1, r1, rb22
+--max vpm, r1, 0
+--
+--# DMA out
+--
+--bra -, ra31
+--mov vw_setup, rb26 # VDW: height rows, 16 8-bit units long
+--mov vw_setup, rb29
+--mov vw_addr, unif # start the VDW
+--
+--####################################################
+--
+--:fast_path
+--## nop                                                                 ; ldtmu0     # loop counter increment
+--## shr r0, r4, ra17                                                    ; ldtmu0
+--## shr r1, r4, ra17     ; v8subs r0, r0, rb20
+--## add t0s, ra16, r5    ; v8subs r1, r1, rb20
+--## add ra16, ra16, rb16 ; mov t0s, ra16
+--##
+--## # generate seven shifted versions
+--## # interleave with scroll of vertical context
+--##
+--## mov r2, rb21         ; mul24 r3, r0, ra0
+--## sub r2, r2, r3       ; mul24 r3, ra1 << 1, r0 << 1
+--## sub r2, r2, r3       ; mul24 r3, ra2 << 2, r0 << 2
+--## sub r2, r2, r3       ; mul24 r3, ra3 << 3, r0 << 3
+--## sub r2, r2, r3       ; mul24 r3, ra4 << 4, r0 << 4
+--## sub r2, r2, r3       ; mul24 r3, ra5 << 5, r0 << 5
+--## sub r2, r2, r3       ; mul24 r3, ra6 << 6, r0 << 6
+--## sub r2, r2, r3       ; mul24 r3, ra7 << 7, r0 << 7
+--## sub r2, r2, r3                                                    ; ldtmu0
+--##
+--## mov r0, ra22
+--## shr r0, r4, ra17     ; mul24 r2, r2, r0                           ; ldtmu0
+--## shr r1, r4, ra17     ; v8subs r0, r0, rb20
+--## add t0s, ra16, r5    ; v8subs r1, r1, rb20
+--## add ra16, ra16, rb16 ; mov t0s, ra16
+--##
+--## # apply horizontal filter
+--##
+--## asr r2, r2, 15    ; mul24 r3, r0, ra0
+--## min r2, r2, rb22
+--## max ra13, r2, 0
+--##
+--## # generate seven shifted versions
+--## # interleave with scroll of vertical context
+--##
+--## mov r2, rb21
+--## sub r2, r2, r3 ; mul24    r3, ra1 << 1, r0 << 1
+--## sub r2, r2, r3 ; mul24    r3, ra2 << 2, r0 << 2
+--## sub r2, r2, r3 ; mul24    r3, ra3 << 3, r0 << 3
+--## sub r2, r2, r3 ; mul24    r3, ra4 << 4, r0 << 4
+--## sub r2, r2, r3 ; mul24    r3, ra5 << 5, r0 << 5
+--## sub r2, r2, r3 ; mul24    r3, ra6 << 6, r0 << 6
+--## sub r2, r2, r3 ; mul24    r3, ra7 << 7, r0 << 7
+--## sub r0, r2, r3
+--##
+--## # apply horizontal filter
+--##
+--## nop          ; mul24 r0, r0, ra22         # last bit of context scroll, including clamp to zero
+--## asr r0, r0, 15
+--## min r0, r0, rb22
+--## max ra14, r0, 0
+--##
+--##
+--##
+--##
+--## nop                                                                 ; ldtmu0     # loop counter increment
+--## shr r0, r4, ra17                                                    ; ldtmu0
+--## shr r1, r4, ra17     ; v8subs r0, r0, rb20
+--## add t0s, ra16, r5    ; v8subs r1, r1, rb20
+--## add ra16, ra16, rb16 ; mov t0s, ra16
+--##
+--## # generate seven shifted versions
+--## # interleave with scroll of vertical context
+--##
+--## mov r2, rb21   ; mul24    r3, r0, ra0
+--## sub r2, r2, r3 ; mul24    r3, ra1 << 1, r0 << 1
+--## sub r2, r2, r3 ; mul24    r3, ra2 << 2, r0 << 2
+--## sub r2, r2, r3 ; mul24    r3, ra3 << 3, r0 << 3
+--## sub r2, r2, r3 ; mul24    r3, ra4 << 4, r0 << 4
+--## sub r2, r2, r3 ; mul24    r3, ra5 << 5, r0 << 5
+--## sub r2, r2, r3 ; mul24    r3, ra6 << 6, r0 << 6
+--## sub r2, r2, r3 ; mul24    r3, ra7 << 7, r0 << 7
+--## sub r0, r2, r3
+--##
+--## # apply horizontal filter
+--##
+--## nop          ; mul24 r0, r0, ra22         # last bit of context scroll, including clamp to zero
+--## asr r0, r0, 15
+--## min r0, r0, rb22
+--## max ra15, r0, 0
+--
+--
+--mov r3, 0  # This signifies the amount of unrolling
+--
+--:fast_loop
+--# retrieve texture results and pick out bytes
+--# then submit two more texture requests
+--
+--# Due to pipelining we can only skip second pipeline instructions related to the fetched pixels
+--sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
+--shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--mov.ifz ra_y, ra_y_next   ; mov rb31, r3
+--mov.ifz ra_x2_base, ra_x2_base_next   ; mov r3, rb_pitch
+--
+--max r2, ra_y, 0
+--min r2, r2, rb_frame_height_minus_1 ; mov r1, r4  # discard texture read
+--add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--add t0s, ra_x_base, r2    ; v8subs r0, r0, rb20
+--add t0s, ra_x2_base, r2
+--
+--# generate seven shifted versions
+--# interleave with scroll of vertical context
+--
+--mov r2, rb21         ; mul24 r3, r0, ra0
+--sub r2, r2, r3       ; mul24 r3, ra1 << 1, r0 << 1
+--sub r2, r2, r3       ; mul24 r3, ra2 << 2, r0 << 2
+--sub r2, r2, r3       ; mul24 r3, ra3 << 3, r0 << 3
+--sub r2, r2, r3       ; mul24 r3, ra4 << 4, r0 << 4
+--sub r2, r2, r3       ; mul24 r3, ra5 << 5, r0 << 5
+--sub r2, r2, r3       ; mul24 r3, ra6 << 6, r0 << 6
+--sub r2, r2, r3       ; mul24 r3, ra7 << 7, r0 << 7
+--sub r0, r2, r3       ; mov r3, rb31
+--
+--mov ra8, ra9
+--mov ra9, ra10
+--mov ra10, ra11
+--mov ra11, ra12
+--mov ra12, ra13
+--mov ra13, ra14
+--
+--sub.setf -, r3, 8       ; mov r1, ra22
+--
+--# apply horizontal filter
+--
+--brr.anyn -, r:fast_loop
+--max ra14, ra15, 0       ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
+--asr r0, r0, 15          ; mov r1, ra21
+--min.setf ra15, r0, rb22
+--
+--# apply vertical filter and write to VPM
+--
+--nop                     ; mul24 r0, ra14, rb14
+--sub r1, r1, r0          ; mul24 r0, ra13, rb13
+--sub r1, r1, r0          ; mul24 r0, ra12, rb12
+--sub r1, r1, r0          ; mul24 r0, ra11, rb11
+--sub r1, r1, r0          ; mul24 r0, ra10, rb10
+--sub r1, r1, r0          ; mul24 r0, ra9, rb9
+--sub r1, r1, r0          ; mul24 r0, ra8, rb8
+--sub r1, r1, r0          ; mul24 r0, ra15, rb15
+--sub.ifnn r1, r1, r0     ; mov -, vw_wait
+--sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--brr.anyn -, r:fast_loop
+--asr r1, r1, 15
+--min r1, r1, rb22
+--max vpm, r1, 0
+--
+--# DMA out
+--
+--bra -, ra31
+--mov vw_setup, rb26 # VDW: height rows, 16 8-bit units long
+--mov vw_setup, rb29
+--mov vw_addr, unif # start the VDW
+- 
+- ################################################################################
+- 
+--# mc_filter_b(next_kernel, x, y, frame_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_dst)
+--
+--# At this point we have already issued two pairs of texture requests for the current block
+--# ra_x_base, ra_x16_base point to the current coordinates for this block
+--::mc_filter_b
+-+::mc_filter_uv_b
+- mov ra31, unif
+- 
+- # per-channel shifts were calculated on the *previous* invocation
+- 
+- mov ra_xshift, ra_xshift_next
+--mov ra_x2shift, ra_x2shift_next
+- 
+- # get base addresses and per-channel shifts for *next* invocation
+- add r0, unif, elem_num    # x
+--add r2, r0, 8 # x+8
+- max r0, r0, 0; mov r1, unif # y
+--min r0, r0, rb_frame_width_minus_1 ; mov r3,unif # frame_base
+-+min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base
+- shl ra_xshift_next, r0, 3
+--max r2, r2, 0
+--min r2, r2, rb_frame_width_minus_1
+--shl ra_x2shift_next, r2, 3
+-+sub r2, unif, r3 # compute offset from frame base u to frame base v
+- add r0, r0, r3
+--add r2, r2, r3
+- and rb_x_base_next, r0, ~3
+--and ra_x2_base_next, r2, ~3
+- mov ra_y_next, r1
+-+add ra_x2_base_next, rb_x_base_next, r2
+- 
+- # set up VPM write
+- mov vw_setup, rb28
+-@@ -801,17 +375,22 @@ and r0, r0, rb22 # Extract height
+- add rb17, r0, 5
+- add rb18, r0, 7
+- shl r0, r0, 7
+-+
+- # r0 is currently height<<7
+- # For vr_setup we want height<<20 (so 20-7=13 additional bits)
+- shl r3, r0, 13
+- shl r3, r3, 8 # Mask off top 8 bits
+- shr r3, r3, 8
+-+
+- add r0, r0, r1 # Combine width and height of destination area
+- shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
+- add rb26, r0, rb27
+-+
+- # In a B frame, so also set up VPM read
+- add vr_setup, r3, rb28
+- 
+-+sub.setf -,8,r1 # 8-r1, so if <0 (negative) we need to use the full code
+-+
+- # get filter coefficients
+- 
+- mov r0, unif
+-@@ -837,9 +416,13 @@ asr rb12, r0, rb23
+- 
+- mov r5rep, -8
+- mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+
+-+# retrieve texture results and pick out bytes
+-+# then submit two more texture requests
+-+
+- mov r3, 0
+- 
+--:bloop
+-+:uvloop_b
+- # retrieve texture results and pick out bytes
+- # then submit two more texture requests
+- 
+-@@ -847,7 +430,7 @@ sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+- shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+- mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+- mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
+-+shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
+- 
+- max r2, ra_y, 0  # y
+- min r2, r2, rb_frame_height_minus_1
+-@@ -861,6 +444,7 @@ add t0s, ra_x2_base, r2
+- mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+- 
+- mov r2, rb21         ; mul24 r3, r0, ra0
+-+nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
+- sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+- nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+- sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-@@ -889,7 +473,7 @@ mov ra13, ra14
+- sub.setf -, r3, 8 ; mov r1, ra22
+- 
+- # apply horizontal filter
+--brr.anyn -, r:bloop
+-+brr.anyn -, r:uvloop_b
+- max ra14, ra15, 0       ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
+- asr r0, r0, 15          ; mov r1, ra21
+- min.setf ra15, r0, rb22
+-@@ -906,213 +490,50 @@ sub r1, r1, r0          ; mul24 r0, ra8, rb8
+- sub r1, r1, r0          ; mul24 r0, ra15, rb15
+- sub.ifnn r1, r1, r0     ; mov -, vw_wait
+- sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--asr r1, r1, 15          ; mov -, vr_wait
+-+asr r1, r1, 15
+- min r1, r1, rb22
+- add r0, vpm, 1          # Blend in previous VPM contents at this location
+--brr.anyn -, r:bloop
+-+brr.anyn -, r:uvloop_b
+- max r1, r1, 0
+- add r1, r1, r0
+- shr vpm, r1, 1
+- 
+--# DMA out
+-+
+-+# DMA out for U
+-+
+-+mov vw_setup, rb26 # VDW setup 0
+-+mov vw_setup, rb29 # Stride
+-+mov vw_addr, unif # start the VDW
+-+
+-+# DMA out for V
+-+# We need to wait for the U to complete first, but have nothing useful to compute while we wait.
+-+# Could potentially push this write into the start of the next pipeline stage.
+-+mov r0, 16
+-+mov -, vw_wait
+- 
+- bra -, ra31
+--mov vw_setup, rb26 # VDW: height rows, 16 8-bit units long
+--mov vw_setup, rb29
+-+add vw_setup, rb26, r0 # VDW setup 0
+-+mov vw_setup, rb29 # Stride
+- mov vw_addr, unif # start the VDW
+- 
+- ################################################################################
+- 
+--# mc_filter_honly(next_kernel, x, y, frame_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_dst)
+--# This filter only does horizontal filtering.
+--# It is assumed that the region to fetch does not include extra rows above.
+-+# mc_exit()
+- 
+--# At this point we have already issued two pairs of texture requests for the current block
+--# ra_x_base, ra_x16_base point to the current coordinates for this block
+--::mc_filter_honly
+--mov ra31, unif
+-+::mc_exit
+-+mov  -, vw_wait # wait on the VDW
+- 
+--# per-channel shifts were calculated on the *previous* invocation
+-+mov -,srel(0)
+- 
+--mov ra_xshift, ra_xshift_next
+--mov ra_x2shift, ra_x2shift_next
+--
+--# get base addresses and per-channel shifts for *next* invocation
+--add r0, unif, elem_num    # x
+--add r2, r0, 8 # x+8
+--max r0, r0, 0; mov r1, unif # y
+--min r0, r0, rb_frame_width_minus_1 ; mov r3,unif # frame_base
+--shl ra_xshift_next, r0, 3
+--max r2, r2, 0
+--min r2, r2, rb_frame_width_minus_1
+--shl ra_x2shift_next, r2, 3
+--add r0, r0, r3
+--add r2, r2, r3
+--and rb_x_base_next, r0, ~3
+--and ra_x2_base_next, r2, ~3
+--mov ra_y_next, r1
+--
+--# set up VPM write
+--mov vw_setup, rb28
+--
+--# get width,height of block
+--mov r2, 16
+--mov r0, unif
+--shr r1, r0, r2 # Extract width
+--sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
+--and r0, r0, rb22 # Extract height
+--add rb17, r0, -2 # Pipelining means we move data across 2 iterations early
+--shl r0, r0, 7 ; mov rb18,r0
+--add r0, r0, r1 # Combine width and height of destination area
+--shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
+--add rb26, r0, rb27
+--
+--# get filter coefficients
+--
+--mov r0, unif
+--asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--asr ra0, r0, rb23;      mov r0, unif
+--asr ra7, r0, rb23;      mul24 r0, r0, ra22
+--asr ra6, r0, rb23;      mul24 r0, r0, ra22
+--asr ra5, r0, rb23;      mul24 r0, r0, ra22
+--asr ra4, r0, rb23;      mov r0, unif
+--mov r0, unif
+--
+--# r2 is elem_num
+--# r3 is loop counter
+--mov r5rep, -8
+--mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]  # delay slot 3
+--mov r3, 0
+--
+--:loop_honly
+--# retrieve texture results and pick out bytes
+--# then submit two more texture requests
+--
+--sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
+--shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
+--
+--max r2, ra_y, 0  # y
+--min r2, r2, rb_frame_height_minus_1
+--add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--add t0s, ra_x2_base, r2
+--
+--# generate seven shifted versions
+--# interleave with scroll of vertical context
+--
+--mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--
+--mov r2, rb21         ; mul24 r3, r0, ra0
+--sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+--nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--sub r0, r2, r3       ; mov r3, rb31
+--
+--sub.setf -, r3, rb18 ; mov r1, ra22
+--
+--mov -, vw_wait   ; mul24 r0, r0, r1
+--brr.anyn -, r:loop_honly
+--asr r0, r0, 15          # delay 1
+--min r0, r0, rb22        # delay 2
+--max vpm, r0, 0          # delay 3
+--
+--# DMA out
+--bra -, ra31
+--mov vw_setup, rb26 # VDW: height rows, 16 8-bit units long
+--mov vw_setup, rb29
+--mov vw_addr, unif # start the VDW
+--
+--
+--################################################################################
+--
+--# mc_exit()
+--
+--::mc_exit
+--mov  -, vw_wait # wait on the VDW
+--
+--mov -,srel(0)
+--
+--ldtmu0
+--ldtmu0
+--ldtmu0
+--ldtmu0
+-+ldtmu0
+-+ldtmu0
+-+ldtmu0
+-+ldtmu0
+- 
+- nop        ; nop ; thrend
+- nop        ; nop # delay slot 1
+- nop        ; nop # delay slot 2
+- 
+--::mc_exit1
+--mov  -, vw_wait # wait on the VDW
+--
+--#mov -,srel(1)
+--
+--ldtmu0
+--ldtmu0
+--ldtmu0
+--ldtmu0
+--
+--nop        ; nop ; thrend
+--mov interrupt, 1; nop # delay slot 1
+--nop        ; nop # delay slot 2
+--
+--# mc_interrupt_exit()
+--::mc_interrupt_exit
+--mov  -, vw_wait # wait on the VDW
+--
+--ldtmu0
+--ldtmu0
+--ldtmu0
+--ldtmu0
+--
+--mov -,sacq(0) # 1
+--mov -,sacq(0) # 2
+--mov -,sacq(0) # 3
+--mov -,sacq(0) # 4
+--mov -,sacq(0) # 5
+--mov -,sacq(0) # 6
+--mov -,sacq(0) # 7
+--mov -,sacq(0) # 8
+--mov -,sacq(0) # 9
+--mov -,sacq(0) # 10
+--mov -,sacq(0) # 11
+--
+--nop        ; nop ; thrend
+--mov interrupt, 1; nop # delay slot 1
+--nop        ; nop # delay slot 2
+--
+--# mc_interrupt_exit4()
+--::mc_interrupt_exit4
+--mov  -, vw_wait # wait on the VDW
+--
+--ldtmu0
+--ldtmu0
+--ldtmu0
+--ldtmu0
+--
+--mov -,sacq(0) # 1
+--mov -,sacq(0) # 2
+--mov -,sacq(0) # 3
+--
+--nop        ; nop ; thrend
+--mov interrupt, 1; nop # delay slot 1
+--nop        ; nop # delay slot 2
+--
+- # mc_interrupt_exit8()
+- ::mc_interrupt_exit8
+- mov  -, vw_wait # wait on the VDW
+-@@ -1134,282 +555,5 @@ nop        ; nop ; thrend
+- mov interrupt, 1; nop # delay slot 1
+- nop        ; nop # delay slot 2
+- 
+--################################################################################
+--# mc_setup_uv(next_kernel, x, y, ref_u_base, ref_v_base, frame_width, frame_height, pitch, dst_pitch, pad0, pad1, pad2)
+--::mc_setup_uv
+--
+--# Read starting kernel
+--mov ra31, unif
+--
+--# Load first request location
+--add ra_x_base, unif, elem_num # Store x
+--mov ra_y, unif # Store y
+--mov ra_x2_base, unif # Store frame u base
+--nop
+--sub ra_u2v_ref_offset, unif, ra_x2_base # Store offset to add to move from u to v in reference frame
+--
+--# Read image dimensions
+--sub rb25,unif,1
+--sub rb30,unif,1
+--
+--# get source pitch
+--mov rb16, unif
+--
+--# get destination pitch
+--mov r0, unif
+--mov r1, vdw_setup_1(0)
+--add rb24, r1, r0
+--
+--# load constants
+--
+--mov ra20, 1
+--mov ra21, 32
+--mov ra22, 256
+--mov ra23, 8
+--
+--mov rb20, 0xffffff00
+--mov rb21, 64
+--mov rb22, 255
+--mov rb23, 24
+--
+--# touch vertical context to keep simulator happy
+--
+--mov ra8, 0
+--mov ra9, 0
+--mov ra10, 0
+--mov ra11, 0
+--mov ra12, 0
+--mov ra13, 0
+--mov ra14, 0
+--mov ra15, 0
+--
+--# Compute part of VPM to use for DMA output
+--mov r2, qpu_num
+--and r2, r2, 15
+--mov r1, r2
+--asr r1, r1, 2
+--shl r1, r1, 6
+--mov r0, r2
+--and r0, r0, 3
+--add r0, r0, r1
+--mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
+--shl r0, r0, 5
+--add rb27, r0, r1
+--
+--# Compute part of VPM to save data into
+--mov r2, qpu_num
+--and r2, r2, 15
+--mov r1, r2
+--asr r1, r1, 2
+--shl r1, r1, 6
+--mov r0, r2
+--and r0, r0, 3
+--add r0, r0, r1
+--mov r1, vpm_setup(0, 4, h8p(0, 0))
+--add rb28, r0, r1
+--
+--# Compute base address for first and second access
+--mov r0, ra_x_base           # Load x
+--max r0, r0, 0; mov r1, ra_y # Load y
+--min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base  # Load the frame base
+--shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
+--add ra_y, r1, 1
+--add r0, r0, r3
+--and r0, r0, ~3
+--max r1, r1, 0 ; mov ra_x_base, r0 # y
+--min r1, r1, rb_frame_height_minus_1
+--# submit texture requests for first line
+--add r2, r2, r0 ; mul24 r1, r1, rb_pitch
+--add t0s, r0, r1 ; mov ra_x2_base, r2
+--add t0s, r2, r1
+--
+--# Dump padding words
+--mov r0, unif
+--mov r0, unif
+--mov r0, unif
+--
+--# submit texture requests for second line
+--max r1, ra_y, 0
+--min r1, r1, rb_frame_height_minus_1
+--add ra_y, ra_y, 1
+--bra -, ra31
+--nop ; mul24 r1, r1, rb_pitch
+--add t0s, r1, ra_x_base
+--add t0s, r1, ra_x2_base
+--
+--
+--
+--################################################################################
+--
+--::mc_filter_uv_b
+--mov ra31, unif
+--
+--# per-channel shifts were calculated on the *previous* invocation
+--
+--mov ra_xshift, ra_xshift_next
+--
+--# get base addresses and per-channel shifts for *next* invocation
+--add r0, unif, elem_num    # x
+--max r0, r0, 0; mov r1, unif # y
+--min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base
+--shl ra_xshift_next, r0, 3
+--sub r2, unif, r3 # compute offset from frame base u to frame base v
+--add r0, r0, r3
+--and rb_x_base_next, r0, ~3
+--mov ra_y_next, r1
+--add ra_x2_base_next, rb_x_base_next, r2
+--
+--# set up VPM write
+--mov vw_setup, rb28
+--
+--# get width,height of block
+--mov r2, 16
+--mov r0, unif
+--shr r1, r0, r2 # Extract width
+--sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
+--and r0, r0, rb22 # Extract height
+--add rb17, r0, 5
+--add rb18, r0, 7
+--shl r0, r0, 7
+--
+--# r0 is currently height<<7
+--# For vr_setup we want height<<20 (so 20-7=13 additional bits)
+--shl r3, r0, 13
+--shl r3, r3, 8 # Mask off top 8 bits
+--shr r3, r3, 8
+--
+--add r0, r0, r1 # Combine width and height of destination area
+--shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
+--add rb26, r0, rb27
+--
+--# In a B frame, so also set up VPM read
+--add vr_setup, r3, rb28
+--
+--sub.setf -,8,r1 # 8-r1, so if <0 (negative) we need to use the full code
+--
+--# get filter coefficients
+--
+--mov r0, unif
+--asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--asr ra0, r0, rb23;      mov r0, unif
+--asr ra7, r0, rb23;      mul24 r0, r0, ra22
+--asr ra6, r0, rb23;      mul24 r0, r0, ra22
+--asr ra5, r0, rb23;      mul24 r0, r0, ra22
+--asr ra4, r0, rb23;      mov r0, unif
+--asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--asr rb8, r0, rb23;      mov r0, unif
+--asr rb15, r0, rb23;     mul24 r0, r0, ra22
+--asr rb14, r0, rb23;     mul24 r0, r0, ra22
+--asr rb13, r0, rb23;     mul24 r0, r0, ra22
+--asr rb12, r0, rb23
+--
+--# r2 is elem_num
+--# r3 is loop counter
+--
+--mov r5rep, -8
+--mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--
+--# retrieve texture results and pick out bytes
+--# then submit two more texture requests
+--
+--mov r3, 0
+--
+--:uvloop_b
+--# retrieve texture results and pick out bytes
+--# then submit two more texture requests
+--
+--sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
+--shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
+--
+--max r2, ra_y, 0  # y
+--min r2, r2, rb_frame_height_minus_1
+--add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--add t0s, ra_x2_base, r2
+--
+--# generate seven shifted versions
+--# interleave with scroll of vertical context
+--
+--mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--
+--mov r2, rb21         ; mul24 r3, r0, ra0
+--nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
+--sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+--nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--sub r0, r2, r3
+--
+--mov r3, rb31
+--
+--mov ra8, ra9
+--mov ra9, ra10
+--mov ra10, ra11
+--mov ra11, ra12
+--mov ra12, ra13
+--mov ra13, ra14
+--
+--sub.setf -, r3, 8 ; mov r1, ra22
+--
+--# apply horizontal filter
+--brr.anyn -, r:uvloop_b
+--max ra14, ra15, 0       ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
+--asr r0, r0, 15          ; mov r1, ra21
+--min.setf ra15, r0, rb22
+--
+--# apply vertical filter and write to VPM
+--
+--nop                     ; mul24 r0, ra14, rb14
+--sub r1, r1, r0          ; mul24 r0, ra13, rb13
+--sub r1, r1, r0          ; mul24 r0, ra12, rb12
+--sub r1, r1, r0          ; mul24 r0, ra11, rb11
+--sub r1, r1, r0          ; mul24 r0, ra10, rb10
+--sub r1, r1, r0          ; mul24 r0, ra9, rb9
+--sub r1, r1, r0          ; mul24 r0, ra8, rb8
+--sub r1, r1, r0          ; mul24 r0, ra15, rb15
+--sub.ifnn r1, r1, r0     ; mov -, vw_wait
+--sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--asr r1, r1, 15
+--min r1, r1, rb22
+--add r0, vpm, 1          # Blend in previous VPM contents at this location
+--brr.anyn -, r:uvloop_b
+--max r1, r1, 0
+--add r1, r1, r0
+--shr vpm, r1, 1
+--
+--
+--# DMA out for U
+--
+--mov vw_setup, rb26 # VDW setup 0
+--mov vw_setup, rb29 # Stride
+--mov vw_addr, unif # start the VDW
+--
+--# DMA out for V
+--# We need to wait for the U to complete first, but have nothing useful to compute while we wait.
+--# Could potentially push this write into the start of the next pipeline stage.
+--mov r0, 16
+--mov -, vw_wait
+--
+--bra -, ra31
+--add vw_setup, rb26, r0 # VDW setup 0
+--mov vw_setup, rb29 # Stride
+--mov vw_addr, unif # start the VDW
+--
+- ::mc_end
+-+# Do not add code here because mc_end must appear after all other code.
+--- 
+-2.7.4
+-
+-
+-From d40d59de0f09fd1a6e7146532418b63d8e2711b7 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Wed, 13 May 2015 14:54:25 +0100
+-Subject: [PATCH 20/68] Moved chroma P1 to QPUs
+-
+----
+- libavcodec/hevc.c | 38 ++++++++++++++++++++++++++++++++++++++
+- 1 file changed, 38 insertions(+)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 31b8b2f..391d139 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -2070,6 +2070,44 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                     s->sh.luma_offset_l1[current_mv.ref_idx[1]]);
+- 
+-         if (s->ps.sps->chroma_format_idc) {
+-+#ifdef RPI_INTER_QPU
+-+            if (s->enable_rpi) {
+-+                int reflist = 1;
+-+                int hshift           = s->ps.sps->hshift[1];
+-+                int vshift           = s->ps.sps->vshift[1];
+-+                const Mv *mv         = &current_mv.mv[reflist];
+-+                intptr_t mx          = av_mod_uintp2(mv->x, 2 + hshift);
+-+                intptr_t my          = av_mod_uintp2(mv->y, 2 + vshift);
+-+                intptr_t _mx         = mx << (1 - hshift);
+-+                intptr_t _my         = my << (1 - vshift); // Fractional part of motion vector
+-+
+-+                int x1_c = x0_c + (mv->x >> (2 + hshift));
+-+                int y1_c = y0_c + (mv->y >> (2 + hshift));
+-+                //int chan = x0>>8; // Allocate commands for the first 256 luma pixels across to the first QPU.  This is optimised for images around 1920 width
+-+                int chan = x0>>8;
+-+
+-+                uint32_t *u = s->u_mvs[chan & 7];
+-+                for(int start_y=0;start_y < nPbH_c;start_y+=16) {
+-+                  for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 3 + start_x;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 3 + start_y;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[1]);
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[2]);
+-+                      *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
+-+                      // TODO chroma weight and offset... s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]
+-+                      *u++ = rpi_filter_coefs[_mx][0];
+-+                      *u++ = rpi_filter_coefs[_mx][1];
+-+                      *u++ = rpi_filter_coefs[_my][0];
+-+                      *u++ = rpi_filter_coefs[_my][1];
+-+                      *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
+-+                      *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+-+                    }
+-+                }
+-+                s->u_mvs[chan & 7] = u;
+-+                return;
+-+            }
+-+#endif
+-             RPI_REDIRECT(chroma_mc_uni)(s, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1],
+-                           1, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
+-                           s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0], s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0]);
+--- 
+-2.7.4
+-
+-
+-From 75777ba7927086e862104b14f6446e81bc789611 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Wed, 13 May 2015 15:13:47 +0100
+-Subject: [PATCH 21/68] Added B prediction - not quite right
+-
+----
+- libavcodec/hevc.c          |  58 ++++++++++++++++++++++++
+- libavcodec/rpi_shader.c    | 108 +++++++++++++++++++++++----------------------
+- libavcodec/rpi_shader.h    |   6 +--
+- libavcodec/rpi_shader.qasm |  48 ++++++++++----------
+- 4 files changed, 141 insertions(+), 79 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 391d139..47ddfff 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -2127,6 +2127,64 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                    ref1->frame, &current_mv.mv[1], &current_mv);
+- 
+-         if (s->ps.sps->chroma_format_idc) {
+-+#ifdef RPI_INTER_QPU
+-+            if (s->enable_rpi) {
+-+                int hshift           = s->ps.sps->hshift[1];
+-+                int vshift           = s->ps.sps->vshift[1];
+-+                const Mv *mv         = &current_mv.mv[0];
+-+                intptr_t mx          = av_mod_uintp2(mv->x, 2 + hshift);
+-+                intptr_t my          = av_mod_uintp2(mv->y, 2 + vshift);
+-+                intptr_t _mx         = mx << (1 - hshift);
+-+                intptr_t _my         = my << (1 - vshift); // Fractional part of motion vector
+-+                int x1_c = x0_c + (mv->x >> (2 + hshift));
+-+                int y1_c = y0_c + (mv->y >> (2 + hshift));
+-+
+-+                const Mv *mv2         = &current_mv.mv[1];
+-+                intptr_t mx2          = av_mod_uintp2(mv2->x, 2 + hshift);
+-+                intptr_t my2          = av_mod_uintp2(mv2->y, 2 + vshift);
+-+                intptr_t _mx2         = mx2 << (1 - hshift);
+-+                intptr_t _my2         = my2 << (1 - vshift); // Fractional part of motion vector
+-+
+-+                int x2_c = x0_c + (mv2->x >> (2 + hshift));
+-+                int y2_c = y0_c + (mv2->y >> (2 + hshift));
+-+
+-+                int chan = x0>>8; // Allocate commands for the first 256 luma pixels across to the first QPU.  This is optimised for images around 1920 width
+-+
+-+                uint32_t *u = s->u_mvs[chan & 7];
+-+                for(int start_y=0;start_y < nPbH_c;start_y+=16) {
+-+                  for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 3 + start_x;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 3 + start_y;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[1]);
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[2]);
+-+                      *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
+-+                      *u++ = rpi_filter_coefs[_mx][0];
+-+                      *u++ = rpi_filter_coefs[_mx][1];
+-+                      *u++ = rpi_filter_coefs[_my][0];
+-+                      *u++ = rpi_filter_coefs[_my][1];
+-+                      *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]); // TODO this will become unused once we have a dedicated pass0 filter
+-+                      *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+-+
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = x2_c - 3 + start_x;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = y2_c - 3 + start_y;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[1]);
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[2]);
+-+                      *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
+-+                      // TODO chroma weight and offset... s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]
+-+                      *u++ = rpi_filter_coefs[_mx2][0];
+-+                      *u++ = rpi_filter_coefs[_mx2][1];
+-+                      *u++ = rpi_filter_coefs[_my2][0];
+-+                      *u++ = rpi_filter_coefs[_my2][1];
+-+                      *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
+-+                      *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+-+                    }
+-+                }
+-+                s->u_mvs[chan & 7] = u;
+-+                return;
+-+            }
+-+#endif
+-             RPI_REDIRECT(chroma_mc_bi)(s, dst1, s->frame->linesize[1], ref0->frame, ref1->frame,
+-                          x0_c, y0_c, nPbW_c, nPbH_c, &current_mv, 0);
+- 
+-diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
+-index 170e8ac..5d00cb2 100644
+---- a/libavcodec/rpi_shader.c
+-+++ b/libavcodec/rpi_shader.c
+-@@ -265,23 +265,23 @@ unsigned int rpi_shader[] = {
+- /* [0x00000760] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+- /* [0x00000768] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+- /* [0x00000770] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000778] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
+--/* [0x00000780] */ 0x40038031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
+--/* [0x00000788] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00000778] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x00000780] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x00000788] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+- /* [0x00000790] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x00000798] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000798] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+- /* [0x000007a0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x000007a8] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x000007a8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+- /* [0x000007b0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x000007b8] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+/* [0x000007b8] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+- /* [0x000007c0] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--/* [0x000007c8] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+/* [0x000007c8] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+- /* [0x000007d0] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--/* [0x000007d8] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+/* [0x000007d8] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+- /* [0x000007e0] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--/* [0x000007e8] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+/* [0x000007e8] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+- /* [0x000007f0] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--/* [0x000007f8] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
+-+/* [0x000007f8] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
+- /* [0x00000800] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+- /* [0x00000808] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+- /* [0x00000810] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+-@@ -291,61 +291,63 @@ unsigned int rpi_shader[] = {
+- /* [0x00000830] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+- /* [0x00000838] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+- /* [0x00000840] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
+--/* [0x00000848] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
+--/* [0x00000850] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
+--/* [0x00000858] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
+--/* [0x00000860] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
+--/* [0x00000868] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
+--/* [0x00000870] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
+--/* [0x00000878] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
+--/* [0x00000880] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
+--/* [0x00000888] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
+--/* [0x00000890] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
+--/* [0x00000898] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
+--/* [0x000008a0] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
+-+/* [0x00000848] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+-+/* [0x00000850] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+-+/* [0x00000858] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+-+/* [0x00000860] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
+-+/* [0x00000868] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
+-+/* [0x00000870] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
+-+/* [0x00000878] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
+-+/* [0x00000880] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
+-+/* [0x00000888] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
+-+/* [0x00000890] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
+-+/* [0x00000898] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
+-+/* [0x000008a0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+- /* [0x000008a8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x000008b0] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
+--/* [0x000008b8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x000008c0] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
+--/* [0x000008c8] */ 0xfffffe38, 0xf06809e7, // brr.anyn -, r:uvloop_b
+--/* [0x000008d0] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+--/* [0x000008d8] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
+--/* [0x000008e0] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
+--/* [0x000008e8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x000008f0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x000008f8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x00000900] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x00000908] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x00000910] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000918] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x00000920] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000928] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x000008b0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x000008b8] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
+-+/* [0x000008c0] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
+-+/* [0x000008c8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x000008d0] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
+-+/* [0x000008d8] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x000008e0] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+-+/* [0x000008e8] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
+-+/* [0x000008f0] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
+-+/* [0x000008f8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00000900] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000908] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000910] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x00000918] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x00000920] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000928] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x00000930] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000938] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_exit
+--/* [0x00000930] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x00000938] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+--/* [0x00000940] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000948] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000940] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00000948] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+- /* [0x00000950] */ 0x009e7000, 0xa00009e7, // ldtmu0
+- /* [0x00000958] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000960] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x00000968] */ 0x009e7000, 0x100009e7, // nop        ; nop
+--/* [0x00000970] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000960] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000968] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000970] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00000978] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000980] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_interrupt_exit8
+--/* [0x00000978] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x00000980] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000988] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000988] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+- /* [0x00000990] */ 0x009e7000, 0xa00009e7, // ldtmu0
+- /* [0x00000998] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x000009a0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x000009a8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000009a0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x000009a8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+- /* [0x000009b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x000009b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x000009c0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x000009c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x000009d0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x000009d8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x000009e0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+--/* [0x000009e8] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x000009d8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000009e0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000009e8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x000009f0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x000009f8] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_end
+- };
+- #ifdef __HIGHC__
+-diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
+-index 9de4535..e36c4ae 100644
+---- a/libavcodec/rpi_shader.h
+-+++ b/libavcodec/rpi_shader.h
+-@@ -6,8 +6,8 @@ extern unsigned int rpi_shader[];
+- #define mc_setup_uv (rpi_shader + 0)
+- #define mc_filter_uv (rpi_shader + 142)
+- #define mc_filter_uv_b (rpi_shader + 360)
+--#define mc_exit (rpi_shader + 588)
+--#define mc_interrupt_exit8 (rpi_shader + 606)
+--#define mc_end (rpi_shader + 636)
+-+#define mc_exit (rpi_shader + 592)
+-+#define mc_interrupt_exit8 (rpi_shader + 610)
+-+#define mc_end (rpi_shader + 640)
+- 
+- #endif
+-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
+-index cd7346d..870437d2 100644
+---- a/libavcodec/rpi_shader.qasm
+-+++ b/libavcodec/rpi_shader.qasm
+-@@ -443,23 +443,23 @@ add t0s, ra_x2_base, r2
+- 
+- mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+- 
+--mov r2, rb21         ; mul24 r3, r0, ra0
+--nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
+--sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+-+nop                  ; mul24 r2, r0, ra0
+-+nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+nop                  ; mul24      r3, ra1 << 1, r0 << 1
+- nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+- nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+- nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+- nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+- nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+- nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+- nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--sub r0, r2, r3
+-+add r0, r2, r3
+- 
+- mov r3, rb31
+- 
+-@@ -474,23 +474,25 @@ sub.setf -, r3, 8 ; mov r1, ra22
+- 
+- # apply horizontal filter
+- brr.anyn -, r:uvloop_b
+--max ra14, ra15, 0       ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
+--asr r0, r0, 15          ; mov r1, ra21
+--min.setf ra15, r0, rb22
+-+mov ra14, ra15          ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
+-+asr ra15, r0, 8         ; nop
+-+nop                     ; nop
+- 
+- # apply vertical filter and write to VPM
+- 
+--nop                     ; mul24 r0, ra14, rb14
+--sub r1, r1, r0          ; mul24 r0, ra13, rb13
+--sub r1, r1, r0          ; mul24 r0, ra12, rb12
+--sub r1, r1, r0          ; mul24 r0, ra11, rb11
+--sub r1, r1, r0          ; mul24 r0, ra10, rb10
+--sub r1, r1, r0          ; mul24 r0, ra9, rb9
+--sub r1, r1, r0          ; mul24 r0, ra8, rb8
+--sub r1, r1, r0          ; mul24 r0, ra15, rb15
+--sub.ifnn r1, r1, r0     ; mov -, vw_wait
+-+nop                     ; mul24 r1, ra14, rb14
+-+nop                     ; mul24 r0, ra13, rb13
+-+add r1, r1, r0          ; mul24 r0, ra12, rb12
+-+add r1, r1, r0          ; mul24 r0, ra11, rb11
+-+add r1, r1, r0          ; mul24 r0, ra10, rb10
+-+add r1, r1, r0          ; mul24 r0, ra9, rb9
+-+add r1, r1, r0          ; mul24 r0, ra8, rb8
+-+add r1, r1, r0          ; mul24 r0, ra15, rb15
+-+add r1, r1, r0          ; mov -, vw_wait
+- sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--asr r1, r1, 15
+-+asr r1, r1, 14
+-+add r1, r1, ra21
+-+asr r1, r1, 6
+- min r1, r1, rb22
+- add r0, vpm, 1          # Blend in previous VPM contents at this location
+- brr.anyn -, r:uvloop_b
+--- 
+-2.7.4
+-
+-
+-From 3d4e94b8f0b08fe4c0b582fc7f1dbe9d1d9d60ed Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Thu, 14 May 2015 08:15:55 +0100
+-Subject: [PATCH 22/68] Added flush for SAO
+-
+----
+- libavcodec/hevc.c        |  2 +-
+- libavcodec/hevc_filter.c | 39 ++++++++++++++++++++++++++-------------
+- 2 files changed, 27 insertions(+), 14 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 47ddfff..93e1eba 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -2903,7 +2903,7 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-             rpi_execute_inter_qpu(s);
+- #endif
+-             // Transform all blocks
+--            //printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
+-+            // printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
+-             rpi_execute_transform(s);
+-             // Perform inter prediction
+-             rpi_execute_inter_cmds(s);
+-diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
+-index 9b6e26d..92a8271 100644
+---- a/libavcodec/hevc_filter.c
+-+++ b/libavcodec/hevc_filter.c
+-@@ -871,6 +871,21 @@ static void flush_buffer(AVBufferRef *bref) {
+-     GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
+-     gpu_cache_flush(p);
+- }
+-+
+-+static void ff_hevc_flush_chroma(HEVCContext *s)
+-+{
+-+    if (s->enable_rpi && !(  s->nal_unit_type == NAL_TRAIL_N ||
+-+            s->nal_unit_type == NAL_TSA_N   ||
+-+            s->nal_unit_type == NAL_STSA_N  ||
+-+            s->nal_unit_type == NAL_RADL_N  ||
+-+            s->nal_unit_type == NAL_RASL_N )) {
+-+        flush_buffer(s->frame->buf[1]);
+-+        flush_buffer(s->frame->buf[2]);
+-+        //memcpy(s->dummy.arm,s->frame->data[0],2048*64);
+-+        //memcpy(s->dummy.arm,s->frame->data[1],1024*32);
+-+        //memcpy(s->dummy.arm,s->frame->data[2],1024*32);
+-+    }
+-+}
+- #endif
+- 
+- void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
+-@@ -886,31 +901,29 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
+-             sao_filter_CTB(s, x - ctb_size, y);
+-         if (y && x_end) {
+-             sao_filter_CTB(s, x, y - ctb_size);
+--            if (s->threads_type & FF_THREAD_FRAME )
+-+            if (s->threads_type & FF_THREAD_FRAME ) {
+-+#ifdef RPI_INTER_QPU
+-+                ff_hevc_flush_chroma(s);
+-+#endif
+-                 ff_thread_report_progress(&s->ref->tf, y, 0);
+-+            }
+-         }
+-         if (x_end && y_end) {
+-             sao_filter_CTB(s, x , y);
+--            if (s->threads_type & FF_THREAD_FRAME )
+-+            if (s->threads_type & FF_THREAD_FRAME ) {
+-+#ifdef RPI_INTER_QPU
+-+                ff_hevc_flush_chroma(s);
+-+#endif
+-                 ff_thread_report_progress(&s->ref->tf, y + ctb_size, 0);
+-+            }
+-         }
+-     } else if (s->threads_type & FF_THREAD_FRAME && x_end) {
+-         //int newh = y + ctb_size - 4;
+-         //int currh = s->ref->tf.progress->data[0];
+-         //if (((y + ctb_size)&63)==0)
+--        if (!(  s->nal_unit_type == NAL_TRAIL_N ||
+--            s->nal_unit_type == NAL_TSA_N   ||
+--            s->nal_unit_type == NAL_STSA_N  ||
+--            s->nal_unit_type == NAL_RADL_N  ||
+--            s->nal_unit_type == NAL_RASL_N )) {
+- #ifdef RPI_INTER_QPU
+--            flush_buffer(s->frame->buf[1]);
+--            flush_buffer(s->frame->buf[2]);
+-+        ff_hevc_flush_chroma(s);
+- #endif
+--            //memcpy(s->dummy.arm,s->frame->data[0],2048*64);
+--            //memcpy(s->dummy.arm,s->frame->data[1],1024*32);
+--            //memcpy(s->dummy.arm,s->frame->data[2],1024*32);
+--        }
+-         ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
+-     }
+- }
+--- 
+-2.7.4
+-
+-
+-From 3e337b9c4ef0c356a0259be2254ad1bc4d5bbe29 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Thu, 14 May 2015 09:17:28 +0100
+-Subject: [PATCH 23/68] Stopped using acceleration in unsupported cases
+-
+----
+- libavcodec/hevc.c       | 14 +++++++-------
+- libavcodec/hevc_cabac.c |  4 ++--
+- 2 files changed, 9 insertions(+), 9 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 93e1eba..bfd5a55 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -1152,15 +1152,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+-                         for (i = 0; i < (size * size); i++) {
+-                             coeffs[i] = ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
+-                         }
+--                        printf("Cross component not supported\n"); // TODO
+--                        exit(-1);
+-                         s->hevcdsp.transform_add[log2_trafo_size_c-2](dst, coeffs, stride);
+-                     }
+-             }
+- 
+-             if (lc->tu.cross_pf) {
+--                printf("Cross component not supported\n"); // TODO
+--                exit(-1);
+-                 hls_cross_component_pred(s, 1);
+-             }
+-             for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
+-@@ -1189,8 +1185,6 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+-                         for (i = 0; i < (size * size); i++) {
+-                             coeffs[i] = ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
+-                         }
+--                        printf("Cross component not supported\n"); // TODO
+--                        exit(-1);
+-                         s->hevcdsp.transform_add[log2_trafo_size_c-2](dst, coeffs, stride);
+-                     }
+-             }
+-@@ -2857,7 +2851,13 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-     int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs];
+- 
+- #ifdef RPI
+--    s->enable_rpi = 1; // TODO this should depend on cross component and frame width etc.
+-+    s->enable_rpi = s->ps.sps->bit_depth == 8
+-+                    && s->ps.sps->width <= RPI_MAX_WIDTH
+-+                    && !s->ps.pps->cross_component_prediction_enabled_flag
+-+                    && s->ps.pps->num_tile_rows <= 1 && s->ps.pps->num_tile_columns <= 1
+-+                    && !(s->ps.pps->weighted_pred_flag && s->sh.slice_type == P_SLICE)
+-+                    && !(s->ps.pps->weighted_bipred_flag && s->sh.slice_type == B_SLICE);
+-+
+- #endif
+- 
+-     if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) {
+-diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
+-index 4f072be..38f53de 100644
+---- a/libavcodec/hevc_cabac.c
+-+++ b/libavcodec/hevc_cabac.c
+-@@ -1513,9 +1513,9 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+- #ifdef RPI
+-             if (!use_vpu) {
+-               int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
+--              if (max_xy == 0)
+-+              if (max_xy == 0) {
+-                   s->hevcdsp.idct_dc[log2_trafo_size-2](coeffs);
+--              else {
+-+              } else {
+-                   int col_limit = last_significant_coeff_x + last_significant_coeff_y + 4;
+-                   if (max_xy < 4)
+-                       col_limit = FFMIN(4, col_limit);
+--- 
+-2.7.4
+-
+-
+-From 3941d3e4c2305fa037e8aba5a14cf698ac8673db Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Thu, 14 May 2015 09:42:16 +0100
+-Subject: [PATCH 24/68] Split B prediction into two passes
+-
+----
+- libavcodec/hevc.c          |   1 +
+- libavcodec/hevc.h          |   1 +
+- libavcodec/rpi_qpu.c       |   3 +
+- libavcodec/rpi_qpu.h       |   1 +
+- libavcodec/rpi_shader.c    | 559 +++++++++++++++++++++++++++------------------
+- libavcodec/rpi_shader.h    |  11 +-
+- libavcodec/rpi_shader.qasm | 196 ++++++++++++++--
+- 7 files changed, 531 insertions(+), 241 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index bfd5a55..4b133d2 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -3801,6 +3801,7 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
+-             p += uv_commands_per_qpu;
+-         }
+-         s->mc_filter_uv = qpu_get_fn(QPU_MC_FILTER_UV);
+-+        s->mc_filter_uv_b0 = qpu_get_fn(QPU_MC_FILTER_UV_B0);
+-         s->mc_filter_uv_b = qpu_get_fn(QPU_MC_FILTER_UV_B);
+- 
+-     }
+-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
+-index d513579..4a39e39 100644
+---- a/libavcodec/hevc.h
+-+++ b/libavcodec/hevc.h
+-@@ -917,6 +917,7 @@ typedef struct HEVCContext {
+-     uint32_t *u_mvs[8];
+-     // Function pointers
+-     uint32_t mc_filter_uv;
+-+    uint32_t mc_filter_uv_b0;
+-     uint32_t mc_filter_uv_b;
+- #endif
+- 
+-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
+-index 4e90cc1..60bf079 100644
+---- a/libavcodec/rpi_qpu.c
+-+++ b/libavcodec/rpi_qpu.c
+-@@ -636,6 +636,9 @@ unsigned int qpu_get_fn(int num) {
+-     case QPU_MC_FILTER_UV:
+-       fn = mc_filter_uv;
+-       break;
+-+    case QPU_MC_FILTER_UV_B0:
+-+      fn = mc_filter_uv_b0;
+-+      break;
+-     case QPU_MC_FILTER_UV_B:
+-       fn = mc_filter_uv_b;
+-       break;
+-diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
+-index f9ad333..543c84b 100644
+---- a/libavcodec/rpi_qpu.h
+-+++ b/libavcodec/rpi_qpu.h
+-@@ -29,6 +29,7 @@ enum {
+-   QPU_MC_FILTER_HONLY,
+-   QPU_MC_SETUP_UV,
+-   QPU_MC_FILTER_UV,
+-+  QPU_MC_FILTER_UV_B0,
+-   QPU_MC_FILTER_UV_B,
+-   QPU_MC_INTERRUPT_EXIT8,
+-   QPU_MC_END
+-diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
+-index 5d00cb2..88ad20b 100644
+---- a/libavcodec/rpi_shader.c
+-+++ b/libavcodec/rpi_shader.c
+-@@ -39,18 +39,18 @@ unsigned int rpi_shader[] = {
+- /* [0x00000070] */ 0x00000100, 0xe00205a7, // mov ra22, 256
+- /* [0x00000078] */ 0x00000008, 0xe00205e7, // mov ra23, 8
+- /* [0x00000080] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
+--/* [0x00000088] */ 0x00000040, 0xe0021567, // mov rb21, 64
+--/* [0x00000090] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
+--/* [0x00000098] */ 0x00000018, 0xe00215e7, // mov rb23, 24
+--/* [0x000000a0] */ 0x00000000, 0xe0020227, // mov ra8, 0
+--/* [0x000000a8] */ 0x00000000, 0xe0020267, // mov ra9, 0
+--/* [0x000000b0] */ 0x00000000, 0xe00202a7, // mov ra10, 0
+--/* [0x000000b8] */ 0x00000000, 0xe00202e7, // mov ra11, 0
+--/* [0x000000c0] */ 0x00000000, 0xe0020327, // mov ra12, 0
+--/* [0x000000c8] */ 0x00000000, 0xe0020367, // mov ra13, 0
+--/* [0x000000d0] */ 0x00000000, 0xe00203a7, // mov ra14, 0
+--/* [0x000000d8] */ 0x00000000, 0xe00203e7, // mov ra15, 0
+--/* [0x000000e0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+-+/* [0x00000088] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
+-+/* [0x00000090] */ 0x00000018, 0xe00215e7, // mov rb23, 24
+-+/* [0x00000098] */ 0x00000000, 0xe0020227, // mov ra8, 0
+-+/* [0x000000a0] */ 0x00000000, 0xe0020267, // mov ra9, 0
+-+/* [0x000000a8] */ 0x00000000, 0xe00202a7, // mov ra10, 0
+-+/* [0x000000b0] */ 0x00000000, 0xe00202e7, // mov ra11, 0
+-+/* [0x000000b8] */ 0x00000000, 0xe0020327, // mov ra12, 0
+-+/* [0x000000c0] */ 0x00000000, 0xe0020367, // mov ra13, 0
+-+/* [0x000000c8] */ 0x00000000, 0xe00203a7, // mov ra14, 0
+-+/* [0x000000d0] */ 0x00000000, 0xe00203e7, // mov ra15, 0
+-+/* [0x000000d8] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+-+/* [0x000000e0] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
+- /* [0x000000e8] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+- /* [0x000000f0] */ 0x159e7480, 0x10020867, // mov r1, r2
+- /* [0x000000f8] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+-@@ -62,176 +62,176 @@ unsigned int rpi_shader[] = {
+- /* [0x00000128] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+- /* [0x00000130] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
+- /* [0x00000138] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+--/* [0x00000140] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+--/* [0x00000148] */ 0x159e7480, 0x10020867, // mov r1, r2
+--/* [0x00000150] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+--/* [0x00000158] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+--/* [0x00000160] */ 0x159e7480, 0x10020827, // mov r0, r2
+--/* [0x00000168] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+--/* [0x00000170] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000178] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+--/* [0x00000180] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
+--/* [0x00000188] */ 0x15427d80, 0x10020827, // mov r0, ra_x_base
+--/* [0x00000190] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
+--/* [0x00000198] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base
+--/* [0x000001a0] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
+--/* [0x000001a8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
+--/* [0x000001b0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x000001b8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+--/* [0x000001c0] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x_base, r0
+--/* [0x000001c8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+--/* [0x000001d0] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
+--/* [0x000001d8] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
+--/* [0x000001e0] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
+--/* [0x000001e8] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000001f0] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000001f8] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000200] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
+--/* [0x00000208] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+--/* [0x00000210] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
+--/* [0x00000218] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000220] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+--/* [0x00000228] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
+--/* [0x00000230] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
+-+/* [0x00000140] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
+-+/* [0x00000148] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+-+/* [0x00000150] */ 0x159e7480, 0x10020867, // mov r1, r2
+-+/* [0x00000158] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+-+/* [0x00000160] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+-+/* [0x00000168] */ 0x159e7480, 0x10020827, // mov r0, r2
+-+/* [0x00000170] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+-+/* [0x00000178] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000180] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+-+/* [0x00000188] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
+-+/* [0x00000190] */ 0x0f9c11c0, 0xd0020827, // asr r0, r0, 1
+-+/* [0x00000198] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
+-+/* [0x000001a0] */ 0x0c9e7040, 0x10021567, // add rb21, r0, r1
+-+/* [0x000001a8] */ 0x15427d80, 0x10020827, // mov r0, ra_x_base
+-+/* [0x000001b0] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
+-+/* [0x000001b8] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base
+-+/* [0x000001c0] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
+-+/* [0x000001c8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
+-+/* [0x000001d0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x000001d8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+-+/* [0x000001e0] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x_base, r0
+-+/* [0x000001e8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x000001f0] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
+-+/* [0x000001f8] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
+-+/* [0x00000200] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
+-+/* [0x00000208] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000210] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000218] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000220] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
+-+/* [0x00000228] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x00000230] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
+-+/* [0x00000238] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000240] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+-+/* [0x00000248] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
+-+/* [0x00000250] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
+- // ::mc_filter_uv
+--/* [0x00000238] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00000240] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x00000248] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00000250] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x00000258] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+--/* [0x00000260] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x00000268] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+--/* [0x00000270] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x00000278] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+--/* [0x00000280] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x00000288] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+--/* [0x00000290] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+--/* [0x00000298] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x000002a0] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000002a8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x000002b0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x000002b8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x000002c0] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+--/* [0x000002c8] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+--/* [0x000002d0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x000002d8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x000002e0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x000002e8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x000002f0] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
+--/* [0x000002f8] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000300] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000308] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000310] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000318] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x00000320] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000328] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000330] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000338] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+--/* [0x00000340] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000348] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000350] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000358] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+--/* [0x00000360] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000368] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000370] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000378] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+--/* [0x00000380] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x00000388] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000390] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x00000258] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000260] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00000268] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000270] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000278] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x00000280] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000288] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x00000290] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x00000298] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+-+/* [0x000002a0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x000002a8] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+-+/* [0x000002b0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x000002b8] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x000002c0] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000002c8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x000002d0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x000002d8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x000002e0] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+-+/* [0x000002e8] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+-+/* [0x000002f0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x000002f8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000300] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000308] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00000310] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
+-+/* [0x00000318] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000320] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000328] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000330] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000338] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x00000340] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000348] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000350] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000358] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+-+/* [0x00000360] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000368] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000370] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000378] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+-+/* [0x00000380] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000388] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000390] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000398] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+-+/* [0x000003a0] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x000003a8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x000003b0] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop
+--/* [0x00000398] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x000003a0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x000003a8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x000003b0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x000003b8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x000003c0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x000003c8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x000003d0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x000003d8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x000003e0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x000003e8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x000003f0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x000003f8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x00000400] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x00000408] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x00000410] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x00000418] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x00000420] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x00000428] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00000430] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--/* [0x00000438] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--/* [0x00000440] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--/* [0x00000448] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--/* [0x00000450] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--/* [0x00000458] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--/* [0x00000460] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--/* [0x00000468] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--/* [0x00000470] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
+--/* [0x00000478] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+--/* [0x00000480] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+--/* [0x00000488] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+--/* [0x00000490] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+--/* [0x00000498] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+--/* [0x000004a0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+--/* [0x000004a8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x000004b0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+--/* [0x000004b8] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop
+--/* [0x000004c0] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+--/* [0x000004c8] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+--/* [0x000004d0] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+--/* [0x000004d8] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
+--/* [0x000004e0] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
+--/* [0x000004e8] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
+--/* [0x000004f0] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
+--/* [0x000004f8] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
+--/* [0x00000500] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
+--/* [0x00000508] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
+--/* [0x00000510] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
+--/* [0x00000518] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x00000520] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x00000528] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+--/* [0x00000530] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
+--/* [0x00000538] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:uvloop
+--/* [0x00000540] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
+--/* [0x00000548] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x00000550] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+--/* [0x00000558] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00000560] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000568] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x00000570] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x00000578] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x00000580] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000588] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x00000590] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000598] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--// ::mc_filter_uv_b
+--/* [0x000005a0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x000005a8] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x000005b0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x000005b8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x000005c0] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+--/* [0x000005c8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x000005d0] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+--/* [0x000005d8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x000005e0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+--/* [0x000005e8] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x000005f0] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+--/* [0x000005f8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+--/* [0x00000600] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x00000608] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000610] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x00000618] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x00000620] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x00000628] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+--/* [0x00000630] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+--/* [0x00000638] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x00000640] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+--/* [0x00000648] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+--/* [0x00000650] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+--/* [0x00000658] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000660] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000668] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00000670] */ 0x0c9dc7c0, 0x10020c67, // add vr_setup, r3, rb28
+-+/* [0x000003b8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x000003c0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x000003c8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x000003d0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x000003d8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x000003e0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x000003e8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x000003f0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x000003f8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x00000400] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x00000408] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000410] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x00000418] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x00000420] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00000428] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x00000430] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000438] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00000440] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00000448] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00000450] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+/* [0x00000458] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+/* [0x00000460] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+/* [0x00000468] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+/* [0x00000470] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+/* [0x00000478] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+/* [0x00000480] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+/* [0x00000488] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+/* [0x00000490] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
+-+/* [0x00000498] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+-+/* [0x000004a0] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+-+/* [0x000004a8] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+-+/* [0x000004b0] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+-+/* [0x000004b8] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+-+/* [0x000004c0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+-+/* [0x000004c8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x000004d0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+-+/* [0x000004d8] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop
+-+/* [0x000004e0] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+-+/* [0x000004e8] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+-+/* [0x000004f0] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+-+/* [0x000004f8] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
+-+/* [0x00000500] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
+-+/* [0x00000508] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
+-+/* [0x00000510] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
+-+/* [0x00000518] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
+-+/* [0x00000520] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
+-+/* [0x00000528] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
+-+/* [0x00000530] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
+-+/* [0x00000538] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x00000540] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x00000548] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x00000550] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
+-+/* [0x00000558] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:uvloop
+-+/* [0x00000560] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
+-+/* [0x00000568] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x00000570] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x00000578] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00000580] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000588] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000590] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x00000598] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x000005a0] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x000005a8] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x000005b0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x000005b8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+// ::mc_filter_uv_b0
+-+/* [0x000005c0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x000005c8] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x000005d0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x000005d8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x000005e0] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x000005e8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x000005f0] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x000005f8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x00000600] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+-+/* [0x00000608] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x00000610] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+-+/* [0x00000618] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x00000620] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x00000628] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000630] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x00000638] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x00000640] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x00000648] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+-+/* [0x00000650] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+-+/* [0x00000658] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x00000660] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000668] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000670] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+- /* [0x00000678] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
+- /* [0x00000680] */ 0x15827d80, 0x10020827, // mov r0, unif
+- /* [0x00000688] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-@@ -253,7 +253,7 @@ unsigned int rpi_shader[] = {
+- /* [0x00000708] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+- /* [0x00000710] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+- /* [0x00000718] */ 0x00000000, 0xe00208e7, // mov r3, 0
+--// :uvloop_b
+-+// :uvloop_b0
+- /* [0x00000720] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+- /* [0x00000728] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+- /* [0x00000730] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-@@ -290,7 +290,7 @@ unsigned int rpi_shader[] = {
+- /* [0x00000828] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+- /* [0x00000830] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+- /* [0x00000838] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+--/* [0x00000840] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x00000840] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+- /* [0x00000848] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+- /* [0x00000850] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+- /* [0x00000858] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+-@@ -306,48 +306,163 @@ unsigned int rpi_shader[] = {
+- /* [0x000008a8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+- /* [0x000008b0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+- /* [0x000008b8] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
+--/* [0x000008c0] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
+--/* [0x000008c8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x000008d0] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
+--/* [0x000008d8] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:uvloop_b
+--/* [0x000008e0] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+--/* [0x000008e8] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
+--/* [0x000008f0] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
+--/* [0x000008f8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00000900] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000908] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x00000910] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x00000918] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x00000920] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000928] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x00000930] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000938] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x000008c0] */ 0xfffffad8, 0xf06809e7, // brr.anyn -, r:uvloop
+-+/* [0x000008c8] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
+-+/* [0x000008d0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x000008d8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x000008e0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x000008e8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x000008f0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x000008f8] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x00000900] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x00000908] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000910] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x00000918] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000920] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+// ::mc_filter_uv_b
+-+/* [0x00000928] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000930] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00000938] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000940] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000948] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x00000950] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000958] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x00000960] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x00000968] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+-+/* [0x00000970] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x00000978] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+-+/* [0x00000980] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x00000988] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x00000990] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000998] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x000009a0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x000009a8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x000009b0] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+-+/* [0x000009b8] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+-+/* [0x000009c0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x000009c8] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+-+/* [0x000009d0] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+-+/* [0x000009d8] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+-+/* [0x000009e0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x000009e8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x000009f0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x000009f8] */ 0x0c9dc7c0, 0x10020c67, // add vr_setup, r3, rb28
+-+/* [0x00000a00] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
+-+/* [0x00000a08] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000a10] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000a18] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000a20] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000a28] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x00000a30] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000a38] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000a40] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000a48] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+-+/* [0x00000a50] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000a58] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000a60] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000a68] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+-+/* [0x00000a70] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000a78] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000a80] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000a88] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+-+/* [0x00000a90] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x00000a98] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000aa0] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+// :uvloop_b
+-+/* [0x00000aa8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x00000ab0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x00000ab8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x00000ac0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000ac8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x00000ad0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000ad8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000ae0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x00000ae8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x00000af0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x00000af8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000b00] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x00000b08] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x00000b10] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00000b18] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x00000b20] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000b28] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00000b30] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00000b38] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00000b40] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+/* [0x00000b48] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+/* [0x00000b50] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+/* [0x00000b58] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+/* [0x00000b60] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+/* [0x00000b68] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+/* [0x00000b70] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+/* [0x00000b78] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+/* [0x00000b80] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
+-+/* [0x00000b88] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+-+/* [0x00000b90] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+-+/* [0x00000b98] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+-+/* [0x00000ba0] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+-+/* [0x00000ba8] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+-+/* [0x00000bb0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+-+/* [0x00000bb8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00000bc0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+-+/* [0x00000bc8] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x00000bd0] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+-+/* [0x00000bd8] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+-+/* [0x00000be0] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+-+/* [0x00000be8] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
+-+/* [0x00000bf0] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
+-+/* [0x00000bf8] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
+-+/* [0x00000c00] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
+-+/* [0x00000c08] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
+-+/* [0x00000c10] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
+-+/* [0x00000c18] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
+-+/* [0x00000c20] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
+-+/* [0x00000c28] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x00000c30] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x00000c38] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x00000c40] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
+-+/* [0x00000c48] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
+-+/* [0x00000c50] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x00000c58] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
+-+/* [0x00000c60] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x00000c68] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+-+/* [0x00000c70] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
+-+/* [0x00000c78] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
+-+/* [0x00000c80] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00000c88] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000c90] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000c98] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x00000ca0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x00000ca8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000cb0] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x00000cb8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000cc0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_exit
+--/* [0x00000940] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x00000948] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+--/* [0x00000950] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000958] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000960] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000968] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000970] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x00000978] */ 0x009e7000, 0x100009e7, // nop        ; nop
+--/* [0x00000980] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000cc8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00000cd0] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+-+/* [0x00000cd8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000ce0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000ce8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000cf0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000cf8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00000d00] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000d08] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_interrupt_exit8
+--/* [0x00000988] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x00000990] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000998] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x000009a0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x000009a8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x000009b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x000009b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x000009c0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x000009c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x000009d0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x000009d8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x000009e0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x000009e8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x000009f0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+--/* [0x000009f8] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000d10] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00000d18] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000d20] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000d28] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000d30] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000d38] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000d40] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000d48] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000d50] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000d58] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000d60] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000d68] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000d70] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00000d78] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x00000d80] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_end
+- };
+- #ifdef __HIGHC__
+-diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
+-index e36c4ae..809e582 100644
+---- a/libavcodec/rpi_shader.h
+-+++ b/libavcodec/rpi_shader.h
+-@@ -4,10 +4,11 @@
+- extern unsigned int rpi_shader[];
+- 
+- #define mc_setup_uv (rpi_shader + 0)
+--#define mc_filter_uv (rpi_shader + 142)
+--#define mc_filter_uv_b (rpi_shader + 360)
+--#define mc_exit (rpi_shader + 592)
+--#define mc_interrupt_exit8 (rpi_shader + 610)
+--#define mc_end (rpi_shader + 640)
+-+#define mc_filter_uv (rpi_shader + 150)
+-+#define mc_filter_uv_b0 (rpi_shader + 368)
+-+#define mc_filter_uv_b (rpi_shader + 586)
+-+#define mc_exit (rpi_shader + 818)
+-+#define mc_interrupt_exit8 (rpi_shader + 836)
+-+#define mc_end (rpi_shader + 866)
+- 
+- #endif
+-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
+-index 870437d2..635b894 100644
+---- a/libavcodec/rpi_shader.qasm
+-+++ b/libavcodec/rpi_shader.qasm
+-@@ -26,7 +26,7 @@
+- # ra23                                          8
+- #
+- # rb20                                          0xffffff00
+--# rb21                                          64
+-+# rb21                                          vpm_setup for writing 16bit results into VPM
+- # rb22                                          255
+- # rb23                                          24
+- #
+-@@ -34,7 +34,7 @@
+- # rb25                                          frame width-1
+- # rb26                                          height<<23 + width<<16 + vdw_setup_0
+- # rb27                                          vdw_setup_0 (depends on QPU number)
+--# rb28                                          vpm_setup (depends on QPU number)
+-+# rb28                                          vpm_setup (depends on QPU number) for writing 8bit results into VPM
+- # rb29                                          vdw_setup_1(dst_pitch-width)
+- # rb30                                          frame height-1
+- # rb31                                          used as temp to count loop iterations
+-@@ -69,8 +69,6 @@
+- .set ra_y_next,                    ra28
+- .set ra_y,                         ra29
+- 
+--.set rb_const_64,                  rb21
+--
+- 
+- ################################################################################
+- # mc_setup_uv(next_kernel, x, y, ref_u_base, ref_v_base, frame_width, frame_height, pitch, dst_pitch, pad0, pad1, pad2)
+-@@ -106,7 +104,6 @@ mov ra22, 256
+- mov ra23, 8
+- 
+- mov rb20, 0xffffff00
+--mov rb21, 64
+- mov rb22, 255
+- mov rb23, 24
+- 
+-@@ -123,6 +120,7 @@ mov ra15, 0
+- 
+- # Compute part of VPM to use for DMA output
+- mov r2, qpu_num
+-+shl r2, r2, 1   # Convert QPU numbers to be even (this means we can only use 8 QPUs, but is necessary as we need to save 16bit intermediate results)
+- and r2, r2, 15
+- mov r1, r2
+- asr r1, r1, 2
+-@@ -135,16 +133,21 @@ shl r0, r0, 5
+- add rb27, r0, r1
+- 
+- # Compute part of VPM to save data into
+--mov r2, qpu_num
+--and r2, r2, 15
+--mov r1, r2
+--asr r1, r1, 2
+--shl r1, r1, 6
+--mov r0, r2
+--and r0, r0, 3
+--add r0, r0, r1
+--mov r1, vpm_setup(0, 4, h8p(0, 0))
+-+mov r2, qpu_num   # qpu_num = abcd
+-+shl r2, r2, 1
+-+and r2, r2, 15    # r2 = bcd0
+-+mov r1, r2        # r1 = bcd0
+-+asr r1, r1, 2     # r1 = bc
+-+shl r1, r1, 6     # r1 = bc000000
+-+mov r0, r2        # r0 = bcd0
+-+and r0, r0, 3     # r0 = d0
+-+add r0, r0, r1    # r0 = bc0000d0
+-+mov r1, vpm_setup(0, 4, h8p(0, 0))   # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
+- add rb28, r0, r1
+-+asr r0, r0, 1     # r0 = bc0000d
+-+# Prepare VPM command for 16bit intermediates
+-+mov r1, vpm_setup(0, 2, h16p(0, 0))  # 2 is stride - stride acts on ADDR which is Y[5:0],H[0] for 16 bit
+-+add rb21, r0, r1
+- 
+- # Compute base address for first and second access
+- mov r0, ra_x_base           # Load x
+-@@ -345,6 +348,171 @@ mov vw_addr, unif # start the VDW
+- 
+- ################################################################################
+- 
+-+# mc_filter_uv_b0(next_kernel, x, y, frame_u_base, frame_v_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_u_dst, this_v_dst)
+-+
+-+# At this point we have already issued two pairs of texture requests for the current block
+-+# ra_x_base, ra_x16_base point to the current coordinates for this block
+-+::mc_filter_uv_b0
+-+mov ra31, unif
+-+
+-+# per-channel shifts were calculated on the *previous* invocation
+-+
+-+mov ra_xshift, ra_xshift_next
+-+
+-+# get base addresses and per-channel shifts for *next* invocation
+-+add r0, unif, elem_num    # x
+-+max r0, r0, 0; mov r1, unif # y
+-+min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base
+-+shl ra_xshift_next, r0, 3
+-+sub r2, unif, r3 # compute offset from frame base u to frame base v
+-+add r0, r0, r3
+-+and rb_x_base_next, r0, ~3
+-+mov ra_y_next, r1
+-+add ra_x2_base_next, rb_x_base_next, r2
+-+
+-+# set up VPM write
+-+mov vw_setup, rb28
+-+
+-+# get width,height of block
+-+mov r2, 16
+-+mov r0, unif
+-+shr r1, r0, r2 # Extract width
+-+sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
+-+and r0, r0, rb22 # Extract height
+-+add rb17, r0, 5
+-+add rb18, r0, 7
+-+shl r0, r0, 7
+-+add r0, r0, r1 # Combine width and height of destination area
+-+shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
+-+add rb26, r0, rb27
+-+
+-+sub.setf -,8,r1 # 8-r1, so if <0 (negative) we need to use the full code
+-+
+-+# get filter coefficients
+-+
+-+mov r0, unif
+-+asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra0, r0, rb23;      mov r0, unif
+-+asr ra7, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra6, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra5, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra4, r0, rb23;      mov r0, unif
+-+asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+asr rb8, r0, rb23;      mov r0, unif
+-+asr rb15, r0, rb23;     mul24 r0, r0, ra22
+-+asr rb14, r0, rb23;     mul24 r0, r0, ra22
+-+asr rb13, r0, rb23;     mul24 r0, r0, ra22
+-+asr rb12, r0, rb23
+-+
+-+# r2 is elem_num
+-+# r3 is loop counter
+-+
+-+mov r5rep, -8
+-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+
+-+# retrieve texture results and pick out bytes
+-+# then submit two more texture requests
+-+
+-+mov r3, 0
+-+
+-+:uvloop_b0
+-+# retrieve texture results and pick out bytes
+-+# then submit two more texture requests
+-+
+-+sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
+-+shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
+-+
+-+max r2, ra_y, 0  # y
+-+min r2, r2, rb_frame_height_minus_1
+-+add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+add t0s, ra_x2_base, r2
+-+
+-+# generate seven shifted versions
+-+# interleave with scroll of vertical context
+-+
+-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+
+-+nop                  ; mul24 r2, r0, ra0
+-+nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+add r0, r2, r3
+-+
+-+mov r3, rb31
+-+
+-+mov ra8, ra9
+-+mov ra9, ra10
+-+mov ra10, ra11
+-+mov ra11, ra12
+-+mov ra12, ra13
+-+mov ra13, ra14
+-+
+-+sub.setf -, r3, 8 ; mov r1, ra22
+-+
+-+# apply horizontal filter
+-+brr.anyn -, r:uvloop_b0
+-+mov ra14, ra15          ; mul24 r0, r0, r1         # last bit of context scroll
+-+asr ra15, r0, 8         ; nop
+-+nop                     ; nop  # Delay slot 3 (TODO move more of the context scroll into here)
+-+
+-+# apply vertical filter and write to VPM
+-+
+-+nop                     ; mul24 r1, ra14, rb14
+-+nop                     ; mul24 r0, ra13, rb13
+-+add r1, r1, r0          ; mul24 r0, ra12, rb12
+-+add r1, r1, r0          ; mul24 r0, ra11, rb11
+-+add r1, r1, r0          ; mul24 r0, ra10, rb10
+-+add r1, r1, r0          ; mul24 r0, ra9, rb9
+-+add r1, r1, r0          ; mul24 r0, ra8, rb8
+-+add r1, r1, r0          ; mul24 r0, ra15, rb15
+-+add r1, r1, r0          ; mov -, vw_wait
+-+sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+asr r1, r1, 14
+-+add r1, r1, ra21
+-+brr.anyn -, r:uvloop
+-+asr r1, r1, 6          # Delay 1
+-+min r1, r1, rb22       # Delay 2
+-+max vpm, r1, 0         # Delay 3
+-+
+-+# DMA out for U
+-+
+-+mov vw_setup, rb26 # VDW setup 0
+-+mov vw_setup, rb29 # Stride
+-+mov vw_addr, unif # start the VDW
+-+
+-+# DMA out for V
+-+# We need to wait for the U to complete first, but have nothing useful to compute while we wait.
+-+# Could potentially push this write into the start of the next pipeline stage.
+-+mov r0, 16
+-+mov -, vw_wait
+-+
+-+bra -, ra31
+-+add vw_setup, rb26, r0 # VDW setup 0
+-+mov vw_setup, rb29 # Stride
+-+mov vw_addr, unif # start the VDW
+-+
+-+################################################################################
+-+
+- ::mc_filter_uv_b
+- mov ra31, unif
+- 
+--- 
+-2.7.4
+-
+-
+-From 85d0ffa2bcf6a2b94c1a0c8f84241cda9ac92ce2 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Thu, 14 May 2015 10:04:55 +0100
+-Subject: [PATCH 25/68] Switch to using 16bit temp buffers
+-
+----
+- libavcodec/hevc.c          |  2 +-
+- libavcodec/rpi_shader.c    |  4 ++--
+- libavcodec/rpi_shader.qasm | 10 +++++-----
+- 3 files changed, 8 insertions(+), 8 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 4b133d2..28a6660 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -2147,7 +2147,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                 uint32_t *u = s->u_mvs[chan & 7];
+-                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
+-                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
+--                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b0;
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 3 + start_x;
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 3 + start_y;
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[1]);
+-diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
+-index 88ad20b..ffd3a07 100644
+---- a/libavcodec/rpi_shader.c
+-+++ b/libavcodec/rpi_shader.c
+-@@ -220,7 +220,7 @@ unsigned int rpi_shader[] = {
+- /* [0x00000600] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+- /* [0x00000608] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+- /* [0x00000610] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+--/* [0x00000618] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x00000618] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
+- /* [0x00000620] */ 0x00000010, 0xe00208a7, // mov r2, 16
+- /* [0x00000628] */ 0x15827d80, 0x10020827, // mov r0, unif
+- /* [0x00000630] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-@@ -346,7 +346,7 @@ unsigned int rpi_shader[] = {
+- /* [0x000009e0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+- /* [0x000009e8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+- /* [0x000009f0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x000009f8] */ 0x0c9dc7c0, 0x10020c67, // add vr_setup, r3, rb28
+-+/* [0x000009f8] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
+- /* [0x00000a00] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
+- /* [0x00000a08] */ 0x15827d80, 0x10020827, // mov r0, unif
+- /* [0x00000a10] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
+-index 635b894..9577121 100644
+---- a/libavcodec/rpi_shader.qasm
+-+++ b/libavcodec/rpi_shader.qasm
+-@@ -26,7 +26,7 @@
+- # ra23                                          8
+- #
+- # rb20                                          0xffffff00
+--# rb21                                          vpm_setup for writing 16bit results into VPM
+-+# rb21                                          vpm_setup for reading/writing 16bit results into VPM
+- # rb22                                          255
+- # rb23                                          24
+- #
+-@@ -370,8 +370,8 @@ and rb_x_base_next, r0, ~3
+- mov ra_y_next, r1
+- add ra_x2_base_next, rb_x_base_next, r2
+- 
+--# set up VPM write
+--mov vw_setup, rb28
+-+# set up VPM write, we need to save 16bit precision
+-+mov vw_setup, rb21
+- 
+- # get width,height of block
+- mov r2, 16
+-@@ -554,8 +554,8 @@ add r0, r0, r1 # Combine width and height of destination area
+- shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
+- add rb26, r0, rb27
+- 
+--# In a B frame, so also set up VPM read
+--add vr_setup, r3, rb28
+-+# In a B frame, so also set up VPM read (reading back 16bit precision)
+-+add vr_setup, r3, rb21
+- 
+- sub.setf -,8,r1 # 8-r1, so if <0 (negative) we need to use the full code
+- 
+--- 
+-2.7.4
+-
+-
+-From abc51bf61df597082fbd7cf1bba5031e4d44318b Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Thu, 14 May 2015 10:30:44 +0100
+-Subject: [PATCH 26/68] Corrected B prediction: matching md5 sum for hobbit50
+-
+----
+- libavcodec/rpi_shader.c    | 815 ++++++++++++++++++++++-----------------------
+- libavcodec/rpi_shader.h    |  12 +-
+- libavcodec/rpi_shader.qasm |  36 +-
+- 3 files changed, 429 insertions(+), 434 deletions(-)
+-
+-diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
+-index ffd3a07..77cca46 100644
+---- a/libavcodec/rpi_shader.c
+-+++ b/libavcodec/rpi_shader.c
+-@@ -38,431 +38,428 @@ unsigned int rpi_shader[] = {
+- /* [0x00000068] */ 0x00000020, 0xe0020567, // mov ra21, 32
+- /* [0x00000070] */ 0x00000100, 0xe00205a7, // mov ra22, 256
+- /* [0x00000078] */ 0x00000008, 0xe00205e7, // mov ra23, 8
+--/* [0x00000080] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
+--/* [0x00000088] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
+--/* [0x00000090] */ 0x00000018, 0xe00215e7, // mov rb23, 24
+--/* [0x00000098] */ 0x00000000, 0xe0020227, // mov ra8, 0
+--/* [0x000000a0] */ 0x00000000, 0xe0020267, // mov ra9, 0
+--/* [0x000000a8] */ 0x00000000, 0xe00202a7, // mov ra10, 0
+--/* [0x000000b0] */ 0x00000000, 0xe00202e7, // mov ra11, 0
+--/* [0x000000b8] */ 0x00000000, 0xe0020327, // mov ra12, 0
+--/* [0x000000c0] */ 0x00000000, 0xe0020367, // mov ra13, 0
+--/* [0x000000c8] */ 0x00000000, 0xe00203a7, // mov ra14, 0
+--/* [0x000000d0] */ 0x00000000, 0xe00203e7, // mov ra15, 0
+--/* [0x000000d8] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+--/* [0x000000e0] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
+--/* [0x000000e8] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+--/* [0x000000f0] */ 0x159e7480, 0x10020867, // mov r1, r2
+--/* [0x000000f8] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+--/* [0x00000100] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+--/* [0x00000108] */ 0x159e7480, 0x10020827, // mov r0, r2
+--/* [0x00000110] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+--/* [0x00000118] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000120] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
+--/* [0x00000128] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+--/* [0x00000130] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
+--/* [0x00000138] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+--/* [0x00000140] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
+--/* [0x00000148] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+--/* [0x00000150] */ 0x159e7480, 0x10020867, // mov r1, r2
+--/* [0x00000158] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+--/* [0x00000160] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+--/* [0x00000168] */ 0x159e7480, 0x10020827, // mov r0, r2
+--/* [0x00000170] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+--/* [0x00000178] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000180] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+--/* [0x00000188] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
+--/* [0x00000190] */ 0x0f9c11c0, 0xd0020827, // asr r0, r0, 1
+--/* [0x00000198] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
+--/* [0x000001a0] */ 0x0c9e7040, 0x10021567, // add rb21, r0, r1
+--/* [0x000001a8] */ 0x15427d80, 0x10020827, // mov r0, ra_x_base
+--/* [0x000001b0] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
+--/* [0x000001b8] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base
+--/* [0x000001c0] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
+--/* [0x000001c8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
+--/* [0x000001d0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x000001d8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+--/* [0x000001e0] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x_base, r0
+--/* [0x000001e8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+--/* [0x000001f0] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
+--/* [0x000001f8] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
+--/* [0x00000200] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
+--/* [0x00000208] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000080] */ 0x00000040, 0xe00207a7, // mov ra30, 64
+-+/* [0x00000088] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
+-+/* [0x00000090] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
+-+/* [0x00000098] */ 0x00000018, 0xe00215e7, // mov rb23, 24
+-+/* [0x000000a0] */ 0x00000000, 0xe0020227, // mov ra8, 0
+-+/* [0x000000a8] */ 0x00000000, 0xe0020267, // mov ra9, 0
+-+/* [0x000000b0] */ 0x00000000, 0xe00202a7, // mov ra10, 0
+-+/* [0x000000b8] */ 0x00000000, 0xe00202e7, // mov ra11, 0
+-+/* [0x000000c0] */ 0x00000000, 0xe0020327, // mov ra12, 0
+-+/* [0x000000c8] */ 0x00000000, 0xe0020367, // mov ra13, 0
+-+/* [0x000000d0] */ 0x00000000, 0xe00203a7, // mov ra14, 0
+-+/* [0x000000d8] */ 0x00000000, 0xe00203e7, // mov ra15, 0
+-+/* [0x000000e0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+-+/* [0x000000e8] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
+-+/* [0x000000f0] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+-+/* [0x000000f8] */ 0x159e7480, 0x10020867, // mov r1, r2
+-+/* [0x00000100] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+-+/* [0x00000108] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+-+/* [0x00000110] */ 0x159e7480, 0x10020827, // mov r0, r2
+-+/* [0x00000118] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+-+/* [0x00000120] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000128] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
+-+/* [0x00000130] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+-+/* [0x00000138] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
+-+/* [0x00000140] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+-+/* [0x00000148] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
+-+/* [0x00000150] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+-+/* [0x00000158] */ 0x159e7480, 0x10020867, // mov r1, r2
+-+/* [0x00000160] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+-+/* [0x00000168] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+-+/* [0x00000170] */ 0x159e7480, 0x10020827, // mov r0, r2
+-+/* [0x00000178] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+-+/* [0x00000180] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000188] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+-+/* [0x00000190] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
+-+/* [0x00000198] */ 0x0f9c11c0, 0xd0020827, // asr r0, r0, 1
+-+/* [0x000001a0] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
+-+/* [0x000001a8] */ 0x0c9e7040, 0x10021567, // add rb21, r0, r1
+-+/* [0x000001b0] */ 0x15427d80, 0x10020827, // mov r0, ra_x_base
+-+/* [0x000001b8] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
+-+/* [0x000001c0] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base
+-+/* [0x000001c8] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
+-+/* [0x000001d0] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
+-+/* [0x000001d8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x000001e0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+-+/* [0x000001e8] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x_base, r0
+-+/* [0x000001f0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x000001f8] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
+-+/* [0x00000200] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
+-+/* [0x00000208] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
+- /* [0x00000210] */ 0x15827d80, 0x10020827, // mov r0, unif
+- /* [0x00000218] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000220] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
+--/* [0x00000228] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+--/* [0x00000230] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
+--/* [0x00000238] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000240] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+--/* [0x00000248] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
+--/* [0x00000250] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
+-+/* [0x00000220] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000228] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
+-+/* [0x00000230] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x00000238] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
+-+/* [0x00000240] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000248] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+-+/* [0x00000250] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
+-+/* [0x00000258] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
+- // ::mc_filter_uv
+--/* [0x00000258] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00000260] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x00000268] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00000270] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x00000278] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+--/* [0x00000280] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x00000288] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+--/* [0x00000290] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x00000298] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+--/* [0x000002a0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x000002a8] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+--/* [0x000002b0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+--/* [0x000002b8] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x000002c0] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000002c8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x000002d0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x000002d8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x000002e0] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+--/* [0x000002e8] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+--/* [0x000002f0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x000002f8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000300] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000308] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00000310] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
+--/* [0x00000318] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000320] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000328] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000330] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000338] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x00000340] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000348] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000350] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000358] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+--/* [0x00000360] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000368] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000370] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000378] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+--/* [0x00000380] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000388] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000390] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000398] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+--/* [0x000003a0] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x000003a8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x000003b0] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x00000260] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000268] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00000270] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000278] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000280] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x00000288] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000290] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x00000298] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x000002a0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+-+/* [0x000002a8] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x000002b0] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+-+/* [0x000002b8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x000002c0] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x000002c8] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000002d0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x000002d8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x000002e0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x000002e8] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+-+/* [0x000002f0] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+-+/* [0x000002f8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x00000300] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000308] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000310] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00000318] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
+-+/* [0x00000320] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000328] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000330] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000338] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000340] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x00000348] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000350] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000358] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000360] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+-+/* [0x00000368] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000370] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000378] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000380] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+-+/* [0x00000388] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000390] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000398] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000003a0] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+-+/* [0x000003a8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x000003b0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x000003b8] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop
+--/* [0x000003b8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x000003c0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x000003c8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x000003d0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x000003d8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x000003e0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x000003e8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x000003f0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x000003f8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x00000400] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x00000408] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000410] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x00000418] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x00000420] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x00000428] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x00000430] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x00000438] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x00000440] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x00000448] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00000450] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--/* [0x00000458] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--/* [0x00000460] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--/* [0x00000468] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--/* [0x00000470] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--/* [0x00000478] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--/* [0x00000480] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--/* [0x00000488] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--/* [0x00000490] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
+--/* [0x00000498] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+--/* [0x000004a0] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+--/* [0x000004a8] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+--/* [0x000004b0] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+--/* [0x000004b8] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+--/* [0x000004c0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+--/* [0x000004c8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x000004d0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+--/* [0x000004d8] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop
+--/* [0x000004e0] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+--/* [0x000004e8] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+--/* [0x000004f0] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+--/* [0x000004f8] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
+--/* [0x00000500] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
+--/* [0x00000508] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
+--/* [0x00000510] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
+--/* [0x00000518] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
+--/* [0x00000520] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
+--/* [0x00000528] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
+--/* [0x00000530] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
+--/* [0x00000538] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x00000540] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x00000548] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+--/* [0x00000550] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
+--/* [0x00000558] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:uvloop
+--/* [0x00000560] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
+--/* [0x00000568] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x00000570] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+--/* [0x00000578] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00000580] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000588] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x00000590] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x00000598] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x000005a0] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x000005a8] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x000005b0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x000005b8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x000003c0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x000003c8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x000003d0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x000003d8] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x000003e0] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x000003e8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x000003f0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x000003f8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x00000400] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x00000408] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x00000410] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000418] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x00000420] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x00000428] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00000430] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x00000438] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000440] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00000448] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00000450] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00000458] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+/* [0x00000460] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+/* [0x00000468] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+/* [0x00000470] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+/* [0x00000478] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+/* [0x00000480] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+/* [0x00000488] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+/* [0x00000490] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+/* [0x00000498] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
+-+/* [0x000004a0] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+-+/* [0x000004a8] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+-+/* [0x000004b0] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+-+/* [0x000004b8] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+-+/* [0x000004c0] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+-+/* [0x000004c8] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+-+/* [0x000004d0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x000004d8] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+-+/* [0x000004e0] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop
+-+/* [0x000004e8] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+-+/* [0x000004f0] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+-+/* [0x000004f8] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+-+/* [0x00000500] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
+-+/* [0x00000508] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
+-+/* [0x00000510] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
+-+/* [0x00000518] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
+-+/* [0x00000520] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
+-+/* [0x00000528] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
+-+/* [0x00000530] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
+-+/* [0x00000538] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
+-+/* [0x00000540] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x00000548] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x00000550] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x00000558] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
+-+/* [0x00000560] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:uvloop
+-+/* [0x00000568] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
+-+/* [0x00000570] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x00000578] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x00000580] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00000588] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000590] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000598] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x000005a0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x000005a8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x000005b0] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x000005b8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x000005c0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_filter_uv_b0
+--/* [0x000005c0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x000005c8] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x000005d0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x000005d8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x000005e0] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+--/* [0x000005e8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x000005f0] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+--/* [0x000005f8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x00000600] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+--/* [0x00000608] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x00000610] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+--/* [0x00000618] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
+--/* [0x00000620] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x00000628] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000630] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x00000638] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x00000640] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x00000648] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+--/* [0x00000650] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+--/* [0x00000658] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x00000660] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000668] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000670] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00000678] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
+--/* [0x00000680] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000688] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000690] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000698] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000006a0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x000006a8] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000006b0] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000006b8] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000006c0] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+--/* [0x000006c8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000006d0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000006d8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000006e0] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+--/* [0x000006e8] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000006f0] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000006f8] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000700] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+--/* [0x00000708] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x00000710] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000718] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x000005c8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x000005d0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x000005d8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x000005e0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x000005e8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x000005f0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x000005f8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x00000600] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x00000608] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+-+/* [0x00000610] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x00000618] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+-+/* [0x00000620] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
+-+/* [0x00000628] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x00000630] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000638] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x00000640] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x00000648] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x00000650] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+-+/* [0x00000658] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+-+/* [0x00000660] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x00000668] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000670] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000678] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00000680] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
+-+/* [0x00000688] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000690] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000698] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000006a0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000006a8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x000006b0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000006b8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000006c0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000006c8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+-+/* [0x000006d0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000006d8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000006e0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000006e8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+-+/* [0x000006f0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000006f8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000700] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000708] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+-+/* [0x00000710] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x00000718] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000720] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop_b0
+--/* [0x00000720] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x00000728] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x00000730] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x00000738] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x00000740] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x00000748] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x00000750] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x00000758] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x00000760] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x00000768] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x00000770] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000778] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x00000780] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x00000788] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x00000790] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x00000798] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x000007a0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x000007a8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x000007b0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x000007b8] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--/* [0x000007c0] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--/* [0x000007c8] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--/* [0x000007d0] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--/* [0x000007d8] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--/* [0x000007e0] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--/* [0x000007e8] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--/* [0x000007f0] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--/* [0x000007f8] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
+--/* [0x00000800] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+--/* [0x00000808] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+--/* [0x00000810] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+--/* [0x00000818] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+--/* [0x00000820] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+--/* [0x00000828] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+--/* [0x00000830] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x00000838] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+--/* [0x00000840] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+--/* [0x00000848] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+--/* [0x00000850] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+--/* [0x00000858] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+--/* [0x00000860] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
+--/* [0x00000868] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
+--/* [0x00000870] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
+--/* [0x00000878] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
+--/* [0x00000880] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
+--/* [0x00000888] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
+--/* [0x00000890] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
+--/* [0x00000898] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
+--/* [0x000008a0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x000008a8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x000008b0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+--/* [0x000008b8] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
+--/* [0x000008c0] */ 0xfffffad8, 0xf06809e7, // brr.anyn -, r:uvloop
+--/* [0x000008c8] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
+--/* [0x000008d0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x000008d8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+--/* [0x000008e0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x000008e8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x000008f0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x000008f8] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x00000900] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x00000908] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000910] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x00000918] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000920] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000728] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x00000730] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x00000738] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x00000740] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000748] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x00000750] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000758] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000760] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x00000768] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x00000770] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x00000778] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000780] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x00000788] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x00000790] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00000798] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x000007a0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x000007a8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x000007b0] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x000007b8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x000007c0] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+/* [0x000007c8] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+/* [0x000007d0] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+/* [0x000007d8] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+/* [0x000007e0] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+/* [0x000007e8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+/* [0x000007f0] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+/* [0x000007f8] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+/* [0x00000800] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
+-+/* [0x00000808] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+-+/* [0x00000810] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+-+/* [0x00000818] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+-+/* [0x00000820] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+-+/* [0x00000828] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+-+/* [0x00000830] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+-+/* [0x00000838] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00000840] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+-+/* [0x00000848] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+-+/* [0x00000850] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+-+/* [0x00000858] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+-+/* [0x00000860] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+-+/* [0x00000868] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
+-+/* [0x00000870] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
+-+/* [0x00000878] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
+-+/* [0x00000880] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
+-+/* [0x00000888] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
+-+/* [0x00000890] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
+-+/* [0x00000898] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
+-+/* [0x000008a0] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
+-+/* [0x000008a8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x000008b0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x000008b8] */ 0xfffffe50, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+-+/* [0x000008c0] */ 0x0f9ce3c0, 0xd0020c27, // asr vpm, r1, 14
+-+/* [0x000008c8] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x000008d0] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x000008d8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x000008e0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x000008e8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x000008f0] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x000008f8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x00000900] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000908] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x00000910] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000918] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_filter_uv_b
+--/* [0x00000928] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00000930] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x00000938] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00000940] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x00000948] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+--/* [0x00000950] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x00000958] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+--/* [0x00000960] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x00000968] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+--/* [0x00000970] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x00000978] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+--/* [0x00000980] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+--/* [0x00000988] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x00000990] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000998] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x000009a0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x000009a8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x000009b0] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+--/* [0x000009b8] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+--/* [0x000009c0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x000009c8] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+--/* [0x000009d0] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+--/* [0x000009d8] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+--/* [0x000009e0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x000009e8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x000009f0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x000009f8] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
+--/* [0x00000a00] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
+--/* [0x00000a08] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000a10] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000a18] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000a20] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000a28] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x00000a30] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000a38] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000a40] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000a48] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+--/* [0x00000a50] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000a58] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000a60] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000a68] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+--/* [0x00000a70] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000a78] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000a80] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000a88] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+--/* [0x00000a90] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x00000a98] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000aa0] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x00000920] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000928] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00000930] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000938] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000940] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x00000948] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000950] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x00000958] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x00000960] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+-+/* [0x00000968] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x00000970] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+-+/* [0x00000978] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x00000980] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x00000988] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000990] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x00000998] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x000009a0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x000009a8] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+-+/* [0x000009b0] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+-+/* [0x000009b8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x000009c0] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+-+/* [0x000009c8] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+-+/* [0x000009d0] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+-+/* [0x000009d8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x000009e0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x000009e8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x000009f0] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
+-+/* [0x000009f8] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
+-+/* [0x00000a00] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000a08] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000a10] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000a18] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000a20] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x00000a28] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000a30] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000a38] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000a40] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+-+/* [0x00000a48] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000a50] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000a58] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000a60] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+-+/* [0x00000a68] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000a70] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000a78] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000a80] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+-+/* [0x00000a88] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x00000a90] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000a98] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop_b
+--/* [0x00000aa8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x00000ab0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x00000ab8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x00000ac0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x00000ac8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x00000ad0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x00000ad8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x00000ae0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x00000ae8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x00000af0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x00000af8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000b00] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x00000b08] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x00000b10] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x00000b18] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x00000b20] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x00000b28] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x00000b30] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x00000b38] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00000b40] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--/* [0x00000b48] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--/* [0x00000b50] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--/* [0x00000b58] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--/* [0x00000b60] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--/* [0x00000b68] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--/* [0x00000b70] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--/* [0x00000b78] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--/* [0x00000b80] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
+--/* [0x00000b88] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+--/* [0x00000b90] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+--/* [0x00000b98] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+--/* [0x00000ba0] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+--/* [0x00000ba8] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+--/* [0x00000bb0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+--/* [0x00000bb8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x00000bc0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+--/* [0x00000bc8] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
+--/* [0x00000bd0] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+--/* [0x00000bd8] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+--/* [0x00000be0] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+--/* [0x00000be8] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
+--/* [0x00000bf0] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
+--/* [0x00000bf8] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
+--/* [0x00000c00] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
+--/* [0x00000c08] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
+--/* [0x00000c10] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
+--/* [0x00000c18] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
+--/* [0x00000c20] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
+--/* [0x00000c28] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x00000c30] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x00000c38] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+--/* [0x00000c40] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
+--/* [0x00000c48] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
+--/* [0x00000c50] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x00000c58] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
+--/* [0x00000c60] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:uvloop_b
+--/* [0x00000c68] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+--/* [0x00000c70] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
+--/* [0x00000c78] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
+--/* [0x00000c80] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00000c88] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000c90] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x00000c98] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x00000ca0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x00000ca8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000cb0] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x00000cb8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000cc0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000aa0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x00000aa8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x00000ab0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x00000ab8] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000ac0] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x00000ac8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000ad0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000ad8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x00000ae0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x00000ae8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x00000af0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000af8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x00000b00] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x00000b08] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00000b10] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x00000b18] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000b20] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00000b28] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00000b30] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00000b38] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+/* [0x00000b40] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+/* [0x00000b48] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+/* [0x00000b50] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+/* [0x00000b58] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+/* [0x00000b60] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+/* [0x00000b68] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+/* [0x00000b70] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+/* [0x00000b78] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
+-+/* [0x00000b80] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+-+/* [0x00000b88] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+-+/* [0x00000b90] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+-+/* [0x00000b98] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+-+/* [0x00000ba0] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+-+/* [0x00000ba8] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+-+/* [0x00000bb0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00000bb8] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+-+/* [0x00000bc0] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x00000bc8] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+-+/* [0x00000bd0] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+-+/* [0x00000bd8] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+-+/* [0x00000be0] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
+-+/* [0x00000be8] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
+-+/* [0x00000bf0] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
+-+/* [0x00000bf8] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
+-+/* [0x00000c00] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
+-+/* [0x00000c08] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
+-+/* [0x00000c10] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
+-+/* [0x00000c18] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
+-+/* [0x00000c20] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x00000c28] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x00000c30] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x00000c38] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
+-+/* [0x00000c40] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
+-+/* [0x00000c48] */ 0xfffffe38, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x00000c50] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
+-+/* [0x00000c58] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x00000c60] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x00000c68] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00000c70] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000c78] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000c80] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x00000c88] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x00000c90] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000c98] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x00000ca0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000ca8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_exit
+--/* [0x00000cc8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x00000cd0] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+-+/* [0x00000cb0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00000cb8] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+-+/* [0x00000cc0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000cc8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000cd0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+- /* [0x00000cd8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000ce0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000ce8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000cf0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000cf8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x00000d00] */ 0x009e7000, 0x100009e7, // nop        ; nop
+--/* [0x00000d08] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000ce0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00000ce8] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000cf0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_interrupt_exit8
+--/* [0x00000d10] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00000cf8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00000d00] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000d08] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000d10] */ 0x009e7000, 0xa00009e7, // ldtmu0
+- /* [0x00000d18] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000d20] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000d28] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000d30] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000d20] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000d28] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000d30] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00000d38] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00000d40] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00000d48] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00000d50] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000d58] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000d60] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000d68] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000d70] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x00000d78] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+--/* [0x00000d80] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000d58] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00000d60] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x00000d68] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_end
+- };
+- #ifdef __HIGHC__
+-diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
+-index 809e582..6562fa9 100644
+---- a/libavcodec/rpi_shader.h
+-+++ b/libavcodec/rpi_shader.h
+-@@ -4,11 +4,11 @@
+- extern unsigned int rpi_shader[];
+- 
+- #define mc_setup_uv (rpi_shader + 0)
+--#define mc_filter_uv (rpi_shader + 150)
+--#define mc_filter_uv_b0 (rpi_shader + 368)
+--#define mc_filter_uv_b (rpi_shader + 586)
+--#define mc_exit (rpi_shader + 818)
+--#define mc_interrupt_exit8 (rpi_shader + 836)
+--#define mc_end (rpi_shader + 866)
+-+#define mc_filter_uv (rpi_shader + 152)
+-+#define mc_filter_uv_b0 (rpi_shader + 370)
+-+#define mc_filter_uv_b (rpi_shader + 584)
+-+#define mc_exit (rpi_shader + 812)
+-+#define mc_interrupt_exit8 (rpi_shader + 830)
+-+#define mc_end (rpi_shader + 860)
+- 
+- #endif
+-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
+-index 9577121..562dc35 100644
+---- a/libavcodec/rpi_shader.qasm
+-+++ b/libavcodec/rpi_shader.qasm
+-@@ -39,13 +39,13 @@
+- # rb30                                          frame height-1
+- # rb31                                          used as temp to count loop iterations
+- #
+--# ra24...ra30                                   15, 14, 13, 12, 11, 10, 9
+- # ra24                                          clipped(row start address+8+elem_num)&~3
+- # ra25                                          per-channel shifts 2
+- # ra26                                          next ra24
+- # ra27                                          next ra25
+- # ra28                                          next y
+- # ra29                                          y for next texture access
+-+# ra30                                          64
+- #
+- # ra31                                          next kernel address
+- 
+-@@ -102,6 +102,7 @@ mov ra20, 1
+- mov ra21, 32
+- mov ra22, 256
+- mov ra23, 8
+-+mov ra30, 64
+- 
+- mov rb20, 0xffffff00
+- mov rb22, 255
+-@@ -472,7 +473,7 @@ sub.setf -, r3, 8 ; mov r1, ra22
+- # apply horizontal filter
+- brr.anyn -, r:uvloop_b0
+- mov ra14, ra15          ; mul24 r0, r0, r1         # last bit of context scroll
+--asr ra15, r0, 8         ; nop
+-+asr ra15, r0, 8         ; nop  # TODO isn't ra15 already in 24bit precision, may not need the sign extension here?
+- nop                     ; nop  # Delay slot 3 (TODO move more of the context scroll into here)
+- 
+- # apply vertical filter and write to VPM
+-@@ -487,18 +488,18 @@ add r1, r1, r0          ; mul24 r0, ra8, rb8
+- add r1, r1, r0          ; mul24 r0, ra15, rb15
+- add r1, r1, r0          ; mov -, vw_wait
+- sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--asr r1, r1, 14
+--add r1, r1, ra21
+--brr.anyn -, r:uvloop
+--asr r1, r1, 6          # Delay 1
+--min r1, r1, rb22       # Delay 2
+--max vpm, r1, 0         # Delay 3
+-+#asr r1, r1, 14
+-+#add r1, r1, ra21
+-+brr.anyn -, r:uvloop_b0
+-+asr vpm, r1, 14        # Delay 1 shifts down by shift2=6, but results are still in 16bit precision TODO may be able to avoid the mul24 and use more delay slots
+-+nop                    # Delay 2
+-+nop                    # Delay 3
+- 
+- # DMA out for U
+- 
+- mov vw_setup, rb26 # VDW setup 0
+- mov vw_setup, rb29 # Stride
+--mov vw_addr, unif # start the VDW
+-+mov vw_addr, unif # start the VDW    # TODO in pass0 we don't need to save any results
+- 
+- # DMA out for V
+- # We need to wait for the U to complete first, but have nothing useful to compute while we wait.
+-@@ -639,12 +640,11 @@ mov ra12, ra13
+- mov ra13, ra14
+- 
+- sub.setf -, r3, 8 ; mov r1, ra22
+--
+- # apply horizontal filter
+- brr.anyn -, r:uvloop_b
+- mov ra14, ra15          ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
+- asr ra15, r0, 8         ; nop
+--nop                     ; nop
+-+nop                     ; nop    # TODO improve use of delay slots
+- 
+- # apply vertical filter and write to VPM
+- 
+-@@ -658,15 +658,13 @@ add r1, r1, r0          ; mul24 r0, ra8, rb8
+- add r1, r1, r0          ; mul24 r0, ra15, rb15
+- add r1, r1, r0          ; mov -, vw_wait
+- sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--asr r1, r1, 14
+--add r1, r1, ra21
+--asr r1, r1, 6
+--min r1, r1, rb22
+--add r0, vpm, 1          # Blend in previous VPM contents at this location
+-+asr r1, r1, 14          # shift2=6
+-+add r1, r1, vpm         # Blend in previous VPM contents at this location
+-+add r1, r1, ra30
+- brr.anyn -, r:uvloop_b
+--max r1, r1, 0
+--add r1, r1, r0
+--shr vpm, r1, 1
+-+asr r1, r1, 7           # Delay 1
+-+min r1, r1, rb22        # Delay 2
+-+max vpm, r1, 0          # Delay 3
+- 
+- 
+- # DMA out for U
+--- 
+-2.7.4
+-
+-
+-From ea60373134f98099c4ebaf0d23cca666008b4bba Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Thu, 14 May 2015 10:55:07 +0100
+-Subject: [PATCH 27/68] P prediction uses 4 tap filters
+-
+----
+- libavcodec/hevc.c          |  50 ++--
+- libavcodec/rpi_shader.c    | 631 ++++++++++++++++++++++-----------------------
+- libavcodec/rpi_shader.h    |  10 +-
+- libavcodec/rpi_shader.qasm |  43 +--
+- 4 files changed, 344 insertions(+), 390 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 28a6660..a47ebc5 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -65,15 +65,15 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
+- #define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24)
+- 
+- // TODO Chroma only needs 4 taps
+--static uint32_t rpi_filter_coefs[8][2] = {
+--        { ENCODE_COEFFS(  0,  0,  0,  64), ENCODE_COEFFS(   0,   0,  0,  0 ) },
+--        { ENCODE_COEFFS(  0,  0, -2,  58), ENCODE_COEFFS(  10,  -2,  0,  0 ) },
+--        { ENCODE_COEFFS(  0,  0, -4,  54), ENCODE_COEFFS(  16,  -2,  0,  0 ) },
+--        { ENCODE_COEFFS(  0,  0, -6,  46), ENCODE_COEFFS(  28,  -4,  0,  0 ) },
+--        { ENCODE_COEFFS(  0,  0, -4,  36), ENCODE_COEFFS(  36,  -4,  0,  0 ) },
+--        { ENCODE_COEFFS(  0,  0, -4,  28), ENCODE_COEFFS(  46,  -6,  0,  0 ) },
+--        { ENCODE_COEFFS(  0,  0, -2,  16), ENCODE_COEFFS(  54,  -4,  0,  0 ) },
+--        { ENCODE_COEFFS(  0,  0, -2,  10), ENCODE_COEFFS(  58,  -2,  0,  0 ) }
+-+static uint32_t rpi_filter_coefs[8][1] = {
+-+        { ENCODE_COEFFS(   0,  64,   0,   0) },
+-+        { ENCODE_COEFFS(  -2,  58,  10,  -2) },
+-+        { ENCODE_COEFFS(  -4,  54,  16,  -2) },
+-+        { ENCODE_COEFFS(  -6,  46,  28,  -4) },
+-+        { ENCODE_COEFFS(  -4,  36,  36,  -4) },
+-+        { ENCODE_COEFFS(  -4,  28,  46,  -6) },
+-+        { ENCODE_COEFFS(  -2,  16,  54,  -4) },
+-+        { ENCODE_COEFFS(  -2,  10,  58,  -2) }
+- };
+- 
+- static uint32_t get_vc_address(AVBufferRef *bref) {
+-@@ -2027,16 +2027,16 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
+-                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
+--                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 3 + start_x;
+--                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 3 + start_y;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[1]);
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[2]);
+-                       *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
+-                       // TODO chroma weight and offset... s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]
+-                       *u++ = rpi_filter_coefs[_mx][0];
+--                      *u++ = rpi_filter_coefs[_mx][1];
+-+                      u++;
+-                       *u++ = rpi_filter_coefs[_my][0];
+--                      *u++ = rpi_filter_coefs[_my][1];
+-+                      u++;
+-                       *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
+-                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+-                     }
+-@@ -2084,16 +2084,16 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
+-                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
+--                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 3 + start_x;
+--                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 3 + start_y;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[1]);
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[2]);
+-                       *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
+-                       // TODO chroma weight and offset... s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]
+-                       *u++ = rpi_filter_coefs[_mx][0];
+--                      *u++ = rpi_filter_coefs[_mx][1];
+-+                      u++;
+-                       *u++ = rpi_filter_coefs[_my][0];
+--                      *u++ = rpi_filter_coefs[_my][1];
+-+                      u++;
+-                       *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
+-                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+-                     }
+-@@ -2148,29 +2148,29 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
+-                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b0;
+--                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 3 + start_x;
+--                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 3 + start_y;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[1]);
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[2]);
+-                       *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
+-                       *u++ = rpi_filter_coefs[_mx][0];
+--                      *u++ = rpi_filter_coefs[_mx][1];
+-+                      u++;
+-                       *u++ = rpi_filter_coefs[_my][0];
+--                      *u++ = rpi_filter_coefs[_my][1];
+-+                      u++;
+-                       *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]); // TODO this will become unused once we have a dedicated pass0 filter
+-                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+- 
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b;
+--                      u++[-RPI_CHROMA_COMMAND_WORDS] = x2_c - 3 + start_x;
+--                      u++[-RPI_CHROMA_COMMAND_WORDS] = y2_c - 3 + start_y;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = x2_c - 1 + start_x;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = y2_c - 1 + start_y;
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[1]);
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[2]);
+-                       *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
+-                       // TODO chroma weight and offset... s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]
+-                       *u++ = rpi_filter_coefs[_mx2][0];
+--                      *u++ = rpi_filter_coefs[_mx2][1];
+-+                      u++;
+-                       *u++ = rpi_filter_coefs[_my2][0];
+--                      *u++ = rpi_filter_coefs[_my2][1];
+-+                      u++;
+-                       *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
+-                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+-                     }
+-diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
+-index 77cca46..c8d0728 100644
+---- a/libavcodec/rpi_shader.c
+-+++ b/libavcodec/rpi_shader.c
+-@@ -116,8 +116,8 @@ unsigned int rpi_shader[] = {
+- /* [0x000002d0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+- /* [0x000002d8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+- /* [0x000002e0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x000002e8] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+--/* [0x000002f0] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+-+/* [0x000002e8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+-+/* [0x000002f0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+- /* [0x000002f8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+- /* [0x00000300] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+- /* [0x00000308] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-@@ -128,338 +128,315 @@ unsigned int rpi_shader[] = {
+- /* [0x00000330] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+- /* [0x00000338] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+- /* [0x00000340] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x00000348] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000350] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000358] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000360] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+--/* [0x00000368] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000370] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000378] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000380] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+--/* [0x00000388] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000390] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000398] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000003a0] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+--/* [0x000003a8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x000003b0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x000003b8] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x00000348] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000350] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000358] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000360] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000368] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+-+/* [0x00000370] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x00000378] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000380] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop
+--/* [0x000003c0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x000003c8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x000003d0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x000003d8] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x000003e0] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x000003e8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x000003f0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x000003f8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x00000400] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x00000408] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x00000410] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000418] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x00000420] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x00000428] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x00000430] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x00000438] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x00000440] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x00000448] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x00000450] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00000458] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--/* [0x00000460] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--/* [0x00000468] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--/* [0x00000470] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--/* [0x00000478] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--/* [0x00000480] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--/* [0x00000488] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--/* [0x00000490] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--/* [0x00000498] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
+--/* [0x000004a0] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+--/* [0x000004a8] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+--/* [0x000004b0] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+--/* [0x000004b8] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+--/* [0x000004c0] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+--/* [0x000004c8] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+--/* [0x000004d0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x000004d8] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+--/* [0x000004e0] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop
+--/* [0x000004e8] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+--/* [0x000004f0] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+--/* [0x000004f8] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+--/* [0x00000500] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
+--/* [0x00000508] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
+--/* [0x00000510] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
+--/* [0x00000518] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
+--/* [0x00000520] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
+--/* [0x00000528] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
+--/* [0x00000530] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
+--/* [0x00000538] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
+--/* [0x00000540] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x00000548] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x00000550] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+--/* [0x00000558] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
+--/* [0x00000560] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:uvloop
+--/* [0x00000568] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
+--/* [0x00000570] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x00000578] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+--/* [0x00000580] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00000588] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000590] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x00000598] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x000005a0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x000005a8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x000005b0] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x000005b8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x000005c0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000388] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x00000390] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x00000398] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x000003a0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x000003a8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x000003b0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x000003b8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x000003c0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x000003c8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x000003d0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x000003d8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x000003e0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x000003e8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x000003f0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x000003f8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x00000400] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000408] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00000410] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00000418] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00000420] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
+-+/* [0x00000428] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+-+/* [0x00000430] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+-+/* [0x00000438] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00000440] */ 0x8d5847f6, 0xd00269e1, // sub.setf -, r3, 4 ; mov r1, ra22
+-+/* [0x00000448] */ 0xffffff20, 0xf06809e7, // brr.anyn -, r:uvloop
+-+/* [0x00000450] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+-+/* [0x00000458] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+-+/* [0x00000460] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+-+/* [0x00000468] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+-+/* [0x00000470] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+-+/* [0x00000478] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+/* [0x00000480] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x00000488] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x00000490] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x00000498] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x000004a0] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
+-+/* [0x000004a8] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop
+-+/* [0x000004b0] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
+-+/* [0x000004b8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x000004c0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x000004c8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x000004d0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x000004d8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x000004e0] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x000004e8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x000004f0] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x000004f8] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x00000500] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000508] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_filter_uv_b0
+--/* [0x000005c8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x000005d0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x000005d8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x000005e0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x000005e8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+--/* [0x000005f0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x000005f8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+--/* [0x00000600] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x00000608] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+--/* [0x00000610] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x00000618] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+--/* [0x00000620] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
+--/* [0x00000628] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x00000630] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000638] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x00000640] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x00000648] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x00000650] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+--/* [0x00000658] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+--/* [0x00000660] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x00000668] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000670] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000678] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00000680] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
+--/* [0x00000688] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000690] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000698] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000006a0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000006a8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x000006b0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000006b8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000006c0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000006c8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+--/* [0x000006d0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000006d8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000006e0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000006e8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+--/* [0x000006f0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000006f8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000700] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000708] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+--/* [0x00000710] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x00000718] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000720] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x00000510] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000518] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00000520] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000528] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000530] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x00000538] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000540] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x00000548] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x00000550] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+-+/* [0x00000558] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x00000560] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+-+/* [0x00000568] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
+-+/* [0x00000570] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x00000578] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000580] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x00000588] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x00000590] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x00000598] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+-+/* [0x000005a0] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+-+/* [0x000005a8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x000005b0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x000005b8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x000005c0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x000005c8] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
+-+/* [0x000005d0] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000005d8] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000005e0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000005e8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000005f0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x000005f8] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000600] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000608] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000610] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+-+/* [0x00000618] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000620] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000628] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000630] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+-+/* [0x00000638] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000640] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000648] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000650] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+-+/* [0x00000658] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x00000660] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000668] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop_b0
+--/* [0x00000728] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x00000730] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x00000738] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x00000740] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x00000748] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x00000750] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x00000758] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x00000760] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x00000768] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x00000770] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x00000778] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000780] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x00000788] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x00000790] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x00000798] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x000007a0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x000007a8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x000007b0] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x000007b8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x000007c0] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--/* [0x000007c8] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--/* [0x000007d0] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--/* [0x000007d8] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--/* [0x000007e0] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--/* [0x000007e8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--/* [0x000007f0] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--/* [0x000007f8] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--/* [0x00000800] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
+--/* [0x00000808] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+--/* [0x00000810] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+--/* [0x00000818] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+--/* [0x00000820] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+--/* [0x00000828] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+--/* [0x00000830] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+--/* [0x00000838] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x00000840] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+--/* [0x00000848] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+--/* [0x00000850] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+--/* [0x00000858] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+--/* [0x00000860] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+--/* [0x00000868] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
+--/* [0x00000870] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
+--/* [0x00000878] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
+--/* [0x00000880] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
+--/* [0x00000888] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
+--/* [0x00000890] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
+--/* [0x00000898] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
+--/* [0x000008a0] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
+--/* [0x000008a8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x000008b0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x000008b8] */ 0xfffffe50, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+--/* [0x000008c0] */ 0x0f9ce3c0, 0xd0020c27, // asr vpm, r1, 14
+--/* [0x000008c8] */ 0x009e7000, 0x100009e7, // nop
+--/* [0x000008d0] */ 0x009e7000, 0x100009e7, // nop
+--/* [0x000008d8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x000008e0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x000008e8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x000008f0] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x000008f8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x00000900] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000908] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x00000910] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000918] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000670] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x00000678] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x00000680] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x00000688] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000690] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x00000698] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x000006a0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x000006a8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x000006b0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x000006b8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x000006c0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x000006c8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x000006d0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x000006d8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x000006e0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x000006e8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x000006f0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x000006f8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00000700] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00000708] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+/* [0x00000710] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+/* [0x00000718] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+/* [0x00000720] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+/* [0x00000728] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+/* [0x00000730] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+/* [0x00000738] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+/* [0x00000740] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+/* [0x00000748] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
+-+/* [0x00000750] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+-+/* [0x00000758] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+-+/* [0x00000760] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+-+/* [0x00000768] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+-+/* [0x00000770] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+-+/* [0x00000778] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+-+/* [0x00000780] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00000788] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+-+/* [0x00000790] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+-+/* [0x00000798] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+-+/* [0x000007a0] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+-+/* [0x000007a8] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+-+/* [0x000007b0] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
+-+/* [0x000007b8] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
+-+/* [0x000007c0] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
+-+/* [0x000007c8] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
+-+/* [0x000007d0] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
+-+/* [0x000007d8] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
+-+/* [0x000007e0] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
+-+/* [0x000007e8] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
+-+/* [0x000007f0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x000007f8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x00000800] */ 0xfffffe50, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+-+/* [0x00000808] */ 0x0f9ce3c0, 0xd0020c27, // asr vpm, r1, 14
+-+/* [0x00000810] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x00000818] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x00000820] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00000828] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000830] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000838] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x00000840] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x00000848] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000850] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x00000858] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000860] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_filter_uv_b
+--/* [0x00000920] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00000928] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x00000930] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00000938] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x00000940] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+--/* [0x00000948] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x00000950] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+--/* [0x00000958] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x00000960] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+--/* [0x00000968] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x00000970] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+--/* [0x00000978] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+--/* [0x00000980] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x00000988] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000990] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x00000998] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x000009a0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x000009a8] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+--/* [0x000009b0] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+--/* [0x000009b8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x000009c0] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+--/* [0x000009c8] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+--/* [0x000009d0] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+--/* [0x000009d8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x000009e0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x000009e8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x000009f0] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
+--/* [0x000009f8] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
+--/* [0x00000a00] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000a08] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000a10] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000a18] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000a20] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x00000a28] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000a30] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000a38] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000a40] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+--/* [0x00000a48] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000a50] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000a58] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000a60] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+--/* [0x00000a68] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000a70] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000a78] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000a80] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+--/* [0x00000a88] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x00000a90] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000a98] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x00000868] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000870] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00000878] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000880] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000888] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x00000890] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000898] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x000008a0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x000008a8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+-+/* [0x000008b0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x000008b8] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+-+/* [0x000008c0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x000008c8] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x000008d0] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000008d8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x000008e0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x000008e8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x000008f0] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+-+/* [0x000008f8] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+-+/* [0x00000900] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x00000908] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+-+/* [0x00000910] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+-+/* [0x00000918] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+-+/* [0x00000920] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000928] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000930] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00000938] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
+-+/* [0x00000940] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
+-+/* [0x00000948] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000950] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000958] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000960] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000968] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x00000970] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000978] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000980] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000988] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+-+/* [0x00000990] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000998] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000009a0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000009a8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+-+/* [0x000009b0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000009b8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000009c0] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000009c8] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+-+/* [0x000009d0] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x000009d8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x000009e0] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop_b
+--/* [0x00000aa0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x00000aa8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x00000ab0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x00000ab8] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x00000ac0] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x00000ac8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x00000ad0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x00000ad8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x00000ae0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x00000ae8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x00000af0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000af8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x00000b00] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x00000b08] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x00000b10] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x00000b18] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x00000b20] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x00000b28] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x00000b30] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00000b38] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--/* [0x00000b40] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--/* [0x00000b48] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--/* [0x00000b50] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--/* [0x00000b58] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--/* [0x00000b60] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--/* [0x00000b68] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--/* [0x00000b70] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--/* [0x00000b78] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
+--/* [0x00000b80] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+--/* [0x00000b88] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+--/* [0x00000b90] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+--/* [0x00000b98] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+--/* [0x00000ba0] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+--/* [0x00000ba8] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+--/* [0x00000bb0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x00000bb8] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+--/* [0x00000bc0] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
+--/* [0x00000bc8] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+--/* [0x00000bd0] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+--/* [0x00000bd8] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+--/* [0x00000be0] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
+--/* [0x00000be8] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
+--/* [0x00000bf0] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
+--/* [0x00000bf8] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
+--/* [0x00000c00] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
+--/* [0x00000c08] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
+--/* [0x00000c10] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
+--/* [0x00000c18] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
+--/* [0x00000c20] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x00000c28] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x00000c30] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+--/* [0x00000c38] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
+--/* [0x00000c40] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
+--/* [0x00000c48] */ 0xfffffe38, 0xf06809e7, // brr.anyn -, r:uvloop_b
+--/* [0x00000c50] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
+--/* [0x00000c58] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x00000c60] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+--/* [0x00000c68] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00000c70] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000c78] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x00000c80] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x00000c88] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x00000c90] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000c98] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x00000ca0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000ca8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x000009e8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x000009f0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x000009f8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x00000a00] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000a08] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x00000a10] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000a18] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000a20] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x00000a28] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x00000a30] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x00000a38] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000a40] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x00000a48] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x00000a50] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00000a58] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x00000a60] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000a68] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00000a70] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00000a78] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00000a80] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+/* [0x00000a88] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+/* [0x00000a90] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+/* [0x00000a98] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+/* [0x00000aa0] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+/* [0x00000aa8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+/* [0x00000ab0] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+/* [0x00000ab8] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+/* [0x00000ac0] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
+-+/* [0x00000ac8] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+-+/* [0x00000ad0] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+-+/* [0x00000ad8] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+-+/* [0x00000ae0] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+-+/* [0x00000ae8] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+-+/* [0x00000af0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+-+/* [0x00000af8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00000b00] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+-+/* [0x00000b08] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x00000b10] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+-+/* [0x00000b18] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+-+/* [0x00000b20] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+-+/* [0x00000b28] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
+-+/* [0x00000b30] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
+-+/* [0x00000b38] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
+-+/* [0x00000b40] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
+-+/* [0x00000b48] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
+-+/* [0x00000b50] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
+-+/* [0x00000b58] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
+-+/* [0x00000b60] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
+-+/* [0x00000b68] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x00000b70] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x00000b78] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x00000b80] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
+-+/* [0x00000b88] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
+-+/* [0x00000b90] */ 0xfffffe38, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x00000b98] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
+-+/* [0x00000ba0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x00000ba8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x00000bb0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00000bb8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000bc0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000bc8] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x00000bd0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x00000bd8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000be0] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x00000be8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000bf0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_exit
+--/* [0x00000cb0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x00000cb8] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+--/* [0x00000cc0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000cc8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000cd0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000cd8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000ce0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x00000ce8] */ 0x009e7000, 0x100009e7, // nop        ; nop
+--/* [0x00000cf0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000bf8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00000c00] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+-+/* [0x00000c08] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000c10] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000c18] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000c20] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000c28] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00000c30] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000c38] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_interrupt_exit8
+--/* [0x00000cf8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x00000d00] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000d08] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000d10] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000d18] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000d20] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000d28] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000d30] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000d38] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000d40] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000d48] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000d50] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000d58] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x00000d60] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+--/* [0x00000d68] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000c40] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00000c48] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000c50] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000c58] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000c60] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000c68] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000c70] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000c78] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000c80] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000c88] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000c90] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000c98] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000ca0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00000ca8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x00000cb0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_end
+- };
+- #ifdef __HIGHC__
+-diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
+-index 6562fa9..1bf7a68 100644
+---- a/libavcodec/rpi_shader.h
+-+++ b/libavcodec/rpi_shader.h
+-@@ -5,10 +5,10 @@ extern unsigned int rpi_shader[];
+- 
+- #define mc_setup_uv (rpi_shader + 0)
+- #define mc_filter_uv (rpi_shader + 152)
+--#define mc_filter_uv_b0 (rpi_shader + 370)
+--#define mc_filter_uv_b (rpi_shader + 584)
+--#define mc_exit (rpi_shader + 812)
+--#define mc_interrupt_exit8 (rpi_shader + 830)
+--#define mc_end (rpi_shader + 860)
+-+#define mc_filter_uv_b0 (rpi_shader + 324)
+-+#define mc_filter_uv_b (rpi_shader + 538)
+-+#define mc_exit (rpi_shader + 766)
+-+#define mc_interrupt_exit8 (rpi_shader + 784)
+-+#define mc_end (rpi_shader + 814)
+- 
+- #endif
+-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
+-index 562dc35..8e4f18f 100644
+---- a/libavcodec/rpi_shader.qasm
+-+++ b/libavcodec/rpi_shader.qasm
+-@@ -16,8 +16,8 @@
+- # ra19                                          next ra17
+- #
+- # rb16                                          pitch
+--# rb17                                          height + 5
+--# rb18                                          height + 7
+-+# rb17                                          height + 1
+-+# rb18                                          height + 3
+- # rb19                                          next ra16
+- #
+- # ra20                                          1
+-@@ -214,8 +214,8 @@ mov r0, unif
+- shr r1, r0, r2 # Extract width
+- sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
+- and r0, r0, rb22 # Extract height
+--add rb17, r0, 5
+--add rb18, r0, 7
+-+add rb17, r0, 1
+-+add rb18, r0, 3
+- shl r0, r0, 7
+- add r0, r0, r1 # Combine width and height of destination area
+- shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
+-@@ -230,18 +230,11 @@ asr ra3, r0, rb23;      mul24 r0, r0, ra22
+- asr ra2, r0, rb23;      mul24 r0, r0, ra22
+- asr ra1, r0, rb23;      mul24 r0, r0, ra22
+- asr ra0, r0, rb23;      mov r0, unif
+--asr ra7, r0, rb23;      mul24 r0, r0, ra22
+--asr ra6, r0, rb23;      mul24 r0, r0, ra22
+--asr ra5, r0, rb23;      mul24 r0, r0, ra22
+--asr ra4, r0, rb23;      mov r0, unif
+-+                        mov r0, unif
+- asr rb11, r0, rb23;     mul24 r0, r0, ra22
+- asr rb10, r0, rb23;     mul24 r0, r0, ra22
+- asr rb9, r0, rb23;      mul24 r0, r0, ra22
+- asr rb8, r0, rb23;      mov r0, unif
+--asr rb15, r0, rb23;     mul24 r0, r0, ra22
+--asr rb14, r0, rb23;     mul24 r0, r0, ra22
+--asr rb13, r0, rb23;     mul24 r0, r0, ra22
+--asr rb12, r0, rb23
+- 
+- # r2 is elem_num
+- # r3 is loop counter
+-@@ -283,26 +276,14 @@ add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+- nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+- add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+- nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+- add r0, r2, r3
+- 
+- mov r3, rb31
+- 
+--mov ra8, ra9
+--mov ra9, ra10
+--mov ra10, ra11
+--mov ra11, ra12
+- mov ra12, ra13
+- mov ra13, ra14
+- 
+--sub.setf -, r3, 8 ; mov r1, ra22
+-+sub.setf -, r3, 4 ; mov r1, ra22
+- 
+- # apply horizontal filter
+- brr.anyn -, r:uvloop
+-@@ -312,14 +293,10 @@ nop                     ; nop  # Delay slot 3 (TODO move more of the context scr
+- 
+- # apply vertical filter and write to VPM
+- 
+--nop                     ; mul24 r1, ra14, rb14
+--nop                     ; mul24 r0, ra13, rb13
+--add r1, r1, r0          ; mul24 r0, ra12, rb12
+--add r1, r1, r0          ; mul24 r0, ra11, rb11
+--add r1, r1, r0          ; mul24 r0, ra10, rb10
+--add r1, r1, r0          ; mul24 r0, ra9, rb9
+--add r1, r1, r0          ; mul24 r0, ra8, rb8
+--add r1, r1, r0          ; mul24 r0, ra15, rb15
+-+nop                     ; mul24 r1, ra14, rb10
+-+nop                     ; mul24 r0, ra13, rb9
+-+add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+add r1, r1, r0          ; mul24 r0, ra15, rb11
+- add r1, r1, r0          ; mov -, vw_wait
+- sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+- asr r1, r1, 14
+--- 
+-2.7.4
+-
+-
+-From e4bdd110d4640519b751ab428e7976a1e9a15802 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Thu, 14 May 2015 11:03:51 +0100
+-Subject: [PATCH 28/68] Optimised B0 pass
+-
+----
+- libavcodec/rpi_shader.c    | 424 +++++++++++++++++++++------------------------
+- libavcodec/rpi_shader.h    |   8 +-
+- libavcodec/rpi_shader.qasm |  43 +----
+- 3 files changed, 212 insertions(+), 263 deletions(-)
+-
+-diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
+-index c8d0728..1f63ee0 100644
+---- a/libavcodec/rpi_shader.c
+-+++ b/libavcodec/rpi_shader.c
+-@@ -204,239 +204,215 @@ unsigned int rpi_shader[] = {
+- /* [0x00000580] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+- /* [0x00000588] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+- /* [0x00000590] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x00000598] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+--/* [0x000005a0] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+-+/* [0x00000598] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+-+/* [0x000005a0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+- /* [0x000005a8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+- /* [0x000005b0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+- /* [0x000005b8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+- /* [0x000005c0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x000005c8] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
+--/* [0x000005d0] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000005d8] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000005e0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000005e8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000005f0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x000005f8] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000600] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000608] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000610] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+--/* [0x00000618] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000620] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000628] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000630] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+--/* [0x00000638] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000640] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000648] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000650] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+--/* [0x00000658] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x00000660] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000668] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x000005c8] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000005d0] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000005d8] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000005e0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000005e8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x000005f0] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000005f8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000600] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000608] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000610] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+-+/* [0x00000618] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x00000620] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000628] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop_b0
+--/* [0x00000670] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x00000678] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x00000680] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x00000688] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x00000690] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x00000698] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x000006a0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x000006a8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x000006b0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x000006b8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x000006c0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x000006c8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x000006d0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x000006d8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x000006e0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x000006e8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x000006f0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x000006f8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x00000700] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00000708] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--/* [0x00000710] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--/* [0x00000718] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--/* [0x00000720] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--/* [0x00000728] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--/* [0x00000730] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--/* [0x00000738] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--/* [0x00000740] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--/* [0x00000748] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
+--/* [0x00000750] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+--/* [0x00000758] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+--/* [0x00000760] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+--/* [0x00000768] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+--/* [0x00000770] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+--/* [0x00000778] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+--/* [0x00000780] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x00000788] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+--/* [0x00000790] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+--/* [0x00000798] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+--/* [0x000007a0] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+--/* [0x000007a8] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+--/* [0x000007b0] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
+--/* [0x000007b8] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
+--/* [0x000007c0] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
+--/* [0x000007c8] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
+--/* [0x000007d0] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
+--/* [0x000007d8] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
+--/* [0x000007e0] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
+--/* [0x000007e8] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
+--/* [0x000007f0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x000007f8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x00000800] */ 0xfffffe50, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+--/* [0x00000808] */ 0x0f9ce3c0, 0xd0020c27, // asr vpm, r1, 14
+--/* [0x00000810] */ 0x009e7000, 0x100009e7, // nop
+--/* [0x00000818] */ 0x009e7000, 0x100009e7, // nop
+--/* [0x00000820] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00000828] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000830] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x00000838] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x00000840] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x00000848] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000850] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x00000858] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000860] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000630] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x00000638] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x00000640] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x00000648] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000650] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x00000658] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000660] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000668] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x00000670] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x00000678] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x00000680] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000688] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x00000690] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x00000698] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x000006a0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x000006a8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x000006b0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x000006b8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x000006c0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x000006c8] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
+-+/* [0x000006d0] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+-+/* [0x000006d8] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+-+/* [0x000006e0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x000006e8] */ 0x8d5847f6, 0xd00269e1, // sub.setf -, r3, 4 ; mov r1, ra22
+-+/* [0x000006f0] */ 0xffffff20, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+-+/* [0x000006f8] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+-+/* [0x00000700] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+-+/* [0x00000708] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+-+/* [0x00000710] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+-+/* [0x00000718] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+-+/* [0x00000720] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+/* [0x00000728] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x00000730] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x00000738] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x00000740] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+-+/* [0x00000748] */ 0x0f9ce3c0, 0xd0020c27, // asr vpm, r1, 14
+-+/* [0x00000750] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x00000758] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x00000760] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00000768] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000770] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000778] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x00000780] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x00000788] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000790] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x00000798] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x000007a0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_filter_uv_b
+--/* [0x00000868] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00000870] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x00000878] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00000880] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x00000888] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+--/* [0x00000890] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x00000898] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+--/* [0x000008a0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x000008a8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+--/* [0x000008b0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x000008b8] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+--/* [0x000008c0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+--/* [0x000008c8] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x000008d0] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000008d8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x000008e0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x000008e8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x000008f0] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+--/* [0x000008f8] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+--/* [0x00000900] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x00000908] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+--/* [0x00000910] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+--/* [0x00000918] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+--/* [0x00000920] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000928] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000930] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00000938] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
+--/* [0x00000940] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
+--/* [0x00000948] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000950] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000958] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000960] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000968] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x00000970] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000978] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000980] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000988] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+--/* [0x00000990] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000998] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000009a0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000009a8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+--/* [0x000009b0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000009b8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000009c0] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000009c8] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+--/* [0x000009d0] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x000009d8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x000009e0] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x000007a8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x000007b0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x000007b8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x000007c0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x000007c8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x000007d0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x000007d8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x000007e0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x000007e8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+-+/* [0x000007f0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x000007f8] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+-+/* [0x00000800] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x00000808] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x00000810] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000818] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x00000820] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x00000828] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x00000830] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+-+/* [0x00000838] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+-+/* [0x00000840] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x00000848] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+-+/* [0x00000850] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+-+/* [0x00000858] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+-+/* [0x00000860] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000868] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000870] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00000878] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
+-+/* [0x00000880] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
+-+/* [0x00000888] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000890] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000898] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000008a0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000008a8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x000008b0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000008b8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000008c0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000008c8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+-+/* [0x000008d0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000008d8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000008e0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000008e8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+-+/* [0x000008f0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000008f8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000900] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000908] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+-+/* [0x00000910] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x00000918] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000920] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop_b
+--/* [0x000009e8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x000009f0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x000009f8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x00000a00] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x00000a08] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x00000a10] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x00000a18] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x00000a20] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x00000a28] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x00000a30] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x00000a38] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000a40] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x00000a48] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x00000a50] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x00000a58] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x00000a60] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x00000a68] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x00000a70] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x00000a78] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00000a80] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--/* [0x00000a88] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--/* [0x00000a90] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--/* [0x00000a98] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--/* [0x00000aa0] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--/* [0x00000aa8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--/* [0x00000ab0] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--/* [0x00000ab8] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--/* [0x00000ac0] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
+--/* [0x00000ac8] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+--/* [0x00000ad0] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+--/* [0x00000ad8] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+--/* [0x00000ae0] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+--/* [0x00000ae8] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+--/* [0x00000af0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+--/* [0x00000af8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x00000b00] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+--/* [0x00000b08] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
+--/* [0x00000b10] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+--/* [0x00000b18] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+--/* [0x00000b20] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+--/* [0x00000b28] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
+--/* [0x00000b30] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
+--/* [0x00000b38] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
+--/* [0x00000b40] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
+--/* [0x00000b48] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
+--/* [0x00000b50] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
+--/* [0x00000b58] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
+--/* [0x00000b60] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
+--/* [0x00000b68] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x00000b70] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x00000b78] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+--/* [0x00000b80] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
+--/* [0x00000b88] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
+--/* [0x00000b90] */ 0xfffffe38, 0xf06809e7, // brr.anyn -, r:uvloop_b
+--/* [0x00000b98] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
+--/* [0x00000ba0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x00000ba8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+--/* [0x00000bb0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00000bb8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000bc0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x00000bc8] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x00000bd0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x00000bd8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000be0] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x00000be8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000bf0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000928] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x00000930] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x00000938] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x00000940] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000948] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x00000950] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000958] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000960] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x00000968] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x00000970] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x00000978] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000980] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x00000988] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x00000990] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00000998] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x000009a0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x000009a8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x000009b0] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x000009b8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x000009c0] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+/* [0x000009c8] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+/* [0x000009d0] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+/* [0x000009d8] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+/* [0x000009e0] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+/* [0x000009e8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+/* [0x000009f0] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+/* [0x000009f8] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+/* [0x00000a00] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
+-+/* [0x00000a08] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+-+/* [0x00000a10] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+-+/* [0x00000a18] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+-+/* [0x00000a20] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+-+/* [0x00000a28] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+-+/* [0x00000a30] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+-+/* [0x00000a38] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00000a40] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+-+/* [0x00000a48] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x00000a50] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+-+/* [0x00000a58] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+-+/* [0x00000a60] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+-+/* [0x00000a68] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
+-+/* [0x00000a70] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
+-+/* [0x00000a78] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
+-+/* [0x00000a80] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
+-+/* [0x00000a88] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
+-+/* [0x00000a90] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
+-+/* [0x00000a98] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
+-+/* [0x00000aa0] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
+-+/* [0x00000aa8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x00000ab0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x00000ab8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x00000ac0] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
+-+/* [0x00000ac8] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
+-+/* [0x00000ad0] */ 0xfffffe38, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x00000ad8] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
+-+/* [0x00000ae0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x00000ae8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x00000af0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00000af8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000b00] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000b08] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x00000b10] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x00000b18] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000b20] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x00000b28] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000b30] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_exit
+--/* [0x00000bf8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x00000c00] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+--/* [0x00000c08] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000c10] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000c18] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000c20] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000c28] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x00000c30] */ 0x009e7000, 0x100009e7, // nop        ; nop
+--/* [0x00000c38] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000b38] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00000b40] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+-+/* [0x00000b48] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000b50] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000b58] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000b60] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000b68] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00000b70] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000b78] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_interrupt_exit8
+--/* [0x00000c40] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x00000c48] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000c50] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000c58] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000c60] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000c68] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000c70] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000c78] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000c80] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000c88] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000c90] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000c98] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000ca0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x00000ca8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+--/* [0x00000cb0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000b80] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00000b88] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000b90] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000b98] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000ba0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000ba8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000bb0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000bb8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000bc0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000bc8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000bd0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000bd8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000be0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00000be8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x00000bf0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_end
+- };
+- #ifdef __HIGHC__
+-diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
+-index 1bf7a68..cb74887 100644
+---- a/libavcodec/rpi_shader.h
+-+++ b/libavcodec/rpi_shader.h
+-@@ -6,9 +6,9 @@ extern unsigned int rpi_shader[];
+- #define mc_setup_uv (rpi_shader + 0)
+- #define mc_filter_uv (rpi_shader + 152)
+- #define mc_filter_uv_b0 (rpi_shader + 324)
+--#define mc_filter_uv_b (rpi_shader + 538)
+--#define mc_exit (rpi_shader + 766)
+--#define mc_interrupt_exit8 (rpi_shader + 784)
+--#define mc_end (rpi_shader + 814)
+-+#define mc_filter_uv_b (rpi_shader + 490)
+-+#define mc_exit (rpi_shader + 718)
+-+#define mc_interrupt_exit8 (rpi_shader + 736)
+-+#define mc_end (rpi_shader + 766)
+- 
+- #endif
+-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
+-index 8e4f18f..faa5755 100644
+---- a/libavcodec/rpi_shader.qasm
+-+++ b/libavcodec/rpi_shader.qasm
+-@@ -357,15 +357,13 @@ mov r0, unif
+- shr r1, r0, r2 # Extract width
+- sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
+- and r0, r0, rb22 # Extract height
+--add rb17, r0, 5
+--add rb18, r0, 7
+-+add rb17, r0, 1
+-+add rb18, r0, 3
+- shl r0, r0, 7
+- add r0, r0, r1 # Combine width and height of destination area
+- shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
+- add rb26, r0, rb27
+- 
+--sub.setf -,8,r1 # 8-r1, so if <0 (negative) we need to use the full code
+--
+- # get filter coefficients
+- 
+- mov r0, unif
+-@@ -373,18 +371,11 @@ asr ra3, r0, rb23;      mul24 r0, r0, ra22
+- asr ra2, r0, rb23;      mul24 r0, r0, ra22
+- asr ra1, r0, rb23;      mul24 r0, r0, ra22
+- asr ra0, r0, rb23;      mov r0, unif
+--asr ra7, r0, rb23;      mul24 r0, r0, ra22
+--asr ra6, r0, rb23;      mul24 r0, r0, ra22
+--asr ra5, r0, rb23;      mul24 r0, r0, ra22
+--asr ra4, r0, rb23;      mov r0, unif
+-+                        mov r0, unif
+- asr rb11, r0, rb23;     mul24 r0, r0, ra22
+- asr rb10, r0, rb23;     mul24 r0, r0, ra22
+- asr rb9, r0, rb23;      mul24 r0, r0, ra22
+- asr rb8, r0, rb23;      mov r0, unif
+--asr rb15, r0, rb23;     mul24 r0, r0, ra22
+--asr rb14, r0, rb23;     mul24 r0, r0, ra22
+--asr rb13, r0, rb23;     mul24 r0, r0, ra22
+--asr rb12, r0, rb23
+- 
+- # r2 is elem_num
+- # r3 is loop counter
+-@@ -426,26 +417,14 @@ add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+- nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+- add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+- nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+- add r0, r2, r3
+- 
+- mov r3, rb31
+- 
+--mov ra8, ra9
+--mov ra9, ra10
+--mov ra10, ra11
+--mov ra11, ra12
+- mov ra12, ra13
+- mov ra13, ra14
+- 
+--sub.setf -, r3, 8 ; mov r1, ra22
+-+sub.setf -, r3, 4 ; mov r1, ra22
+- 
+- # apply horizontal filter
+- brr.anyn -, r:uvloop_b0
+-@@ -455,18 +434,12 @@ nop                     ; nop  # Delay slot 3 (TODO move more of the context scr
+- 
+- # apply vertical filter and write to VPM
+- 
+--nop                     ; mul24 r1, ra14, rb14
+--nop                     ; mul24 r0, ra13, rb13
+--add r1, r1, r0          ; mul24 r0, ra12, rb12
+--add r1, r1, r0          ; mul24 r0, ra11, rb11
+--add r1, r1, r0          ; mul24 r0, ra10, rb10
+--add r1, r1, r0          ; mul24 r0, ra9, rb9
+--add r1, r1, r0          ; mul24 r0, ra8, rb8
+--add r1, r1, r0          ; mul24 r0, ra15, rb15
+-+nop                     ; mul24 r1, ra14, rb10
+-+nop                     ; mul24 r0, ra13, rb9
+-+add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+add r1, r1, r0          ; mul24 r0, ra15, rb11
+- add r1, r1, r0          ; mov -, vw_wait
+- sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--#asr r1, r1, 14
+--#add r1, r1, ra21
+- brr.anyn -, r:uvloop_b0
+- asr vpm, r1, 14        # Delay 1 shifts down by shift2=6, but results are still in 16bit precision TODO may be able to avoid the mul24 and use more delay slots
+- nop                    # Delay 2
+--- 
+-2.7.4
+-
+-
+-From 93805e78a13d36e28ed84a0e8456da2eac45be89 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Thu, 14 May 2015 11:12:43 +0100
+-Subject: [PATCH 29/68] Optimised B pass
+-
+----
+- libavcodec/rpi_shader.c    | 202 ++++++++++++++++++++-------------------------
+- libavcodec/rpi_shader.h    |   6 +-
+- libavcodec/rpi_shader.qasm |  41 ++-------
+- 3 files changed, 100 insertions(+), 149 deletions(-)
+-
+-diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
+-index 1f63ee0..4e6c5ea 100644
+---- a/libavcodec/rpi_shader.c
+-+++ b/libavcodec/rpi_shader.c
+-@@ -289,8 +289,8 @@ unsigned int rpi_shader[] = {
+- /* [0x00000818] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+- /* [0x00000820] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+- /* [0x00000828] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x00000830] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+--/* [0x00000838] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+-+/* [0x00000830] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+-+/* [0x00000838] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+- /* [0x00000840] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+- /* [0x00000848] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+- /* [0x00000850] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+-@@ -299,120 +299,96 @@ unsigned int rpi_shader[] = {
+- /* [0x00000868] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+- /* [0x00000870] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+- /* [0x00000878] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
+--/* [0x00000880] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
+--/* [0x00000888] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000890] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000898] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000008a0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000008a8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x000008b0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000008b8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000008c0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000008c8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+--/* [0x000008d0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000008d8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000008e0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000008e8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+--/* [0x000008f0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000008f8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000900] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000908] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+--/* [0x00000910] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x00000918] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000920] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x00000880] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000888] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000890] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000898] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000008a0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x000008a8] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000008b0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000008b8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000008c0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000008c8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+-+/* [0x000008d0] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x000008d8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x000008e0] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop_b
+--/* [0x00000928] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x00000930] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x00000938] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x00000940] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x00000948] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x00000950] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x00000958] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x00000960] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x00000968] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x00000970] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x00000978] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000980] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x00000988] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x00000990] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x00000998] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x000009a0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x000009a8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x000009b0] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x000009b8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x000009c0] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--/* [0x000009c8] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--/* [0x000009d0] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--/* [0x000009d8] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--/* [0x000009e0] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--/* [0x000009e8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--/* [0x000009f0] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--/* [0x000009f8] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--/* [0x00000a00] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
+--/* [0x00000a08] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+--/* [0x00000a10] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+--/* [0x00000a18] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+--/* [0x00000a20] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+--/* [0x00000a28] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+--/* [0x00000a30] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+--/* [0x00000a38] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x00000a40] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+--/* [0x00000a48] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
+--/* [0x00000a50] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+--/* [0x00000a58] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+--/* [0x00000a60] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+--/* [0x00000a68] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
+--/* [0x00000a70] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
+--/* [0x00000a78] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
+--/* [0x00000a80] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
+--/* [0x00000a88] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
+--/* [0x00000a90] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
+--/* [0x00000a98] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
+--/* [0x00000aa0] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
+--/* [0x00000aa8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x00000ab0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x00000ab8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+--/* [0x00000ac0] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
+--/* [0x00000ac8] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
+--/* [0x00000ad0] */ 0xfffffe38, 0xf06809e7, // brr.anyn -, r:uvloop_b
+--/* [0x00000ad8] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
+--/* [0x00000ae0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x00000ae8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+--/* [0x00000af0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00000af8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000b00] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x00000b08] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x00000b10] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x00000b18] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000b20] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x00000b28] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000b30] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x000008e8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x000008f0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x000008f8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x00000900] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000908] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x00000910] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000918] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000920] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x00000928] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x00000930] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x00000938] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000940] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x00000948] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x00000950] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00000958] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x00000960] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000968] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00000970] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00000978] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00000980] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
+-+/* [0x00000988] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+-+/* [0x00000990] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+-+/* [0x00000998] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x000009a0] */ 0x8d5847f6, 0xd00269e1, // sub.setf -, r3, 4 ; mov r1, ra22
+-+/* [0x000009a8] */ 0xffffff20, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x000009b0] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+-+/* [0x000009b8] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+-+/* [0x000009c0] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+-+/* [0x000009c8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+-+/* [0x000009d0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+-+/* [0x000009d8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+/* [0x000009e0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x000009e8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x000009f0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x000009f8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x00000a00] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
+-+/* [0x00000a08] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
+-+/* [0x00000a10] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x00000a18] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
+-+/* [0x00000a20] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x00000a28] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x00000a30] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00000a38] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000a40] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000a48] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x00000a50] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x00000a58] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000a60] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x00000a68] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000a70] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_exit
+--/* [0x00000b38] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x00000b40] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+--/* [0x00000b48] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000b50] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000b58] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000b60] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000b68] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x00000b70] */ 0x009e7000, 0x100009e7, // nop        ; nop
+--/* [0x00000b78] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000a78] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00000a80] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+-+/* [0x00000a88] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a90] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a98] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000aa0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000aa8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00000ab0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000ab8] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_interrupt_exit8
+--/* [0x00000b80] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x00000b88] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000b90] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000b98] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000ba0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000ba8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000bb0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000bb8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000bc0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000bc8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000bd0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000bd8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000be0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x00000be8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+--/* [0x00000bf0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000ac0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00000ac8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000ad0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000ad8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000ae0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000ae8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000af0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000af8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000b00] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000b08] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000b10] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000b18] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000b20] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00000b28] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x00000b30] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_end
+- };
+- #ifdef __HIGHC__
+-diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
+-index cb74887..53da629 100644
+---- a/libavcodec/rpi_shader.h
+-+++ b/libavcodec/rpi_shader.h
+-@@ -7,8 +7,8 @@ extern unsigned int rpi_shader[];
+- #define mc_filter_uv (rpi_shader + 152)
+- #define mc_filter_uv_b0 (rpi_shader + 324)
+- #define mc_filter_uv_b (rpi_shader + 490)
+--#define mc_exit (rpi_shader + 718)
+--#define mc_interrupt_exit8 (rpi_shader + 736)
+--#define mc_end (rpi_shader + 766)
+-+#define mc_exit (rpi_shader + 670)
+-+#define mc_interrupt_exit8 (rpi_shader + 688)
+-+#define mc_end (rpi_shader + 718)
+- 
+- #endif
+-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
+-index faa5755..f38c926 100644
+---- a/libavcodec/rpi_shader.qasm
+-+++ b/libavcodec/rpi_shader.qasm
+-@@ -491,8 +491,8 @@ mov r0, unif
+- shr r1, r0, r2 # Extract width
+- sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
+- and r0, r0, rb22 # Extract height
+--add rb17, r0, 5
+--add rb18, r0, 7
+-+add rb17, r0, 1
+-+add rb18, r0, 3
+- shl r0, r0, 7
+- 
+- # r0 is currently height<<7
+-@@ -508,8 +508,6 @@ add rb26, r0, rb27
+- # In a B frame, so also set up VPM read (reading back 16bit precision)
+- add vr_setup, r3, rb21
+- 
+--sub.setf -,8,r1 # 8-r1, so if <0 (negative) we need to use the full code
+--
+- # get filter coefficients
+- 
+- mov r0, unif
+-@@ -517,18 +515,11 @@ asr ra3, r0, rb23;      mul24 r0, r0, ra22
+- asr ra2, r0, rb23;      mul24 r0, r0, ra22
+- asr ra1, r0, rb23;      mul24 r0, r0, ra22
+- asr ra0, r0, rb23;      mov r0, unif
+--asr ra7, r0, rb23;      mul24 r0, r0, ra22
+--asr ra6, r0, rb23;      mul24 r0, r0, ra22
+--asr ra5, r0, rb23;      mul24 r0, r0, ra22
+--asr ra4, r0, rb23;      mov r0, unif
+-+                        mov r0, unif
+- asr rb11, r0, rb23;     mul24 r0, r0, ra22
+- asr rb10, r0, rb23;     mul24 r0, r0, ra22
+- asr rb9, r0, rb23;      mul24 r0, r0, ra22
+- asr rb8, r0, rb23;      mov r0, unif
+--asr rb15, r0, rb23;     mul24 r0, r0, ra22
+--asr rb14, r0, rb23;     mul24 r0, r0, ra22
+--asr rb13, r0, rb23;     mul24 r0, r0, ra22
+--asr rb12, r0, rb23
+- 
+- # r2 is elem_num
+- # r3 is loop counter
+-@@ -570,26 +561,14 @@ add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+- nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+- add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+- nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+- add r0, r2, r3
+- 
+- mov r3, rb31
+- 
+--mov ra8, ra9
+--mov ra9, ra10
+--mov ra10, ra11
+--mov ra11, ra12
+- mov ra12, ra13
+- mov ra13, ra14
+- 
+--sub.setf -, r3, 8 ; mov r1, ra22
+-+sub.setf -, r3, 4 ; mov r1, ra22
+- # apply horizontal filter
+- brr.anyn -, r:uvloop_b
+- mov ra14, ra15          ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
+-@@ -598,14 +577,10 @@ nop                     ; nop    # TODO improve use of delay slots
+- 
+- # apply vertical filter and write to VPM
+- 
+--nop                     ; mul24 r1, ra14, rb14
+--nop                     ; mul24 r0, ra13, rb13
+--add r1, r1, r0          ; mul24 r0, ra12, rb12
+--add r1, r1, r0          ; mul24 r0, ra11, rb11
+--add r1, r1, r0          ; mul24 r0, ra10, rb10
+--add r1, r1, r0          ; mul24 r0, ra9, rb9
+--add r1, r1, r0          ; mul24 r0, ra8, rb8
+--add r1, r1, r0          ; mul24 r0, ra15, rb15
+-+nop                     ; mul24 r1, ra14, rb10
+-+nop                     ; mul24 r0, ra13, rb9
+-+add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+add r1, r1, r0          ; mul24 r0, ra15, rb11
+- add r1, r1, r0          ; mov -, vw_wait
+- sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+- asr r1, r1, 14          # shift2=6
+--- 
+-2.7.4
+-
+-
+-From e48df43c16de74dddbc7c702d64dd01eaf8e6b39 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Thu, 14 May 2015 11:17:09 +0100
+-Subject: [PATCH 30/68] Used P delay slots more efficiently
+-
+----
+- libavcodec/rpi_shader.c    | 437 ++++++++++++++++++++++-----------------------
+- libavcodec/rpi_shader.h    |  10 +-
+- libavcodec/rpi_shader.qasm |  19 +-
+- 3 files changed, 228 insertions(+), 238 deletions(-)
+-
+-diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
+-index 4e6c5ea..a1af4e3 100644
+---- a/libavcodec/rpi_shader.c
+-+++ b/libavcodec/rpi_shader.c
+-@@ -156,239 +156,236 @@ unsigned int rpi_shader[] = {
+- /* [0x00000408] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+- /* [0x00000410] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+- /* [0x00000418] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00000420] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
+--/* [0x00000428] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+--/* [0x00000430] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+-+/* [0x00000420] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+-+/* [0x00000428] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+-+/* [0x00000430] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
+- /* [0x00000438] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x00000440] */ 0x8d5847f6, 0xd00269e1, // sub.setf -, r3, 4 ; mov r1, ra22
+--/* [0x00000448] */ 0xffffff20, 0xf06809e7, // brr.anyn -, r:uvloop
+--/* [0x00000450] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+--/* [0x00000458] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+--/* [0x00000460] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+--/* [0x00000468] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+--/* [0x00000470] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+--/* [0x00000478] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+--/* [0x00000480] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+--/* [0x00000488] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x00000490] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x00000498] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+--/* [0x000004a0] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
+--/* [0x000004a8] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop
+--/* [0x000004b0] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
+--/* [0x000004b8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x000004c0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+--/* [0x000004c8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x000004d0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x000004d8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x000004e0] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x000004e8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x000004f0] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x000004f8] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x00000500] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000508] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000440] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+-+/* [0x00000448] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+-+/* [0x00000450] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+-+/* [0x00000458] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+-+/* [0x00000460] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+/* [0x00000468] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x00000470] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x00000478] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x00000480] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x00000488] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
+-+/* [0x00000490] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:uvloop
+-+/* [0x00000498] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
+-+/* [0x000004a0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x000004a8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x000004b0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x000004b8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x000004c0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x000004c8] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x000004d0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x000004d8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x000004e0] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x000004e8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x000004f0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_filter_uv_b0
+--/* [0x00000510] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00000518] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x00000520] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00000528] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x00000530] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+--/* [0x00000538] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x00000540] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+--/* [0x00000548] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x00000550] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+--/* [0x00000558] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x00000560] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+--/* [0x00000568] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
+--/* [0x00000570] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x00000578] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000580] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x00000588] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x00000590] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x00000598] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+--/* [0x000005a0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+--/* [0x000005a8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x000005b0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x000005b8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x000005c0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x000005c8] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000005d0] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000005d8] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000005e0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000005e8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x000005f0] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000005f8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000600] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000608] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000610] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+--/* [0x00000618] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x00000620] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000628] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x000004f8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000500] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00000508] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000510] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000518] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x00000520] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000528] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x00000530] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x00000538] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+-+/* [0x00000540] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x00000548] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+-+/* [0x00000550] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
+-+/* [0x00000558] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x00000560] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000568] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x00000570] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x00000578] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x00000580] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+-+/* [0x00000588] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+-+/* [0x00000590] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x00000598] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x000005a0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x000005a8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x000005b0] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000005b8] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000005c0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000005c8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000005d0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x000005d8] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000005e0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000005e8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000005f0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000005f8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+-+/* [0x00000600] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x00000608] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000610] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop_b0
+--/* [0x00000630] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x00000638] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x00000640] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x00000648] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x00000650] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x00000658] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x00000660] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x00000668] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x00000670] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x00000678] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x00000680] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000688] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x00000690] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x00000698] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x000006a0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x000006a8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x000006b0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x000006b8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x000006c0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x000006c8] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
+--/* [0x000006d0] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+--/* [0x000006d8] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+--/* [0x000006e0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x000006e8] */ 0x8d5847f6, 0xd00269e1, // sub.setf -, r3, 4 ; mov r1, ra22
+--/* [0x000006f0] */ 0xffffff20, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+--/* [0x000006f8] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+--/* [0x00000700] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+--/* [0x00000708] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+--/* [0x00000710] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+--/* [0x00000718] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+--/* [0x00000720] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+--/* [0x00000728] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+--/* [0x00000730] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x00000738] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x00000740] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+--/* [0x00000748] */ 0x0f9ce3c0, 0xd0020c27, // asr vpm, r1, 14
+--/* [0x00000750] */ 0x009e7000, 0x100009e7, // nop
+--/* [0x00000758] */ 0x009e7000, 0x100009e7, // nop
+--/* [0x00000760] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00000768] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000770] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x00000778] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x00000780] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x00000788] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000790] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x00000798] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x000007a0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000618] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x00000620] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x00000628] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x00000630] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000638] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x00000640] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000648] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000650] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x00000658] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x00000660] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x00000668] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000670] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x00000678] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x00000680] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00000688] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x00000690] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000698] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x000006a0] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x000006a8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x000006b0] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
+-+/* [0x000006b8] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+-+/* [0x000006c0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+-+/* [0x000006c8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x000006d0] */ 0x8d5847f6, 0xd00269e1, // sub.setf -, r3, 4 ; mov r1, ra22
+-+/* [0x000006d8] */ 0xffffff20, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+-+/* [0x000006e0] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+-+/* [0x000006e8] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+-+/* [0x000006f0] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+-+/* [0x000006f8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+-+/* [0x00000700] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+-+/* [0x00000708] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+/* [0x00000710] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x00000718] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x00000720] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x00000728] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+-+/* [0x00000730] */ 0x0f9ce3c0, 0xd0020c27, // asr vpm, r1, 14
+-+/* [0x00000738] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x00000740] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x00000748] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00000750] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000758] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000760] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x00000768] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x00000770] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000778] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x00000780] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000788] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_filter_uv_b
+--/* [0x000007a8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x000007b0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x000007b8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x000007c0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x000007c8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+--/* [0x000007d0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x000007d8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+--/* [0x000007e0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x000007e8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+--/* [0x000007f0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x000007f8] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+--/* [0x00000800] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+--/* [0x00000808] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x00000810] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000818] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x00000820] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x00000828] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x00000830] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+--/* [0x00000838] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+--/* [0x00000840] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x00000848] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+--/* [0x00000850] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+--/* [0x00000858] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+--/* [0x00000860] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000868] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000870] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00000878] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
+--/* [0x00000880] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000888] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000890] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000898] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000008a0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x000008a8] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000008b0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000008b8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000008c0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000008c8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+--/* [0x000008d0] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x000008d8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x000008e0] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x00000790] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000798] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x000007a0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x000007a8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x000007b0] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x000007b8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x000007c0] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x000007c8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x000007d0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+-+/* [0x000007d8] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x000007e0] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+-+/* [0x000007e8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x000007f0] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x000007f8] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000800] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x00000808] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x00000810] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x00000818] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+-+/* [0x00000820] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+-+/* [0x00000828] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x00000830] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+-+/* [0x00000838] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+-+/* [0x00000840] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+-+/* [0x00000848] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000850] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000858] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00000860] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
+-+/* [0x00000868] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000870] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000878] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000880] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000888] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x00000890] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000898] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000008a0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000008a8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000008b0] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+-+/* [0x000008b8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x000008c0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x000008c8] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop_b
+--/* [0x000008e8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x000008f0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x000008f8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x00000900] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x00000908] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x00000910] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x00000918] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x00000920] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x00000928] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x00000930] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x00000938] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000940] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x00000948] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x00000950] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x00000958] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x00000960] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x00000968] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x00000970] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x00000978] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00000980] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
+--/* [0x00000988] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+--/* [0x00000990] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+--/* [0x00000998] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x000009a0] */ 0x8d5847f6, 0xd00269e1, // sub.setf -, r3, 4 ; mov r1, ra22
+--/* [0x000009a8] */ 0xffffff20, 0xf06809e7, // brr.anyn -, r:uvloop_b
+--/* [0x000009b0] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+--/* [0x000009b8] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+--/* [0x000009c0] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+--/* [0x000009c8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+--/* [0x000009d0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+--/* [0x000009d8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+--/* [0x000009e0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+--/* [0x000009e8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x000009f0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x000009f8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+--/* [0x00000a00] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
+--/* [0x00000a08] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
+--/* [0x00000a10] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:uvloop_b
+--/* [0x00000a18] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
+--/* [0x00000a20] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x00000a28] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+--/* [0x00000a30] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00000a38] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000a40] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x00000a48] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x00000a50] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x00000a58] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000a60] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x00000a68] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000a70] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x000008d0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x000008d8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x000008e0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x000008e8] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x000008f0] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x000008f8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000900] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000908] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x00000910] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x00000918] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x00000920] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000928] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x00000930] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x00000938] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00000940] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x00000948] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000950] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00000958] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00000960] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00000968] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
+-+/* [0x00000970] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+-+/* [0x00000978] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+-+/* [0x00000980] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00000988] */ 0x8d5847f6, 0xd00269e1, // sub.setf -, r3, 4 ; mov r1, ra22
+-+/* [0x00000990] */ 0xffffff20, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x00000998] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+-+/* [0x000009a0] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+-+/* [0x000009a8] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+-+/* [0x000009b0] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+-+/* [0x000009b8] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+-+/* [0x000009c0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+/* [0x000009c8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x000009d0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x000009d8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x000009e0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x000009e8] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
+-+/* [0x000009f0] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
+-+/* [0x000009f8] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x00000a00] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
+-+/* [0x00000a08] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x00000a10] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x00000a18] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00000a20] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000a28] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000a30] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x00000a38] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x00000a40] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000a48] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x00000a50] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000a58] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_exit
+--/* [0x00000a78] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x00000a80] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+-+/* [0x00000a60] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00000a68] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+-+/* [0x00000a70] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a78] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a80] */ 0x009e7000, 0xa00009e7, // ldtmu0
+- /* [0x00000a88] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000a90] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000a98] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000aa0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000aa8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x00000ab0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+--/* [0x00000ab8] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000a90] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00000a98] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000aa0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_interrupt_exit8
+--/* [0x00000ac0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00000aa8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00000ab0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000ab8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000ac0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+- /* [0x00000ac8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000ad0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000ad8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000ae0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000ad0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000ad8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000ae0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00000ae8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00000af0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00000af8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00000b00] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000b08] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000b10] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000b18] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000b20] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x00000b28] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+--/* [0x00000b30] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000b08] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00000b10] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x00000b18] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_end
+- };
+- #ifdef __HIGHC__
+-diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
+-index 53da629..1fb3e37 100644
+---- a/libavcodec/rpi_shader.h
+-+++ b/libavcodec/rpi_shader.h
+-@@ -5,10 +5,10 @@ extern unsigned int rpi_shader[];
+- 
+- #define mc_setup_uv (rpi_shader + 0)
+- #define mc_filter_uv (rpi_shader + 152)
+--#define mc_filter_uv_b0 (rpi_shader + 324)
+--#define mc_filter_uv_b (rpi_shader + 490)
+--#define mc_exit (rpi_shader + 670)
+--#define mc_interrupt_exit8 (rpi_shader + 688)
+--#define mc_end (rpi_shader + 718)
+-+#define mc_filter_uv_b0 (rpi_shader + 318)
+-+#define mc_filter_uv_b (rpi_shader + 484)
+-+#define mc_exit (rpi_shader + 664)
+-+#define mc_interrupt_exit8 (rpi_shader + 682)
+-+#define mc_end (rpi_shader + 712)
+- 
+- #endif
+-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
+-index f38c926..02e95dd 100644
+---- a/libavcodec/rpi_shader.qasm
+-+++ b/libavcodec/rpi_shader.qasm
+-@@ -268,6 +268,7 @@ add t0s, ra_x2_base, r2
+- 
+- mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+- 
+-+# apply horizontal filter
+- nop                  ; mul24 r2, r0, ra0
+- nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+- nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-@@ -276,20 +277,12 @@ add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+- nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+- add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+- nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--add r0, r2, r3
+--
+--mov r3, rb31
+--
+--mov ra12, ra13
+--mov ra13, ra14
+--
+--sub.setf -, r3, 4 ; mov r1, ra22
+--
+--# apply horizontal filter
+-+add r0, r2, r3       ; mov r3, rb31
+-+sub.setf -, r3, 4    ; mov ra12, ra13
+- brr.anyn -, r:uvloop
+--mov ra14, ra15          ; mul24 r0, r0, r1         # last bit of context scroll
+--asr ra15, r0, 8         ; nop
+--nop                     ; nop  # Delay slot 3 (TODO move more of the context scroll into here)
+-+mov ra13, ra14       # Delay slot 1
+-+mov ra14, ra15       # Delay slot 2
+-+mov ra15, r0         # Delay slot 3
+- 
+- # apply vertical filter and write to VPM
+- 
+--- 
+-2.7.4
+-
+-
+-From b33dfc243ff5509299685add3c532ab7f207fd73 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Thu, 14 May 2015 11:22:25 +0100
+-Subject: [PATCH 31/68] Improved use of delay slots
+-
+----
+- libavcodec/rpi_shader.c    | 503 ++++++++++++++++++++++-----------------------
+- libavcodec/rpi_shader.h    |  10 +-
+- libavcodec/rpi_shader.qasm |  41 ++--
+- 3 files changed, 265 insertions(+), 289 deletions(-)
+-
+-diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
+-index a1af4e3..c498f28 100644
+---- a/libavcodec/rpi_shader.c
+-+++ b/libavcodec/rpi_shader.c
+-@@ -122,270 +122,263 @@ unsigned int rpi_shader[] = {
+- /* [0x00000300] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+- /* [0x00000308] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+- /* [0x00000310] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00000318] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
+--/* [0x00000320] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000328] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000330] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000338] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000340] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x00000348] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000350] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000358] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000360] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000368] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+--/* [0x00000370] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x00000378] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000380] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x00000318] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000320] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000328] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000330] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000338] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x00000340] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000348] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000350] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000358] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000360] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+-+/* [0x00000368] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x00000370] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000378] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop
+--/* [0x00000388] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x00000390] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x00000398] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x000003a0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x000003a8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x000003b0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x000003b8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x000003c0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x000003c8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x000003d0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x000003d8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x000003e0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x000003e8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x000003f0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x000003f8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x00000400] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x00000408] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x00000410] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x00000418] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00000420] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+--/* [0x00000428] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+--/* [0x00000430] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
+--/* [0x00000438] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x00000440] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+--/* [0x00000448] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+--/* [0x00000450] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+--/* [0x00000458] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+--/* [0x00000460] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+--/* [0x00000468] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+--/* [0x00000470] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x00000478] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x00000480] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+--/* [0x00000488] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
+--/* [0x00000490] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:uvloop
+--/* [0x00000498] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
+--/* [0x000004a0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x000004a8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+--/* [0x000004b0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x000004b8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x000004c0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x000004c8] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x000004d0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x000004d8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x000004e0] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x000004e8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x000004f0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000380] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x00000388] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x00000390] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x00000398] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x000003a0] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x000003a8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x000003b0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x000003b8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x000003c0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x000003c8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x000003d0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x000003d8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x000003e0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x000003e8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x000003f0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x000003f8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000400] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00000408] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00000410] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00000418] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+-+/* [0x00000420] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+-+/* [0x00000428] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
+-+/* [0x00000430] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00000438] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+-+/* [0x00000440] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+-+/* [0x00000448] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+-+/* [0x00000450] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+-+/* [0x00000458] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+/* [0x00000460] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x00000468] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x00000470] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x00000478] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x00000480] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
+-+/* [0x00000488] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:uvloop
+-+/* [0x00000490] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
+-+/* [0x00000498] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x000004a0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x000004a8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x000004b0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x000004b8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x000004c0] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x000004c8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x000004d0] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x000004d8] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x000004e0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x000004e8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_filter_uv_b0
+--/* [0x000004f8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00000500] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x00000508] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00000510] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x00000518] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+--/* [0x00000520] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x00000528] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+--/* [0x00000530] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x00000538] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+--/* [0x00000540] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x00000548] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+--/* [0x00000550] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
+--/* [0x00000558] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x00000560] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000568] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x00000570] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x00000578] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x00000580] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+--/* [0x00000588] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+--/* [0x00000590] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x00000598] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x000005a0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x000005a8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x000005b0] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000005b8] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000005c0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000005c8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000005d0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x000005d8] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000005e0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000005e8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000005f0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000005f8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+--/* [0x00000600] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x00000608] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000610] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x000004f0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x000004f8] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00000500] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000508] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000510] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x00000518] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000520] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x00000528] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x00000530] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+-+/* [0x00000538] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x00000540] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+-+/* [0x00000548] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
+-+/* [0x00000550] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x00000558] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000560] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x00000568] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x00000570] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x00000578] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+-+/* [0x00000580] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+-+/* [0x00000588] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x00000590] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000598] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x000005a0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x000005a8] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000005b0] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000005b8] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000005c0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000005c8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x000005d0] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000005d8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000005e0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000005e8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000005f0] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+-+/* [0x000005f8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x00000600] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000608] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop_b0
+--/* [0x00000618] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x00000620] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x00000628] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x00000630] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x00000638] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x00000640] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x00000648] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x00000650] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x00000658] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x00000660] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x00000668] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000670] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x00000678] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x00000680] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x00000688] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x00000690] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x00000698] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x000006a0] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x000006a8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x000006b0] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
+--/* [0x000006b8] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+--/* [0x000006c0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+--/* [0x000006c8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x000006d0] */ 0x8d5847f6, 0xd00269e1, // sub.setf -, r3, 4 ; mov r1, ra22
+--/* [0x000006d8] */ 0xffffff20, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+--/* [0x000006e0] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+--/* [0x000006e8] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+--/* [0x000006f0] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+--/* [0x000006f8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+--/* [0x00000700] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+--/* [0x00000708] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+--/* [0x00000710] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+--/* [0x00000718] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x00000720] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x00000728] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+--/* [0x00000730] */ 0x0f9ce3c0, 0xd0020c27, // asr vpm, r1, 14
+--/* [0x00000738] */ 0x009e7000, 0x100009e7, // nop
+--/* [0x00000740] */ 0x009e7000, 0x100009e7, // nop
+--/* [0x00000748] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00000750] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000758] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x00000760] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x00000768] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x00000770] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000778] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x00000780] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000788] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000610] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x00000618] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x00000620] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x00000628] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000630] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x00000638] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000640] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000648] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x00000650] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x00000658] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x00000660] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000668] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x00000670] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x00000678] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00000680] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x00000688] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000690] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00000698] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x000006a0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x000006a8] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+-+/* [0x000006b0] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+-+/* [0x000006b8] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+-+/* [0x000006c0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x000006c8] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+-+/* [0x000006d0] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+-+/* [0x000006d8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+-+/* [0x000006e0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+-+/* [0x000006e8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+/* [0x000006f0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x000006f8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x00000700] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
+-+/* [0x00000708] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+-+/* [0x00000710] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
+-+/* [0x00000718] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x00000720] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x00000728] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00000730] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000738] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000740] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x00000748] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x00000750] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000758] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x00000760] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000768] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_filter_uv_b
+--/* [0x00000790] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00000798] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x000007a0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x000007a8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x000007b0] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+--/* [0x000007b8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x000007c0] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+--/* [0x000007c8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x000007d0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+--/* [0x000007d8] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x000007e0] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+--/* [0x000007e8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+--/* [0x000007f0] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x000007f8] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000800] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x00000808] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x00000810] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x00000818] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+--/* [0x00000820] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+--/* [0x00000828] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x00000830] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+--/* [0x00000838] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+--/* [0x00000840] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+--/* [0x00000848] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000850] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000858] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00000860] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
+--/* [0x00000868] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000870] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000878] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000880] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000888] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x00000890] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000898] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000008a0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000008a8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000008b0] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+--/* [0x000008b8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x000008c0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x000008c8] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x00000770] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000778] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00000780] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000788] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000790] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x00000798] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x000007a0] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x000007a8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x000007b0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+-+/* [0x000007b8] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x000007c0] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+-+/* [0x000007c8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x000007d0] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x000007d8] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000007e0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x000007e8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x000007f0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x000007f8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+-+/* [0x00000800] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+-+/* [0x00000808] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x00000810] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+-+/* [0x00000818] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+-+/* [0x00000820] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+-+/* [0x00000828] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000830] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000838] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00000840] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
+-+/* [0x00000848] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000850] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000858] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000860] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000868] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x00000870] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000878] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000880] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000888] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000890] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+-+/* [0x00000898] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x000008a0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x000008a8] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop_b
+--/* [0x000008d0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x000008d8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x000008e0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x000008e8] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x000008f0] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x000008f8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x00000900] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x00000908] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x00000910] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x00000918] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x00000920] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000928] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x00000930] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x00000938] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x00000940] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x00000948] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x00000950] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x00000958] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x00000960] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00000968] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
+--/* [0x00000970] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+--/* [0x00000978] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+--/* [0x00000980] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x00000988] */ 0x8d5847f6, 0xd00269e1, // sub.setf -, r3, 4 ; mov r1, ra22
+--/* [0x00000990] */ 0xffffff20, 0xf06809e7, // brr.anyn -, r:uvloop_b
+--/* [0x00000998] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+--/* [0x000009a0] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+--/* [0x000009a8] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+--/* [0x000009b0] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+--/* [0x000009b8] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+--/* [0x000009c0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+--/* [0x000009c8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+--/* [0x000009d0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x000009d8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x000009e0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+--/* [0x000009e8] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
+--/* [0x000009f0] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
+--/* [0x000009f8] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:uvloop_b
+--/* [0x00000a00] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
+--/* [0x00000a08] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x00000a10] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+--/* [0x00000a18] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00000a20] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000a28] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x00000a30] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x00000a38] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x00000a40] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000a48] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x00000a50] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000a58] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x000008b0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x000008b8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x000008c0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x000008c8] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x000008d0] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x000008d8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x000008e0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x000008e8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x000008f0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x000008f8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x00000900] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000908] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x00000910] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x00000918] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00000920] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x00000928] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000930] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00000938] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00000940] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00000948] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+-+/* [0x00000950] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+-+/* [0x00000958] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x00000960] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00000968] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+-+/* [0x00000970] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+-+/* [0x00000978] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+-+/* [0x00000980] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+-+/* [0x00000988] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+/* [0x00000990] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x00000998] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x000009a0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x000009a8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x000009b0] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
+-+/* [0x000009b8] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
+-+/* [0x000009c0] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x000009c8] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
+-+/* [0x000009d0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x000009d8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x000009e0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x000009e8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x000009f0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x000009f8] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x00000a00] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x00000a08] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000a10] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x00000a18] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000a20] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_exit
+--/* [0x00000a60] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x00000a68] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+--/* [0x00000a70] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a28] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00000a30] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+-+/* [0x00000a38] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a40] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a48] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a50] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a58] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00000a60] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000a68] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+// ::mc_interrupt_exit8
+-+/* [0x00000a70] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+- /* [0x00000a78] */ 0x009e7000, 0xa00009e7, // ldtmu0
+- /* [0x00000a80] */ 0x009e7000, 0xa00009e7, // ldtmu0
+- /* [0x00000a88] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000a90] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x00000a98] */ 0x009e7000, 0x100009e7, // nop        ; nop
+--/* [0x00000aa0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+--// ::mc_interrupt_exit8
+--/* [0x00000aa8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x00000ab0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000ab8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000ac0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000ac8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000ad0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000ad8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000ae0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000ae8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000af0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000af8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000b00] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000b08] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x00000b10] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+--/* [0x00000b18] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000a90] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a98] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000aa0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000aa8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000ab0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000ab8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000ac0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000ac8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000ad0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00000ad8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x00000ae0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_end
+- };
+- #ifdef __HIGHC__
+-diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
+-index 1fb3e37..3fac45f 100644
+---- a/libavcodec/rpi_shader.h
+-+++ b/libavcodec/rpi_shader.h
+-@@ -5,10 +5,10 @@ extern unsigned int rpi_shader[];
+- 
+- #define mc_setup_uv (rpi_shader + 0)
+- #define mc_filter_uv (rpi_shader + 152)
+--#define mc_filter_uv_b0 (rpi_shader + 318)
+--#define mc_filter_uv_b (rpi_shader + 484)
+--#define mc_exit (rpi_shader + 664)
+--#define mc_interrupt_exit8 (rpi_shader + 682)
+--#define mc_end (rpi_shader + 712)
+-+#define mc_filter_uv_b0 (rpi_shader + 316)
+-+#define mc_filter_uv_b (rpi_shader + 476)
+-+#define mc_exit (rpi_shader + 650)
+-+#define mc_interrupt_exit8 (rpi_shader + 668)
+-+#define mc_end (rpi_shader + 698)
+- 
+- #endif
+-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
+-index 02e95dd..10f5113 100644
+---- a/libavcodec/rpi_shader.qasm
+-+++ b/libavcodec/rpi_shader.qasm
+-@@ -221,8 +221,6 @@ add r0, r0, r1 # Combine width and height of destination area
+- shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
+- add rb26, r0, rb27
+- 
+--sub.setf -,8,r1 # 8-r1, so if <0 (negative) we need to use the full code
+--
+- # get filter coefficients
+- 
+- mov r0, unif
+-@@ -410,20 +408,12 @@ add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+- nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+- add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+- nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--add r0, r2, r3
+--
+--mov r3, rb31
+--
+--mov ra12, ra13
+--mov ra13, ra14
+--
+--sub.setf -, r3, 4 ; mov r1, ra22
+--
+--# apply horizontal filter
+-+add r0, r2, r3       ; mov r3, rb31
+-+sub.setf -, r3, 4    ; mov ra12, ra13
+- brr.anyn -, r:uvloop_b0
+--mov ra14, ra15          ; mul24 r0, r0, r1         # last bit of context scroll
+--asr ra15, r0, 8         ; nop  # TODO isn't ra15 already in 24bit precision, may not need the sign extension here?
+--nop                     ; nop  # Delay slot 3 (TODO move more of the context scroll into here)
+-+mov ra13, ra14       # Delay slot 1
+-+mov ra14, ra15       # Delay slot 2
+-+mov ra15, r0         # Delay slot 3
+- 
+- # apply vertical filter and write to VPM
+- 
+-@@ -432,9 +422,9 @@ nop                     ; mul24 r0, ra13, rb9
+- add r1, r1, r0          ; mul24 r0, ra12, rb8
+- add r1, r1, r0          ; mul24 r0, ra15, rb11
+- add r1, r1, r0          ; mov -, vw_wait
+--sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+sub.setf -, r3, rb18
+- brr.anyn -, r:uvloop_b0
+--asr vpm, r1, 14        # Delay 1 shifts down by shift2=6, but results are still in 16bit precision TODO may be able to avoid the mul24 and use more delay slots
+-+asr vpm, r1, 6         # Delay 1 shifts down by shift2=6, but results are still in 16bit precision
+- nop                    # Delay 2
+- nop                    # Delay 3
+- 
+-@@ -554,19 +544,12 @@ add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+- nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+- add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+- nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--add r0, r2, r3
+--
+--mov r3, rb31
+--
+--mov ra12, ra13
+--mov ra13, ra14
+--
+--sub.setf -, r3, 4 ; mov r1, ra22
+--# apply horizontal filter
+-+add r0, r2, r3       ; mov r3, rb31
+-+sub.setf -, r3, 4    ; mov ra12, ra13
+- brr.anyn -, r:uvloop_b
+--mov ra14, ra15          ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
+--asr ra15, r0, 8         ; nop
+--nop                     ; nop    # TODO improve use of delay slots
+-+mov ra13, ra14       # Delay slot 1
+-+mov ra14, ra15       # Delay slot 2
+-+mov ra15, r0         # Delay slot 3
+- 
+- # apply vertical filter and write to VPM
+- 
+--- 
+-2.7.4
+-
+-
+-From af59f8e00eb977e97debc5e72ba47e0077db1787 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Thu, 14 May 2015 11:31:23 +0100
+-Subject: [PATCH 32/68] Avoid writeback of first B results
+-
+----
+- libavcodec/rpi_shader.c    | 229 ++++++++++++++++++++++-----------------------
+- libavcodec/rpi_shader.h    |   8 +-
+- libavcodec/rpi_shader.qasm |  18 +---
+- 3 files changed, 121 insertions(+), 134 deletions(-)
+-
+-diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
+-index c498f28..ba453a2 100644
+---- a/libavcodec/rpi_shader.c
+-+++ b/libavcodec/rpi_shader.c
+-@@ -255,130 +255,125 @@ unsigned int rpi_shader[] = {
+- /* [0x00000710] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
+- /* [0x00000718] */ 0x009e7000, 0x100009e7, // nop
+- /* [0x00000720] */ 0x009e7000, 0x100009e7, // nop
+--/* [0x00000728] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00000730] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000738] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x00000740] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x00000748] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x00000750] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000758] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x00000760] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000768] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000728] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000730] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000738] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000740] */ 0x009e7000, 0x100009e7, // nop
+- // ::mc_filter_uv_b
+--/* [0x00000770] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00000778] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x00000780] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00000788] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x00000790] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+--/* [0x00000798] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x000007a0] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+--/* [0x000007a8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x000007b0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+--/* [0x000007b8] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x000007c0] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+--/* [0x000007c8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+--/* [0x000007d0] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x000007d8] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000007e0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x000007e8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x000007f0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x000007f8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+--/* [0x00000800] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+--/* [0x00000808] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x00000810] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+--/* [0x00000818] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+--/* [0x00000820] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+--/* [0x00000828] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000830] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000838] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00000840] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
+-+/* [0x00000748] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000750] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00000758] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000760] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000768] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x00000770] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000778] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x00000780] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x00000788] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+-+/* [0x00000790] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x00000798] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+-+/* [0x000007a0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x000007a8] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x000007b0] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000007b8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x000007c0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x000007c8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x000007d0] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+-+/* [0x000007d8] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+-+/* [0x000007e0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x000007e8] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+-+/* [0x000007f0] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+-+/* [0x000007f8] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+-+/* [0x00000800] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000808] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000810] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00000818] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
+-+/* [0x00000820] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000828] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000830] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000838] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000840] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+- /* [0x00000848] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000850] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000858] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000860] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000868] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x00000870] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000878] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000880] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000888] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000890] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+--/* [0x00000898] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x000008a0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x000008a8] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x00000850] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000858] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000860] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000868] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+-+/* [0x00000870] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x00000878] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000880] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop_b
+--/* [0x000008b0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x000008b8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x000008c0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x000008c8] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x000008d0] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x000008d8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x000008e0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x000008e8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x000008f0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x000008f8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x00000900] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000908] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x00000910] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x00000918] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x00000920] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x00000928] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x00000930] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x00000938] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x00000940] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00000948] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+--/* [0x00000950] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+--/* [0x00000958] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
+--/* [0x00000960] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x00000968] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+--/* [0x00000970] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+--/* [0x00000978] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+--/* [0x00000980] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+--/* [0x00000988] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+--/* [0x00000990] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+--/* [0x00000998] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x000009a0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x000009a8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+--/* [0x000009b0] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
+--/* [0x000009b8] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
+--/* [0x000009c0] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
+--/* [0x000009c8] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
+--/* [0x000009d0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x000009d8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+--/* [0x000009e0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x000009e8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x000009f0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x000009f8] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x00000a00] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x00000a08] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000a10] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x00000a18] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000a20] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000888] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x00000890] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x00000898] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x000008a0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x000008a8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x000008b0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x000008b8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x000008c0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x000008c8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x000008d0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x000008d8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x000008e0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x000008e8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x000008f0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x000008f8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x00000900] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000908] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00000910] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00000918] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00000920] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+-+/* [0x00000928] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+-+/* [0x00000930] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x00000938] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00000940] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+-+/* [0x00000948] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+-+/* [0x00000950] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+-+/* [0x00000958] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+-+/* [0x00000960] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+/* [0x00000968] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x00000970] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x00000978] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x00000980] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x00000988] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
+-+/* [0x00000990] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
+-+/* [0x00000998] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x000009a0] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
+-+/* [0x000009a8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x000009b0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x000009b8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x000009c0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x000009c8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x000009d0] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x000009d8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x000009e0] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x000009e8] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x000009f0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x000009f8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_exit
+--/* [0x00000a28] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x00000a30] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+--/* [0x00000a38] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000a40] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000a48] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000a50] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000a58] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x00000a60] */ 0x009e7000, 0x100009e7, // nop        ; nop
+--/* [0x00000a68] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000a00] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00000a08] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+-+/* [0x00000a10] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a18] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a20] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a28] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a30] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00000a38] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000a40] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_interrupt_exit8
+--/* [0x00000a70] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x00000a78] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000a80] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000a88] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000a90] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a48] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00000a50] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a58] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a60] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a68] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a70] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000a78] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000a80] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000a88] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000a90] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00000a98] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00000aa0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000aa8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000ab0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000ab8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000ac0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000ac8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000ad0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x00000ad8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+--/* [0x00000ae0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000aa8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00000ab0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x00000ab8] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_end
+- };
+- #ifdef __HIGHC__
+-diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
+-index 3fac45f..45dbe0e 100644
+---- a/libavcodec/rpi_shader.h
+-+++ b/libavcodec/rpi_shader.h
+-@@ -6,9 +6,9 @@ extern unsigned int rpi_shader[];
+- #define mc_setup_uv (rpi_shader + 0)
+- #define mc_filter_uv (rpi_shader + 152)
+- #define mc_filter_uv_b0 (rpi_shader + 316)
+--#define mc_filter_uv_b (rpi_shader + 476)
+--#define mc_exit (rpi_shader + 650)
+--#define mc_interrupt_exit8 (rpi_shader + 668)
+--#define mc_end (rpi_shader + 698)
+-+#define mc_filter_uv_b (rpi_shader + 466)
+-+#define mc_exit (rpi_shader + 640)
+-+#define mc_interrupt_exit8 (rpi_shader + 658)
+-+#define mc_end (rpi_shader + 688)
+- 
+- #endif
+-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
+-index 10f5113..e138c95 100644
+---- a/libavcodec/rpi_shader.qasm
+-+++ b/libavcodec/rpi_shader.qasm
+-@@ -428,22 +428,14 @@ asr vpm, r1, 6         # Delay 1 shifts down by shift2=6, but results are still
+- nop                    # Delay 2
+- nop                    # Delay 3
+- 
+-+# in pass0 we don't really need to save any results, but need to discard the uniforms
+- # DMA out for U
+- 
+--mov vw_setup, rb26 # VDW setup 0
+--mov vw_setup, rb29 # Stride
+--mov vw_addr, unif # start the VDW    # TODO in pass0 we don't need to save any results
+--
+--# DMA out for V
+--# We need to wait for the U to complete first, but have nothing useful to compute while we wait.
+--# Could potentially push this write into the start of the next pipeline stage.
+--mov r0, 16
+--mov -, vw_wait
+--
+- bra -, ra31
+--add vw_setup, rb26, r0 # VDW setup 0
+--mov vw_setup, rb29 # Stride
+--mov vw_addr, unif # start the VDW
+-+mov r0, unif           # Delay 1
+-+mov r0, unif           # Delay 2
+-+nop                    # Delay 3
+-+
+- 
+- ################################################################################
+- 
+--- 
+-2.7.4
+-
+-
+-From 12e57278cb19a769d2e1488e8e94003027493d09 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Thu, 14 May 2015 11:36:24 +0100
+-Subject: [PATCH 33/68] Cutdown size of chroma prediction commands
+-
+----
+- libavcodec/hevc.c          |  17 +-
+- libavcodec/rpi_shader.c    | 543 ++++++++++++++++++++++-----------------------
+- libavcodec/rpi_shader.h    |  12 +-
+- libavcodec/rpi_shader.qasm |  11 +-
+- 4 files changed, 281 insertions(+), 302 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index a47ebc5..32b89d5 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -56,7 +56,7 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
+- 
+- #ifdef RPI_INTER_QPU
+- 
+--#define RPI_CHROMA_COMMAND_WORDS 12
+-+#define RPI_CHROMA_COMMAND_WORDS 10
+- #define UV_COMMANDS_PER_QPU ((1 + (256*64*2)/(4*4)) * RPI_CHROMA_COMMAND_WORDS)
+- // The QPU code for UV blocks only works up to a block width of 8
+- #define RPI_CHROMA_BLOCK_WIDTH 8
+-@@ -2032,11 +2032,8 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[1]);
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[2]);
+-                       *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
+--                      // TODO chroma weight and offset... s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]
+-                       *u++ = rpi_filter_coefs[_mx][0];
+--                      u++;
+-                       *u++ = rpi_filter_coefs[_my][0];
+--                      u++;
+-                       *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
+-                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+-                     }
+-@@ -2091,9 +2088,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                       *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
+-                       // TODO chroma weight and offset... s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]
+-                       *u++ = rpi_filter_coefs[_mx][0];
+--                      u++;
+-                       *u++ = rpi_filter_coefs[_my][0];
+--                      u++;
+-                       *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
+-                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+-                     }
+-@@ -2154,11 +2149,8 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[2]);
+-                       *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
+-                       *u++ = rpi_filter_coefs[_mx][0];
+--                      u++;
+-                       *u++ = rpi_filter_coefs[_my][0];
+--                      u++;
+--                      *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]); // TODO this will become unused once we have a dedicated pass0 filter
+--                      *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+-+                      u+=2; // Intermediate results are not written back in first pass of B filtering
+- 
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b;
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = x2_c - 1 + start_x;
+-@@ -2166,11 +2158,8 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[1]);
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[2]);
+-                       *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
+--                      // TODO chroma weight and offset... s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]
+-                       *u++ = rpi_filter_coefs[_mx2][0];
+--                      u++;
+-                       *u++ = rpi_filter_coefs[_my2][0];
+--                      u++;
+-                       *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
+-                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+-                     }
+-@@ -2808,7 +2797,7 @@ static void rpi_inter_clear(HEVCContext *s)
+-         *s->u_mvs[i]++ = pic_height;
+-         *s->u_mvs[i]++ = s->frame->linesize[1];
+-         *s->u_mvs[i]++ = s->frame->linesize[2];
+--        s->u_mvs[i] += 3;  // Padding words
+-+        s->u_mvs[i] += 1;  // Padding words
+-     }
+- }
+- 
+-diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
+-index ba453a2..b0b93b5 100644
+---- a/libavcodec/rpi_shader.c
+-+++ b/libavcodec/rpi_shader.c
+-@@ -89,291 +89,286 @@ unsigned int rpi_shader[] = {
+- /* [0x00000200] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
+- /* [0x00000208] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
+- /* [0x00000210] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000218] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000220] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000228] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
+--/* [0x00000230] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+--/* [0x00000238] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
+--/* [0x00000240] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000248] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+--/* [0x00000250] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
+--/* [0x00000258] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
+-+/* [0x00000218] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
+-+/* [0x00000220] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x00000228] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
+-+/* [0x00000230] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000238] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+-+/* [0x00000240] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
+-+/* [0x00000248] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
+- // ::mc_filter_uv
+--/* [0x00000260] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00000268] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x00000270] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00000278] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x00000280] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+--/* [0x00000288] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x00000290] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+--/* [0x00000298] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x000002a0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+--/* [0x000002a8] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x000002b0] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+--/* [0x000002b8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+--/* [0x000002c0] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x000002c8] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000002d0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x000002d8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x000002e0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x000002e8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+--/* [0x000002f0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+--/* [0x000002f8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x00000300] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000308] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000310] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00000318] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000320] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000328] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000330] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000338] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x00000340] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000348] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000350] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000358] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000360] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+--/* [0x00000368] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x00000370] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000378] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x00000250] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000258] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00000260] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000268] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000270] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x00000278] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000280] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x00000288] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x00000290] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+-+/* [0x00000298] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x000002a0] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+-+/* [0x000002a8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x000002b0] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x000002b8] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000002c0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x000002c8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x000002d0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x000002d8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+-+/* [0x000002e0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+-+/* [0x000002e8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x000002f0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x000002f8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000300] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00000308] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000310] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000318] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000320] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000328] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x00000330] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000338] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000340] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000348] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
+-+/* [0x00000350] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x00000358] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000360] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop
+--/* [0x00000380] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x00000388] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x00000390] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x00000398] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x000003a0] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x000003a8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x000003b0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x000003b8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x000003c0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x000003c8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x000003d0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x000003d8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x000003e0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x000003e8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x000003f0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x000003f8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x00000400] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x00000408] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x00000410] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00000418] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+--/* [0x00000420] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+--/* [0x00000428] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
+--/* [0x00000430] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x00000438] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+--/* [0x00000440] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+--/* [0x00000448] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+--/* [0x00000450] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+--/* [0x00000458] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+--/* [0x00000460] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+--/* [0x00000468] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x00000470] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x00000478] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+--/* [0x00000480] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
+--/* [0x00000488] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:uvloop
+--/* [0x00000490] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
+--/* [0x00000498] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x000004a0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+--/* [0x000004a8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x000004b0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x000004b8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x000004c0] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x000004c8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x000004d0] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x000004d8] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x000004e0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x000004e8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000368] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x00000370] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x00000378] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x00000380] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000388] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x00000390] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000398] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x000003a0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x000003a8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x000003b0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x000003b8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x000003c0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x000003c8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x000003d0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x000003d8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x000003e0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x000003e8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x000003f0] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x000003f8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00000400] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+-+/* [0x00000408] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+-+/* [0x00000410] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
+-+/* [0x00000418] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00000420] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+-+/* [0x00000428] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+-+/* [0x00000430] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+-+/* [0x00000438] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+-+/* [0x00000440] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+/* [0x00000448] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x00000450] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x00000458] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x00000460] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x00000468] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
+-+/* [0x00000470] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:uvloop
+-+/* [0x00000478] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
+-+/* [0x00000480] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x00000488] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x00000490] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00000498] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x000004a0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x000004a8] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x000004b0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x000004b8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x000004c0] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x000004c8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x000004d0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_filter_uv_b0
+--/* [0x000004f0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x000004f8] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x00000500] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00000508] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x00000510] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+--/* [0x00000518] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x00000520] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+--/* [0x00000528] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x00000530] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+--/* [0x00000538] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x00000540] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+--/* [0x00000548] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
+--/* [0x00000550] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x00000558] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000560] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x00000568] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x00000570] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x00000578] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+--/* [0x00000580] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+--/* [0x00000588] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x00000590] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000598] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x000005a0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x000005a8] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000005b0] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000005b8] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000005c0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000005c8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x000005d0] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000005d8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000005e0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000005e8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000005f0] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+--/* [0x000005f8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x00000600] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000608] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x000004d8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x000004e0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x000004e8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x000004f0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x000004f8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x00000500] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000508] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x00000510] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x00000518] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+-+/* [0x00000520] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x00000528] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+-+/* [0x00000530] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
+-+/* [0x00000538] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x00000540] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000548] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x00000550] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x00000558] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x00000560] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+-+/* [0x00000568] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+-+/* [0x00000570] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x00000578] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000580] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000588] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00000590] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000598] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000005a0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000005a8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000005b0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x000005b8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000005c0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000005c8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000005d0] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
+-+/* [0x000005d8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x000005e0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x000005e8] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop_b0
+--/* [0x00000610] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x00000618] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x00000620] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x00000628] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x00000630] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x00000638] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x00000640] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x00000648] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x00000650] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x00000658] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x00000660] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000668] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x00000670] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x00000678] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x00000680] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x00000688] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x00000690] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x00000698] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x000006a0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x000006a8] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+--/* [0x000006b0] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+--/* [0x000006b8] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+--/* [0x000006c0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x000006c8] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+--/* [0x000006d0] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+--/* [0x000006d8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+--/* [0x000006e0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+--/* [0x000006e8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+--/* [0x000006f0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+--/* [0x000006f8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x00000700] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
+--/* [0x00000708] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+--/* [0x00000710] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
+--/* [0x00000718] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x000005f0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x000005f8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x00000600] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x00000608] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000610] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x00000618] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000620] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000628] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x00000630] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x00000638] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x00000640] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000648] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x00000650] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x00000658] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00000660] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x00000668] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000670] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00000678] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00000680] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00000688] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+-+/* [0x00000690] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+-+/* [0x00000698] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+-+/* [0x000006a0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x000006a8] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+-+/* [0x000006b0] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+-+/* [0x000006b8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+-+/* [0x000006c0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+-+/* [0x000006c8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+/* [0x000006d0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x000006d8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x000006e0] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
+-+/* [0x000006e8] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+-+/* [0x000006f0] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
+-+/* [0x000006f8] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x00000700] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x00000708] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000710] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000718] */ 0x15827d80, 0x10020827, // mov r0, unif
+- /* [0x00000720] */ 0x009e7000, 0x100009e7, // nop
+--/* [0x00000728] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000730] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000738] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000740] */ 0x009e7000, 0x100009e7, // nop
+- // ::mc_filter_uv_b
+--/* [0x00000748] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00000750] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x00000758] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00000760] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x00000768] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+--/* [0x00000770] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x00000778] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+--/* [0x00000780] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x00000788] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+--/* [0x00000790] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x00000798] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+--/* [0x000007a0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+--/* [0x000007a8] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x000007b0] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000007b8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x000007c0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x000007c8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x000007d0] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+--/* [0x000007d8] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+--/* [0x000007e0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x000007e8] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+--/* [0x000007f0] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+--/* [0x000007f8] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+--/* [0x00000800] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000808] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000810] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00000818] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
+--/* [0x00000820] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000828] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000830] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000838] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000840] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x00000848] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000850] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000858] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000860] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000868] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+--/* [0x00000870] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x00000878] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000880] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x00000728] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000730] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00000738] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000740] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000748] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x00000750] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000758] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x00000760] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x00000768] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+-+/* [0x00000770] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x00000778] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+-+/* [0x00000780] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x00000788] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x00000790] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000798] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x000007a0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x000007a8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x000007b0] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+-+/* [0x000007b8] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+-+/* [0x000007c0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x000007c8] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+-+/* [0x000007d0] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+-+/* [0x000007d8] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+-+/* [0x000007e0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x000007e8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x000007f0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x000007f8] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
+-+/* [0x00000800] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000808] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000810] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000818] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000820] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x00000828] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000830] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000838] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000840] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
+-+/* [0x00000848] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x00000850] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000858] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop_b
+--/* [0x00000888] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x00000890] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x00000898] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x000008a0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x000008a8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x000008b0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x000008b8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x000008c0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x000008c8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x000008d0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x000008d8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x000008e0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x000008e8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x000008f0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x000008f8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x00000900] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x00000908] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x00000910] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x00000918] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00000920] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+--/* [0x00000928] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+--/* [0x00000930] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
+--/* [0x00000938] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x00000940] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+--/* [0x00000948] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+--/* [0x00000950] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+--/* [0x00000958] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+--/* [0x00000960] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+--/* [0x00000968] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+--/* [0x00000970] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x00000978] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x00000980] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+--/* [0x00000988] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
+--/* [0x00000990] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
+--/* [0x00000998] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
+--/* [0x000009a0] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
+--/* [0x000009a8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x000009b0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+--/* [0x000009b8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x000009c0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x000009c8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x000009d0] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x000009d8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x000009e0] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x000009e8] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x000009f0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x000009f8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000860] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x00000868] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x00000870] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x00000878] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000880] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x00000888] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000890] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000898] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x000008a0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x000008a8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x000008b0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x000008b8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x000008c0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x000008c8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x000008d0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x000008d8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x000008e0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x000008e8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x000008f0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x000008f8] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+-+/* [0x00000900] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+-+/* [0x00000908] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x00000910] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00000918] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+-+/* [0x00000920] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+-+/* [0x00000928] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+-+/* [0x00000930] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+-+/* [0x00000938] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+/* [0x00000940] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x00000948] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x00000950] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x00000958] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x00000960] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
+-+/* [0x00000968] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
+-+/* [0x00000970] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x00000978] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
+-+/* [0x00000980] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x00000988] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x00000990] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00000998] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x000009a0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x000009a8] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x000009b0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x000009b8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x000009c0] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x000009c8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x000009d0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_exit
+--/* [0x00000a00] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x00000a08] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+--/* [0x00000a10] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000a18] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000a20] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000a28] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000a30] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x00000a38] */ 0x009e7000, 0x100009e7, // nop        ; nop
+--/* [0x00000a40] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x000009d8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x000009e0] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+-+/* [0x000009e8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x000009f0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x000009f8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a00] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a08] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00000a10] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000a18] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_interrupt_exit8
+--/* [0x00000a48] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x00000a50] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000a58] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000a60] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000a68] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a20] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00000a28] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a30] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a38] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a40] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a48] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000a50] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000a58] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000a60] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000a68] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00000a70] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00000a78] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000a80] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000a88] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000a90] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000a98] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000aa0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000aa8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x00000ab0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+--/* [0x00000ab8] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000a80] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00000a88] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x00000a90] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_end
+- };
+- #ifdef __HIGHC__
+-diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
+-index 45dbe0e..99927c4 100644
+---- a/libavcodec/rpi_shader.h
+-+++ b/libavcodec/rpi_shader.h
+-@@ -4,11 +4,11 @@
+- extern unsigned int rpi_shader[];
+- 
+- #define mc_setup_uv (rpi_shader + 0)
+--#define mc_filter_uv (rpi_shader + 152)
+--#define mc_filter_uv_b0 (rpi_shader + 316)
+--#define mc_filter_uv_b (rpi_shader + 466)
+--#define mc_exit (rpi_shader + 640)
+--#define mc_interrupt_exit8 (rpi_shader + 658)
+--#define mc_end (rpi_shader + 688)
+-+#define mc_filter_uv (rpi_shader + 148)
+-+#define mc_filter_uv_b0 (rpi_shader + 310)
+-+#define mc_filter_uv_b (rpi_shader + 458)
+-+#define mc_exit (rpi_shader + 630)
+-+#define mc_interrupt_exit8 (rpi_shader + 648)
+-+#define mc_end (rpi_shader + 678)
+- 
+- #endif
+-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
+-index e138c95..d9ffcda 100644
+---- a/libavcodec/rpi_shader.qasm
+-+++ b/libavcodec/rpi_shader.qasm
+-@@ -167,8 +167,6 @@ add t0s, r2, r1
+- 
+- # Dump padding words
+- mov r0, unif
+--mov r0, unif
+--mov r0, unif
+- 
+- # submit texture requests for second line
+- max r1, ra_y, 0
+-@@ -228,11 +226,10 @@ asr ra3, r0, rb23;      mul24 r0, r0, ra22
+- asr ra2, r0, rb23;      mul24 r0, r0, ra22
+- asr ra1, r0, rb23;      mul24 r0, r0, ra22
+- asr ra0, r0, rb23;      mov r0, unif
+--                        mov r0, unif
+- asr rb11, r0, rb23;     mul24 r0, r0, ra22
+- asr rb10, r0, rb23;     mul24 r0, r0, ra22
+- asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--asr rb8, r0, rb23;      mov r0, unif
+-+asr rb8, r0, rb23
+- 
+- # r2 is elem_num
+- # r3 is loop counter
+-@@ -362,11 +359,10 @@ asr ra3, r0, rb23;      mul24 r0, r0, ra22
+- asr ra2, r0, rb23;      mul24 r0, r0, ra22
+- asr ra1, r0, rb23;      mul24 r0, r0, ra22
+- asr ra0, r0, rb23;      mov r0, unif
+--                        mov r0, unif
+- asr rb11, r0, rb23;     mul24 r0, r0, ra22
+- asr rb10, r0, rb23;     mul24 r0, r0, ra22
+- asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--asr rb8, r0, rb23;      mov r0, unif
+-+asr rb8, r0, rb23
+- 
+- # r2 is elem_num
+- # r3 is loop counter
+-@@ -490,11 +486,10 @@ asr ra3, r0, rb23;      mul24 r0, r0, ra22
+- asr ra2, r0, rb23;      mul24 r0, r0, ra22
+- asr ra1, r0, rb23;      mul24 r0, r0, ra22
+- asr ra0, r0, rb23;      mov r0, unif
+--                        mov r0, unif
+- asr rb11, r0, rb23;     mul24 r0, r0, ra22
+- asr rb10, r0, rb23;     mul24 r0, r0, ra22
+- asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--asr rb8, r0, rb23;      mov r0, unif
+-+asr rb8, r0, rb23
+- 
+- # r2 is elem_num
+- # r3 is loop counter
+--- 
+-2.7.4
+-
+-
+-From 3e8f02cf9d3e4bfcd07a5fcf321ace07c4f2e6f3 Mon Sep 17 00:00:00 2001
+-From: popcornmix <popcornmix@gmail.com>
+-Date: Thu, 14 May 2015 15:21:49 +0100
+-Subject: [PATCH 34/68] hevc: don't redirect when not rpi_enabled
+-
+----
+- libavcodec/hevc.c | 2 +-
+- 1 file changed, 1 insertion(+), 1 deletion(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 32b89d5..2459e34 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -1468,7 +1468,7 @@ static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size)
+-  */
+- 
+- #ifdef RPI_INTER
+--#define RPI_REDIRECT(fn) rpi_ ## fn
+-+#define RPI_REDIRECT(fn) (s->enable_rpi ? rpi_ ## fn : fn)
+- static void rpi_luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+-                         AVFrame *ref, const Mv *mv, int x_off, int y_off,
+-                         int block_w, int block_h, int luma_weight, int luma_offset)
+--- 
+-2.7.4
+-
+-
+-From 6da455b382b28c3c1f4e98c1703a695cdb946ad3 Mon Sep 17 00:00:00 2001
+-From: popcornmix <popcornmix@gmail.com>
+-Date: Thu, 14 May 2015 15:22:02 +0100
+-Subject: [PATCH 35/68] Use /dev/vcio for mailbox access
+-
+----
+- libavcodec/rpi_mailbox.c | 2 +-
+- 1 file changed, 1 insertion(+), 1 deletion(-)
+-
+-diff --git a/libavcodec/rpi_mailbox.c b/libavcodec/rpi_mailbox.c
+-index 536896f..77a56dd 100644
+---- a/libavcodec/rpi_mailbox.c
+-+++ b/libavcodec/rpi_mailbox.c
+-@@ -39,7 +39,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+- 
+- #define MAJOR_NUM 100
+- #define IOCTL_MBOX_PROPERTY _IOWR(MAJOR_NUM, 0, char *)
+--#define DEVICE_FILE_NAME "/dev/char_dev"
+-+#define DEVICE_FILE_NAME "/dev/vcio"
+- 
+- #include "rpi_mailbox.h"
+- 
+--- 
+-2.7.4
+-
+-
+-From f96ef6131f16a4c03b8e2882bdf7319c3b646a6c Mon Sep 17 00:00:00 2001
+-From: popcornmix <popcornmix@gmail.com>
+-Date: Thu, 14 May 2015 15:25:25 +0100
+-Subject: [PATCH 36/68] Use vcsm for all memory allocations
+-
+----
+- libavcodec/rpi_qpu.c | 174 +++++++++++++++++++--------------------------------
+- 1 file changed, 64 insertions(+), 110 deletions(-)
+-
+-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
+-index 60bf079..f62051f 100644
+---- a/libavcodec/rpi_qpu.c
+-+++ b/libavcodec/rpi_qpu.c
+-@@ -1,7 +1,5 @@
+- #ifdef RPI
+--// define RPI_USE_VCSM to use the vcsm device for shared memory
+- // This works better than the mmap in that the memory can be cached, but requires a kernel modification to enable the device.
+--#define RPI_USE_VCSM
+- // define RPI_TIME_TOTAL_QPU to print out how much time is spent in the QPU code
+- #define RPI_TIME_TOTAL_QPU
+- // define RPI_TIME_TOTAL_VPU to print out how much time is spent in the VPI code
+-@@ -25,9 +23,7 @@
+- #include "rpi_shader.h"
+- #include "rpi_hevc_transform.h"
+- 
+--#ifdef RPI_USE_VCSM
+- #include "rpi_user_vcsm.h"
+--#endif
+- 
+- // On Pi2 there is no way to access the VPU L2 cache
+- // GPU_MEM_FLG should be 4 for uncached memory.  (Or C for alias to allocate in the VPU L2 cache)
+-@@ -96,7 +92,6 @@ struct GPU
+-   unsigned int vpu_code[VPU_CODE_SIZE];
+-   short transMatrix2even[16*16*2];
+-   int open_count; // Number of allocated video buffers
+--  unsigned int vc_handle; // Handle of this memory
+-   int      mb; // Mailbox handle
+-   int      vc; // Address in GPU memory
+-   int mail[12]; // These are used to pass pairs of code/unifs to the QPUs
+-@@ -105,6 +100,7 @@ struct GPU
+- // Stop more than one thread trying to allocate memory or use the processing resources at once
+- static pthread_mutex_t gpu_mutex = PTHREAD_MUTEX_INITIALIZER;
+- static volatile struct GPU* gpu = NULL;
+-+static GPU_MEM_PTR_T gpu_mem_ptr;
+- 
+- #if defined(RPI_TIME_TOTAL_QPU) || defined(RPI_TIME_TOTAL_VPU)
+- static unsigned int Microseconds(void) {
+-@@ -132,39 +128,27 @@ static volatile int vpu_async_tail=0; // Contains the number of posted jobs
+- static volatile int vpu_async_head=0;
+- #endif
+- 
+-+static int gpu_malloc_uncached_internal(int numbytes, GPU_MEM_PTR_T *p, int mb);
+-+static void gpu_free_internal(GPU_MEM_PTR_T *p);
+-+
+- // Connect to QPU, returns 0 on success.
+- static int gpu_init(volatile struct GPU **gpu) {
+-   int mb = mbox_open();
+-   int vc;
+--  int handle;
+-   volatile struct GPU* ptr;
+- 	if (mb < 0)
+- 		return -1;
+- 
+- 	if (qpu_enable(mb, 1)) return -2;
+- 
+--#ifdef RPI_USE_VCSM
+-   vcsm_init();
+--#endif
+-+  gpu_malloc_uncached_internal(sizeof(struct GPU), &gpu_mem_ptr, mb);
+-+  ptr = (volatile struct GPU*)gpu_mem_ptr.arm;
+-+  memset(ptr, 0, sizeof *ptr);
+-+  vc = gpu_mem_ptr.vc;
+- 
+--  handle = mem_alloc(mb, sizeof(struct GPU), 4096, GPU_MEM_FLG);
+--  if (!handle)
+--  {
+--    qpu_enable(mb, 0);
+--    return -3;
+--  }
+--	vc = mem_lock(mb, handle);
+--	ptr = mapmem_shared((vc+GPU_MEM_MAP)&~0xc0000000, sizeof(struct GPU));
+--	if (ptr == NULL)
+--	{	mem_free(mb, handle);
+--		mem_unlock(mb, handle);
+--		qpu_enable(mb, 0);
+--		return -4;
+--	}
+--
+--	ptr->mb = mb;
+--	ptr->vc_handle = handle;
+--	ptr->vc = vc;
+-+  ptr->mb = mb;
+-+  ptr->vc = vc;
+- 
+-   printf("GPU allocated at 0x%x\n",vc);
+- 
+-@@ -226,94 +210,74 @@ static void gpu_unlock(void) {
+-   pthread_mutex_unlock(&gpu_mutex);
+- }
+- 
+-+static int gpu_malloc_uncached_internal(int numbytes, GPU_MEM_PTR_T *p, int mb) {
+-+  p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
+-+  assert(p->vcsm_handle);
+-+  p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
+-+  assert(p->vc_handle);
+-+  p->arm = vcsm_lock(p->vcsm_handle);
+-+  assert(p->arm);
+-+  p->vc = mem_lock(mb, p->vc_handle);
+-+  assert(p->vc);
+-+  return 0;
+-+}
+-+
+- // Allocate memory on GPU
+- // Fills in structure <p> containing ARM pointer, videocore handle, videocore memory address, numbytes
+- // Returns 0 on success.
+- // This allocates memory that will not be cached in ARM's data cache.
+- // Therefore safe to use without data cache flushing.
+--int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p) {
+-+int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p)
+-+{
+-+  int r;
+-   gpu_lock();
+--  p->vc_handle = mem_alloc(gpu->mb, numbytes, 4096, GPU_MEM_FLG);
+--  p->vcsm_handle = 0;
+--  if (!p->vc_handle)
+--  {
+--    qpu_enable(gpu->mb, 0);
+--    return -3;
+--  }
+--  p->vc = mem_lock(gpu->mb, p->vc_handle);
+--  p->arm = mapmem_shared((p->vc+GPU_MEM_MAP)&~0xc0000000,numbytes);
+--  p->numbytes = numbytes;
+--  if (p->arm == NULL)
+--  {
+--    mem_free(gpu->mb, p->vc_handle);
+--    mem_unlock(gpu->mb, p->vc_handle);
+--    gpu_unlock();
+--    qpu_enable(gpu->mb, 0);
+--    return -4;
+--  }
+-+  r = gpu_malloc_uncached_internal(numbytes, p, gpu->mb);
+-   gpu->open_count++;
+-   gpu_unlock();
+--  return 0;
+-+  return r;
+- }
+- 
+- void gpu_cache_flush(GPU_MEM_PTR_T *p)
+- {
+--  // This only works when using RPI_USE_VCSM
+-   void *tmp = vcsm_lock(p->vcsm_handle);
+-   vcsm_unlock_ptr(tmp);
+- }
+- 
+-+static int gpu_malloc_cached_internal(int numbytes, GPU_MEM_PTR_T *p) {
+-+  p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST, (char *)"Video Frame" );
+-+  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_VC, (char *)"Video Frame" );
+-+  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
+-+  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST_AND_VC, (char *)"Video Frame" );
+-+  assert(p->vcsm_handle);
+-+  p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
+-+  assert(p->vc_handle);
+-+  p->arm = vcsm_lock(p->vcsm_handle);
+-+  assert(p->arm);
+-+  p->vc = mem_lock(gpu->mb, p->vc_handle);
+-+  assert(p->vc);
+-+  return 0;
+-+}
+-+
+- // This allocates data that will be
+- //    Cached in ARM L2
+- //    Uncached in VPU L2
+--int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p) {
+-+int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p)
+-+{
+-+  int r;
+-   gpu_lock();
+--#ifdef RPI_USE_VCSM
+--  {
+--      p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST, (char *)"Video Frame" ); // f....... locks up for VP9 - retest this?
+--      //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_VC, (char *)"Video Frame" ); // 3b...... works
+--      //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" ); //fb...... locks up
+--      //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST_AND_VC, (char *)"Video Frame" ); // 3b works (but corrupted due to caching)
+--      p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
+--      p->arm = vcsm_lock(p->vcsm_handle);
+--      p->vc = mem_lock(gpu->mb, p->vc_handle);
+--  }
+--#else
+--  p->vc_handle = mem_alloc(gpu->mb, numbytes, 4096, GPU_MEM_FLG);
+--  p->vcsm_handle = 0;
+--  if (!p->handle)
+--  {
+--    qpu_enable(gpu->mb, 0);
+--    return -3;
+--  }
+--  p->vc = mem_lock(gpu->mb, p->vc_handle);
+--  printf("This mapmem_private does not seem to work\n");
+--  exit(-1);
+--  p->arm = mapmem_private((p->vc+GPU_MEM_MAP)&~0xc0000000,numbytes);
+--  p->numbytes = numbytes;
+--  if (p->arm == NULL)
+--  {
+--    mem_free(gpu->mb, p->handle);
+--    mem_unlock(gpu->mb, p->handle);
+--    gpu_unlock();
+--    qpu_enable(gpu->mb, 0);
+--    return -4;
+--  }
+--#endif
+-+  r = gpu_malloc_cached_internal(numbytes, p);
+-   gpu->open_count++;
+-   gpu_unlock();
+--  return 0;
+-+  return r;
+- }
+- 
+- static void gpu_term(void)
+- {
+--	int mb;
+--	unsigned handle;
+-+  int mb;
+- 
+-   if (gpu==NULL)
+-     return;
+-   mb = gpu->mb;
+--  handle = gpu->vc_handle;
+- 
+- #ifdef RPI_ASYNC
+-   {
+-@@ -323,37 +287,26 @@ static void gpu_term(void)
+-   }
+- #endif
+- 
+-+  qpu_enable(mb, 0);
+-+  gpu_free_internal(&gpu_mem_ptr);
+- 
+--	unmapmem((void*)gpu, sizeof(struct GPU));
+--	mem_unlock(mb, handle);
+--	mem_free(mb, handle);
+--	qpu_enable(mb, 0);
+--#ifdef RPI_USE_VCSM
+-   vcsm_exit();
+--#endif
+--	mbox_close(mb);
+-+
+-+  mbox_close(mb);
+-   gpu = NULL;
+- }
+- 
+--void gpu_free(GPU_MEM_PTR_T *p) {
+-+void gpu_free_internal(GPU_MEM_PTR_T *p) {
+-   int mb = gpu->mb;
+--	unsigned handle = p->vc_handle;
+-+  mem_unlock(mb,p->vc_handle);
+-+  vcsm_unlock_ptr(p->arm);
+-+  vcsm_free(p->vcsm_handle);
+-+}
+-+
+-+void gpu_free(GPU_MEM_PTR_T *p) {
+-   gpu_lock();
+--#ifdef RPI_USE_VCSM
+--  if (p->vcsm_handle) {
+--      mem_unlock(mb,p->vc_handle);
+--      vcsm_unlock_ptr(p->arm);
+--      vcsm_free(p->vcsm_handle);
+--  } else {
+--	unmapmem((void*)p->arm, sizeof(struct GPU));
+--      mem_unlock(mb, handle);
+--      mem_free(mb, handle);
+--  }
+--#else
+--	unmapmem((void*)p->arm, sizeof(struct GPU));
+--	mem_unlock(mb, handle);
+--	mem_free(mb, handle);
+--#endif
+-+
+-+  gpu_free_internal(p);
+- 
+-   gpu->open_count--;
+-   if (gpu->open_count==0) {
+-@@ -386,20 +339,21 @@ unsigned int vpu_get_constants(void) {
+- 
+- static void *vpu_start(void *arg) {
+-   while(1) {
+-+    int *p;
+-     pthread_mutex_lock(&post_mutex);
+-     while( vpu_async_tail - vpu_async_head <= 0)
+-     {
+-       //printf("Checking number %d %d\n",vpu_async_head,vpu_async_tail);
+-       pthread_cond_wait(&post_cond_tail, &post_mutex);
+-     }
+--    int *p = vpu_cmds[vpu_async_head%MAXCMDS];
+-+    p = vpu_cmds[vpu_async_head%MAXCMDS];
+-     pthread_mutex_unlock(&post_mutex);
+- 
+-     if (p[6] == -1) {
+-       break; // Last job
+-     }
+-     if (p[7]) {
+--        GPU_MEM_PTR_T *buf = (GPU_MEM_PTR_T *)p[7];
+-+        //GPU_MEM_PTR_T *buf = (GPU_MEM_PTR_T *)p[7];
+-         //gpu_cache_flush(buf);
+-     }
+-     vpu_execute_code(p[0], p[1], p[2], p[3], p[4], p[5], p[6]);
+--- 
+-2.7.4
+-
+-
+-From 7c94b833b48a455d27d82eb2ca1b53a162705caf Mon Sep 17 00:00:00 2001
+-From: popcornmix <popcornmix@gmail.com>
+-Date: Thu, 14 May 2015 15:43:17 +0100
+-Subject: [PATCH 37/68] Enable EARLY_MALLOC and fix sps access bug
+-
+----
+- libavcodec/hevc.c | 5 +++--
+- 1 file changed, 3 insertions(+), 2 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 2459e34..4e82a15 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -44,7 +44,7 @@
+- #ifdef RPI
+-   #include "rpi_qpu.h"
+-   // For some unknown reason, the code seems to crash if I do a late malloc
+--  #define EARLY_MALLOC
+-+  //#define EARLY_MALLOC
+-   // Move Inter prediction into separate pass
+-   #define RPI_INTER
+- #endif
+-@@ -149,7 +149,8 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
+- #ifdef RPI
+- #ifdef EARLY_MALLOC
+- #else
+--    int coeffs_in_ctb = (1 << s->ps.sps->log2_ctb_size) * (1 << s->ps.sps->log2_ctb_size);
+-+    assert(sps);
+-+    int coeffs_in_ctb = (1 << sps->log2_ctb_size) * (1 << sps->log2_ctb_size);
+-     int coefs_per_row = sps->ctb_width * coeffs_in_ctb * 3;  // Allow space for chroma
+-     printf("pic_arrays_init\n");
+-     printf("Allocated %d\n",coefs_per_row);
+--- 
+-2.7.4
+-
+-
+-From 0a0a92817a7959d213dca9c75a242b6ad88d6b80 Mon Sep 17 00:00:00 2001
+-From: popcornmix <popcornmix@gmail.com>
+-Date: Thu, 14 May 2015 16:40:51 +0100
+-Subject: [PATCH 38/68] Add copy of av_mod_uintp2 for use with stable ffmpeg
+-
+----
+- libavcodec/hevc.c | 8 ++++++++
+- 1 file changed, 8 insertions(+)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 4e82a15..80db603 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -51,6 +51,14 @@
+- 
+- // #define DISABLE_MC
+- 
+-+#ifndef av_mod_uintp2
+-+static av_always_inline av_const unsigned av_mod_uintp2_c(unsigned a, unsigned p)
+-+{
+-+    return a & ((1 << p) - 1);
+-+}
+-+#   define av_mod_uintp2   av_mod_uintp2_c
+-+#endif
+-+
+- const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
+- 
+- 
+--- 
+-2.7.4
+-
+-
+-From c48d08e968b24c2e260b0cc76c7901a1b4d75bbf Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Mon, 18 May 2015 11:11:02 +0100
+-Subject: [PATCH 39/68] Added support for weighted prediction in P frames
+-
+----
+- libavcodec/hevc.c          |  52 ++++-
+- libavcodec/rpi_shader.c    | 566 +++++++++++++++++++++++----------------------
+- libavcodec/rpi_shader.h    |  12 +-
+- libavcodec/rpi_shader.qasm |  39 +++-
+- 4 files changed, 384 insertions(+), 285 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 80db603..9668ef8 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -64,7 +64,7 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
+- 
+- #ifdef RPI_INTER_QPU
+- 
+--#define RPI_CHROMA_COMMAND_WORDS 10
+-+#define RPI_CHROMA_COMMAND_WORDS 12
+- #define UV_COMMANDS_PER_QPU ((1 + (256*64*2)/(4*4)) * RPI_CHROMA_COMMAND_WORDS)
+- // The QPU code for UV blocks only works up to a block width of 8
+- #define RPI_CHROMA_BLOCK_WIDTH 8
+-@@ -2031,6 +2031,8 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                 int y1_c = y0_c + (mv->y >> (2 + hshift));
+-                 //int chan = x0>>8; // Allocate commands for the first 256 luma pixels across to the first QPU.  This is optimised for images around 1920 width
+-                 int chan = x0>>8;
+-+                int weight_flag      = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
+-+                                       (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
+- 
+-                 uint32_t *u = s->u_mvs[chan & 7];
+-                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
+-@@ -2043,6 +2045,13 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                       *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
+-                       *u++ = rpi_filter_coefs[_mx][0];
+-                       *u++ = rpi_filter_coefs[_my][0];
+-+                      if (weight_flag) {
+-+                          *u++ = (s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0] << 16) + (s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0] & 0xffff);
+-+                          *u++ = (s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1] << 16) + (s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1] & 0xffff);
+-+                      } else {
+-+                          *u++ = 1; // Weight of 1 and offset of 0
+-+                          *u++ = 1;
+-+                      }
+-                       *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
+-                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+-                     }
+-@@ -2085,6 +2094,8 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                 int y1_c = y0_c + (mv->y >> (2 + hshift));
+-                 //int chan = x0>>8; // Allocate commands for the first 256 luma pixels across to the first QPU.  This is optimised for images around 1920 width
+-                 int chan = x0>>8;
+-+                int weight_flag      = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
+-+                                       (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
+- 
+-                 uint32_t *u = s->u_mvs[chan & 7];
+-                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
+-@@ -2098,6 +2109,13 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                       // TODO chroma weight and offset... s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]
+-                       *u++ = rpi_filter_coefs[_mx][0];
+-                       *u++ = rpi_filter_coefs[_my][0];
+-+                      if (weight_flag) {
+-+                          *u++ = (s->sh.chroma_offset_l0[current_mv.ref_idx[1]][0] << 16) + (s->sh.chroma_weight_l0[current_mv.ref_idx[1]][0] & 0xffff);
+-+                          *u++ = (s->sh.chroma_offset_l0[current_mv.ref_idx[1]][1] << 16) + (s->sh.chroma_weight_l0[current_mv.ref_idx[1]][1] & 0xffff);
+-+                      } else {
+-+                          *u++ = 1; // Weight of 1 and offset of 0
+-+                          *u++ = 1;
+-+                      }
+-                       *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
+-                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+-                     }
+-@@ -2159,6 +2177,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                       *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
+-                       *u++ = rpi_filter_coefs[_mx][0];
+-                       *u++ = rpi_filter_coefs[_my][0];
+-+                      u+=2; // Weights not supported in B slices
+-                       u+=2; // Intermediate results are not written back in first pass of B filtering
+- 
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b;
+-@@ -2169,6 +2188,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                       *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
+-                       *u++ = rpi_filter_coefs[_mx2][0];
+-                       *u++ = rpi_filter_coefs[_my2][0];
+-+                      u+=2; // Weights not supported in B slices
+-                       *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
+-                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+-                     }
+-@@ -2795,6 +2815,9 @@ static void rpi_inter_clear(HEVCContext *s)
+-     int i;
+-     int pic_width        = s->ps.sps->width >> s->ps.sps->hshift[1];
+-     int pic_height       = s->ps.sps->height >> s->ps.sps->vshift[1];
+-+    int weight_flag      = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
+-+                           (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
+-+
+-     for(i=0;i<8;i++) {
+-         s->u_mvs[i] = s->mvs_base[i];
+-         *s->u_mvs[i]++ = 0;
+-@@ -2806,6 +2829,13 @@ static void rpi_inter_clear(HEVCContext *s)
+-         *s->u_mvs[i]++ = pic_height;
+-         *s->u_mvs[i]++ = s->frame->linesize[1];
+-         *s->u_mvs[i]++ = s->frame->linesize[2];
+-+        if (weight_flag) {
+-+            *s->u_mvs[i]++ = 1 << (s->sh.chroma_log2_weight_denom + 6 - 1);
+-+            *s->u_mvs[i]++ = s->sh.chroma_log2_weight_denom + 6;
+-+        } else {
+-+            *s->u_mvs[i]++ = 1 << 5;
+-+            *s->u_mvs[i]++ = 6;
+-+        }
+-         s->u_mvs[i] += 1;  // Padding words
+-     }
+- }
+-@@ -2849,12 +2879,29 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-     int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs];
+- 
+- #ifdef RPI
+-+#ifdef RPI_INTER_QPU
+-     s->enable_rpi = s->ps.sps->bit_depth == 8
+-                     && s->ps.sps->width <= RPI_MAX_WIDTH
+-                     && !s->ps.pps->cross_component_prediction_enabled_flag
+-                     && s->ps.pps->num_tile_rows <= 1 && s->ps.pps->num_tile_columns <= 1
+--                    && !(s->ps.pps->weighted_pred_flag && s->sh.slice_type == P_SLICE)
+-                     && !(s->ps.pps->weighted_bipred_flag && s->sh.slice_type == B_SLICE);
+-+#else
+-+    s->enable_rpi = s->ps.sps->bit_depth == 8
+-+                    && s->ps.sps->width <= RPI_MAX_WIDTH
+-+                    && !s->ps.pps->cross_component_prediction_enabled_flag
+-+                    && s->ps.pps->num_tile_rows <= 1 && s->ps.pps->num_tile_columns <= 1;
+-+#endif
+-+
+-+    /*if (!s->enable_rpi) {
+-+      if (s->ps.pps->cross_component_prediction_enabled_flag)
+-+        printf("Cross component\n");
+-+      if (s->ps.pps->num_tile_rows > 1 || s->ps.pps->num_tile_columns > 1)
+-+        printf("Tiles\n");
+-+      if (s->ps.pps->weighted_pred_flag && s->sh.slice_type == P_SLICE)
+-+        printf("Weighted P slice\n");
+-+      if (s->ps.pps->weighted_bipred_flag && s->sh.slice_type == B_SLICE)
+-+        printf("Weighted B slice\n");
+-+    }*/
+- 
+- #endif
+- 
+-@@ -2987,6 +3034,7 @@ static int hls_decode_entry_wpp(AVCodecContext *avctxt, void *input_ctb_row, int
+- 
+- #ifdef RPI
+-     s->enable_rpi = 0;
+-+    //printf("Wavefront\n");
+- #endif
+- 
+-     if(ctb_row) {
+-diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
+-index b0b93b5..3f04d80 100644
+---- a/libavcodec/rpi_shader.c
+-+++ b/libavcodec/rpi_shader.c
+-@@ -88,287 +88,307 @@ unsigned int rpi_shader[] = {
+- /* [0x000001f8] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
+- /* [0x00000200] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
+- /* [0x00000208] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
+--/* [0x00000210] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000218] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
+--/* [0x00000220] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+--/* [0x00000228] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
+--/* [0x00000230] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000238] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+--/* [0x00000240] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
+--/* [0x00000248] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
+-+/* [0x00000210] */ 0x15827d80, 0x10021327, // mov rb12,unif
+-+/* [0x00000218] */ 0x15827d80, 0x10021367, // mov rb13,unif
+-+/* [0x00000220] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000228] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
+-+/* [0x00000230] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x00000238] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
+-+/* [0x00000240] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000248] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+-+/* [0x00000250] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
+-+/* [0x00000258] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
+- // ::mc_filter_uv
+--/* [0x00000250] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00000258] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x00000260] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00000268] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x00000270] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+--/* [0x00000278] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x00000280] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+--/* [0x00000288] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x00000290] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+--/* [0x00000298] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x000002a0] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+--/* [0x000002a8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+--/* [0x000002b0] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x000002b8] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000002c0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x000002c8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x000002d0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x000002d8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+--/* [0x000002e0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+--/* [0x000002e8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x000002f0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x000002f8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000300] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00000308] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000310] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000318] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000320] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000328] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x00000330] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000338] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000340] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000348] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
+--/* [0x00000350] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x00000358] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000360] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x00000260] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000268] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00000270] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000278] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000280] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x00000288] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000290] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x00000298] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x000002a0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+-+/* [0x000002a8] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x000002b0] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+-+/* [0x000002b8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x000002c0] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x000002c8] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000002d0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x000002d8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x000002e0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x000002e8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+-+/* [0x000002f0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+-+/* [0x000002f8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x00000300] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000308] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000310] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00000318] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000320] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000328] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000330] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000338] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x00000340] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000348] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000350] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000358] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
+-+/* [0x00000360] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000368] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000370] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
+-+/* [0x00000378] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000380] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
+-+/* [0x00000388] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000390] */ 0x0f9e7080, 0x100613e7, // asr.ifnz rb15, r0, r2
+-+/* [0x00000398] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x000003a0] */ 0x0f9e7080, 0x100613a7, // asr.ifnz rb14, r0, r2
+-+/* [0x000003a8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x000003b0] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop
+--/* [0x00000368] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x00000370] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x00000378] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x00000380] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x00000388] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x00000390] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x00000398] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x000003a0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x000003a8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x000003b0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x000003b8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x000003c0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x000003c8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x000003d0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x000003d8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x000003e0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x000003e8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x000003f0] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x000003f8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00000400] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+--/* [0x00000408] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+--/* [0x00000410] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
+--/* [0x00000418] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x00000420] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+--/* [0x00000428] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+--/* [0x00000430] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+--/* [0x00000438] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+--/* [0x00000440] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+--/* [0x00000448] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+--/* [0x00000450] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x00000458] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x00000460] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+--/* [0x00000468] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
+--/* [0x00000470] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:uvloop
+--/* [0x00000478] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
+--/* [0x00000480] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x00000488] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+--/* [0x00000490] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00000498] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x000004a0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x000004a8] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x000004b0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x000004b8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x000004c0] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x000004c8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x000004d0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x000003b8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x000003c0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x000003c8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x000003d0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x000003d8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x000003e0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x000003e8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x000003f0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x000003f8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x00000400] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x00000408] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000410] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x00000418] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x00000420] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00000428] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x00000430] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000438] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00000440] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00000448] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00000450] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+-+/* [0x00000458] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+-+/* [0x00000460] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
+-+/* [0x00000468] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00000470] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+-+/* [0x00000478] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+-+/* [0x00000480] */ 0x00000020, 0xe0021327, // mov rb12,32
+-+/* [0x00000488] */ 0x00000006, 0xe0021367, // mov rb13,6
+-+/* [0x00000490] */ 0x00000001, 0xe00213a7, // mov rb14,1
+-+/* [0x00000498] */ 0x00000000, 0xe00213e7, // mov rb15,0
+-+/* [0x000004a0] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+-+/* [0x000004a8] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+-+/* [0x000004b0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+/* [0x000004b8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x000004c0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x000004c8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x000004d0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x000004d8] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
+-+/* [0x000004e0] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
+-+/* [0x000004e8] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
+-+/* [0x000004f0] */ 0xfffffea8, 0xf06809e7, // brr.anyn -, r:uvloop
+-+/* [0x000004f8] */ 0x0c9cf3c0, 0x10020867, // add r1, r1, rb15
+-+/* [0x00000500] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x00000508] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x00000510] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00000518] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000520] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000528] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x00000530] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x00000538] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000540] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x00000548] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000550] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_filter_uv_b0
+--/* [0x000004d8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x000004e0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x000004e8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x000004f0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x000004f8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+--/* [0x00000500] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x00000508] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+--/* [0x00000510] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x00000518] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+--/* [0x00000520] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x00000528] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+--/* [0x00000530] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
+--/* [0x00000538] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x00000540] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000548] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x00000550] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x00000558] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x00000560] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+--/* [0x00000568] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+--/* [0x00000570] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x00000578] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000580] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000588] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00000590] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000598] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000005a0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000005a8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000005b0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x000005b8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000005c0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000005c8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000005d0] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
+--/* [0x000005d8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x000005e0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x000005e8] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x00000558] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000560] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00000568] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000570] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000578] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x00000580] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000588] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x00000590] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x00000598] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+-+/* [0x000005a0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x000005a8] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+-+/* [0x000005b0] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
+-+/* [0x000005b8] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x000005c0] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000005c8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x000005d0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x000005d8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x000005e0] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+-+/* [0x000005e8] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+-+/* [0x000005f0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x000005f8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000600] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000608] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00000610] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000618] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000620] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000628] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000630] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x00000638] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000640] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000648] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000650] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
+-+/* [0x00000658] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000660] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000668] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x00000670] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000678] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop_b0
+--/* [0x000005f0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x000005f8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x00000600] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x00000608] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x00000610] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x00000618] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x00000620] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x00000628] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x00000630] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x00000638] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x00000640] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000648] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x00000650] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x00000658] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x00000660] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x00000668] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x00000670] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x00000678] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x00000680] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00000688] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+--/* [0x00000690] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+--/* [0x00000698] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+--/* [0x000006a0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x000006a8] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+--/* [0x000006b0] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+--/* [0x000006b8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+--/* [0x000006c0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+--/* [0x000006c8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+--/* [0x000006d0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+--/* [0x000006d8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x000006e0] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
+--/* [0x000006e8] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+--/* [0x000006f0] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
+--/* [0x000006f8] */ 0x009e7000, 0x100009e7, // nop
+--/* [0x00000700] */ 0x009e7000, 0x100009e7, // nop
+--/* [0x00000708] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000710] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000718] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000720] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x00000680] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x00000688] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x00000690] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x00000698] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x000006a0] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x000006a8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x000006b0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x000006b8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x000006c0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x000006c8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x000006d0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x000006d8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x000006e0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x000006e8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x000006f0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x000006f8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000700] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00000708] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00000710] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00000718] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+-+/* [0x00000720] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+-+/* [0x00000728] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+-+/* [0x00000730] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00000738] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+-+/* [0x00000740] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+-+/* [0x00000748] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+-+/* [0x00000750] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+-+/* [0x00000758] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+/* [0x00000760] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x00000768] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x00000770] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
+-+/* [0x00000778] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+-+/* [0x00000780] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
+-+/* [0x00000788] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x00000790] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x00000798] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x000007a0] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000007a8] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000007b0] */ 0x009e7000, 0x100009e7, // nop
+- // ::mc_filter_uv_b
+--/* [0x00000728] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00000730] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x00000738] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00000740] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x00000748] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+--/* [0x00000750] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x00000758] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+--/* [0x00000760] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x00000768] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+--/* [0x00000770] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x00000778] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+--/* [0x00000780] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+--/* [0x00000788] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x00000790] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000798] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x000007a0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x000007a8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x000007b0] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+--/* [0x000007b8] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+--/* [0x000007c0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x000007c8] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+--/* [0x000007d0] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+--/* [0x000007d8] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+--/* [0x000007e0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x000007e8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x000007f0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x000007f8] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
+--/* [0x00000800] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000808] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000810] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000818] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000820] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x00000828] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000830] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000838] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000840] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
+--/* [0x00000848] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x00000850] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000858] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x000007b8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x000007c0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x000007c8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x000007d0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x000007d8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x000007e0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x000007e8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x000007f0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x000007f8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+-+/* [0x00000800] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x00000808] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+-+/* [0x00000810] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x00000818] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x00000820] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000828] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x00000830] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x00000838] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x00000840] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+-+/* [0x00000848] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+-+/* [0x00000850] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x00000858] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+-+/* [0x00000860] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+-+/* [0x00000868] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+-+/* [0x00000870] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000878] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000880] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00000888] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
+-+/* [0x00000890] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000898] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000008a0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000008a8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000008b0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x000008b8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000008c0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000008c8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000008d0] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
+-+/* [0x000008d8] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000008e0] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000008e8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x000008f0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x000008f8] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop_b
+--/* [0x00000860] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x00000868] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x00000870] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x00000878] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x00000880] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x00000888] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x00000890] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x00000898] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x000008a0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x000008a8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x000008b0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x000008b8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x000008c0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x000008c8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x000008d0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x000008d8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x000008e0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x000008e8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x000008f0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x000008f8] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+--/* [0x00000900] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+--/* [0x00000908] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
+--/* [0x00000910] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x00000918] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+--/* [0x00000920] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+--/* [0x00000928] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+--/* [0x00000930] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+--/* [0x00000938] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+--/* [0x00000940] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+--/* [0x00000948] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x00000950] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x00000958] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+--/* [0x00000960] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
+--/* [0x00000968] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
+--/* [0x00000970] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
+--/* [0x00000978] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
+--/* [0x00000980] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x00000988] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+--/* [0x00000990] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00000998] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x000009a0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x000009a8] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x000009b0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x000009b8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x000009c0] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x000009c8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x000009d0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000900] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x00000908] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x00000910] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x00000918] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000920] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x00000928] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000930] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000938] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x00000940] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x00000948] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x00000950] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000958] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x00000960] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x00000968] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00000970] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x00000978] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000980] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00000988] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00000990] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00000998] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+-+/* [0x000009a0] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+-+/* [0x000009a8] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x000009b0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x000009b8] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+-+/* [0x000009c0] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+-+/* [0x000009c8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+-+/* [0x000009d0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+-+/* [0x000009d8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+/* [0x000009e0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x000009e8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x000009f0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x000009f8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x00000a00] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
+-+/* [0x00000a08] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
+-+/* [0x00000a10] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x00000a18] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
+-+/* [0x00000a20] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x00000a28] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x00000a30] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00000a38] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000a40] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000a48] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x00000a50] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x00000a58] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000a60] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x00000a68] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000a70] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_exit
+--/* [0x000009d8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x000009e0] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+--/* [0x000009e8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x000009f0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x000009f8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000a00] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000a08] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x00000a10] */ 0x009e7000, 0x100009e7, // nop        ; nop
+--/* [0x00000a18] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000a78] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00000a80] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+-+/* [0x00000a88] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a90] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a98] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000aa0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000aa8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00000ab0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000ab8] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_interrupt_exit8
+--/* [0x00000a20] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x00000a28] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000a30] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000a38] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000a40] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000a48] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000a50] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000a58] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000a60] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000a68] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000a70] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000a78] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000a80] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x00000a88] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+--/* [0x00000a90] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000ac0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00000ac8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000ad0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000ad8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000ae0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000ae8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000af0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000af8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000b00] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000b08] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000b10] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000b18] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000b20] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00000b28] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x00000b30] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_end
+- };
+- #ifdef __HIGHC__
+-diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
+-index 99927c4..cec9901 100644
+---- a/libavcodec/rpi_shader.h
+-+++ b/libavcodec/rpi_shader.h
+-@@ -4,11 +4,11 @@
+- extern unsigned int rpi_shader[];
+- 
+- #define mc_setup_uv (rpi_shader + 0)
+--#define mc_filter_uv (rpi_shader + 148)
+--#define mc_filter_uv_b0 (rpi_shader + 310)
+--#define mc_filter_uv_b (rpi_shader + 458)
+--#define mc_exit (rpi_shader + 630)
+--#define mc_interrupt_exit8 (rpi_shader + 648)
+--#define mc_end (rpi_shader + 678)
+-+#define mc_filter_uv (rpi_shader + 152)
+-+#define mc_filter_uv_b0 (rpi_shader + 342)
+-+#define mc_filter_uv_b (rpi_shader + 494)
+-+#define mc_exit (rpi_shader + 670)
+-+#define mc_interrupt_exit8 (rpi_shader + 688)
+-+#define mc_end (rpi_shader + 718)
+- 
+- #endif
+-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
+-index d9ffcda..97c4c02 100644
+---- a/libavcodec/rpi_shader.qasm
+-+++ b/libavcodec/rpi_shader.qasm
+-@@ -9,7 +9,12 @@
+- #                                               (ra15 isn't clamped to zero - this happens during the
+- #                                                copy to ra14, and during its use in the vertical filter)
+- #
+--# rb8...rb15                                    eight vertical filter coefficients
+-+# rb8...rb11                                    eight vertical filter coefficients
+-+
+-+# rb12 offset to add before shift
+-+# rb13 shift
+-+# rb14 weight (U on left, V on right)
+-+# rb15 offset (U on left, V on right)
+- #
+- # ra16                                          clipped(row start address+elem_num)&~3
+- # ra17                                          per-channel shifts
+-@@ -165,6 +170,9 @@ add r2, r2, r0 ; mul24 r1, r1, rb_pitch
+- add t0s, r0, r1 ; mov ra_x2_base, r2
+- add t0s, r2, r1
+- 
+-+mov rb12,unif # offset before shift
+-+mov rb13,unif # offset after shift
+-+
+- # Dump padding words
+- mov r0, unif
+- 
+-@@ -231,11 +239,21 @@ asr rb10, r0, rb23;     mul24 r0, r0, ra22
+- asr rb9, r0, rb23;      mul24 r0, r0, ra22
+- asr rb8, r0, rb23
+- 
+-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+
+-+mov r0, unif # U offset/weight
+-+asr rb15, r0, r2  # Compute offset from MSBs
+-+shl r0, r0, r2
+-+asr rb14, r0, r2  # Compute weight from LSBs
+-+mov r0, unif # V offset/weight
+-+asr.ifnz rb15, r0, r2
+-+shl r0, r0, r2
+-+asr.ifnz rb14, r0, r2
+-+
+- # r2 is elem_num
+- # r3 is loop counter
+- 
+- mov r5rep, -8
+--mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+- 
+- # retrieve texture results and pick out bytes
+- # then submit two more texture requests
+-@@ -279,6 +297,11 @@ mov ra13, ra14       # Delay slot 1
+- mov ra14, ra15       # Delay slot 2
+- mov ra15, r0         # Delay slot 3
+- 
+-+mov rb12,32
+-+mov rb13,6
+-+mov rb14,1
+-+mov rb15,0
+-+
+- # apply vertical filter and write to VPM
+- 
+- nop                     ; mul24 r1, ra14, rb10
+-@@ -288,9 +311,11 @@ add r1, r1, r0          ; mul24 r0, ra15, rb11
+- add r1, r1, r0          ; mov -, vw_wait
+- sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+- asr r1, r1, 14
+--add r1, r1, ra21
+-+nop                     ; mul24 r1, r1, rb14
+-+add r1, r1, rb12
+-+asr r1, r1, rb13
+- brr.anyn -, r:uvloop
+--asr r1, r1, 6          # Delay 1
+-+add r1, r1, rb15       # Delay 1
+- min r1, r1, rb22       # Delay 2
+- max vpm, r1, 0         # Delay 3
+- 
+-@@ -364,6 +389,9 @@ asr rb10, r0, rb23;     mul24 r0, r0, ra22
+- asr rb9, r0, rb23;      mul24 r0, r0, ra22
+- asr rb8, r0, rb23
+- 
+-+mov r0, unif # U offset/weight
+-+mov r0, unif # V offset/weight
+-+
+- # r2 is elem_num
+- # r3 is loop counter
+- 
+-@@ -491,6 +519,9 @@ asr rb10, r0, rb23;     mul24 r0, r0, ra22
+- asr rb9, r0, rb23;      mul24 r0, r0, ra22
+- asr rb8, r0, rb23
+- 
+-+mov r0, unif # U offset/weight
+-+mov r0, unif # V offset/weight
+-+
+- # r2 is elem_num
+- # r3 is loop counter
+- 
+--- 
+-2.7.4
+-
+-
+-From 310d994ea39e29b41a6a013abc4d94e6b90487b2 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Tue, 19 May 2015 08:43:30 +0100
+-Subject: [PATCH 40/68] Improved ordering of tasks
+-
+----
+- libavcodec/hevc.c | 8 ++++----
+- 1 file changed, 4 insertions(+), 4 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 9668ef8..951e2d3 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -2943,15 +2943,15 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-           s->dblk_cmds[s->num_dblk_cmds][0] = x_ctb;
+-           s->dblk_cmds[s->num_dblk_cmds++][1] = y_ctb;
+-           if ( (((y_ctb + ctb_size)&63) == 0) && x_ctb + ctb_size >= s->ps.sps->width) {
+--#ifdef RPI_INTER_QPU
+--            // Kick off inter prediction on QPUs
+--            rpi_execute_inter_qpu(s);
+--#endif
+-             // Transform all blocks
+-             // printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
+-             rpi_execute_transform(s);
+-             // Perform inter prediction
+-             rpi_execute_inter_cmds(s);
+-+#ifdef RPI_INTER_QPU
+-+            // Kick off inter prediction on QPUs
+-+            rpi_execute_inter_qpu(s);
+-+#endif
+-             // Wait for transform completion
+-             vpu_wait(s->vpu_id);
+- 
+--- 
+-2.7.4
+-
+-
+-From d6e1ce7898196e49e52a6223c12979b3d0014588 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Wed, 20 May 2015 19:58:19 +0100
+-Subject: [PATCH 41/68] Drafted Luma inter prediction
+-
+----
+- libavcodec/rpi_shader.qasm | 594 ++++++++++++++++++++++++++++++++++++++++++---
+- 1 file changed, 554 insertions(+), 40 deletions(-)
+-
+-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
+-index 97c4c02..9cfc0d9 100644
+---- a/libavcodec/rpi_shader.qasm
+-+++ b/libavcodec/rpi_shader.qasm
+-@@ -2,7 +2,10 @@
+- #
+- # ra0...ra7                                     eight horizontal filter coefficients
+- #
+--# rb1...rb7                                     seven shifted copies of the current unfiltered row
+-+# rb0 rx_shift2
+-+# rb1 ra_y2_next
+-+#
+-+# rb4...rb7
+- #
+- # ra8...ra15                                    eight filtered rows of context (rb15 == most recent)
+- #
+-@@ -26,9 +29,9 @@
+- # rb19                                          next ra16
+- #
+- # ra20                                          1
+--# ra21                                          32
+-+# ra21                                          ra_21
+- # ra22                                          256
+--# ra23                                          8
+-+# ra23                                          rx_shift2_next
+- #
+- # rb20                                          0xffffff00
+- # rb21                                          vpm_setup for reading/writing 16bit results into VPM
+-@@ -57,16 +60,23 @@
+- .set rb_frame_width_minus_1,       rb25
+- .set rb_frame_height_minus_1,      rb30
+- .set rb_pitch,                     rb16
+--.set ra_x_base,                    ra16
+--.set rb_x_base_next,               rb19
+--.set ra_x2_base,                   ra24
+--.set ra_x2_base_next,              ra26
+-+.set ra_x,                         ra16
+-+.set ra_y2,                        ra21
+-+.set ra_y2_next,                   rb1
+-+
+-+.set rb_x_next,                    rb19
+-+.set rx_frame_base2_next,          rb19
+-+
+-+.set ra_frame_base,                ra24
+-+.set ra_frame_base_next,           ra26
+- .set ra_xshift,                    ra17
+- 
+--.set ra_x2shift,                   ra25
+- .set ra_u2v_ref_offset,            ra25
+-+.set ra_frame_base2,               ra25
+- 
+- .set ra_xshift_next,               ra19
+-+.set rx_xshift2,                   rb0
+-+.set rx_xshift2_next,              ra23
+- 
+- .set ra_x2shift_next,              ra27
+- .set ra_u2v_dst_offset,            ra27
+-@@ -83,11 +93,11 @@
+- mov ra31, unif
+- 
+- # Load first request location
+--add ra_x_base, unif, elem_num # Store x
+-+add ra_x, unif, elem_num # Store x
+- mov ra_y, unif # Store y
+--mov ra_x2_base, unif # Store frame u base
+-+mov ra_frame_base, unif # Store frame u base
+- nop
+--sub ra_u2v_ref_offset, unif, ra_x2_base # Store offset to add to move from u to v in reference frame
+-+sub ra_u2v_ref_offset, unif, ra_frame_base # Store offset to add to move from u to v in reference frame
+- 
+- # Read image dimensions
+- sub rb25,unif,1
+-@@ -104,9 +114,7 @@ add rb24, r1, r0
+- # load constants
+- 
+- mov ra20, 1
+--mov ra21, 32
+- mov ra22, 256
+--mov ra23, 8
+- mov ra30, 64
+- 
+- mov rb20, 0xffffff00
+-@@ -156,18 +164,18 @@ mov r1, vpm_setup(0, 2, h16p(0, 0))  # 2 is stride - stride acts on ADDR which i
+- add rb21, r0, r1
+- 
+- # Compute base address for first and second access
+--mov r0, ra_x_base           # Load x
+-+mov r0, ra_x           # Load x
+- max r0, r0, 0; mov r1, ra_y # Load y
+--min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base  # Load the frame base
+-+min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base  # Load the frame base
+- shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
+- add ra_y, r1, 1
+- add r0, r0, r3
+- and r0, r0, ~3
+--max r1, r1, 0 ; mov ra_x_base, r0 # y
+-+max r1, r1, 0 ; mov ra_x, r0 # y
+- min r1, r1, rb_frame_height_minus_1
+- # submit texture requests for first line
+- add r2, r2, r0 ; mul24 r1, r1, rb_pitch
+--add t0s, r0, r1 ; mov ra_x2_base, r2
+-+add t0s, r0, r1 ; mov ra_frame_base, r2
+- add t0s, r2, r1
+- 
+- mov rb12,unif # offset before shift
+-@@ -182,8 +190,8 @@ min r1, r1, rb_frame_height_minus_1
+- add ra_y, ra_y, 1
+- bra -, ra31
+- nop ; mul24 r1, r1, rb_pitch
+--add t0s, r1, ra_x_base
+--add t0s, r1, ra_x2_base
+-+add t0s, r1, ra_x
+-+add t0s, r1, ra_frame_base
+- 
+- 
+- 
+-@@ -192,7 +200,7 @@ add t0s, r1, ra_x2_base
+- # mc_filter_uv(next_kernel, x, y, frame_u_base, frame_v_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_u_dst, this_v_dst)
+- 
+- # At this point we have already issued two pairs of texture requests for the current block
+--# ra_x_base, ra_x16_base point to the current coordinates for this block
+-+# ra_x, ra_x16_base point to the current coordinates for this block
+- ::mc_filter_uv
+- mov ra31, unif
+- 
+-@@ -207,9 +215,9 @@ min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base
+- shl ra_xshift_next, r0, 3
+- sub r2, unif, r3 # compute offset from frame base u to frame base v
+- add r0, r0, r3
+--and rb_x_base_next, r0, ~3
+-+and rb_x_next, r0, ~3
+- mov ra_y_next, r1
+--add ra_x2_base_next, rb_x_base_next, r2
+-+add ra_frame_base_next, rb_x_next, r2
+- 
+- # set up VPM write
+- mov vw_setup, rb28
+-@@ -265,16 +273,16 @@ mov r3, 0
+- # then submit two more texture requests
+- 
+- sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
+--shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
+-+mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+- mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+- shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
+- 
+- max r2, ra_y, 0  # y
+- min r2, r2, rb_frame_height_minus_1
+- add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--add t0s, ra_x2_base, r2
+-+add t0s, ra_x, r2    ; v8subs r1, r1, rb20
+-+add t0s, ra_frame_base, r2
+- 
+- # generate seven shifted versions
+- # interleave with scroll of vertical context
+-@@ -297,7 +305,7 @@ mov ra13, ra14       # Delay slot 1
+- mov ra14, ra15       # Delay slot 2
+- mov ra15, r0         # Delay slot 3
+- 
+--mov rb12,32
+-+mov rb12,32 # TODO remove these to make P weighted prediction work properly
+- mov rb13,6
+- mov rb14,1
+- mov rb15,0
+-@@ -342,7 +350,7 @@ mov vw_addr, unif # start the VDW
+- # mc_filter_uv_b0(next_kernel, x, y, frame_u_base, frame_v_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_u_dst, this_v_dst)
+- 
+- # At this point we have already issued two pairs of texture requests for the current block
+--# ra_x_base, ra_x16_base point to the current coordinates for this block
+-+# ra_x, ra_x16_base point to the current coordinates for this block
+- ::mc_filter_uv_b0
+- mov ra31, unif
+- 
+-@@ -357,9 +365,9 @@ min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base
+- shl ra_xshift_next, r0, 3
+- sub r2, unif, r3 # compute offset from frame base u to frame base v
+- add r0, r0, r3
+--and rb_x_base_next, r0, ~3
+-+and rb_x_next, r0, ~3
+- mov ra_y_next, r1
+--add ra_x2_base_next, rb_x_base_next, r2
+-+add ra_frame_base_next, rb_x_next, r2
+- 
+- # set up VPM write, we need to save 16bit precision
+- mov vw_setup, rb21
+-@@ -408,16 +416,16 @@ mov r3, 0
+- # then submit two more texture requests
+- 
+- sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
+--shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
+-+mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+- mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+- shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
+- 
+- max r2, ra_y, 0  # y
+- min r2, r2, rb_frame_height_minus_1
+- add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--add t0s, ra_x2_base, r2
+-+add t0s, ra_x, r2    ; v8subs r1, r1, rb20
+-+add t0s, ra_frame_base, r2
+- 
+- # generate seven shifted versions
+- # interleave with scroll of vertical context
+-@@ -477,9 +485,9 @@ min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base
+- shl ra_xshift_next, r0, 3
+- sub r2, unif, r3 # compute offset from frame base u to frame base v
+- add r0, r0, r3
+--and rb_x_base_next, r0, ~3
+-+and rb_x_next, r0, ~3
+- mov ra_y_next, r1
+--add ra_x2_base_next, rb_x_base_next, r2
+-+add ra_frame_base_next, rb_x_next, r2
+- 
+- # set up VPM write
+- mov vw_setup, rb28
+-@@ -538,16 +546,16 @@ mov r3, 0
+- # then submit two more texture requests
+- 
+- sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
+--shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
+-+mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+- mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+- shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
+- 
+- max r2, ra_y, 0  # y
+- min r2, r2, rb_frame_height_minus_1
+- add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--add t0s, ra_x2_base, r2
+-+add t0s, ra_x, r2    ; v8subs r1, r1, rb20
+-+add t0s, ra_frame_base, r2
+- 
+- # generate seven shifted versions
+- # interleave with scroll of vertical context
+-@@ -642,5 +650,511 @@ nop        ; nop ; thrend
+- mov interrupt, 1; nop # delay slot 1
+- nop        ; nop # delay slot 2
+- 
+-+
+-+
+-+
+-+
+-+# LUMA CODE
+-+
+-+# The idea is to form B predictions by doing 8 pixels from ref0 in parallel with 8 pixels from ref1.
+-+# For P frames we make the second x,y coordinates offset by +8
+-+
+-+################################################################################
+-+# mc_setup(next_kernel, x, y, ref_y_base, x2, y2, ref_y2_base, frame_width, frame_height, pitch, dst_pitch, offset, shift, pad2)
+-+::mc_setup
+-+
+-+# Read starting kernel
+-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+mov ra31, unif
+-+
+-+# Compute base address for first and second access
+-+add r0, unif, elem_num # Load x
+-+max r0, r0, 0; mov r1, unif # Load y
+-+min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
+-+shl ra_xshift_next, r0, 3 # Compute shifts
+-+add ra_y, r1, 1
+-+and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
+-+add r2, r2, r0  # r2 is address for frame0 (not including y offset)
+-+max r1, r1, 0
+-+min r1, r1, rb_frame_height_minus_1
+-+nop             ; mul24 r1, r1, rb_pitch   # r2 contains the addresses (not including y offset) for frame0
+-+add t0s, r2, r1 ; mov ra_frame_base, r2
+-+
+-+add r0, unif, elem_num # Load x
+-+max r0, r0, 0; mov r1, unif # Load y
+-+min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
+-+shl rx_xshift2_next, r0, 3 # Compute shifts
+-+add ra_y2, r1, 1
+-+and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
+-+add r2, r2, r0  # r2 is address for frame1 (not including y offset)
+-+max r1, r1, 0
+-+min r1, r1, rb_frame_height_minus_1
+-+nop             ; mul24 r1, r1, rb_pitch   # r2 contains the addresses (not including y offset) for frame0
+-+add t0s, r2, r1 ; mov ra_frame_base2, r2
+-+
+-+
+-+# Read image dimensions
+-+sub rb25,unif,1
+-+sub rb30,unif,1
+-+
+-+# get source pitch
+-+mov rb16, unif
+-+
+-+# get destination pitch
+-+mov r0, unif
+-+mov r1, vdw_setup_1(0)
+-+add rb24, r1, r0
+-+
+-+# load constants
+-+
+-+mov ra20, 1
+-+mov ra22, 256
+-+mov ra30, 64
+-+
+-+mov rb20, 0xffffff00
+-+mov rb22, 255
+-+mov rb23, 24
+-+
+-+# touch vertical context to keep simulator happy
+-+
+-+mov ra8, 0
+-+mov ra9, 0
+-+mov ra10, 0
+-+mov ra11, 0
+-+mov ra12, 0
+-+mov ra13, 0
+-+mov ra14, 0
+-+mov ra15, 0
+-+
+-+# Compute part of VPM to use for DMA output
+-+mov r2, qpu_num
+-+mov r1, r2
+-+asr r1, r1, 2
+-+shl r1, r1, 6
+-+mov r0, r2
+-+and r0, r0, 3
+-+add r0, r0, r1
+-+mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
+-+shl r0, r0, 5
+-+add rb27, r0, r1
+-+
+-+# Compute part of VPM to save data into
+-+mov r2, qpu_num   # qpu_num = abcd
+-+mov r1, r2
+-+asr r1, r1, 2
+-+shl r1, r1, 6
+-+mov r0, r2
+-+and r0, r0, 3
+-+add r0, r0, r1
+-+mov r1, vpm_setup(0, 4, h8p(0, 0))   # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
+-+add rb28, r0, r1
+-+
+-+mov rb12,unif # offset before shift
+-+mov rb13,unif # shift
+-+
+-+# Dump padding words
+-+mov r0, unif
+-+
+-+# submit texture requests for second line
+-+max r1, ra_y, 0
+-+min r1, r1, rb_frame_height_minus_1
+-+add ra_y, ra_y, 1
+-+nop ; mul24 r1, r1, rb_pitch
+-+add t0s, r1, ra_frame_base
+-+
+-+max r1, ra_y2, 0
+-+min r1, r1, rb_frame_height_minus_1
+-+bra -, ra31
+-+add ra_y2, ra_y2, 1           # Delay 1
+-+nop ; mul24 r1, r1, rb_pitch  # Delay 2
+-+add t0s, r1, ra_frame_base2   # Delay 3
+-+
+-+
+-+################################################################################
+-+
+-+# mc_filter(next_kernel, x, y, frame_base, x2, y2, frame_base2, height, hcoeffs[0], hcoeffs2[0], hcoeffs[1], hcoeffs2[1], vcoeffs[0], vcoeffs2[0], vcoeffs[1], vcoeffs2[1], offsetweight0, offsetweight1, this_dst)
+-+# In a P block, only the first half of coefficients contain used information.
+-+# At this point we have already issued two pairs of texture requests for the current block
+-+# ra_x, ra_x16_base point to the current coordinates for this block
+-+::mc_filter
+-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+mov ra31, unif
+-+
+-+# per-channel shifts were calculated on the *previous* invocation
+-+
+-+mov ra_xshift, ra_xshift_next
+-+mov rx_xshift2, rx_xshift2_next
+-+
+-+# get base addresses and per-channel shifts for *next* invocation
+-+add r0, unif, elem_num # Load x
+-+max r0, r0, 0; mov r1, unif # Load y
+-+min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
+-+shl ra_xshift_next, r0, 3 # Compute shifts
+-+mov ra_y_next, r1
+-+and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
+-+add ra_frame_base_next, r2, r0  # r2 is address for frame0 (not including y offset)
+-+
+-+add r0, unif, elem_num # Load x
+-+max r0, r0, 0   ; mov r1, unif # Load y
+-+min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
+-+shl rx_xshift2_next, r0, 3 # Compute shifts
+-+add ra_y2_next, r1, 1
+-+and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
+-+add rx_frame_base2_next, r2, r0  # r2 is address for frame1 (not including y offset)
+-+
+-+
+-+# set up VPM write
+-+mov vw_setup, rb28
+-+
+-+# get width,height of block
+-+mov r2, 16
+-+mov r0, unif
+-+shr r1, r0, r2 # Extract width
+-+sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
+-+and r0, r0, rb22 # Extract height
+-+add rb17, r0, 5
+-+add rb18, r0, 7
+-+shl r0, r0, 7
+-+add r0, r0, r1 # Combine width and height of destination area
+-+shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
+-+add rb26, r0, rb27
+-+
+-+# get filter coefficients and discard unused B frame values
+-+mov r0, unif
+-+mov.ifnz -, unif # Alternate coefficients are unused for P frames
+-+asr ra3, r0, rb23;      mul24 r0, r0, ra22 # These may need some pre-rotation to be used in B frames correctly
+-+asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra0, r0, rb23;      mov r0, unif
+-+mov.ifnz -, unif
+-+asr ra7, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra6, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra5, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra4, r0, rb23;      mov r0, unif
+-+mov.ifnz -, unif
+-+asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+asr rb8, r0, rb23;      mov r0, unif
+-+mov.ifnz -, unif
+-+asr rb7, r0, rb23;      mul24 r0, r0, ra22
+-+asr rb6, r0, rb23;      mul24 r0, r0, ra22
+-+asr rb5, r0, rb23;      mul24 r0, r0, ra22
+-+asr rb4, r0, rb23
+-+
+-+mov r0, unif # Frame0 offset/weight
+-+mov.ifnz -, unif # Frame1 offset/weight unused
+-+asr rb15, r0, r2  # Compute offset from MSBs
+-+shl r0, r0, r2
+-+asr rb14, r0, r2  # Compute weight from LSBs
+-+
+-+# r3 is loop counter
+-+
+-+# retrieve texture results and pick out bytes
+-+# then submit two more texture requests
+-+
+-+mov r3, 0
+-+
+-+:yloop
+-+# retrieve texture results and pick out bytes
+-+# then submit two more texture requests
+-+
+-+# If we knew there was no clipping then this code would get simpler.
+-+# Perhaps we could add on the pitch and clip using larger values?
+-+
+-+sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
+-+shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu0
+-+mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+-+mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+shr r1, r4, rx_xshift2
+-+mov.ifz ra_y2, ra_y2_next
+-+
+-+max r2, ra_y, 0  # y
+-+min r2, r2, rb_frame_height_minus_1
+-+add ra_y, ra_y, 1            ; mul24 r2, r2, r3
+-+add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte
+-+
+-+max r2, ra_y2, 0  # y
+-+min r2, r2, rb_frame_height_minus_1
+-+add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
+-+add t0s, ra_frame_base2, r2   ; v8subs r0, r0, rb20
+-+
+-+
+-+# generate seven shifted versions
+-+# interleave with scroll of vertical context
+-+
+-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+
+-+# apply horizontal filter
+-+nop                  ; mul24 r2, r0, ra0
+-+nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+add r0, r2, r3       ; mov r3, rb31
+-+sub.setf -, r3, 8    ; mov ra12, ra13
+-+mov ra9, ra10
+-+mov ra10, ra11
+-+mov ra11, ra12
+-+mov ra12, ra13
+-+brr.anyn -, r:yloop
+-+mov ra13, ra14       # Delay slot 1
+-+mov ra14, ra15       # Delay slot 2
+-+mov ra15, r0         # Delay slot 3
+-+
+-+# apply vertical filter and write to VPM
+-+
+-+nop                     ; mul24 r1, ra14, rb10
+-+nop                     ; mul24 r0, ra13, rb9
+-+add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+add r1, r1, r0          ; mul24 r0, ra8, rb4
+-+add r1, r1, r0          ; mul24 r0, ra9, rb5
+-+add r1, r1, r0          ; mul24 r0, ra10, rb6
+-+add r1, r1, r0          ; mul24 r0, ra11, rb7
+-+
+-+add r1, r1, r0          ; mov -, vw_wait
+-+sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+asr r1, r1, 14
+-+nop                     ; mul24 r1, r1, rb14
+-+add r1, r1, rb12
+-+asr r1, r1, rb13
+-+brr.anyn -, r:yloop
+-+add r1, r1, rb15       # Delay 1
+-+min r1, r1, rb22       # Delay 2
+-+max vpm, r1, 0         # Delay 3
+-+
+-+# DMA out
+-+
+-+bra -, ra31
+-+mov vw_setup, rb26 # VDW setup 0    Delay 1
+-+mov vw_setup, rb29 # Stride         Delay 2
+-+mov vw_addr, unif # start the VDW   Delay 3
+-+
+-+
+-+
+-+################################################################################
+-+
+-+# mc_filter_b(next_kernel, x, y, frame_base, x2, y2, frame_base2, width_height, hcoeffs[0], hcoeffs2[0], hcoeffs[1], hcoeffs2[1], vcoeffs[0], vcoeffs2[0], vcoeffs[1], vcoeffs2[1], offsetweight0, offsetweight1, this_dst)
+-+# In a P block, only the first half of coefficients contain used information.
+-+# At this point we have already issued two pairs of texture requests for the current block
+-+# May be better to just send 16.16 motion vector and figure out the coefficients inside this block (only 4 cases so can compute hcoeffs in around 24 cycles?)
+-+# Can fill in the coefficients so only
+-+# Can also assume default weighted prediction for B frames.
+-+# Perhaps can unpack coefficients in a more efficient manner by doing H/V for a and b at the same time?
+-+# Or possibly by taking advantage of symmetry?
+-+# From 19->7 32bits per command.
+-+::mc_filter_b
+-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+mov ra31, unif
+-+
+-+# per-channel shifts were calculated on the *previous* invocation
+-+
+-+mov ra_xshift, ra_xshift_next
+-+mov rx_xshift2, rx_xshift2_next
+-+
+-+# get base addresses and per-channel shifts for *next* invocation
+-+add r0, unif, elem_num # Load x
+-+max r0, r0, 0; mov r1, unif # Load y
+-+min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
+-+shl ra_xshift_next, r0, 3 # Compute shifts
+-+mov ra_y_next, r1
+-+and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
+-+add ra_frame_base_next, r2, r0  # r2 is address for frame0 (not including y offset)
+-+
+-+add r0, unif, elem_num # Load x
+-+max r0, r0, 0   ; mov r1, unif # Load y
+-+min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
+-+shl rx_xshift2_next, r0, 3 # Compute shifts
+-+add ra_y2_next, r1, 1
+-+and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
+-+add rx_frame_base2_next, r2, r0  # r2 is address for frame1 (not including y offset)
+-+
+-+
+-+# set up VPM write
+-+mov vw_setup, rb28
+-+
+-+# get width,height of block
+-+mov r2, 16
+-+mov r0, unif
+-+shr r1, r0, r2 # Extract width
+-+sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
+-+and r0, r0, rb22 # Extract height
+-+add rb17, r0, 5
+-+add rb18, r0, 7
+-+shl r0, r0, 7
+-+add r0, r0, r1 # Combine width and height of destination area
+-+shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
+-+add rb26, r0, rb27
+-+
+-+# get filter coefficients and discard unused B frame values
+-+mov r0, unif
+-+mov r1, 1
+-+mov.ifnz r0, unif # Alternate coefficients are unused for P frames
+-+nop              ;      mul24 r0, r0 << 13, r1 << 13
+-+asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+nop              ;      mul24 r0, r0 << 14, r1 << 14
+-+asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+nop              ;      mul24 r0, r0 << 15, r1 << 15 # Adjust such that a rotate of 1 will produce the values with first 8 on left, second 8 on right
+-+asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra0, r0, rb23;      mov r0, unif
+-+mov.ifnz r0, unif
+-+nop              ;      mul24 r0, r0 << 9, r1 << 9
+-+asr ra7, r0, rb23;      mul24 r0, r0, ra22
+-+nop              ;      mul24 r0, r0 << 10, r1 << 10
+-+asr ra6, r0, rb23;      mul24 r0, r0, ra22
+-+nop              ;      mul24 r0, r0 << 11, r1 << 11
+-+asr ra5, r0, rb23;      mul24 r0, r0, ra22
+-+nop              ;      mul24 r0, r0 << 12, r1 << 12
+-+asr ra4, r0, rb23;      mov r0, unif
+-+mov.ifnz r0, unif
+-+asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+asr rb8, r0, rb23;      mov r0, unif
+-+mov.ifnz r0, unif
+-+asr rb7, r0, rb23;      mul24 r0, r0, ra22
+-+asr rb6, r0, rb23;      mul24 r0, r0, ra22
+-+asr rb5, r0, rb23;      mul24 r0, r0, ra22
+-+asr rb4, r0, rb23
+-+
+-+mov r0, unif # Frame0 offset/weight
+-+mov.ifnz r0, unif # Frame1 offset/weight unused
+-+asr rb15, r0, r2  # Compute offset from MSBs
+-+shl r0, r0, r2
+-+asr rb14, r0, r2  # Compute weight from LSBs
+-+
+-+# r3 is loop counter
+-+
+-+# retrieve texture results and pick out bytes
+-+# then submit two more texture requests
+-+
+-+mov r3, 0
+-+
+-+:yloopb
+-+# retrieve texture results and pick out bytes
+-+# then submit two more texture requests
+-+
+-+# If we knew there was no clipping then this code would get simpler.
+-+# Perhaps we could add on the pitch and clip using larger values?
+-+
+-+sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
+-+shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu0
+-+mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+-+mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+shr r1, r4, rx_xshift2
+-+mov.ifz ra_y2, ra_y2_next
+-+
+-+max r2, ra_y, 0  # y
+-+min r2, r2, rb_frame_height_minus_1
+-+add ra_y, ra_y, 1            ; mul24 r2, r2, r3
+-+add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte
+-+
+-+max r2, ra_y2, 0  # y
+-+min r2, r2, rb_frame_height_minus_1
+-+add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
+-+add t0s, ra_frame_base2, r2   ; v8subs r0, r0, rb20
+-+
+-+
+-+# generate seven shifted versions
+-+# interleave with scroll of vertical context
+-+
+-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+
+-+# apply horizontal filter
+-+nop                  ; mul24 r2, r0, ra0
+-+nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+add r0, r2, r3       ; mov r3, rb31
+-+sub.setf -, r3, 8    ; mov ra12, ra13
+-+mov ra9, ra10
+-+mov ra10, ra11
+-+mov ra11, ra12
+-+mov ra12, ra13
+-+brr.anyn -, r:yloopb
+-+mov ra13, ra14       # Delay slot 1
+-+mov ra14, ra15       # Delay slot 2
+-+mov ra15, r0         # Delay slot 3
+-+
+-+# apply vertical filter and write to VPM
+-+
+-+nop                     ; mul24 r1, ra14, rb10
+-+nop                     ; mul24 r0, ra13, rb9
+-+add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+add r1, r1, r0          ; mul24 r0, ra8, rb4
+-+add r1, r1, r0          ; mul24 r0, ra9, rb5
+-+add r1, r1, r0          ; mul24 r0, ra10, rb6
+-+add r1, r1, r0          ; mul24 r0, ra11, rb7
+-+
+-+add r1, r1, r0          ; mov -, vw_wait
+-+sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+asr r1, r1, 14
+-+nop                     ; mul24 r1, r1 << 8, ra20 << 8 # Rotate to align left and right halves
+-+add r1, r1, ra30        ; mul24 r0, r1, rb14
+-+add r1, r1, r0
+-+brr.anyn -, r:yloopb
+-+asr r1, r1, 7          # Delay 1
+-+min r1, r1, rb22       # Delay 2
+-+max vpm, r1, 0         # Delay 3
+-+
+-+# DMA out
+-+bra -, ra31
+-+mov vw_setup, rb26 # VDW setup 0    Delay 1
+-+mov vw_setup, rb29 # Stride         Delay 2
+-+mov vw_addr, unif # start the VDW   Delay 3
+-+
+-+################################################################################
+-+
+-+# mc_interrupt_exit12()
+-+::mc_interrupt_exit12
+-+mov  -, vw_wait # wait on the VDW
+-+
+-+ldtmu0
+-+ldtmu0
+-+ldtmu0
+-+ldtmu0
+-+
+-+mov -,sacq(0) # 1
+-+mov -,sacq(0) # 2
+-+mov -,sacq(0) # 3
+-+mov -,sacq(0) # 4
+-+mov -,sacq(0) # 5
+-+mov -,sacq(0) # 6
+-+mov -,sacq(0) # 7
+-+mov -,sacq(0) # 8
+-+mov -,sacq(0) # 9
+-+mov -,sacq(0) # 10
+-+mov -,sacq(0) # 11
+-+
+-+nop        ; nop ; thrend
+-+mov interrupt, 1; nop # delay slot 1
+-+nop        ; nop # delay slot 2
+-+
+-+
+- ::mc_end
+- # Do not add code here because mc_end must appear after all other code.
+--- 
+-2.7.4
+-
+-
+-From f2ffe4186fa49cb27579953c276b51728a08a8b5 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Wed, 20 May 2015 19:58:30 +0100
+-Subject: [PATCH 42/68] Added support for fast cache flush in deblocker
+-
+----
+- libavcodec/hevc_filter.c   |   44 +-
+- libavcodec/rpi_qpu.c       |    6 +
+- libavcodec/rpi_qpu.h       |    2 +
+- libavcodec/rpi_shader.c    | 1028 +++++++++++++++++++++++++++++---------------
+- libavcodec/rpi_shader.h    |   16 +-
+- libavcodec/rpi_user_vcsm.h |   22 +
+- 6 files changed, 768 insertions(+), 350 deletions(-)
+-
+-diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
+-index 92a8271..186317a 100644
+---- a/libavcodec/hevc_filter.c
+-+++ b/libavcodec/hevc_filter.c
+-@@ -37,6 +37,11 @@
+- 
+- #include "bit_depth_template.c"
+- 
+-+#ifdef RPI
+-+#include "rpi_user_vcsm.h"
+-+#include "rpi_qpu.h"
+-+#endif
+-+
+- #define LUMA 0
+- #define CB 1
+- #define CR 2
+-@@ -872,15 +877,46 @@ static void flush_buffer(AVBufferRef *bref) {
+-     gpu_cache_flush(p);
+- }
+- 
+--static void ff_hevc_flush_chroma(HEVCContext *s)
+-+// Return Physical address for this image
+-+static int ff_hevc_buf_base(AVBufferRef *bref) {
+-+  GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
+-+  return p->vc & 0x3fffffff;
+-+}
+-+
+-+static void ff_hevc_flush_chroma(HEVCContext *s, ThreadFrame *f, int n)
+- {
+-     if (s->enable_rpi && !(  s->nal_unit_type == NAL_TRAIL_N ||
+-             s->nal_unit_type == NAL_TSA_N   ||
+-             s->nal_unit_type == NAL_STSA_N  ||
+-             s->nal_unit_type == NAL_RADL_N  ||
+-             s->nal_unit_type == NAL_RASL_N )) {
+-+#define RPI_FAST_CACHEFLUSH
+-+#ifdef RPI_FAST_CACHEFLUSH
+-+        struct vcsm_user_clean_invalid_s iocache = {};
+-+        int curr_y = f->progress->data[0];
+-+        int sz,base;
+-+        if (curr_y < 0) curr_y = 0;
+-+        if (n<=curr_y) return; // Should not happen
+-+        sz = s->frame->linesize[1] * (n-curr_y);
+-+        base = s->frame->linesize[1] * curr_y;
+-+        iocache.s[0].cmd = 3; // Flush L1 cache
+-+        iocache.s[0].addr = 0;
+-+        iocache.s[0].size  = 0;
+-+
+-+        iocache.s[1].cmd = 2;
+-+        iocache.s[1].addr = ff_hevc_buf_base(s->frame->buf[1]) + base;
+-+        iocache.s[1].size  = sz;
+-+
+-+        iocache.s[2].cmd = 2;
+-+        iocache.s[2].addr = ff_hevc_buf_base(s->frame->buf[2]) + base;
+-+        iocache.s[2].size  = sz;
+-+
+-+        vcsm_clean_invalid( gpu_get_mailbox(), &iocache );
+-+
+-+#else
+-         flush_buffer(s->frame->buf[1]);
+-         flush_buffer(s->frame->buf[2]);
+-+#endif
+-         //memcpy(s->dummy.arm,s->frame->data[0],2048*64);
+-         //memcpy(s->dummy.arm,s->frame->data[1],1024*32);
+-         //memcpy(s->dummy.arm,s->frame->data[2],1024*32);
+-@@ -903,7 +939,7 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
+-             sao_filter_CTB(s, x, y - ctb_size);
+-             if (s->threads_type & FF_THREAD_FRAME ) {
+- #ifdef RPI_INTER_QPU
+--                ff_hevc_flush_chroma(s);
+-+                ff_hevc_flush_chroma(s,&s->ref->tf, y);
+- #endif
+-                 ff_thread_report_progress(&s->ref->tf, y, 0);
+-             }
+-@@ -912,7 +948,7 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
+-             sao_filter_CTB(s, x , y);
+-             if (s->threads_type & FF_THREAD_FRAME ) {
+- #ifdef RPI_INTER_QPU
+--                ff_hevc_flush_chroma(s);
+-+                ff_hevc_flush_chroma(s, &s->ref->tf, y + ctb_size);
+- #endif
+-                 ff_thread_report_progress(&s->ref->tf, y + ctb_size, 0);
+-             }
+-@@ -922,7 +958,7 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
+-         //int currh = s->ref->tf.progress->data[0];
+-         //if (((y + ctb_size)&63)==0)
+- #ifdef RPI_INTER_QPU
+--        ff_hevc_flush_chroma(s);
+-+        ff_hevc_flush_chroma(s, &s->ref->tf, y + ctb_size - 4);
+- #endif
+-         ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
+-     }
+-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
+-index f62051f..fd8a276 100644
+---- a/libavcodec/rpi_qpu.c
+-+++ b/libavcodec/rpi_qpu.c
+-@@ -237,6 +237,12 @@ int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p)
+-   return r;
+- }
+- 
+-+int gpu_get_mailbox(void)
+-+{
+-+  assert(gpu);
+-+  return gpu->mb;
+-+}
+-+
+- void gpu_cache_flush(GPU_MEM_PTR_T *p)
+- {
+-   void *tmp = vcsm_lock(p->vcsm_handle);
+-diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
+-index 543c84b..88965e5 100644
+---- a/libavcodec/rpi_qpu.h
+-+++ b/libavcodec/rpi_qpu.h
+-@@ -49,4 +49,6 @@ extern int rpi_test_shader(void);
+- extern void rpi_do_block(const unsigned char *in_buffer_vc, int src_pitch, unsigned char *dst_vc, int dst_pitch, unsigned char *dst);
+- extern void rpi_do_block_arm(const unsigned char *in_buffer, int src_pitch, unsigned char *dst, int dst_pitch);
+- 
+-+extern int gpu_get_mailbox(void);
+-+
+- #endif
+-diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
+-index 3f04d80..9c30e32 100644
+---- a/libavcodec/rpi_shader.c
+-+++ b/libavcodec/rpi_shader.c
+-@@ -23,11 +23,11 @@ __attribute__((aligned(8)))
+- unsigned int rpi_shader[] = {
+- // ::mc_setup_uv
+- /* [0x00000000] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00000008] */ 0x0c9a0f80, 0x10020427, // add ra_x_base, unif, elem_num
+-+/* [0x00000008] */ 0x0c9a0f80, 0x10020427, // add ra_x, unif, elem_num
+- /* [0x00000010] */ 0x15827d80, 0x10020767, // mov ra_y, unif
+--/* [0x00000018] */ 0x15827d80, 0x10020627, // mov ra_x2_base, unif
+-+/* [0x00000018] */ 0x15827d80, 0x10020627, // mov ra_frame_base, unif
+- /* [0x00000020] */ 0x009e7000, 0x100009e7, // nop
+--/* [0x00000028] */ 0x0d620f80, 0x10020667, // sub ra_u2v_ref_offset, unif, ra_x2_base
+-+/* [0x00000028] */ 0x0d620f80, 0x10020667, // sub ra_u2v_ref_offset, unif, ra_frame_base
+- /* [0x00000030] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
+- /* [0x00000038] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
+- /* [0x00000040] */ 0x15827d80, 0x10021427, // mov rb16, unif
+-@@ -35,360 +35,708 @@ unsigned int rpi_shader[] = {
+- /* [0x00000050] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+- /* [0x00000058] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
+- /* [0x00000060] */ 0x00000001, 0xe0020527, // mov ra20, 1
+--/* [0x00000068] */ 0x00000020, 0xe0020567, // mov ra21, 32
+--/* [0x00000070] */ 0x00000100, 0xe00205a7, // mov ra22, 256
+--/* [0x00000078] */ 0x00000008, 0xe00205e7, // mov ra23, 8
+--/* [0x00000080] */ 0x00000040, 0xe00207a7, // mov ra30, 64
+--/* [0x00000088] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
+--/* [0x00000090] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
+--/* [0x00000098] */ 0x00000018, 0xe00215e7, // mov rb23, 24
+--/* [0x000000a0] */ 0x00000000, 0xe0020227, // mov ra8, 0
+--/* [0x000000a8] */ 0x00000000, 0xe0020267, // mov ra9, 0
+--/* [0x000000b0] */ 0x00000000, 0xe00202a7, // mov ra10, 0
+--/* [0x000000b8] */ 0x00000000, 0xe00202e7, // mov ra11, 0
+--/* [0x000000c0] */ 0x00000000, 0xe0020327, // mov ra12, 0
+--/* [0x000000c8] */ 0x00000000, 0xe0020367, // mov ra13, 0
+--/* [0x000000d0] */ 0x00000000, 0xe00203a7, // mov ra14, 0
+--/* [0x000000d8] */ 0x00000000, 0xe00203e7, // mov ra15, 0
+--/* [0x000000e0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+--/* [0x000000e8] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
+--/* [0x000000f0] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+--/* [0x000000f8] */ 0x159e7480, 0x10020867, // mov r1, r2
+--/* [0x00000100] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+--/* [0x00000108] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+--/* [0x00000110] */ 0x159e7480, 0x10020827, // mov r0, r2
+--/* [0x00000118] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+--/* [0x00000120] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000128] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
+--/* [0x00000130] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+--/* [0x00000138] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
+--/* [0x00000140] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+--/* [0x00000148] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
+--/* [0x00000150] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+--/* [0x00000158] */ 0x159e7480, 0x10020867, // mov r1, r2
+--/* [0x00000160] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+--/* [0x00000168] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+--/* [0x00000170] */ 0x159e7480, 0x10020827, // mov r0, r2
+--/* [0x00000178] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+--/* [0x00000180] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000188] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+--/* [0x00000190] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
+--/* [0x00000198] */ 0x0f9c11c0, 0xd0020827, // asr r0, r0, 1
+--/* [0x000001a0] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
+--/* [0x000001a8] */ 0x0c9e7040, 0x10021567, // add rb21, r0, r1
+--/* [0x000001b0] */ 0x15427d80, 0x10020827, // mov r0, ra_x_base
+--/* [0x000001b8] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
+--/* [0x000001c0] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base
+--/* [0x000001c8] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
+--/* [0x000001d0] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
+--/* [0x000001d8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x000001e0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+--/* [0x000001e8] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x_base, r0
+--/* [0x000001f0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+--/* [0x000001f8] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
+--/* [0x00000200] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
+--/* [0x00000208] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
+--/* [0x00000210] */ 0x15827d80, 0x10021327, // mov rb12,unif
+--/* [0x00000218] */ 0x15827d80, 0x10021367, // mov rb13,unif
+--/* [0x00000220] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000228] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
+--/* [0x00000230] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+--/* [0x00000238] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
+--/* [0x00000240] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000248] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+--/* [0x00000250] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
+--/* [0x00000258] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
+-+/* [0x00000068] */ 0x00000100, 0xe00205a7, // mov ra22, 256
+-+/* [0x00000070] */ 0x00000040, 0xe00207a7, // mov ra30, 64
+-+/* [0x00000078] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
+-+/* [0x00000080] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
+-+/* [0x00000088] */ 0x00000018, 0xe00215e7, // mov rb23, 24
+-+/* [0x00000090] */ 0x00000000, 0xe0020227, // mov ra8, 0
+-+/* [0x00000098] */ 0x00000000, 0xe0020267, // mov ra9, 0
+-+/* [0x000000a0] */ 0x00000000, 0xe00202a7, // mov ra10, 0
+-+/* [0x000000a8] */ 0x00000000, 0xe00202e7, // mov ra11, 0
+-+/* [0x000000b0] */ 0x00000000, 0xe0020327, // mov ra12, 0
+-+/* [0x000000b8] */ 0x00000000, 0xe0020367, // mov ra13, 0
+-+/* [0x000000c0] */ 0x00000000, 0xe00203a7, // mov ra14, 0
+-+/* [0x000000c8] */ 0x00000000, 0xe00203e7, // mov ra15, 0
+-+/* [0x000000d0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+-+/* [0x000000d8] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
+-+/* [0x000000e0] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+-+/* [0x000000e8] */ 0x159e7480, 0x10020867, // mov r1, r2
+-+/* [0x000000f0] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+-+/* [0x000000f8] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+-+/* [0x00000100] */ 0x159e7480, 0x10020827, // mov r0, r2
+-+/* [0x00000108] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+-+/* [0x00000110] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000118] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
+-+/* [0x00000120] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+-+/* [0x00000128] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
+-+/* [0x00000130] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+-+/* [0x00000138] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
+-+/* [0x00000140] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+-+/* [0x00000148] */ 0x159e7480, 0x10020867, // mov r1, r2
+-+/* [0x00000150] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+-+/* [0x00000158] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+-+/* [0x00000160] */ 0x159e7480, 0x10020827, // mov r0, r2
+-+/* [0x00000168] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+-+/* [0x00000170] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000178] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+-+/* [0x00000180] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
+-+/* [0x00000188] */ 0x0f9c11c0, 0xd0020827, // asr r0, r0, 1
+-+/* [0x00000190] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
+-+/* [0x00000198] */ 0x0c9e7040, 0x10021567, // add rb21, r0, r1
+-+/* [0x000001a0] */ 0x15427d80, 0x10020827, // mov r0, ra_x
+-+/* [0x000001a8] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
+-+/* [0x000001b0] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base
+-+/* [0x000001b8] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
+-+/* [0x000001c0] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
+-+/* [0x000001c8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x000001d0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+-+/* [0x000001d8] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x, r0
+-+/* [0x000001e0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x000001e8] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
+-+/* [0x000001f0] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_frame_base, r2
+-+/* [0x000001f8] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
+-+/* [0x00000200] */ 0x15827d80, 0x10021327, // mov rb12,unif
+-+/* [0x00000208] */ 0x15827d80, 0x10021367, // mov rb13,unif
+-+/* [0x00000210] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000218] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
+-+/* [0x00000220] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x00000228] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
+-+/* [0x00000230] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000238] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+-+/* [0x00000240] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x
+-+/* [0x00000248] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_frame_base
+- // ::mc_filter_uv
+--/* [0x00000260] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00000268] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x00000270] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00000278] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x00000280] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+--/* [0x00000288] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x00000290] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+--/* [0x00000298] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x000002a0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+--/* [0x000002a8] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x000002b0] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+--/* [0x000002b8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+--/* [0x000002c0] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x000002c8] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000002d0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x000002d8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x000002e0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x000002e8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+--/* [0x000002f0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+--/* [0x000002f8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x00000300] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000308] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000310] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00000318] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000320] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000328] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000330] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000338] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x00000340] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000348] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000350] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000358] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
+--/* [0x00000360] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000368] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000370] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
+--/* [0x00000378] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000380] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
+--/* [0x00000388] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000390] */ 0x0f9e7080, 0x100613e7, // asr.ifnz rb15, r0, r2
+--/* [0x00000398] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x000003a0] */ 0x0f9e7080, 0x100613a7, // asr.ifnz rb14, r0, r2
+--/* [0x000003a8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x000003b0] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x00000250] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000258] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00000260] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000268] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000270] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x00000278] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000280] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x00000288] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x00000290] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
+-+/* [0x00000298] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x000002a0] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
+-+/* [0x000002a8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x000002b0] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x000002b8] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000002c0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x000002c8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x000002d0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x000002d8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+-+/* [0x000002e0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+-+/* [0x000002e8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x000002f0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x000002f8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000300] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00000308] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000310] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000318] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000320] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000328] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x00000330] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000338] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000340] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000348] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
+-+/* [0x00000350] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000358] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000360] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
+-+/* [0x00000368] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000370] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
+-+/* [0x00000378] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000380] */ 0x0f9e7080, 0x100613e7, // asr.ifnz rb15, r0, r2
+-+/* [0x00000388] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000390] */ 0x0f9e7080, 0x100613a7, // asr.ifnz rb14, r0, r2
+-+/* [0x00000398] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x000003a0] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop
+--/* [0x000003b8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x000003c0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x000003c8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x000003d0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x000003d8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x000003e0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x000003e8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x000003f0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x000003f8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x00000400] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x00000408] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000410] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x00000418] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x00000420] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x00000428] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x00000430] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x00000438] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x00000440] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x00000448] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00000450] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+--/* [0x00000458] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+--/* [0x00000460] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
+--/* [0x00000468] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x00000470] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+--/* [0x00000478] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+--/* [0x00000480] */ 0x00000020, 0xe0021327, // mov rb12,32
+--/* [0x00000488] */ 0x00000006, 0xe0021367, // mov rb13,6
+--/* [0x00000490] */ 0x00000001, 0xe00213a7, // mov rb14,1
+--/* [0x00000498] */ 0x00000000, 0xe00213e7, // mov rb15,0
+--/* [0x000004a0] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+--/* [0x000004a8] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+--/* [0x000004b0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+--/* [0x000004b8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+--/* [0x000004c0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x000004c8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x000004d0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+--/* [0x000004d8] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
+--/* [0x000004e0] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
+--/* [0x000004e8] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
+--/* [0x000004f0] */ 0xfffffea8, 0xf06809e7, // brr.anyn -, r:uvloop
+--/* [0x000004f8] */ 0x0c9cf3c0, 0x10020867, // add r1, r1, rb15
+--/* [0x00000500] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x00000508] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+--/* [0x00000510] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00000518] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000520] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x00000528] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x00000530] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x00000538] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000540] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x00000548] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000550] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x000003a8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x000003b0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
+-+/* [0x000003b8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+-+/* [0x000003c0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x000003c8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x000003d0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x000003d8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x000003e0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x000003e8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
+-+/* [0x000003f0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_frame_base, r2
+-+/* [0x000003f8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000400] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x00000408] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x00000410] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00000418] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x00000420] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000428] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00000430] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00000438] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00000440] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+-+/* [0x00000448] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+-+/* [0x00000450] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
+-+/* [0x00000458] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00000460] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+-+/* [0x00000468] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+-+/* [0x00000470] */ 0x00000020, 0xe0021327, // mov rb12,32
+-+/* [0x00000478] */ 0x00000006, 0xe0021367, // mov rb13,6
+-+/* [0x00000480] */ 0x00000001, 0xe00213a7, // mov rb14,1
+-+/* [0x00000488] */ 0x00000000, 0xe00213e7, // mov rb15,0
+-+/* [0x00000490] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+-+/* [0x00000498] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+-+/* [0x000004a0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+/* [0x000004a8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x000004b0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x000004b8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x000004c0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x000004c8] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
+-+/* [0x000004d0] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
+-+/* [0x000004d8] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
+-+/* [0x000004e0] */ 0xfffffea8, 0xf06809e7, // brr.anyn -, r:uvloop
+-+/* [0x000004e8] */ 0x0c9cf3c0, 0x10020867, // add r1, r1, rb15
+-+/* [0x000004f0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x000004f8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x00000500] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00000508] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000510] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000518] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x00000520] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x00000528] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000530] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x00000538] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000540] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_filter_uv_b0
+--/* [0x00000558] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00000560] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x00000568] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00000570] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x00000578] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+--/* [0x00000580] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x00000588] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+--/* [0x00000590] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x00000598] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+--/* [0x000005a0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x000005a8] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+--/* [0x000005b0] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
+--/* [0x000005b8] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x000005c0] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000005c8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x000005d0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x000005d8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x000005e0] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+--/* [0x000005e8] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+--/* [0x000005f0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x000005f8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000600] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000608] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00000610] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000618] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000620] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000628] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000630] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x00000638] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000640] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000648] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000650] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
+--/* [0x00000658] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000660] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000668] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x00000670] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000678] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x00000548] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000550] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00000558] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000560] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000568] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x00000570] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000578] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x00000580] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x00000588] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
+-+/* [0x00000590] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x00000598] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
+-+/* [0x000005a0] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
+-+/* [0x000005a8] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x000005b0] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000005b8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x000005c0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x000005c8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x000005d0] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+-+/* [0x000005d8] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+-+/* [0x000005e0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x000005e8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x000005f0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x000005f8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00000600] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000608] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000610] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000618] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000620] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x00000628] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000630] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000638] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000640] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
+-+/* [0x00000648] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000650] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000658] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x00000660] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000668] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop_b0
+--/* [0x00000680] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x00000688] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x00000690] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x00000698] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x000006a0] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x000006a8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x000006b0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x000006b8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x000006c0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x000006c8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x000006d0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x000006d8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x000006e0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x000006e8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x000006f0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x000006f8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x00000700] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x00000708] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x00000710] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00000718] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+--/* [0x00000720] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+--/* [0x00000728] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+--/* [0x00000730] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x00000738] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+--/* [0x00000740] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+--/* [0x00000748] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+--/* [0x00000750] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+--/* [0x00000758] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+--/* [0x00000760] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+--/* [0x00000768] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x00000770] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
+--/* [0x00000778] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+--/* [0x00000780] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
+--/* [0x00000788] */ 0x009e7000, 0x100009e7, // nop
+--/* [0x00000790] */ 0x009e7000, 0x100009e7, // nop
+--/* [0x00000798] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x000007a0] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000007a8] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000007b0] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x00000670] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x00000678] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
+-+/* [0x00000680] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+-+/* [0x00000688] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000690] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x00000698] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x000006a0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x000006a8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x000006b0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
+-+/* [0x000006b8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_frame_base, r2
+-+/* [0x000006c0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x000006c8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x000006d0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x000006d8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x000006e0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x000006e8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x000006f0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x000006f8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00000700] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00000708] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+-+/* [0x00000710] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+-+/* [0x00000718] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+-+/* [0x00000720] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00000728] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+-+/* [0x00000730] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+-+/* [0x00000738] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+-+/* [0x00000740] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+-+/* [0x00000748] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+/* [0x00000750] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x00000758] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x00000760] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
+-+/* [0x00000768] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+-+/* [0x00000770] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
+-+/* [0x00000778] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x00000780] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x00000788] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000790] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000798] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000007a0] */ 0x009e7000, 0x100009e7, // nop
+- // ::mc_filter_uv_b
+--/* [0x000007b8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x000007c0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x000007c8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x000007d0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x000007d8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+--/* [0x000007e0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x000007e8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+--/* [0x000007f0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x000007f8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+--/* [0x00000800] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x00000808] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+--/* [0x00000810] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+--/* [0x00000818] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x00000820] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000828] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x00000830] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x00000838] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x00000840] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+--/* [0x00000848] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+--/* [0x00000850] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x00000858] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+--/* [0x00000860] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+--/* [0x00000868] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+--/* [0x00000870] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000878] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000880] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00000888] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
+--/* [0x00000890] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000898] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000008a0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000008a8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000008b0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x000008b8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000008c0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000008c8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000008d0] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
+--/* [0x000008d8] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000008e0] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000008e8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x000008f0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x000008f8] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x000007a8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x000007b0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x000007b8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x000007c0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x000007c8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x000007d0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x000007d8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x000007e0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x000007e8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
+-+/* [0x000007f0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x000007f8] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
+-+/* [0x00000800] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x00000808] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x00000810] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000818] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x00000820] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x00000828] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x00000830] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+-+/* [0x00000838] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+-+/* [0x00000840] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x00000848] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+-+/* [0x00000850] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+-+/* [0x00000858] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+-+/* [0x00000860] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000868] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000870] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00000878] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
+-+/* [0x00000880] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000888] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000890] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000898] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000008a0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x000008a8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000008b0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000008b8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000008c0] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
+-+/* [0x000008c8] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000008d0] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000008d8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x000008e0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x000008e8] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop_b
+--/* [0x00000900] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x00000908] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x00000910] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x00000918] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x00000920] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x00000928] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x00000930] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x00000938] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x00000940] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x00000948] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x00000950] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000958] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x00000960] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x00000968] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x00000970] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x00000978] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x00000980] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x00000988] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x00000990] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00000998] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+--/* [0x000009a0] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+--/* [0x000009a8] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
+--/* [0x000009b0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x000009b8] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+--/* [0x000009c0] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+--/* [0x000009c8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+--/* [0x000009d0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+--/* [0x000009d8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+--/* [0x000009e0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+--/* [0x000009e8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x000009f0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x000009f8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+--/* [0x00000a00] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
+--/* [0x00000a08] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
+--/* [0x00000a10] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
+--/* [0x00000a18] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
+--/* [0x00000a20] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x00000a28] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+--/* [0x00000a30] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00000a38] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000a40] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x00000a48] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x00000a50] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x00000a58] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000a60] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x00000a68] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000a70] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x000008f0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x000008f8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
+-+/* [0x00000900] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+-+/* [0x00000908] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000910] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x00000918] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000920] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000928] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x00000930] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
+-+/* [0x00000938] */ 0x0c627c80, 0x10020e27, // add t0s, ra_frame_base, r2
+-+/* [0x00000940] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000948] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x00000950] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x00000958] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00000960] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x00000968] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000970] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00000978] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00000980] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00000988] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+-+/* [0x00000990] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+-+/* [0x00000998] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x000009a0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x000009a8] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+-+/* [0x000009b0] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+-+/* [0x000009b8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+-+/* [0x000009c0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+-+/* [0x000009c8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+/* [0x000009d0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x000009d8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x000009e0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x000009e8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x000009f0] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
+-+/* [0x000009f8] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
+-+/* [0x00000a00] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x00000a08] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
+-+/* [0x00000a10] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x00000a18] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x00000a20] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00000a28] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000a30] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000a38] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x00000a40] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x00000a48] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000a50] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x00000a58] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000a60] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_exit
+--/* [0x00000a78] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x00000a80] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+-+/* [0x00000a68] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00000a70] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+-+/* [0x00000a78] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a80] */ 0x009e7000, 0xa00009e7, // ldtmu0
+- /* [0x00000a88] */ 0x009e7000, 0xa00009e7, // ldtmu0
+- /* [0x00000a90] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000a98] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000aa0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000aa8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x00000ab0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+--/* [0x00000ab8] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000a98] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00000aa0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000aa8] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_interrupt_exit8
+--/* [0x00000ac0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00000ab0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00000ab8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000ac0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+- /* [0x00000ac8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+- /* [0x00000ad0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000ad8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000ae0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000ad8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000ae0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00000ae8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00000af0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00000af8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00000b00] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00000b08] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000b10] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000b18] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000b20] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x00000b28] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+--/* [0x00000b30] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000b10] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00000b18] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x00000b20] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+// ::mc_setup
+-+/* [0x00000b28] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000b30] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000b38] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000b40] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000b48] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
+-+/* [0x00000b50] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000b58] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
+-+/* [0x00000b60] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+-+/* [0x00000b68] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
+-+/* [0x00000b70] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+-+/* [0x00000b78] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x00000b80] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
+-+/* [0x00000b88] */ 0x8c9e7452, 0x10025e18, // add t0s, r2, r1 ; mov ra_frame_base, r2
+-+/* [0x00000b90] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000b98] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000ba0] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
+-+/* [0x00000ba8] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
+-+/* [0x00000bb0] */ 0x0c9c13c0, 0xd0020567, // add ra_y2, r1, 1
+-+/* [0x00000bb8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+-+/* [0x00000bc0] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
+-+/* [0x00000bc8] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+-+/* [0x00000bd0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x00000bd8] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
+-+/* [0x00000be0] */ 0x8c9e7452, 0x10025e19, // add t0s, r2, r1 ; mov ra_frame_base2, r2
+-+/* [0x00000be8] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
+-+/* [0x00000bf0] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
+-+/* [0x00000bf8] */ 0x15827d80, 0x10021427, // mov rb16, unif
+-+/* [0x00000c00] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000c08] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+-+/* [0x00000c10] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
+-+/* [0x00000c18] */ 0x00000001, 0xe0020527, // mov ra20, 1
+-+/* [0x00000c20] */ 0x00000100, 0xe00205a7, // mov ra22, 256
+-+/* [0x00000c28] */ 0x00000040, 0xe00207a7, // mov ra30, 64
+-+/* [0x00000c30] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
+-+/* [0x00000c38] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
+-+/* [0x00000c40] */ 0x00000018, 0xe00215e7, // mov rb23, 24
+-+/* [0x00000c48] */ 0x00000000, 0xe0020227, // mov ra8, 0
+-+/* [0x00000c50] */ 0x00000000, 0xe0020267, // mov ra9, 0
+-+/* [0x00000c58] */ 0x00000000, 0xe00202a7, // mov ra10, 0
+-+/* [0x00000c60] */ 0x00000000, 0xe00202e7, // mov ra11, 0
+-+/* [0x00000c68] */ 0x00000000, 0xe0020327, // mov ra12, 0
+-+/* [0x00000c70] */ 0x00000000, 0xe0020367, // mov ra13, 0
+-+/* [0x00000c78] */ 0x00000000, 0xe00203a7, // mov ra14, 0
+-+/* [0x00000c80] */ 0x00000000, 0xe00203e7, // mov ra15, 0
+-+/* [0x00000c88] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+-+/* [0x00000c90] */ 0x159e7480, 0x10020867, // mov r1, r2
+-+/* [0x00000c98] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+-+/* [0x00000ca0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+-+/* [0x00000ca8] */ 0x159e7480, 0x10020827, // mov r0, r2
+-+/* [0x00000cb0] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+-+/* [0x00000cb8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000cc0] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
+-+/* [0x00000cc8] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+-+/* [0x00000cd0] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
+-+/* [0x00000cd8] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+-+/* [0x00000ce0] */ 0x159e7480, 0x10020867, // mov r1, r2
+-+/* [0x00000ce8] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+-+/* [0x00000cf0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+-+/* [0x00000cf8] */ 0x159e7480, 0x10020827, // mov r0, r2
+-+/* [0x00000d00] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+-+/* [0x00000d08] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000d10] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+-+/* [0x00000d18] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
+-+/* [0x00000d20] */ 0x15827d80, 0x10021327, // mov rb12,unif
+-+/* [0x00000d28] */ 0x15827d80, 0x10021367, // mov rb13,unif
+-+/* [0x00000d30] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000d38] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
+-+/* [0x00000d40] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x00000d48] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
+-+/* [0x00000d50] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+-+/* [0x00000d58] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_frame_base
+-+/* [0x00000d60] */ 0x13540dc0, 0xd0020867, // max r1, ra_y2, 0
+-+/* [0x00000d68] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x00000d70] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000d78] */ 0x0c541dc0, 0xd0020567, // add ra_y2, ra_y2, 1
+-+/* [0x00000d80] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+-+/* [0x00000d88] */ 0x0c667380, 0x10020e27, // add t0s, r1, ra_frame_base2
+-+// ::mc_filter
+-+/* [0x00000d90] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000d98] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000da0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00000da8] */ 0x155e7d80, 0x10021027, // mov rx_xshift2, rx_xshift2_next
+-+/* [0x00000db0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000db8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000dc0] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
+-+/* [0x00000dc8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000dd0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x00000dd8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+-+/* [0x00000de0] */ 0x0c9e7400, 0x100206a7, // add ra_frame_base_next, r2, r0
+-+/* [0x00000de8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000df0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0   ; mov r1, unif
+-+/* [0x00000df8] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
+-+/* [0x00000e00] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
+-+/* [0x00000e08] */ 0x0c9c13c0, 0xd0021067, // add ra_y2_next, r1, 1
+-+/* [0x00000e10] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+-+/* [0x00000e18] */ 0x0c9e7400, 0x100214e7, // add rx_frame_base2_next, r2, r0
+-+/* [0x00000e20] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x00000e28] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x00000e30] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000e38] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x00000e40] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x00000e48] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x00000e50] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+-+/* [0x00000e58] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+-+/* [0x00000e60] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x00000e68] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000e70] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000e78] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00000e80] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000e88] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
+-+/* [0x00000e90] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000e98] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000ea0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000ea8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x00000eb0] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
+-+/* [0x00000eb8] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000ec0] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000ec8] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000ed0] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+-+/* [0x00000ed8] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
+-+/* [0x00000ee0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000ee8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000ef0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000ef8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+-+/* [0x00000f00] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
+-+/* [0x00000f08] */ 0x4f5971c6, 0x100251e0, // asr rb7, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000f10] */ 0x4f5971c6, 0x100251a0, // asr rb6, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000f18] */ 0x4f5971c6, 0x10025160, // asr rb5, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000f20] */ 0x0f9d71c0, 0x10021127, // asr rb4, r0, rb23
+-+/* [0x00000f28] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000f30] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
+-+/* [0x00000f38] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
+-+/* [0x00000f40] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000f48] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
+-+/* [0x00000f50] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+// :yloop
+-+/* [0x00000f58] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
+-+/* [0x00000f60] */ 0x8e4539bf, 0xa0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu0
+-+/* [0x00000f68] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+-+/* [0x00000f70] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000f78] */ 0x0e9c09c0, 0x10020867, // shr r1, r4, rx_xshift2
+-+/* [0x00000f80] */ 0x159c1fc0, 0x10040567, // mov.ifz ra_y2, ra_y2_next
+-+/* [0x00000f88] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000f90] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000f98] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
+-+/* [0x00000fa0] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
+-+/* [0x00000fa8] */ 0x13540dc0, 0xd00208a7, // max r2, ra_y2, 0
+-+/* [0x00000fb0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000fb8] */ 0x4c541dd3, 0xd0024562, // add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
+-+/* [0x00000fc0] */ 0xec654c87, 0x10024e20, // add t0s, ra_frame_base2, r2   ; v8subs r0, r0, rb20
+-+/* [0x00000fc8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000fd0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x00000fd8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x00000fe0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00000fe8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x00000ff0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000ff8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00001000] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00001008] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00001010] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+/* [0x00001018] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+/* [0x00001020] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+/* [0x00001028] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+/* [0x00001030] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+/* [0x00001038] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+/* [0x00001040] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+/* [0x00001048] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+/* [0x00001050] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+-+/* [0x00001058] */ 0x8d3487f6, 0xd00279cc, // sub.setf -, r3, 8    ; mov ra12, ra13
+-+/* [0x00001060] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+-+/* [0x00001068] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+-+/* [0x00001070] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+-+/* [0x00001078] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+-+/* [0x00001080] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:yloop
+-+/* [0x00001088] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00001090] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+-+/* [0x00001098] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+-+/* [0x000010a0] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+-+/* [0x000010a8] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+-+/* [0x000010b0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+/* [0x000010b8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x000010c0] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb4
+-+/* [0x000010c8] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb5
+-+/* [0x000010d0] */ 0x4c286237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb6
+-+/* [0x000010d8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
+-+/* [0x000010e0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x000010e8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x000010f0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x000010f8] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
+-+/* [0x00001100] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
+-+/* [0x00001108] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
+-+/* [0x00001110] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:yloop
+-+/* [0x00001118] */ 0x0c9cf3c0, 0x10020867, // add r1, r1, rb15
+-+/* [0x00001120] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x00001128] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x00001130] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00001138] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00001140] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00001148] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+// ::mc_filter_b
+-+/* [0x00001150] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00001158] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00001160] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00001168] */ 0x155e7d80, 0x10021027, // mov rx_xshift2, rx_xshift2_next
+-+/* [0x00001170] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00001178] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00001180] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
+-+/* [0x00001188] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00001190] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x00001198] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+-+/* [0x000011a0] */ 0x0c9e7400, 0x100206a7, // add ra_frame_base_next, r2, r0
+-+/* [0x000011a8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x000011b0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0   ; mov r1, unif
+-+/* [0x000011b8] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
+-+/* [0x000011c0] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
+-+/* [0x000011c8] */ 0x0c9c13c0, 0xd0021067, // add ra_y2_next, r1, 1
+-+/* [0x000011d0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+-+/* [0x000011d8] */ 0x0c9e7400, 0x100214e7, // add rx_frame_base2_next, r2, r0
+-+/* [0x000011e0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x000011e8] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x000011f0] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000011f8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x00001200] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x00001208] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x00001210] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+-+/* [0x00001218] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+-+/* [0x00001220] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x00001228] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00001230] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00001238] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00001240] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00001248] */ 0x00000001, 0xe0020867, // mov r1, 1
+-+/* [0x00001250] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
+-+/* [0x00001258] */ 0x409f3001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 13, r1 << 13
+-+/* [0x00001260] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00001268] */ 0x409f2001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 14, r1 << 14
+-+/* [0x00001270] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00001278] */ 0x409f1001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 15, r1 << 15
+-+/* [0x00001280] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00001288] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x00001290] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
+-+/* [0x00001298] */ 0x409f7001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 9, r1 << 9
+-+/* [0x000012a0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000012a8] */ 0x409f6001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 10, r1 << 10
+-+/* [0x000012b0] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000012b8] */ 0x409f5001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 11, r1 << 11
+-+/* [0x000012c0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000012c8] */ 0x409f4001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 12, r1 << 12
+-+/* [0x000012d0] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+-+/* [0x000012d8] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
+-+/* [0x000012e0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000012e8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000012f0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000012f8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+-+/* [0x00001300] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
+-+/* [0x00001308] */ 0x4f5971c6, 0x100251e0, // asr rb7, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00001310] */ 0x4f5971c6, 0x100251a0, // asr rb6, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00001318] */ 0x4f5971c6, 0x10025160, // asr rb5, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00001320] */ 0x0f9d71c0, 0x10021127, // asr rb4, r0, rb23
+-+/* [0x00001328] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00001330] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
+-+/* [0x00001338] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
+-+/* [0x00001340] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00001348] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
+-+/* [0x00001350] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+// :yloopb
+-+/* [0x00001358] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
+-+/* [0x00001360] */ 0x8e4539bf, 0xa0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu0
+-+/* [0x00001368] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+-+/* [0x00001370] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00001378] */ 0x0e9c09c0, 0x10020867, // shr r1, r4, rx_xshift2
+-+/* [0x00001380] */ 0x159c1fc0, 0x10040567, // mov.ifz ra_y2, ra_y2_next
+-+/* [0x00001388] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00001390] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00001398] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
+-+/* [0x000013a0] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
+-+/* [0x000013a8] */ 0x13540dc0, 0xd00208a7, // max r2, ra_y2, 0
+-+/* [0x000013b0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x000013b8] */ 0x4c541dd3, 0xd0024562, // add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
+-+/* [0x000013c0] */ 0xec654c87, 0x10024e20, // add t0s, ra_frame_base2, r2   ; v8subs r0, r0, rb20
+-+/* [0x000013c8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x000013d0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x000013d8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x000013e0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x000013e8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x000013f0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x000013f8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00001400] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00001408] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00001410] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+/* [0x00001418] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+/* [0x00001420] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+/* [0x00001428] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+/* [0x00001430] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+/* [0x00001438] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+/* [0x00001440] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+/* [0x00001448] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+/* [0x00001450] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+-+/* [0x00001458] */ 0x8d3487f6, 0xd00279cc, // sub.setf -, r3, 8    ; mov ra12, ra13
+-+/* [0x00001460] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+-+/* [0x00001468] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+-+/* [0x00001470] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+-+/* [0x00001478] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+-+/* [0x00001480] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:yloopb
+-+/* [0x00001488] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00001490] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+-+/* [0x00001498] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+-+/* [0x000014a0] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+-+/* [0x000014a8] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+-+/* [0x000014b0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+/* [0x000014b8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x000014c0] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb4
+-+/* [0x000014c8] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb5
+-+/* [0x000014d0] */ 0x4c286237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb6
+-+/* [0x000014d8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
+-+/* [0x000014e0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x000014e8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x000014f0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x000014f8] */ 0x4053800e, 0xd00049e1, // nop                     ; mul24 r1, r1 << 8, ra20 << 8
+-+/* [0x00001500] */ 0x4c78e38f, 0x10024860, // add r1, r1, ra30        ; mul24 r0, r1, rb14
+-+/* [0x00001508] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
+-+/* [0x00001510] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:yloopb
+-+/* [0x00001518] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
+-+/* [0x00001520] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x00001528] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x00001530] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00001538] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00001540] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00001548] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+// ::mc_interrupt_exit12
+-+/* [0x00001550] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00001558] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00001560] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00001568] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00001570] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00001578] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001580] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001588] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001590] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001598] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000015a0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000015a8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000015b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000015b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000015c0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000015c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000015d0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x000015d8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x000015e0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_end
+- };
+- #ifdef __HIGHC__
+-diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
+-index cec9901..3fa8531 100644
+---- a/libavcodec/rpi_shader.h
+-+++ b/libavcodec/rpi_shader.h
+-@@ -4,11 +4,15 @@
+- extern unsigned int rpi_shader[];
+- 
+- #define mc_setup_uv (rpi_shader + 0)
+--#define mc_filter_uv (rpi_shader + 152)
+--#define mc_filter_uv_b0 (rpi_shader + 342)
+--#define mc_filter_uv_b (rpi_shader + 494)
+--#define mc_exit (rpi_shader + 670)
+--#define mc_interrupt_exit8 (rpi_shader + 688)
+--#define mc_end (rpi_shader + 718)
+-+#define mc_filter_uv (rpi_shader + 148)
+-+#define mc_filter_uv_b0 (rpi_shader + 338)
+-+#define mc_filter_uv_b (rpi_shader + 490)
+-+#define mc_exit (rpi_shader + 666)
+-+#define mc_interrupt_exit8 (rpi_shader + 684)
+-+#define mc_setup (rpi_shader + 714)
+-+#define mc_filter (rpi_shader + 868)
+-+#define mc_filter_b (rpi_shader + 1108)
+-+#define mc_interrupt_exit12 (rpi_shader + 1364)
+-+#define mc_end (rpi_shader + 1402)
+- 
+- #endif
+-diff --git a/libavcodec/rpi_user_vcsm.h b/libavcodec/rpi_user_vcsm.h
+-index fbebbbe..95e6de1 100644
+---- a/libavcodec/rpi_user_vcsm.h
+-+++ b/libavcodec/rpi_user_vcsm.h
+-@@ -418,6 +418,28 @@ int vcsm_unlock_hdl( unsigned int handle );
+- */
+- int vcsm_unlock_hdl_sp( unsigned int handle, int cache_no_flush );
+- 
+-+/* Clean and/or invalidate the memory associated with this user opaque handle
+-+**
+-+** Returns:        non-zero on error
+-+**
+-+** structure contains a list of flush/invalidate commands. Commands are:
+-+** 0: nop
+-+** 1: invalidate given physical range in L2
+-+** 2: clean      given physical range in L2
+-+** 3: clean+invalidate all of L1
+-+** 4: flush      all of L2 and all of L1
+-+*/
+-+struct vcsm_user_clean_invalid_s {
+-+    struct {
+-+       unsigned int cmd;
+-+       unsigned int addr;
+-+       unsigned int size;
+-+    } s[8];
+-+};
+-+
+-+int vcsm_clean_invalid( unsigned int handle, struct vcsm_user_clean_invalid_s *s );
+-+
+-+
+- #ifdef __cplusplus
+- }
+- #endif
+--- 
+-2.7.4
+-
+-
+-From 09685ab55aecb9400e354522894e0fbbb6381ca9 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Wed, 20 May 2015 21:12:55 +0100
+-Subject: [PATCH 43/68] Added multi mailbox - not working
+-
+----
+- libavcodec/hevc.c        | 40 ++++++++++++++++++++++++++++---
+- libavcodec/rpi_mailbox.c | 47 +++++++++++++++++++++++++++++++++++++
+- libavcodec/rpi_mailbox.h |  5 ++++
+- libavcodec/rpi_qpu.c     | 61 ++++++++++++++++++++++++++++++++++++++++++++----
+- libavcodec/rpi_qpu.h     |  2 ++
+- 5 files changed, 147 insertions(+), 8 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 951e2d3..ab63efd 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -47,6 +47,11 @@
+-   //#define EARLY_MALLOC
+-   // Move Inter prediction into separate pass
+-   #define RPI_INTER
+-+
+-+  #ifdef RPI_INTER_QPU
+-+    // Define RPI_MULTI_MAILBOX to use the updated mailbox that can launch both QPU and VPU
+-+    #define RPI_MULTI_MAILBOX
+-+  #endif
+- #endif
+- 
+- // #define DISABLE_MC
+-@@ -2843,10 +2848,14 @@ static void rpi_inter_clear(HEVCContext *s)
+- static void rpi_execute_inter_qpu(HEVCContext *s)
+- {
+-     int k;
+-+    int i;
+-     uint32_t *unif_vc = (uint32_t *)s->unif_mvs_ptr.vc;
+--
+--    if (s->sh.slice_type == I_SLICE)
+--        return;
+-+    if (s->sh.slice_type == I_SLICE) {
+-+#ifdef RPI_MULTI_MAILBOX
+-+      rpi_execute_transform(s);
+-+      return;
+-+#endif
+-+    }
+-     for(k=0;k<8;k++) {
+-         s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
+-         s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
+-@@ -2856,6 +2865,22 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
+- 
+-     s->u_mvs[8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
+- 
+-+#ifdef RPI_MULTI_MAILBOX
+-+    gpu_cache_flush(&s->coeffs_buf_accelerated);
+-+    s->vpu_id = vpu_qpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0,
+-+                                   qpu_get_fn(QPU_MC_SETUP_UV),
+-+                                   (uint32_t)(unif_vc+(s->mvs_base[0 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-+                                   (uint32_t)(unif_vc+(s->mvs_base[1 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-+                                   (uint32_t)(unif_vc+(s->mvs_base[2 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-+                                   (uint32_t)(unif_vc+(s->mvs_base[3 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-+                                   (uint32_t)(unif_vc+(s->mvs_base[4 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-+                                   (uint32_t)(unif_vc+(s->mvs_base[5 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-+                                   (uint32_t)(unif_vc+(s->mvs_base[6 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-+                                   (uint32_t)(unif_vc+(s->mvs_base[7 ] - (uint32_t*)s->unif_mvs_ptr.arm))
+-+                                 );
+-+    for(i=0;i<4;i++)
+-+        s->num_coeffs[i] = 0;
+-+#else
+-     qpu_run_shader8(qpu_get_fn(QPU_MC_SETUP_UV),
+-       (uint32_t)(unif_vc+(s->mvs_base[0 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-       (uint32_t)(unif_vc+(s->mvs_base[1 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-@@ -2866,6 +2891,7 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
+-       (uint32_t)(unif_vc+(s->mvs_base[6 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-       (uint32_t)(unif_vc+(s->mvs_base[7 ] - (uint32_t*)s->unif_mvs_ptr.arm))
+-       );
+-+#endif
+- }
+- #endif
+- 
+-@@ -2945,6 +2971,12 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-           if ( (((y_ctb + ctb_size)&63) == 0) && x_ctb + ctb_size >= s->ps.sps->width) {
+-             // Transform all blocks
+-             // printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
+-+#ifdef RPI_MULTI_MAILBOX
+-+            // Kick off inter prediction on QPUs
+-+            rpi_execute_inter_qpu(s);
+-+            // Perform luma inter prediction
+-+            rpi_execute_inter_cmds(s);
+-+#else
+-             rpi_execute_transform(s);
+-             // Perform inter prediction
+-             rpi_execute_inter_cmds(s);
+-@@ -2952,6 +2984,8 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-             // Kick off inter prediction on QPUs
+-             rpi_execute_inter_qpu(s);
+- #endif
+-+#endif
+-+
+-             // Wait for transform completion
+-             vpu_wait(s->vpu_id);
+- 
+-diff --git a/libavcodec/rpi_mailbox.c b/libavcodec/rpi_mailbox.c
+-index 77a56dd..3904efc 100644
+---- a/libavcodec/rpi_mailbox.c
+-+++ b/libavcodec/rpi_mailbox.c
+-@@ -276,6 +276,53 @@ unsigned execute_qpu(int file_desc, unsigned num_qpus, unsigned control, unsigne
+-    return p[5];
+- }
+- 
+-+void execute_multi(int file_desc,
+-+   unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout,
+-+   unsigned num_qpus_2, unsigned control_2, unsigned noflush_2, unsigned timeout_2,
+-+   unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
+-+   unsigned code_2, unsigned r0_2, unsigned r1_2, unsigned r2_2, unsigned r3_2, unsigned r4_2, unsigned r5_2) {
+-+   int i=0;
+-+   unsigned p[32];
+-+
+-+   p[i++] = 0; // size
+-+   p[i++] = 0x00000000; // process request
+-+   p[i++] = 0x30018; // (the tag id)
+-+   p[i++] = 88; // (size of the buffer)
+-+   p[i++] = 88; // (size of the data)
+-+
+-+   p[i++] = num_qpus;
+-+   p[i++] = control;
+-+   p[i++] = noflush;
+-+   p[i++] = timeout; // ms
+-+
+-+   p[i++] = num_qpus_2;
+-+   p[i++] = control_2;
+-+   p[i++] = noflush_2;
+-+   p[i++] = timeout_2; // ms
+-+
+-+   p[i++] = code;
+-+   p[i++] = r0;
+-+   p[i++] = r1;
+-+   p[i++] = r2;
+-+   p[i++] = r3;
+-+   p[i++] = r4;
+-+   p[i++] = r5;
+-+
+-+   p[i++] = code_2;
+-+   p[i++] = r0_2;
+-+   p[i++] = r1_2;
+-+   p[i++] = r2_2;
+-+   p[i++] = r3_2;
+-+   p[i++] = r4_2;
+-+   p[i++] = r5_2;
+-+
+-+   p[i++] = 0x00000000; // end tag
+-+   p[0] = i*sizeof *p; // actual size
+-+
+-+   mbox_property(file_desc, p);
+-+   return;
+-+}
+-+
+- int mbox_open() {
+-    int file_desc;
+- 
+-diff --git a/libavcodec/rpi_mailbox.h b/libavcodec/rpi_mailbox.h
+-index c264d2e..5898102 100644
+---- a/libavcodec/rpi_mailbox.h
+-+++ b/libavcodec/rpi_mailbox.h
+-@@ -15,6 +15,11 @@ extern void unmapmem(void *addr, unsigned size);
+- 
+- extern unsigned execute_code(int file_desc, unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5);
+- extern unsigned execute_qpu(int file_desc, unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout);
+-+extern void execute_multi(int file_desc,
+-+   unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout,
+-+   unsigned num_qpus_2, unsigned control_2, unsigned noflush_2, unsigned timeout_2,
+-+   unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
+-+   unsigned code_2, unsigned r0_2, unsigned r1_2, unsigned r2_2, unsigned r3_2, unsigned r4_2, unsigned r5_2);
+- extern unsigned qpu_enable(int file_desc, unsigned enable);
+- 
+- #endif
+-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
+-index fd8a276..feb3284 100644
+---- a/libavcodec/rpi_qpu.c
+-+++ b/libavcodec/rpi_qpu.c
+-@@ -123,7 +123,7 @@ static pthread_cond_t post_cond_head = PTHREAD_COND_INITIALIZER;
+- static pthread_cond_t post_cond_tail = PTHREAD_COND_INITIALIZER;
+- static pthread_mutex_t post_mutex = PTHREAD_MUTEX_INITIALIZER;
+- 
+--static int vpu_cmds[MAXCMDS][8];
+-+static int vpu_cmds[MAXCMDS][16];
+- static volatile int vpu_async_tail=0; // Contains the number of posted jobs
+- static volatile int vpu_async_head=0;
+- #endif
+-@@ -346,6 +346,7 @@ unsigned int vpu_get_constants(void) {
+- static void *vpu_start(void *arg) {
+-   while(1) {
+-     int *p;
+-+    int qpu_code;
+-     pthread_mutex_lock(&post_mutex);
+-     while( vpu_async_tail - vpu_async_head <= 0)
+-     {
+-@@ -358,12 +359,25 @@ static void *vpu_start(void *arg) {
+-     if (p[6] == -1) {
+-       break; // Last job
+-     }
+--    if (p[7]) {
+-+    qpu_code = p[7];
+-+    //if (p[7]) {
+-         //GPU_MEM_PTR_T *buf = (GPU_MEM_PTR_T *)p[7];
+-         //gpu_cache_flush(buf);
+--    }
+--    vpu_execute_code(p[0], p[1], p[2], p[3], p[4], p[5], p[6]);
+-+    //}
+-+    if (!qpu_code) {
+-+      vpu_execute_code(p[0], p[1], p[2], p[3], p[4], p[5], p[6]);
+-+    } else {
+-+      int i;
+-+      for(i=0;i<8;i++) {
+-+        gpu->mail[i*2] = p[8+i];
+-+        gpu->mail[i*2 + 1] = qpu_code;
+-+      }
+- 
+-+      execute_multi(gpu->mb,8,gpu->vc + offsetof(struct GPU, mail), 1 /* no flush */, 5000 /* timeout ms */,
+-+                              0, 0, 0, 0,
+-+                              p[0], p[1], p[2], p[3], p[4], p[5], p[6], // VPU0
+-+                              0,    0   , 0   , 0   , 0   , 0   , 0); // VPU1
+-+    }
+-     pthread_mutex_lock(&post_mutex);
+-     vpu_async_head++;
+-     pthread_cond_broadcast(&post_cond_head);
+-@@ -400,7 +414,43 @@ int vpu_post_code(unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned
+-     p[4] = r3;
+-     p[5] = r4;
+-     p[6] = r5;
+--    p[7] = (int) buf;
+-+    p[7] = 0;
+-+    if (num<=1)
+-+      pthread_cond_broadcast(&post_cond_tail); // Otherwise the vpu thread must already be awake
+-+    pthread_mutex_unlock(&post_mutex);
+-+    return id;
+-+  }
+-+}
+-+
+-+int vpu_qpu_post_code(unsigned vpu_code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
+-+                      int qpu_code, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8)
+-+{
+-+
+-+  pthread_mutex_lock(&post_mutex);
+-+  {
+-+    int id = vpu_async_tail++;
+-+    int *p = vpu_cmds[id%MAXCMDS];
+-+    int num = vpu_async_tail - vpu_async_head;
+-+    if (num>MAXCMDS) {
+-+      printf("Too many commands submitted\n");
+-+      exit(-1);
+-+    }
+-+    p[0] = vpu_code;
+-+    p[1] = r0;
+-+    p[2] = r1;
+-+    p[3] = r2;
+-+    p[4] = r3;
+-+    p[5] = r4;
+-+    p[6] = r5;
+-+    p[7] = qpu_code;
+-+    p[8 ] = unifs1;
+-+    p[9 ] = unifs2;
+-+    p[10] = unifs3;
+-+    p[11] = unifs4;
+-+    p[12] = unifs5;
+-+    p[13] = unifs6;
+-+    p[14] = unifs7;
+-+    p[15] = unifs8;
+-     if (num<=1)
+-       pthread_cond_broadcast(&post_cond_tail); // Otherwise the vpu thread must already be awake
+-     pthread_mutex_unlock(&post_mutex);
+-@@ -966,6 +1016,7 @@ void rpi_do_block(const uint8_t *in_buffer_vc, int src_pitch, uint8_t *dst_vc, i
+- }
+- 
+- 
+-+
+- #endif
+- 
+- #endif // RPI
+-diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
+-index 88965e5..2f08f03 100644
+---- a/libavcodec/rpi_qpu.h
+-+++ b/libavcodec/rpi_qpu.h
+-@@ -41,6 +41,8 @@ extern unsigned int vpu_get_fn(void);
+- extern unsigned int vpu_get_constants(void);
+- extern unsigned vpu_execute_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5);
+- extern int vpu_post_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5, GPU_MEM_PTR_T *buf);
+-+int vpu_qpu_post_code(unsigned vpu_code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
+-+                      int qpu_code, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8);
+- extern void vpu_wait( int id);
+- 
+- // Simple test of shader code
+--- 
+-2.7.4
+-
+-
+-From 311f2da06d13a98d9bdda2df8684d7cf55b9a08e Mon Sep 17 00:00:00 2001
+-From: popcornmix <popcornmix@gmail.com>
+-Date: Thu, 21 May 2015 16:50:02 +0100
+-Subject: [PATCH 44/68] Pass qpu number in as uniform
+-
+----
+- libavcodec/hevc.c          |    2 +-
+- libavcodec/rpi_shader.c    | 1288 ++++++++++++++++++++++----------------------
+- libavcodec/rpi_shader.h    |   20 +-
+- libavcodec/rpi_shader.qasm |   10 +-
+- 4 files changed, 657 insertions(+), 663 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index ab63efd..caadfaa 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -2834,6 +2834,7 @@ static void rpi_inter_clear(HEVCContext *s)
+-         *s->u_mvs[i]++ = pic_height;
+-         *s->u_mvs[i]++ = s->frame->linesize[1];
+-         *s->u_mvs[i]++ = s->frame->linesize[2];
+-+        *s->u_mvs[i]++ = i;
+-         if (weight_flag) {
+-             *s->u_mvs[i]++ = 1 << (s->sh.chroma_log2_weight_denom + 6 - 1);
+-             *s->u_mvs[i]++ = s->sh.chroma_log2_weight_denom + 6;
+-@@ -2841,7 +2842,6 @@ static void rpi_inter_clear(HEVCContext *s)
+-             *s->u_mvs[i]++ = 1 << 5;
+-             *s->u_mvs[i]++ = 6;
+-         }
+--        s->u_mvs[i] += 1;  // Padding words
+-     }
+- }
+- 
+-diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
+-index 9c30e32..a0f0282 100644
+---- a/libavcodec/rpi_shader.c
+-+++ b/libavcodec/rpi_shader.c
+-@@ -48,8 +48,8 @@ unsigned int rpi_shader[] = {
+- /* [0x000000b8] */ 0x00000000, 0xe0020367, // mov ra13, 0
+- /* [0x000000c0] */ 0x00000000, 0xe00203a7, // mov ra14, 0
+- /* [0x000000c8] */ 0x00000000, 0xe00203e7, // mov ra15, 0
+--/* [0x000000d0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+--/* [0x000000d8] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
+-+/* [0x000000d0] */ 0x15827d80, 0x100208e7, // mov r3, unif
+-+/* [0x000000d8] */ 0x119c17c0, 0xd00208a7, // shl r2, r3, 1
+- /* [0x000000e0] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+- /* [0x000000e8] */ 0x159e7480, 0x10020867, // mov r1, r2
+- /* [0x000000f0] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+-@@ -60,669 +60,669 @@ unsigned int rpi_shader[] = {
+- /* [0x00000118] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
+- /* [0x00000120] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+- /* [0x00000128] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
+--/* [0x00000130] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+--/* [0x00000138] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
+--/* [0x00000140] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+--/* [0x00000148] */ 0x159e7480, 0x10020867, // mov r1, r2
+--/* [0x00000150] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+--/* [0x00000158] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+--/* [0x00000160] */ 0x159e7480, 0x10020827, // mov r0, r2
+--/* [0x00000168] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+--/* [0x00000170] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000178] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+--/* [0x00000180] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
+--/* [0x00000188] */ 0x0f9c11c0, 0xd0020827, // asr r0, r0, 1
+--/* [0x00000190] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
+--/* [0x00000198] */ 0x0c9e7040, 0x10021567, // add rb21, r0, r1
+--/* [0x000001a0] */ 0x15427d80, 0x10020827, // mov r0, ra_x
+--/* [0x000001a8] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
+--/* [0x000001b0] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base
+--/* [0x000001b8] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
+--/* [0x000001c0] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
+--/* [0x000001c8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x000001d0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+--/* [0x000001d8] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x, r0
+--/* [0x000001e0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+--/* [0x000001e8] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
+--/* [0x000001f0] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_frame_base, r2
+--/* [0x000001f8] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
+--/* [0x00000200] */ 0x15827d80, 0x10021327, // mov rb12,unif
+--/* [0x00000208] */ 0x15827d80, 0x10021367, // mov rb13,unif
+--/* [0x00000210] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000218] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
+--/* [0x00000220] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+--/* [0x00000228] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
+--/* [0x00000230] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000238] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+--/* [0x00000240] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x
+--/* [0x00000248] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_frame_base
+-+/* [0x00000130] */ 0x119c17c0, 0xd00208a7, // shl r2, r3, 1
+-+/* [0x00000138] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+-+/* [0x00000140] */ 0x159e7480, 0x10020867, // mov r1, r2
+-+/* [0x00000148] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+-+/* [0x00000150] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+-+/* [0x00000158] */ 0x159e7480, 0x10020827, // mov r0, r2
+-+/* [0x00000160] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+-+/* [0x00000168] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000170] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+-+/* [0x00000178] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
+-+/* [0x00000180] */ 0x0f9c11c0, 0xd0020827, // asr r0, r0, 1
+-+/* [0x00000188] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
+-+/* [0x00000190] */ 0x0c9e7040, 0x10021567, // add rb21, r0, r1
+-+/* [0x00000198] */ 0x15427d80, 0x10020827, // mov r0, ra_x
+-+/* [0x000001a0] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
+-+/* [0x000001a8] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base
+-+/* [0x000001b0] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
+-+/* [0x000001b8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
+-+/* [0x000001c0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x000001c8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+-+/* [0x000001d0] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x, r0
+-+/* [0x000001d8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x000001e0] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
+-+/* [0x000001e8] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_frame_base, r2
+-+/* [0x000001f0] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
+-+/* [0x000001f8] */ 0x15827d80, 0x10021327, // mov rb12,unif
+-+/* [0x00000200] */ 0x15827d80, 0x10021367, // mov rb13,unif
+-+/* [0x00000208] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
+-+/* [0x00000210] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x00000218] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
+-+/* [0x00000220] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000228] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+-+/* [0x00000230] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x
+-+/* [0x00000238] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_frame_base
+- // ::mc_filter_uv
+--/* [0x00000250] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00000258] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x00000260] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00000268] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x00000270] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+--/* [0x00000278] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x00000280] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+--/* [0x00000288] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x00000290] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
+--/* [0x00000298] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x000002a0] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
+--/* [0x000002a8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+--/* [0x000002b0] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x000002b8] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000002c0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x000002c8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x000002d0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x000002d8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+--/* [0x000002e0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+--/* [0x000002e8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x000002f0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x000002f8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000300] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00000308] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000310] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000318] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000320] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000328] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x00000330] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000338] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000340] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000348] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
+--/* [0x00000350] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000358] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000360] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
+--/* [0x00000368] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000370] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
+--/* [0x00000378] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000380] */ 0x0f9e7080, 0x100613e7, // asr.ifnz rb15, r0, r2
+--/* [0x00000388] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000390] */ 0x0f9e7080, 0x100613a7, // asr.ifnz rb14, r0, r2
+--/* [0x00000398] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x000003a0] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x00000240] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000248] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00000250] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000258] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000260] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x00000268] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000270] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x00000278] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x00000280] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
+-+/* [0x00000288] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x00000290] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
+-+/* [0x00000298] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x000002a0] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x000002a8] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000002b0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x000002b8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x000002c0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x000002c8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+-+/* [0x000002d0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+-+/* [0x000002d8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x000002e0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x000002e8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x000002f0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x000002f8] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000300] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000308] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000310] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000318] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x00000320] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000328] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000330] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000338] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
+-+/* [0x00000340] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000348] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000350] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
+-+/* [0x00000358] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000360] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
+-+/* [0x00000368] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000370] */ 0x0f9e7080, 0x100613e7, // asr.ifnz rb15, r0, r2
+-+/* [0x00000378] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000380] */ 0x0f9e7080, 0x100613a7, // asr.ifnz rb14, r0, r2
+-+/* [0x00000388] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x00000390] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop
+--/* [0x000003a8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x000003b0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
+--/* [0x000003b8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+--/* [0x000003c0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x000003c8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x000003d0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x000003d8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x000003e0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x000003e8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
+--/* [0x000003f0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_frame_base, r2
+--/* [0x000003f8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000400] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x00000408] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x00000410] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x00000418] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x00000420] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x00000428] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x00000430] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x00000438] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00000440] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+--/* [0x00000448] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+--/* [0x00000450] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
+--/* [0x00000458] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x00000460] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+--/* [0x00000468] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+--/* [0x00000470] */ 0x00000020, 0xe0021327, // mov rb12,32
+--/* [0x00000478] */ 0x00000006, 0xe0021367, // mov rb13,6
+--/* [0x00000480] */ 0x00000001, 0xe00213a7, // mov rb14,1
+--/* [0x00000488] */ 0x00000000, 0xe00213e7, // mov rb15,0
+--/* [0x00000490] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+--/* [0x00000498] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+--/* [0x000004a0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+--/* [0x000004a8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+--/* [0x000004b0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x000004b8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x000004c0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+--/* [0x000004c8] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
+--/* [0x000004d0] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
+--/* [0x000004d8] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
+--/* [0x000004e0] */ 0xfffffea8, 0xf06809e7, // brr.anyn -, r:uvloop
+--/* [0x000004e8] */ 0x0c9cf3c0, 0x10020867, // add r1, r1, rb15
+--/* [0x000004f0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x000004f8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+--/* [0x00000500] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00000508] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000510] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x00000518] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x00000520] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x00000528] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000530] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x00000538] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000540] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000398] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x000003a0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
+-+/* [0x000003a8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+-+/* [0x000003b0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x000003b8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x000003c0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x000003c8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x000003d0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x000003d8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
+-+/* [0x000003e0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_frame_base, r2
+-+/* [0x000003e8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x000003f0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x000003f8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x00000400] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00000408] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x00000410] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000418] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00000420] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00000428] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00000430] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+-+/* [0x00000438] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+-+/* [0x00000440] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
+-+/* [0x00000448] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00000450] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+-+/* [0x00000458] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+-+/* [0x00000460] */ 0x00000020, 0xe0021327, // mov rb12,32
+-+/* [0x00000468] */ 0x00000006, 0xe0021367, // mov rb13,6
+-+/* [0x00000470] */ 0x00000001, 0xe00213a7, // mov rb14,1
+-+/* [0x00000478] */ 0x00000000, 0xe00213e7, // mov rb15,0
+-+/* [0x00000480] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+-+/* [0x00000488] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+-+/* [0x00000490] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+/* [0x00000498] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x000004a0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x000004a8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x000004b0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x000004b8] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
+-+/* [0x000004c0] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
+-+/* [0x000004c8] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
+-+/* [0x000004d0] */ 0xfffffea8, 0xf06809e7, // brr.anyn -, r:uvloop
+-+/* [0x000004d8] */ 0x0c9cf3c0, 0x10020867, // add r1, r1, rb15
+-+/* [0x000004e0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x000004e8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x000004f0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x000004f8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000500] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000508] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x00000510] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x00000518] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000520] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x00000528] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000530] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_filter_uv_b0
+--/* [0x00000548] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00000550] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x00000558] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00000560] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x00000568] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+--/* [0x00000570] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x00000578] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+--/* [0x00000580] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x00000588] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
+--/* [0x00000590] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x00000598] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
+--/* [0x000005a0] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
+--/* [0x000005a8] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x000005b0] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000005b8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x000005c0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x000005c8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x000005d0] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+--/* [0x000005d8] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+--/* [0x000005e0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x000005e8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x000005f0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x000005f8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00000600] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000608] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000610] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000618] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000620] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x00000628] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000630] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000638] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000640] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
+--/* [0x00000648] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000650] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000658] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x00000660] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000668] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x00000538] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000540] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00000548] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000550] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000558] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x00000560] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000568] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x00000570] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x00000578] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
+-+/* [0x00000580] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x00000588] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
+-+/* [0x00000590] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
+-+/* [0x00000598] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x000005a0] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000005a8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x000005b0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x000005b8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x000005c0] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+-+/* [0x000005c8] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+-+/* [0x000005d0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x000005d8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x000005e0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x000005e8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x000005f0] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000005f8] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000600] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000608] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000610] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x00000618] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000620] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000628] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000630] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
+-+/* [0x00000638] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000640] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000648] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x00000650] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000658] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop_b0
+--/* [0x00000670] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x00000678] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
+--/* [0x00000680] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+--/* [0x00000688] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x00000690] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x00000698] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x000006a0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x000006a8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x000006b0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
+--/* [0x000006b8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_frame_base, r2
+--/* [0x000006c0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x000006c8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x000006d0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x000006d8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x000006e0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x000006e8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x000006f0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x000006f8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x00000700] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00000708] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+--/* [0x00000710] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+--/* [0x00000718] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+--/* [0x00000720] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x00000728] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+--/* [0x00000730] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+--/* [0x00000738] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+--/* [0x00000740] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+--/* [0x00000748] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+--/* [0x00000750] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+--/* [0x00000758] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x00000760] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
+--/* [0x00000768] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+--/* [0x00000770] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
+--/* [0x00000778] */ 0x009e7000, 0x100009e7, // nop
+--/* [0x00000780] */ 0x009e7000, 0x100009e7, // nop
+--/* [0x00000788] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000790] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000798] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000007a0] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x00000660] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x00000668] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
+-+/* [0x00000670] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+-+/* [0x00000678] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000680] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x00000688] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000690] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000698] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x000006a0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
+-+/* [0x000006a8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_frame_base, r2
+-+/* [0x000006b0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x000006b8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x000006c0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x000006c8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x000006d0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x000006d8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x000006e0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x000006e8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x000006f0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x000006f8] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+-+/* [0x00000700] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+-+/* [0x00000708] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+-+/* [0x00000710] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00000718] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+-+/* [0x00000720] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+-+/* [0x00000728] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+-+/* [0x00000730] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+-+/* [0x00000738] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+/* [0x00000740] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x00000748] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x00000750] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
+-+/* [0x00000758] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+-+/* [0x00000760] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
+-+/* [0x00000768] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x00000770] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x00000778] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000780] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000788] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000790] */ 0x009e7000, 0x100009e7, // nop
+- // ::mc_filter_uv_b
+--/* [0x000007a8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x000007b0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x000007b8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x000007c0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x000007c8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+--/* [0x000007d0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x000007d8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+--/* [0x000007e0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x000007e8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
+--/* [0x000007f0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x000007f8] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
+--/* [0x00000800] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+--/* [0x00000808] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x00000810] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000818] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x00000820] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x00000828] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x00000830] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+--/* [0x00000838] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+--/* [0x00000840] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x00000848] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+--/* [0x00000850] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+--/* [0x00000858] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+--/* [0x00000860] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000868] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000870] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00000878] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
+--/* [0x00000880] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000888] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000890] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000898] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000008a0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x000008a8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000008b0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000008b8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000008c0] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
+--/* [0x000008c8] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000008d0] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000008d8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x000008e0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x000008e8] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x00000798] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x000007a0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x000007a8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x000007b0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x000007b8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x000007c0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x000007c8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x000007d0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x000007d8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
+-+/* [0x000007e0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x000007e8] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
+-+/* [0x000007f0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x000007f8] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x00000800] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000808] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x00000810] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x00000818] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x00000820] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+-+/* [0x00000828] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+-+/* [0x00000830] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x00000838] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+-+/* [0x00000840] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+-+/* [0x00000848] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+-+/* [0x00000850] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000858] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000860] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00000868] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
+-+/* [0x00000870] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000878] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000880] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000888] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000890] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x00000898] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000008a0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000008a8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000008b0] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
+-+/* [0x000008b8] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000008c0] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000008c8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x000008d0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x000008d8] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop_b
+--/* [0x000008f0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x000008f8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
+--/* [0x00000900] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+--/* [0x00000908] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x00000910] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x00000918] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x00000920] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x00000928] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x00000930] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
+--/* [0x00000938] */ 0x0c627c80, 0x10020e27, // add t0s, ra_frame_base, r2
+--/* [0x00000940] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000948] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x00000950] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x00000958] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x00000960] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x00000968] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x00000970] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x00000978] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x00000980] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00000988] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+--/* [0x00000990] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+--/* [0x00000998] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
+--/* [0x000009a0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x000009a8] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+--/* [0x000009b0] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+--/* [0x000009b8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+--/* [0x000009c0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+--/* [0x000009c8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+--/* [0x000009d0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+--/* [0x000009d8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x000009e0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x000009e8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+--/* [0x000009f0] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
+--/* [0x000009f8] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
+--/* [0x00000a00] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
+--/* [0x00000a08] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
+--/* [0x00000a10] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x00000a18] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+--/* [0x00000a20] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00000a28] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000a30] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x00000a38] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x00000a40] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x00000a48] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000a50] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x00000a58] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000a60] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x000008e0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x000008e8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
+-+/* [0x000008f0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+-+/* [0x000008f8] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000900] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x00000908] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000910] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000918] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x00000920] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
+-+/* [0x00000928] */ 0x0c627c80, 0x10020e27, // add t0s, ra_frame_base, r2
+-+/* [0x00000930] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000938] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x00000940] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x00000948] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00000950] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x00000958] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000960] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00000968] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00000970] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00000978] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+-+/* [0x00000980] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+-+/* [0x00000988] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x00000990] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00000998] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+-+/* [0x000009a0] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+-+/* [0x000009a8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+-+/* [0x000009b0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+-+/* [0x000009b8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+/* [0x000009c0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x000009c8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x000009d0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x000009d8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x000009e0] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
+-+/* [0x000009e8] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
+-+/* [0x000009f0] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x000009f8] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
+-+/* [0x00000a00] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x00000a08] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x00000a10] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00000a18] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000a20] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000a28] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x00000a30] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x00000a38] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000a40] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x00000a48] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000a50] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_exit
+--/* [0x00000a68] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x00000a70] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+-+/* [0x00000a58] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00000a60] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+-+/* [0x00000a68] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a70] */ 0x009e7000, 0xa00009e7, // ldtmu0
+- /* [0x00000a78] */ 0x009e7000, 0xa00009e7, // ldtmu0
+- /* [0x00000a80] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000a88] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000a90] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000a98] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x00000aa0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+--/* [0x00000aa8] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000a88] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00000a90] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000a98] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_interrupt_exit8
+--/* [0x00000ab0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00000aa0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00000aa8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000ab0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+- /* [0x00000ab8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+- /* [0x00000ac0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000ac8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000ad0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000ac8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000ad0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00000ad8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00000ae0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00000ae8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00000af0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00000af8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000b00] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000b08] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000b10] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x00000b18] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+--/* [0x00000b20] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000b00] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00000b08] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x00000b10] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_setup
+--/* [0x00000b28] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000b30] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00000b38] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00000b40] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x00000b48] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
+--/* [0x00000b50] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x00000b58] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
+--/* [0x00000b60] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+--/* [0x00000b68] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
+--/* [0x00000b70] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+--/* [0x00000b78] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+--/* [0x00000b80] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
+--/* [0x00000b88] */ 0x8c9e7452, 0x10025e18, // add t0s, r2, r1 ; mov ra_frame_base, r2
+--/* [0x00000b90] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00000b98] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x00000ba0] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
+--/* [0x00000ba8] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
+--/* [0x00000bb0] */ 0x0c9c13c0, 0xd0020567, // add ra_y2, r1, 1
+--/* [0x00000bb8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+--/* [0x00000bc0] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
+--/* [0x00000bc8] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+--/* [0x00000bd0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+--/* [0x00000bd8] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
+--/* [0x00000be0] */ 0x8c9e7452, 0x10025e19, // add t0s, r2, r1 ; mov ra_frame_base2, r2
+--/* [0x00000be8] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
+--/* [0x00000bf0] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
+--/* [0x00000bf8] */ 0x15827d80, 0x10021427, // mov rb16, unif
+--/* [0x00000c00] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000c08] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+--/* [0x00000c10] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
+--/* [0x00000c18] */ 0x00000001, 0xe0020527, // mov ra20, 1
+--/* [0x00000c20] */ 0x00000100, 0xe00205a7, // mov ra22, 256
+--/* [0x00000c28] */ 0x00000040, 0xe00207a7, // mov ra30, 64
+--/* [0x00000c30] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
+--/* [0x00000c38] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
+--/* [0x00000c40] */ 0x00000018, 0xe00215e7, // mov rb23, 24
+--/* [0x00000c48] */ 0x00000000, 0xe0020227, // mov ra8, 0
+--/* [0x00000c50] */ 0x00000000, 0xe0020267, // mov ra9, 0
+--/* [0x00000c58] */ 0x00000000, 0xe00202a7, // mov ra10, 0
+--/* [0x00000c60] */ 0x00000000, 0xe00202e7, // mov ra11, 0
+--/* [0x00000c68] */ 0x00000000, 0xe0020327, // mov ra12, 0
+--/* [0x00000c70] */ 0x00000000, 0xe0020367, // mov ra13, 0
+--/* [0x00000c78] */ 0x00000000, 0xe00203a7, // mov ra14, 0
+--/* [0x00000c80] */ 0x00000000, 0xe00203e7, // mov ra15, 0
+--/* [0x00000c88] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+--/* [0x00000c90] */ 0x159e7480, 0x10020867, // mov r1, r2
+--/* [0x00000c98] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+--/* [0x00000ca0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+--/* [0x00000ca8] */ 0x159e7480, 0x10020827, // mov r0, r2
+--/* [0x00000cb0] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+--/* [0x00000cb8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000cc0] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
+--/* [0x00000cc8] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+--/* [0x00000cd0] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
+--/* [0x00000cd8] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+--/* [0x00000ce0] */ 0x159e7480, 0x10020867, // mov r1, r2
+--/* [0x00000ce8] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+--/* [0x00000cf0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+--/* [0x00000cf8] */ 0x159e7480, 0x10020827, // mov r0, r2
+--/* [0x00000d00] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+--/* [0x00000d08] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000d10] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+--/* [0x00000d18] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
+--/* [0x00000d20] */ 0x15827d80, 0x10021327, // mov rb12,unif
+--/* [0x00000d28] */ 0x15827d80, 0x10021367, // mov rb13,unif
+--/* [0x00000d30] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000d38] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
+--/* [0x00000d40] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+--/* [0x00000d48] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
+--/* [0x00000d50] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+--/* [0x00000d58] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_frame_base
+--/* [0x00000d60] */ 0x13540dc0, 0xd0020867, // max r1, ra_y2, 0
+--/* [0x00000d68] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+--/* [0x00000d70] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000d78] */ 0x0c541dc0, 0xd0020567, // add ra_y2, ra_y2, 1
+--/* [0x00000d80] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+--/* [0x00000d88] */ 0x0c667380, 0x10020e27, // add t0s, r1, ra_frame_base2
+-+/* [0x00000b18] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000b20] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000b28] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000b30] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000b38] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
+-+/* [0x00000b40] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000b48] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
+-+/* [0x00000b50] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+-+/* [0x00000b58] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
+-+/* [0x00000b60] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+-+/* [0x00000b68] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x00000b70] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
+-+/* [0x00000b78] */ 0x8c9e7452, 0x10025e18, // add t0s, r2, r1 ; mov ra_frame_base, r2
+-+/* [0x00000b80] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000b88] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000b90] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
+-+/* [0x00000b98] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
+-+/* [0x00000ba0] */ 0x0c9c13c0, 0xd0020567, // add ra_y2, r1, 1
+-+/* [0x00000ba8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+-+/* [0x00000bb0] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
+-+/* [0x00000bb8] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+-+/* [0x00000bc0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x00000bc8] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
+-+/* [0x00000bd0] */ 0x8c9e7452, 0x10025e19, // add t0s, r2, r1 ; mov ra_frame_base2, r2
+-+/* [0x00000bd8] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
+-+/* [0x00000be0] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
+-+/* [0x00000be8] */ 0x15827d80, 0x10021427, // mov rb16, unif
+-+/* [0x00000bf0] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000bf8] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+-+/* [0x00000c00] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
+-+/* [0x00000c08] */ 0x00000001, 0xe0020527, // mov ra20, 1
+-+/* [0x00000c10] */ 0x00000100, 0xe00205a7, // mov ra22, 256
+-+/* [0x00000c18] */ 0x00000040, 0xe00207a7, // mov ra30, 64
+-+/* [0x00000c20] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
+-+/* [0x00000c28] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
+-+/* [0x00000c30] */ 0x00000018, 0xe00215e7, // mov rb23, 24
+-+/* [0x00000c38] */ 0x00000000, 0xe0020227, // mov ra8, 0
+-+/* [0x00000c40] */ 0x00000000, 0xe0020267, // mov ra9, 0
+-+/* [0x00000c48] */ 0x00000000, 0xe00202a7, // mov ra10, 0
+-+/* [0x00000c50] */ 0x00000000, 0xe00202e7, // mov ra11, 0
+-+/* [0x00000c58] */ 0x00000000, 0xe0020327, // mov ra12, 0
+-+/* [0x00000c60] */ 0x00000000, 0xe0020367, // mov ra13, 0
+-+/* [0x00000c68] */ 0x00000000, 0xe00203a7, // mov ra14, 0
+-+/* [0x00000c70] */ 0x00000000, 0xe00203e7, // mov ra15, 0
+-+/* [0x00000c78] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+-+/* [0x00000c80] */ 0x159e7480, 0x10020867, // mov r1, r2
+-+/* [0x00000c88] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+-+/* [0x00000c90] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+-+/* [0x00000c98] */ 0x159e7480, 0x10020827, // mov r0, r2
+-+/* [0x00000ca0] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+-+/* [0x00000ca8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000cb0] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
+-+/* [0x00000cb8] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+-+/* [0x00000cc0] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
+-+/* [0x00000cc8] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+-+/* [0x00000cd0] */ 0x159e7480, 0x10020867, // mov r1, r2
+-+/* [0x00000cd8] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+-+/* [0x00000ce0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+-+/* [0x00000ce8] */ 0x159e7480, 0x10020827, // mov r0, r2
+-+/* [0x00000cf0] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+-+/* [0x00000cf8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000d00] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+-+/* [0x00000d08] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
+-+/* [0x00000d10] */ 0x15827d80, 0x10021327, // mov rb12,unif
+-+/* [0x00000d18] */ 0x15827d80, 0x10021367, // mov rb13,unif
+-+/* [0x00000d20] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000d28] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
+-+/* [0x00000d30] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x00000d38] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
+-+/* [0x00000d40] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+-+/* [0x00000d48] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_frame_base
+-+/* [0x00000d50] */ 0x13540dc0, 0xd0020867, // max r1, ra_y2, 0
+-+/* [0x00000d58] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x00000d60] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000d68] */ 0x0c541dc0, 0xd0020567, // add ra_y2, ra_y2, 1
+-+/* [0x00000d70] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+-+/* [0x00000d78] */ 0x0c667380, 0x10020e27, // add t0s, r1, ra_frame_base2
+- // ::mc_filter
+--/* [0x00000d90] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000d98] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00000da0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x00000da8] */ 0x155e7d80, 0x10021027, // mov rx_xshift2, rx_xshift2_next
+--/* [0x00000db0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00000db8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x00000dc0] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
+--/* [0x00000dc8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x00000dd0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x00000dd8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+--/* [0x00000de0] */ 0x0c9e7400, 0x100206a7, // add ra_frame_base_next, r2, r0
+--/* [0x00000de8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00000df0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0   ; mov r1, unif
+--/* [0x00000df8] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
+--/* [0x00000e00] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
+--/* [0x00000e08] */ 0x0c9c13c0, 0xd0021067, // add ra_y2_next, r1, 1
+--/* [0x00000e10] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+--/* [0x00000e18] */ 0x0c9e7400, 0x100214e7, // add rx_frame_base2_next, r2, r0
+--/* [0x00000e20] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+--/* [0x00000e28] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x00000e30] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000e38] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x00000e40] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x00000e48] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x00000e50] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+--/* [0x00000e58] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+--/* [0x00000e60] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x00000e68] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000e70] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000e78] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00000e80] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000e88] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
+--/* [0x00000e90] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000e98] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000ea0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000ea8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x00000eb0] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
+--/* [0x00000eb8] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000ec0] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000ec8] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000ed0] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+--/* [0x00000ed8] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
+--/* [0x00000ee0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000ee8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000ef0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000ef8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+--/* [0x00000f00] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
+--/* [0x00000f08] */ 0x4f5971c6, 0x100251e0, // asr rb7, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000f10] */ 0x4f5971c6, 0x100251a0, // asr rb6, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000f18] */ 0x4f5971c6, 0x10025160, // asr rb5, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000f20] */ 0x0f9d71c0, 0x10021127, // asr rb4, r0, rb23
+--/* [0x00000f28] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000f30] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
+--/* [0x00000f38] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
+--/* [0x00000f40] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000f48] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
+--/* [0x00000f50] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x00000d80] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000d88] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000d90] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00000d98] */ 0x155e7d80, 0x10021027, // mov rx_xshift2, rx_xshift2_next
+-+/* [0x00000da0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000da8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000db0] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
+-+/* [0x00000db8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000dc0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x00000dc8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+-+/* [0x00000dd0] */ 0x0c9e7400, 0x100206a7, // add ra_frame_base_next, r2, r0
+-+/* [0x00000dd8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000de0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0   ; mov r1, unif
+-+/* [0x00000de8] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
+-+/* [0x00000df0] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
+-+/* [0x00000df8] */ 0x0c9c13c0, 0xd0021067, // add ra_y2_next, r1, 1
+-+/* [0x00000e00] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+-+/* [0x00000e08] */ 0x0c9e7400, 0x100214e7, // add rx_frame_base2_next, r2, r0
+-+/* [0x00000e10] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x00000e18] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x00000e20] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000e28] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x00000e30] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x00000e38] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x00000e40] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+-+/* [0x00000e48] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+-+/* [0x00000e50] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x00000e58] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000e60] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000e68] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00000e70] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000e78] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
+-+/* [0x00000e80] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000e88] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000e90] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000e98] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x00000ea0] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
+-+/* [0x00000ea8] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000eb0] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000eb8] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000ec0] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+-+/* [0x00000ec8] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
+-+/* [0x00000ed0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000ed8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000ee0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000ee8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+-+/* [0x00000ef0] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
+-+/* [0x00000ef8] */ 0x4f5971c6, 0x100251e0, // asr rb7, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000f00] */ 0x4f5971c6, 0x100251a0, // asr rb6, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000f08] */ 0x4f5971c6, 0x10025160, // asr rb5, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000f10] */ 0x0f9d71c0, 0x10021127, // asr rb4, r0, rb23
+-+/* [0x00000f18] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000f20] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
+-+/* [0x00000f28] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
+-+/* [0x00000f30] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000f38] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
+-+/* [0x00000f40] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :yloop
+--/* [0x00000f58] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
+--/* [0x00000f60] */ 0x8e4539bf, 0xa0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu0
+--/* [0x00000f68] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+--/* [0x00000f70] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x00000f78] */ 0x0e9c09c0, 0x10020867, // shr r1, r4, rx_xshift2
+--/* [0x00000f80] */ 0x159c1fc0, 0x10040567, // mov.ifz ra_y2, ra_y2_next
+--/* [0x00000f88] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x00000f90] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x00000f98] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
+--/* [0x00000fa0] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
+--/* [0x00000fa8] */ 0x13540dc0, 0xd00208a7, // max r2, ra_y2, 0
+--/* [0x00000fb0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x00000fb8] */ 0x4c541dd3, 0xd0024562, // add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
+--/* [0x00000fc0] */ 0xec654c87, 0x10024e20, // add t0s, ra_frame_base2, r2   ; v8subs r0, r0, rb20
+--/* [0x00000fc8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000fd0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x00000fd8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x00000fe0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x00000fe8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x00000ff0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x00000ff8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x00001000] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x00001008] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00001010] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--/* [0x00001018] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--/* [0x00001020] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--/* [0x00001028] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--/* [0x00001030] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--/* [0x00001038] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--/* [0x00001040] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--/* [0x00001048] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--/* [0x00001050] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+--/* [0x00001058] */ 0x8d3487f6, 0xd00279cc, // sub.setf -, r3, 8    ; mov ra12, ra13
+--/* [0x00001060] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+--/* [0x00001068] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+--/* [0x00001070] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+--/* [0x00001078] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+--/* [0x00001080] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:yloop
+--/* [0x00001088] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x00001090] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+--/* [0x00001098] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+--/* [0x000010a0] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+--/* [0x000010a8] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+--/* [0x000010b0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+--/* [0x000010b8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+--/* [0x000010c0] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb4
+--/* [0x000010c8] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb5
+--/* [0x000010d0] */ 0x4c286237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb6
+--/* [0x000010d8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
+--/* [0x000010e0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x000010e8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x000010f0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+--/* [0x000010f8] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
+--/* [0x00001100] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
+--/* [0x00001108] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
+--/* [0x00001110] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:yloop
+--/* [0x00001118] */ 0x0c9cf3c0, 0x10020867, // add r1, r1, rb15
+--/* [0x00001120] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x00001128] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+--/* [0x00001130] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00001138] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00001140] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00001148] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000f48] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
+-+/* [0x00000f50] */ 0x8e4539bf, 0xa0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu0
+-+/* [0x00000f58] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+-+/* [0x00000f60] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000f68] */ 0x0e9c09c0, 0x10020867, // shr r1, r4, rx_xshift2
+-+/* [0x00000f70] */ 0x159c1fc0, 0x10040567, // mov.ifz ra_y2, ra_y2_next
+-+/* [0x00000f78] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000f80] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000f88] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
+-+/* [0x00000f90] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
+-+/* [0x00000f98] */ 0x13540dc0, 0xd00208a7, // max r2, ra_y2, 0
+-+/* [0x00000fa0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000fa8] */ 0x4c541dd3, 0xd0024562, // add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
+-+/* [0x00000fb0] */ 0xec654c87, 0x10024e20, // add t0s, ra_frame_base2, r2   ; v8subs r0, r0, rb20
+-+/* [0x00000fb8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000fc0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x00000fc8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x00000fd0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00000fd8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x00000fe0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000fe8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00000ff0] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00000ff8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00001000] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+/* [0x00001008] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+/* [0x00001010] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+/* [0x00001018] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+/* [0x00001020] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+/* [0x00001028] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+/* [0x00001030] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+/* [0x00001038] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+/* [0x00001040] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+-+/* [0x00001048] */ 0x8d3487f6, 0xd00279cc, // sub.setf -, r3, 8    ; mov ra12, ra13
+-+/* [0x00001050] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+-+/* [0x00001058] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+-+/* [0x00001060] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+-+/* [0x00001068] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+-+/* [0x00001070] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:yloop
+-+/* [0x00001078] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00001080] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+-+/* [0x00001088] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+-+/* [0x00001090] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+-+/* [0x00001098] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+-+/* [0x000010a0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+/* [0x000010a8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x000010b0] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb4
+-+/* [0x000010b8] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb5
+-+/* [0x000010c0] */ 0x4c286237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb6
+-+/* [0x000010c8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
+-+/* [0x000010d0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x000010d8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x000010e0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x000010e8] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
+-+/* [0x000010f0] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
+-+/* [0x000010f8] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
+-+/* [0x00001100] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:yloop
+-+/* [0x00001108] */ 0x0c9cf3c0, 0x10020867, // add r1, r1, rb15
+-+/* [0x00001110] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x00001118] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x00001120] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00001128] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00001130] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00001138] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_filter_b
+--/* [0x00001150] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00001158] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00001160] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x00001168] */ 0x155e7d80, 0x10021027, // mov rx_xshift2, rx_xshift2_next
+--/* [0x00001170] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00001178] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x00001180] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
+--/* [0x00001188] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x00001190] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x00001198] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+--/* [0x000011a0] */ 0x0c9e7400, 0x100206a7, // add ra_frame_base_next, r2, r0
+--/* [0x000011a8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x000011b0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0   ; mov r1, unif
+--/* [0x000011b8] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
+--/* [0x000011c0] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
+--/* [0x000011c8] */ 0x0c9c13c0, 0xd0021067, // add ra_y2_next, r1, 1
+--/* [0x000011d0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+--/* [0x000011d8] */ 0x0c9e7400, 0x100214e7, // add rx_frame_base2_next, r2, r0
+--/* [0x000011e0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+--/* [0x000011e8] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x000011f0] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000011f8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x00001200] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x00001208] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x00001210] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+--/* [0x00001218] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+--/* [0x00001220] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x00001228] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00001230] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00001238] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00001240] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00001248] */ 0x00000001, 0xe0020867, // mov r1, 1
+--/* [0x00001250] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
+--/* [0x00001258] */ 0x409f3001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 13, r1 << 13
+--/* [0x00001260] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00001268] */ 0x409f2001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 14, r1 << 14
+--/* [0x00001270] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00001278] */ 0x409f1001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 15, r1 << 15
+--/* [0x00001280] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00001288] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x00001290] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
+--/* [0x00001298] */ 0x409f7001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 9, r1 << 9
+--/* [0x000012a0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000012a8] */ 0x409f6001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 10, r1 << 10
+--/* [0x000012b0] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000012b8] */ 0x409f5001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 11, r1 << 11
+--/* [0x000012c0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000012c8] */ 0x409f4001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 12, r1 << 12
+--/* [0x000012d0] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+--/* [0x000012d8] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
+--/* [0x000012e0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000012e8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000012f0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000012f8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+--/* [0x00001300] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
+--/* [0x00001308] */ 0x4f5971c6, 0x100251e0, // asr rb7, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00001310] */ 0x4f5971c6, 0x100251a0, // asr rb6, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00001318] */ 0x4f5971c6, 0x10025160, // asr rb5, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00001320] */ 0x0f9d71c0, 0x10021127, // asr rb4, r0, rb23
+--/* [0x00001328] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00001330] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
+--/* [0x00001338] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
+--/* [0x00001340] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00001348] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
+--/* [0x00001350] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x00001140] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00001148] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00001150] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00001158] */ 0x155e7d80, 0x10021027, // mov rx_xshift2, rx_xshift2_next
+-+/* [0x00001160] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00001168] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00001170] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
+-+/* [0x00001178] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00001180] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x00001188] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+-+/* [0x00001190] */ 0x0c9e7400, 0x100206a7, // add ra_frame_base_next, r2, r0
+-+/* [0x00001198] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x000011a0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0   ; mov r1, unif
+-+/* [0x000011a8] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
+-+/* [0x000011b0] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
+-+/* [0x000011b8] */ 0x0c9c13c0, 0xd0021067, // add ra_y2_next, r1, 1
+-+/* [0x000011c0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+-+/* [0x000011c8] */ 0x0c9e7400, 0x100214e7, // add rx_frame_base2_next, r2, r0
+-+/* [0x000011d0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x000011d8] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x000011e0] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000011e8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x000011f0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x000011f8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x00001200] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+-+/* [0x00001208] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+-+/* [0x00001210] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x00001218] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00001220] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00001228] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00001230] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00001238] */ 0x00000001, 0xe0020867, // mov r1, 1
+-+/* [0x00001240] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
+-+/* [0x00001248] */ 0x409f3001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 13, r1 << 13
+-+/* [0x00001250] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00001258] */ 0x409f2001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 14, r1 << 14
+-+/* [0x00001260] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00001268] */ 0x409f1001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 15, r1 << 15
+-+/* [0x00001270] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00001278] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x00001280] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
+-+/* [0x00001288] */ 0x409f7001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 9, r1 << 9
+-+/* [0x00001290] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00001298] */ 0x409f6001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 10, r1 << 10
+-+/* [0x000012a0] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000012a8] */ 0x409f5001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 11, r1 << 11
+-+/* [0x000012b0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000012b8] */ 0x409f4001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 12, r1 << 12
+-+/* [0x000012c0] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+-+/* [0x000012c8] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
+-+/* [0x000012d0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000012d8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000012e0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000012e8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+-+/* [0x000012f0] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
+-+/* [0x000012f8] */ 0x4f5971c6, 0x100251e0, // asr rb7, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00001300] */ 0x4f5971c6, 0x100251a0, // asr rb6, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00001308] */ 0x4f5971c6, 0x10025160, // asr rb5, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00001310] */ 0x0f9d71c0, 0x10021127, // asr rb4, r0, rb23
+-+/* [0x00001318] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00001320] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
+-+/* [0x00001328] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
+-+/* [0x00001330] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00001338] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
+-+/* [0x00001340] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :yloopb
+--/* [0x00001358] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
+--/* [0x00001360] */ 0x8e4539bf, 0xa0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu0
+--/* [0x00001368] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+--/* [0x00001370] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x00001378] */ 0x0e9c09c0, 0x10020867, // shr r1, r4, rx_xshift2
+--/* [0x00001380] */ 0x159c1fc0, 0x10040567, // mov.ifz ra_y2, ra_y2_next
+--/* [0x00001388] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x00001390] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x00001398] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
+--/* [0x000013a0] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
+--/* [0x000013a8] */ 0x13540dc0, 0xd00208a7, // max r2, ra_y2, 0
+--/* [0x000013b0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x000013b8] */ 0x4c541dd3, 0xd0024562, // add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
+--/* [0x000013c0] */ 0xec654c87, 0x10024e20, // add t0s, ra_frame_base2, r2   ; v8subs r0, r0, rb20
+--/* [0x000013c8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x000013d0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x000013d8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x000013e0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x000013e8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x000013f0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x000013f8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x00001400] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x00001408] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00001410] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--/* [0x00001418] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--/* [0x00001420] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--/* [0x00001428] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--/* [0x00001430] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--/* [0x00001438] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--/* [0x00001440] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--/* [0x00001448] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--/* [0x00001450] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+--/* [0x00001458] */ 0x8d3487f6, 0xd00279cc, // sub.setf -, r3, 8    ; mov ra12, ra13
+--/* [0x00001460] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+--/* [0x00001468] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+--/* [0x00001470] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+--/* [0x00001478] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+--/* [0x00001480] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:yloopb
+--/* [0x00001488] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x00001490] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+--/* [0x00001498] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+--/* [0x000014a0] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+--/* [0x000014a8] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+--/* [0x000014b0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+--/* [0x000014b8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+--/* [0x000014c0] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb4
+--/* [0x000014c8] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb5
+--/* [0x000014d0] */ 0x4c286237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb6
+--/* [0x000014d8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
+--/* [0x000014e0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x000014e8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x000014f0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+--/* [0x000014f8] */ 0x4053800e, 0xd00049e1, // nop                     ; mul24 r1, r1 << 8, ra20 << 8
+--/* [0x00001500] */ 0x4c78e38f, 0x10024860, // add r1, r1, ra30        ; mul24 r0, r1, rb14
+--/* [0x00001508] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
+--/* [0x00001510] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:yloopb
+--/* [0x00001518] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
+--/* [0x00001520] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x00001528] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+--/* [0x00001530] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00001538] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00001540] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00001548] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00001348] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
+-+/* [0x00001350] */ 0x8e4539bf, 0xa0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu0
+-+/* [0x00001358] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+-+/* [0x00001360] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00001368] */ 0x0e9c09c0, 0x10020867, // shr r1, r4, rx_xshift2
+-+/* [0x00001370] */ 0x159c1fc0, 0x10040567, // mov.ifz ra_y2, ra_y2_next
+-+/* [0x00001378] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00001380] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00001388] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
+-+/* [0x00001390] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
+-+/* [0x00001398] */ 0x13540dc0, 0xd00208a7, // max r2, ra_y2, 0
+-+/* [0x000013a0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x000013a8] */ 0x4c541dd3, 0xd0024562, // add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
+-+/* [0x000013b0] */ 0xec654c87, 0x10024e20, // add t0s, ra_frame_base2, r2   ; v8subs r0, r0, rb20
+-+/* [0x000013b8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x000013c0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x000013c8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x000013d0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x000013d8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x000013e0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x000013e8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x000013f0] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x000013f8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00001400] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+/* [0x00001408] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+/* [0x00001410] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+/* [0x00001418] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+/* [0x00001420] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+/* [0x00001428] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+/* [0x00001430] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+/* [0x00001438] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+/* [0x00001440] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+-+/* [0x00001448] */ 0x8d3487f6, 0xd00279cc, // sub.setf -, r3, 8    ; mov ra12, ra13
+-+/* [0x00001450] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+-+/* [0x00001458] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+-+/* [0x00001460] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+-+/* [0x00001468] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+-+/* [0x00001470] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:yloopb
+-+/* [0x00001478] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00001480] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+-+/* [0x00001488] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+-+/* [0x00001490] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+-+/* [0x00001498] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+-+/* [0x000014a0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+/* [0x000014a8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x000014b0] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb4
+-+/* [0x000014b8] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb5
+-+/* [0x000014c0] */ 0x4c286237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb6
+-+/* [0x000014c8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
+-+/* [0x000014d0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x000014d8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x000014e0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x000014e8] */ 0x4053800e, 0xd00049e1, // nop                     ; mul24 r1, r1 << 8, ra20 << 8
+-+/* [0x000014f0] */ 0x4c78e38f, 0x10024860, // add r1, r1, ra30        ; mul24 r0, r1, rb14
+-+/* [0x000014f8] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
+-+/* [0x00001500] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:yloopb
+-+/* [0x00001508] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
+-+/* [0x00001510] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x00001518] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x00001520] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00001528] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00001530] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00001538] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_interrupt_exit12
+--/* [0x00001550] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00001540] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00001548] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00001550] */ 0x009e7000, 0xa00009e7, // ldtmu0
+- /* [0x00001558] */ 0x009e7000, 0xa00009e7, // ldtmu0
+- /* [0x00001560] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00001568] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00001570] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00001568] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001570] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00001578] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00001580] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00001588] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-@@ -732,11 +732,9 @@ unsigned int rpi_shader[] = {
+- /* [0x000015a8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x000015b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x000015b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x000015c0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x000015c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x000015d0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x000015d8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+--/* [0x000015e0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x000015c0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x000015c8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x000015d0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_end
+- };
+- #ifdef __HIGHC__
+-diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
+-index 3fa8531..6e552d9 100644
+---- a/libavcodec/rpi_shader.h
+-+++ b/libavcodec/rpi_shader.h
+-@@ -4,15 +4,15 @@
+- extern unsigned int rpi_shader[];
+- 
+- #define mc_setup_uv (rpi_shader + 0)
+--#define mc_filter_uv (rpi_shader + 148)
+--#define mc_filter_uv_b0 (rpi_shader + 338)
+--#define mc_filter_uv_b (rpi_shader + 490)
+--#define mc_exit (rpi_shader + 666)
+--#define mc_interrupt_exit8 (rpi_shader + 684)
+--#define mc_setup (rpi_shader + 714)
+--#define mc_filter (rpi_shader + 868)
+--#define mc_filter_b (rpi_shader + 1108)
+--#define mc_interrupt_exit12 (rpi_shader + 1364)
+--#define mc_end (rpi_shader + 1402)
+-+#define mc_filter_uv (rpi_shader + 144)
+-+#define mc_filter_uv_b0 (rpi_shader + 334)
+-+#define mc_filter_uv_b (rpi_shader + 486)
+-+#define mc_exit (rpi_shader + 662)
+-+#define mc_interrupt_exit8 (rpi_shader + 680)
+-+#define mc_setup (rpi_shader + 710)
+-+#define mc_filter (rpi_shader + 864)
+-+#define mc_filter_b (rpi_shader + 1104)
+-+#define mc_interrupt_exit12 (rpi_shader + 1360)
+-+#define mc_end (rpi_shader + 1398)
+- 
+- #endif
+-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
+-index 9cfc0d9..a0b8e5a 100644
+---- a/libavcodec/rpi_shader.qasm
+-+++ b/libavcodec/rpi_shader.qasm
+-@@ -133,8 +133,8 @@ mov ra14, 0
+- mov ra15, 0
+- 
+- # Compute part of VPM to use for DMA output
+--mov r2, qpu_num
+--shl r2, r2, 1   # Convert QPU numbers to be even (this means we can only use 8 QPUs, but is necessary as we need to save 16bit intermediate results)
+-+mov r3, unif
+-+shl r2, r3, 1   # Convert QPU numbers to be even (this means we can only use 8 QPUs, but is necessary as we need to save 16bit intermediate results)
+- and r2, r2, 15
+- mov r1, r2
+- asr r1, r1, 2
+-@@ -147,8 +147,7 @@ shl r0, r0, 5
+- add rb27, r0, r1
+- 
+- # Compute part of VPM to save data into
+--mov r2, qpu_num   # qpu_num = abcd
+--shl r2, r2, 1
+-+shl r2, r3, 1
+- and r2, r2, 15    # r2 = bcd0
+- mov r1, r2        # r1 = bcd0
+- asr r1, r1, 2     # r1 = bc
+-@@ -181,9 +180,6 @@ add t0s, r2, r1
+- mov rb12,unif # offset before shift
+- mov rb13,unif # offset after shift
+- 
+--# Dump padding words
+--mov r0, unif
+--
+- # submit texture requests for second line
+- max r1, ra_y, 0
+- min r1, r1, rb_frame_height_minus_1
+--- 
+-2.7.4
+-
+-
+-From db6fe49d50e42c444b5833acc6206c0bbfaacef4 Mon Sep 17 00:00:00 2001
+-From: popcornmix <popcornmix@gmail.com>
+-Date: Sat, 23 May 2015 13:20:21 +0100
+-Subject: [PATCH 45/68] Add new cache flushing routine
+-
+----
+- libavcodec/hevc.c          |  8 +++--
+- libavcodec/hevc_filter.c   | 39 ++++++++++-----------
+- libavcodec/rpi_qpu.c       | 17 +++++++--
+- libavcodec/rpi_qpu.h       |  2 ++
+- libavcodec/rpi_user_vcsm.h | 86 ++++++++++++++++++++++++++--------------------
+- 5 files changed, 91 insertions(+), 61 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index caadfaa..9d12583 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -3575,9 +3575,13 @@ static int decode_nal_units(HEVCContext *s, const uint8_t *buf, int length)
+-     }
+- 
+- fail:
+--    if (s->ref && s->threads_type == FF_THREAD_FRAME)
+-+    if (s->ref && s->threads_type == FF_THREAD_FRAME) {
+-+#ifdef RPI_INTER_QPU
+-+        void ff_hevc_flush_chroma(HEVCContext *s, ThreadFrame *f, int n);
+-+        ff_hevc_flush_chroma(s, &s->ref->tf, s->ps.sps->height);
+-+#endif
+-         ff_thread_report_progress(&s->ref->tf, INT_MAX, 0);
+--
+-+    }
+-     return ret;
+- }
+- 
+-diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
+-index 186317a..ec84e8a 100644
+---- a/libavcodec/hevc_filter.c
+-+++ b/libavcodec/hevc_filter.c
+-@@ -883,36 +883,35 @@ static int ff_hevc_buf_base(AVBufferRef *bref) {
+-   return p->vc & 0x3fffffff;
+- }
+- 
+--static void ff_hevc_flush_chroma(HEVCContext *s, ThreadFrame *f, int n)
+-+void ff_hevc_flush_chroma(HEVCContext *s, ThreadFrame *f, int n);
+-+void ff_hevc_flush_chroma(HEVCContext *s, ThreadFrame *f, int n)
+- {
+-     if (s->enable_rpi && !(  s->nal_unit_type == NAL_TRAIL_N ||
+-             s->nal_unit_type == NAL_TSA_N   ||
+-             s->nal_unit_type == NAL_STSA_N  ||
+-             s->nal_unit_type == NAL_RADL_N  ||
+-             s->nal_unit_type == NAL_RASL_N )) {
+--#define RPI_FAST_CACHEFLUSH
+- #ifdef RPI_FAST_CACHEFLUSH
+-         struct vcsm_user_clean_invalid_s iocache = {};
+--        int curr_y = f->progress->data[0];
+-+        int curr_y = ((int *)f->progress->data)[0];
+-+        int curr_uv = curr_y >> s->ps.sps->vshift[1];
+-+        int n_uv = n >> s->ps.sps->vshift[1];
+-         int sz,base;
+--        if (curr_y < 0) curr_y = 0;
+--        if (n<=curr_y) return; // Should not happen
+--        sz = s->frame->linesize[1] * (n-curr_y);
+--        base = s->frame->linesize[1] * curr_y;
+--        iocache.s[0].cmd = 3; // Flush L1 cache
+--        iocache.s[0].addr = 0;
+--        iocache.s[0].size  = 0;
+--
+--        iocache.s[1].cmd = 2;
+--        iocache.s[1].addr = ff_hevc_buf_base(s->frame->buf[1]) + base;
+-+        if (curr_uv < 0) curr_uv = 0;
+-+        if (n_uv<=curr_uv) { assert(0); return; } // Should not happen
+-+        sz = s->frame->linesize[1] * (n_uv-curr_uv);
+-+        base = s->frame->linesize[1] * curr_uv;
+-+        GPU_MEM_PTR_T *p = av_buffer_pool_opaque(s->frame->buf[1]);
+-+        iocache.s[0].handle = p->vcsm_handle;
+-+        iocache.s[0].cmd = 3; // clean+invalidate
+-+        iocache.s[0].addr = p->arm + base;
+-+        iocache.s[0].size  = sz;
+-+        p = av_buffer_pool_opaque(s->frame->buf[2]);
+-+        iocache.s[1].handle = p->vcsm_handle;
+-+        iocache.s[1].cmd = 3; // clean+invalidate
+-+        iocache.s[1].addr = p->arm + base;
+-         iocache.s[1].size  = sz;
+--
+--        iocache.s[2].cmd = 2;
+--        iocache.s[2].addr = ff_hevc_buf_base(s->frame->buf[2]) + base;
+--        iocache.s[2].size  = sz;
+--
+--        vcsm_clean_invalid( gpu_get_mailbox(), &iocache );
+--
+-+        vcsm_clean_invalid( &iocache );
+- #else
+-         flush_buffer(s->frame->buf[1]);
+-         flush_buffer(s->frame->buf[2]);
+-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
+-index feb3284..aa65a77 100644
+---- a/libavcodec/rpi_qpu.c
+-+++ b/libavcodec/rpi_qpu.c
+-@@ -211,6 +211,7 @@ static void gpu_unlock(void) {
+- }
+- 
+- static int gpu_malloc_uncached_internal(int numbytes, GPU_MEM_PTR_T *p, int mb) {
+-+  p->numbytes = numbytes;
+-   p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
+-   assert(p->vcsm_handle);
+-   p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
+-@@ -243,13 +244,25 @@ int gpu_get_mailbox(void)
+-   return gpu->mb;
+- }
+- 
+-+// Call this to clean and invalidate a region of memory
+- void gpu_cache_flush(GPU_MEM_PTR_T *p)
+- {
+--  void *tmp = vcsm_lock(p->vcsm_handle);
+--  vcsm_unlock_ptr(tmp);
+-+#define RPI_FAST_CACHEFLUSH
+-+#ifdef RPI_FAST_CACHEFLUSH
+-+    struct vcsm_user_clean_invalid_s iocache = {};
+-+    iocache.s[0].handle = p->vcsm_handle;
+-+    iocache.s[0].cmd = 3; // clean+invalidate
+-+    iocache.s[0].addr = p->arm;
+-+    iocache.s[0].size  = p->numbytes;
+-+    vcsm_clean_invalid( &iocache );
+-+#else
+-+    void *tmp = vcsm_lock(p->vcsm_handle);
+-+    vcsm_unlock_ptr(tmp);
+-+#endif
+- }
+- 
+- static int gpu_malloc_cached_internal(int numbytes, GPU_MEM_PTR_T *p) {
+-+  p->numbytes = numbytes;
+-   p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST, (char *)"Video Frame" );
+-   //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_VC, (char *)"Video Frame" );
+-   //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
+-diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
+-index 2f08f03..0565a60 100644
+---- a/libavcodec/rpi_qpu.h
+-+++ b/libavcodec/rpi_qpu.h
+-@@ -1,6 +1,8 @@
+- #ifndef RPI_QPU_H
+- #define RPI_QPU_H
+- 
+-+#define RPI_FAST_CACHEFLUSH
+-+
+- typedef struct gpu_mem_ptr_s {
+-   unsigned char *arm; // Pointer to memory mapped on ARM side
+-   int vc_handle;   // Videocore handle of relocatable memory
+-diff --git a/libavcodec/rpi_user_vcsm.h b/libavcodec/rpi_user_vcsm.h
+-index 95e6de1..db41a4d 100644
+---- a/libavcodec/rpi_user_vcsm.h
+-+++ b/libavcodec/rpi_user_vcsm.h
+-@@ -1,29 +1,41 @@
+--/*
+--Copyright (c) 2012, Broadcom Europe Ltd
+--All rights reserved.
+--
+--Redistribution and use in source and binary forms, with or without
+--modification, are permitted provided that the following conditions are met:
+--    * Redistributions of source code must retain the above copyright
+--      notice, this list of conditions and the following disclaimer.
+--    * Redistributions in binary form must reproduce the above copyright
+--      notice, this list of conditions and the following disclaimer in the
+--      documentation and/or other materials provided with the distribution.
+--    * Neither the name of the copyright holder nor the
+--      names of its contributors may be used to endorse or promote products
+--      derived from this software without specific prior written permission.
+--
+--THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+--ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+--WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+--DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+--DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+--(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+--LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+--ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+--(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+--SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+--*/
+-+/*****************************************************************************
+-+* Copyright 2001 - 2011 Broadcom Corporation.  All rights reserved.
+-+*
+-+* This program is the proprietary software of Broadcom Corporation and/or
+-+* its licensors, and may only be used, duplicated, modified or distributed
+-+* pursuant to the terms and conditions of a separate, written license
+-+* agreement executed between you and Broadcom (an "Authorized License").
+-+* Except as set forth in an Authorized License, Broadcom grants no license
+-+* (express or implied), right to use, or waiver of any kind with respect to
+-+* the Software, and Broadcom expressly reserves all rights in and to the
+-+* Software and all intellectual property rights therein.  IF YOU HAVE NO
+-+* AUTHORIZED LICENSE, THEN YOU HAVE NO RIGHT TO USE THIS SOFTWARE IN ANY
+-+* WAY, AND SHOULD IMMEDIATELY NOTIFY BROADCOM AND DISCONTINUE ALL USE OF
+-+* THE SOFTWARE.
+-+*
+-+* Except as expressly set forth in the Authorized License,
+-+* 1. This program, including its structure, sequence and organization,
+-+*    constitutes the valuable trade secrets of Broadcom, and you shall use
+-+*    all reasonable efforts to protect the confidentiality thereof, and to
+-+*    use this information only in connection with your use of Broadcom
+-+*    integrated circuit products.
+-+* 2. TO THE MAXIMUM EXTENT PERMITTED BY LAW, THE SOFTWARE IS PROVIDED "AS IS"
+-+*    AND WITH ALL FAULTS AND BROADCOM MAKES NO PROMISES, REPRESENTATIONS OR
+-+*    WARRANTIES, EITHER EXPRESS, IMPLIED, STATUTORY, OR OTHERWISE, WITH
+-+*    RESPECT TO THE SOFTWARE.  BROADCOM SPECIFICALLY DISCLAIMS ANY AND ALL
+-+*    IMPLIED WARRANTIES OF TITLE, MERCHANTABILITY, NONINFRINGEMENT, FITNESS
+-+*    FOR A PARTICULAR PURPOSE, LACK OF VIRUSES, ACCURACY OR COMPLETENESS,
+-+*    QUIET ENJOYMENT, QUIET POSSESSION OR CORRESPONDENCE TO DESCRIPTION. YOU
+-+*    ASSUME THE ENTIRE RISK ARISING OUT OF USE OR PERFORMANCE OF THE SOFTWARE.
+-+* 3. TO THE MAXIMUM EXTENT PERMITTED BY LAW, IN NO EVENT SHALL BROADCOM OR ITS
+-+*    LICENSORS BE LIABLE FOR (i) CONSEQUENTIAL, INCIDENTAL, SPECIAL, INDIRECT,
+-+*    OR EXEMPLARY DAMAGES WHATSOEVER ARISING OUT OF OR IN ANY WAY RELATING TO
+-+*    YOUR USE OF OR INABILITY TO USE THE SOFTWARE EVEN IF BROADCOM HAS BEEN
+-+*    ADVISED OF THE POSSIBILITY OF SUCH DAMAGES; OR (ii) ANY AMOUNT IN EXCESS
+-+*    OF THE AMOUNT ACTUALLY PAID FOR THE SOFTWARE ITSELF OR U.S. $1, WHICHEVER
+-+*    IS GREATER. THESE LIMITATIONS SHALL APPLY NOTWITHSTANDING ANY FAILURE OF
+-+*    ESSENTIAL PURPOSE OF ANY LIMITED REMEDY.
+-+*****************************************************************************/
+- 
+- #ifndef __USER_VCSM__H__INCLUDED__
+- #define __USER_VCSM__H__INCLUDED__
+-@@ -424,21 +436,21 @@ int vcsm_unlock_hdl_sp( unsigned int handle, int cache_no_flush );
+- **
+- ** structure contains a list of flush/invalidate commands. Commands are:
+- ** 0: nop
+--** 1: invalidate given physical range in L2
+--** 2: clean      given physical range in L2
+--** 3: clean+invalidate all of L1
+--** 4: flush      all of L2 and all of L1
+-+** 1: invalidate       given virtual range in L1/L2
+-+** 2: clean            given virtual range in L1/L2
+-+** 3: clean+invalidate given virtual range in L1/L2
+-+** 4: flush all L1/L2
+- */
+- struct vcsm_user_clean_invalid_s {
+--    struct {
+--       unsigned int cmd;
+--       unsigned int addr;
+--       unsigned int size;
+--    } s[8];
+-+   struct {
+-+      unsigned int cmd;
+-+      unsigned int handle;
+-+      unsigned int addr;
+-+      unsigned int size;
+-+   } s[8];
+- };
+- 
+--int vcsm_clean_invalid( unsigned int handle, struct vcsm_user_clean_invalid_s *s );
+--
+-+int vcsm_clean_invalid( struct vcsm_user_clean_invalid_s *s );
+- 
+- #ifdef __cplusplus
+- }
+--- 
+-2.7.4
+-
+-
+-From 87a6cb3a4f7189e711c85de6d20077b6453b2ebe Mon Sep 17 00:00:00 2001
+-From: popcornmix <popcornmix@gmail.com>
+-Date: Sat, 23 May 2015 21:10:10 +0100
+-Subject: [PATCH 46/68] Fix multi mailbox extra transform call
+-
+----
+- libavcodec/hevc.c | 2 ++
+- 1 file changed, 2 insertions(+)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 9d12583..30f5834 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -3024,7 +3024,9 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+- #ifdef RPI_INTER_QPU
+-         rpi_execute_inter_qpu(s);
+- #endif
+-+#ifndef RPI_MULTI_MAILBOX
+-         rpi_execute_transform(s);
+-+#endif
+-         rpi_execute_inter_cmds(s);
+-         vpu_wait(s->vpu_id);
+-         rpi_execute_pred_cmds(s);
+--- 
+-2.7.4
+-
+-
+-From 2a3672a1bda0296453953bebe8b17d69445260b4 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Wed, 27 May 2015 16:44:29 +0100
+-Subject: [PATCH 47/68] Added support for running luma prediction on QPUs
+-
+----
+- libavcodec/hevc.c          |  237 +++++++-
+- libavcodec/hevc.h          |   26 +-
+- libavcodec/hevc_filter.c   |   23 +-
+- libavcodec/rpi_qpu.c       |  156 ++++--
+- libavcodec/rpi_qpu.h       |    8 +-
+- libavcodec/rpi_shader.c    | 1313 ++++++++++++++++++++++----------------------
+- libavcodec/rpi_shader.h    |   21 +-
+- libavcodec/rpi_shader.qasm |  883 ++++++++++++++---------------
+- 8 files changed, 1464 insertions(+), 1203 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 30f5834..2da88ec 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -52,6 +52,11 @@
+-     // Define RPI_MULTI_MAILBOX to use the updated mailbox that can launch both QPU and VPU
+-     #define RPI_MULTI_MAILBOX
+-   #endif
+-+
+-+  // Define RPI_CACHE_UNIF_MVS to write motion vector uniform stream to cached memory
+-+  // RPI_CACHE_UNIF_MVS doesn't seem to make much difference, so left undefined.
+-+
+-+
+- #endif
+- 
+- // #define DISABLE_MC
+-@@ -74,6 +79,13 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
+- // The QPU code for UV blocks only works up to a block width of 8
+- #define RPI_CHROMA_BLOCK_WIDTH 8
+- 
+-+// Split image of 2048 into parts 64 wide
+-+// So some QPUs will have 3 blocks of 64 to do, and others 2 blocks for an image 2048 wide with 32 blocks across
+-+// Each block of 64*64
+-+// Smallest CTU size is 16x16, so smallest block is 8x8
+-+// Corresponds to a total of 83kbytes over all 12 QPUs
+-+#define RPI_LUMA_COMMAND_WORDS 9
+-+#define Y_COMMANDS_PER_QPU ((1+3*(64*64)/(8*8)) * RPI_LUMA_COMMAND_WORDS)
+- 
+- #define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24)
+- 
+-@@ -2015,10 +2027,46 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-         int nPbW_c = nPbW >> s->ps.sps->hshift[1];
+-         int nPbH_c = nPbH >> s->ps.sps->vshift[1];
+- 
+--        RPI_REDIRECT(luma_mc_uni)(s, dst0, s->frame->linesize[0], ref0->frame,
+-+#ifdef RPI_LUMA_QPU
+-+        if (s->enable_rpi) {
+-+            int reflist = 0;
+-+            const Mv *mv         = &current_mv.mv[reflist];
+-+            int mx          = mv->x & 3;
+-+            int my          = mv->y & 3;
+-+            int my_mx = (my<<8) + mx;
+-+            int my2_mx2_my_mx = (my_mx << 16) + my_mx;
+-+            int x1 = x0 + (mv->x >> 2);
+-+            int y1 = y0 + (mv->y >> 2);
+-+            int chan = x0>>6; // 64 wide blocks per QPU
+-+            int weight_flag = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
+-+                              (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
+-+            uint32_t *y = s->y_mvs[chan % 12];
+-+            for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
+-+              for(int start_x=0;start_x < nPbW;start_x+=16) {
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff);
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[0]);
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + 8 + start_x) & 0xffff);
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[0]);
+-+                  *y++ = ( (nPbW<16 ? nPbW : 16) << 16 ) + (nPbH<16 ? nPbH : 16);
+-+                  *y++ = my2_mx2_my_mx;
+-+                  if (weight_flag) {
+-+                      *y++ = (s->sh.luma_offset_l0[current_mv.ref_idx[reflist]] << 16) + (s->sh.luma_weight_l0[current_mv.ref_idx[reflist]] & 0xffff);
+-+                  } else {
+-+                      *y++ = 1; // Weight of 1 and offset of 0
+-+                  }
+-+                  *y++ = (get_vc_address(s->frame->buf[0]) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter;
+-+                }
+-+            }
+-+            s->y_mvs[chan % 12] = y;
+-+        } else
+-+#endif
+-+        {
+-+            RPI_REDIRECT(luma_mc_uni)(s, dst0, s->frame->linesize[0], ref0->frame,
+-                     &current_mv.mv[0], x0, y0, nPbW, nPbH,
+-                     s->sh.luma_weight_l0[current_mv.ref_idx[0]],
+-                     s->sh.luma_offset_l0[current_mv.ref_idx[0]]);
+-+        }
+- 
+-         if (s->ps.sps->chroma_format_idc) {
+- #ifdef RPI_INTER_QPU
+-@@ -2078,10 +2126,47 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-         int nPbW_c = nPbW >> s->ps.sps->hshift[1];
+-         int nPbH_c = nPbH >> s->ps.sps->vshift[1];
+- 
+--        RPI_REDIRECT(luma_mc_uni)(s, dst0, s->frame->linesize[0], ref1->frame,
+-+#ifdef RPI_LUMA_QPU
+-+        if (s->enable_rpi) {
+-+            int reflist = 1;
+-+            const Mv *mv    = &current_mv.mv[reflist];
+-+            int mx          = mv->x & 3;
+-+            int my          = mv->y & 3;
+-+            int my_mx = (my<<8) + mx;
+-+            int my2_mx2_my_mx = (my_mx << 16) + my_mx;
+-+            int x1 = x0 + (mv->x >> 2);
+-+            int y1 = y0 + (mv->y >> 2);
+-+            int chan = x0>>6; // 64 wide blocks per QPU
+-+            int weight_flag = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
+-+                              (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
+-+            uint32_t *y = s->y_mvs[chan % 12];
+-+            for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
+-+              for(int start_x=0;start_x < nPbW;start_x+=16) {
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff);
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[0]);
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + 8 + start_x) & 0xffff);
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[0]);
+-+                  *y++ = ( (nPbW<16 ? nPbW : 16) << 16 ) + (nPbH<16 ? nPbH : 16);
+-+                  *y++ = my2_mx2_my_mx;
+-+                  if (weight_flag) {
+-+                      *y++ = (s->sh.luma_offset_l0[current_mv.ref_idx[reflist]] << 16) + (s->sh.luma_weight_l0[current_mv.ref_idx[reflist]] & 0xffff);
+-+                  } else {
+-+                      *y++ = 1; // Weight of 1 and offset of 0
+-+                  }
+-+                  *y++ = (get_vc_address(s->frame->buf[0]) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter;
+-+                }
+-+            }
+-+            s->y_mvs[chan % 12] = y;
+-+        } else
+-+#endif
+-+
+-+        {
+-+            RPI_REDIRECT(luma_mc_uni)(s, dst0, s->frame->linesize[0], ref1->frame,
+-                     &current_mv.mv[1], x0, y0, nPbW, nPbH,
+-                     s->sh.luma_weight_l1[current_mv.ref_idx[1]],
+-                     s->sh.luma_offset_l1[current_mv.ref_idx[1]]);
+-+        }
+- 
+-         if (s->ps.sps->chroma_format_idc) {
+- #ifdef RPI_INTER_QPU
+-@@ -2115,8 +2200,8 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                       *u++ = rpi_filter_coefs[_mx][0];
+-                       *u++ = rpi_filter_coefs[_my][0];
+-                       if (weight_flag) {
+--                          *u++ = (s->sh.chroma_offset_l0[current_mv.ref_idx[1]][0] << 16) + (s->sh.chroma_weight_l0[current_mv.ref_idx[1]][0] & 0xffff);
+--                          *u++ = (s->sh.chroma_offset_l0[current_mv.ref_idx[1]][1] << 16) + (s->sh.chroma_weight_l0[current_mv.ref_idx[1]][1] & 0xffff);
+-+                          *u++ = (s->sh.chroma_offset_l0[current_mv.ref_idx[reflist]][0] << 16) + (s->sh.chroma_weight_l0[current_mv.ref_idx[reflist]][0] & 0xffff);
+-+                          *u++ = (s->sh.chroma_offset_l0[current_mv.ref_idx[reflist]][1] << 16) + (s->sh.chroma_weight_l0[current_mv.ref_idx[reflist]][1] & 0xffff);
+-                       } else {
+-                           *u++ = 1; // Weight of 1 and offset of 0
+-                           *u++ = 1;
+-@@ -2143,9 +2228,44 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-         int nPbW_c = nPbW >> s->ps.sps->hshift[1];
+-         int nPbH_c = nPbH >> s->ps.sps->vshift[1];
+- 
+--        RPI_REDIRECT(luma_mc_bi)(s, dst0, s->frame->linesize[0], ref0->frame,
+-+#ifdef RPI_LUMA_QPU
+-+        if (s->enable_rpi) {
+-+            const Mv *mv    = &current_mv.mv[0];
+-+            int mx          = mv->x & 3;
+-+            int my          = mv->y & 3;
+-+            int my_mx = (my<<8) + mx;
+-+            const Mv *mv2    = &current_mv.mv[1];
+-+            int mx2          = mv2->x & 3;
+-+            int my2          = mv2->y & 3;
+-+            int my2_mx2 = (my2<<8) + mx2;
+-+            int my2_mx2_my_mx = (my2_mx2 << 16) + my_mx;
+-+            int x1 = x0 + (mv->x >> 2);
+-+            int y1 = y0 + (mv->y >> 2);
+-+            int x2 = x0 + (mv2->x >> 2);
+-+            int y2 = y0 + (mv2->y >> 2);
+-+            int chan = x0>>6; // 64 wide blocks per QPU
+-+            uint32_t *y = s->y_mvs[chan % 12];
+-+            for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
+-+              for(int start_x=0;start_x < nPbW;start_x+=8) { // B blocks work 8 at a time
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff);
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[0]);
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y2 - 3 + start_y) << 16) + ( (x2 - 3 + start_x) & 0xffff); // Second fetch is for ref1
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[0]);
+-+                  *y++ = ( (nPbW<8 ? nPbW : 8) << 16 ) + (nPbH<16 ? nPbH : 16);
+-+                  *y++ = my2_mx2_my_mx;
+-+                  *y++ = 1; // B frame weighted prediction not supported
+-+                  *y++ = (get_vc_address(s->frame->buf[0]) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter_b;
+-+                }
+-+            }
+-+            s->y_mvs[chan % 12] = y;
+-+        } else
+-+#endif
+-+        {
+-+            RPI_REDIRECT(luma_mc_bi)(s, dst0, s->frame->linesize[0], ref0->frame,
+-                    &current_mv.mv[0], x0, y0, nPbW, nPbH,
+-                    ref1->frame, &current_mv.mv[1], &current_mv);
+-+        }
+- 
+-         if (s->ps.sps->chroma_format_idc) {
+- #ifdef RPI_INTER_QPU
+-@@ -2834,7 +2954,6 @@ static void rpi_inter_clear(HEVCContext *s)
+-         *s->u_mvs[i]++ = pic_height;
+-         *s->u_mvs[i]++ = s->frame->linesize[1];
+-         *s->u_mvs[i]++ = s->frame->linesize[2];
+--        *s->u_mvs[i]++ = i;
+-         if (weight_flag) {
+-             *s->u_mvs[i]++ = 1 << (s->sh.chroma_log2_weight_denom + 6 - 1);
+-             *s->u_mvs[i]++ = s->sh.chroma_log2_weight_denom + 6;
+-@@ -2842,7 +2961,31 @@ static void rpi_inter_clear(HEVCContext *s)
+-             *s->u_mvs[i]++ = 1 << 5;
+-             *s->u_mvs[i]++ = 6;
+-         }
+-+        *s->u_mvs[i]++ = i;  // Select section of VPM (avoid collisions with 3d unit)
+-+    }
+-+
+-+#ifdef RPI_LUMA_QPU
+-+    for(i=0;i<12;i++) {
+-+        s->y_mvs[i] = s->y_mvs_base[i];
+-+        *s->y_mvs[i]++ = 0; // y_x
+-+        *s->y_mvs[i]++ = 0; // ref_y_base
+-+        *s->y_mvs[i]++ = 0; // y2_x2
+-+        *s->y_mvs[i]++ = 0; // ref_y2_base
+-+        *s->y_mvs[i]++ = (s->ps.sps->width << 16) + s->ps.sps->height;
+-+        *s->y_mvs[i]++ = s->frame->linesize[0]; // pitch
+-+        *s->y_mvs[i]++ = s->frame->linesize[0]; // dst_pitch
+-+        if (weight_flag) {
+-+            int offset = 1 << (s->sh.luma_log2_weight_denom + 6 - 1);
+-+            int shift = s->sh.luma_log2_weight_denom + 6;
+-+            *s->y_mvs[i]++ = (offset << 16) + shift;
+-+        } else {
+-+            int offset = 1 << 5;
+-+            int shift = 6;
+-+            *s->y_mvs[i]++ = (offset << 16) + shift;
+-+        }
+-+        *s->y_mvs[i]++ = 0; // Next kernel
+-     }
+-+#endif
+- }
+- 
+- static void rpi_execute_inter_qpu(HEVCContext *s)
+-@@ -2850,6 +2993,9 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
+-     int k;
+-     int i;
+-     uint32_t *unif_vc = (uint32_t *)s->unif_mvs_ptr.vc;
+-+#ifdef RPI_LUMA_QPU
+-+    uint32_t *y_unif_vc = (uint32_t *)s->y_unif_mvs_ptr.vc;
+-+#endif
+-     if (s->sh.slice_type == I_SLICE) {
+- #ifdef RPI_MULTI_MAILBOX
+-       rpi_execute_transform(s);
+-@@ -2865,8 +3011,23 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
+- 
+-     s->u_mvs[8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
+- 
+-+#ifdef RPI_LUMA_QPU
+-+    for(k=0;k<12;k++) {
+-+        s->y_mvs[k][-RPI_LUMA_COMMAND_WORDS+1] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
+-+        s->y_mvs[k][-RPI_LUMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // Also need a dummy for second request
+-+        s->y_mvs[k][-RPI_LUMA_COMMAND_WORDS+8] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
+-+        assert(s->y_mvs[k] - s->y_mvs_base[k] < Y_COMMANDS_PER_QPU);
+-+    }
+-+    s->y_mvs[12-1][-RPI_LUMA_COMMAND_WORDS+8] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT12); // This QPU will signal interrupt when all others are done and have acquired a semaphore
+-+#endif
+-+
+-+
+- #ifdef RPI_MULTI_MAILBOX
+-+#ifdef RPI_CACHE_UNIF_MVS
+-+    gpu_cache_flush3(&s->coeffs_buf_accelerated,&s->y_unif_mvs_ptr, &s->unif_mvs_ptr);
+-+#else
+-     gpu_cache_flush(&s->coeffs_buf_accelerated);
+-+#endif
+-     s->vpu_id = vpu_qpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0,
+-                                    qpu_get_fn(QPU_MC_SETUP_UV),
+-                                    (uint32_t)(unif_vc+(s->mvs_base[0 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-@@ -2876,7 +3037,27 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
+-                                    (uint32_t)(unif_vc+(s->mvs_base[4 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-                                    (uint32_t)(unif_vc+(s->mvs_base[5 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-                                    (uint32_t)(unif_vc+(s->mvs_base[6 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+--                                   (uint32_t)(unif_vc+(s->mvs_base[7 ] - (uint32_t*)s->unif_mvs_ptr.arm))
+-+                                   (uint32_t)(unif_vc+(s->mvs_base[7 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-+#ifdef RPI_LUMA_QPU
+-+                                   qpu_get_fn(QPU_MC_SETUP),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[0 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[1 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[2 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[3 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[4 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[5 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[6 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[7 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[8 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[9 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[10 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[11 ] - (uint32_t*)s->y_unif_mvs_ptr.arm))
+-+#else
+-+                                   0,
+-+                                   0,0,0,0,
+-+                                   0,0,0,0,
+-+                                   0,0,0,0
+-+#endif
+-                                  );
+-     for(i=0;i<4;i++)
+-         s->num_coeffs[i] = 0;
+-@@ -2892,6 +3073,8 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
+-       (uint32_t)(unif_vc+(s->mvs_base[7 ] - (uint32_t*)s->unif_mvs_ptr.arm))
+-       );
+- #endif
+-+
+-+
+- }
+- #endif
+- 
+-@@ -3579,8 +3762,7 @@ static int decode_nal_units(HEVCContext *s, const uint8_t *buf, int length)
+- fail:
+-     if (s->ref && s->threads_type == FF_THREAD_FRAME) {
+- #ifdef RPI_INTER_QPU
+--        void ff_hevc_flush_chroma(HEVCContext *s, ThreadFrame *f, int n);
+--        ff_hevc_flush_chroma(s, &s->ref->tf, s->ps.sps->height);
+-+        ff_hevc_flush_buffer(s, &s->ref->tf, s->ps.sps->height);
+- #endif
+-         ff_thread_report_progress(&s->ref->tf, INT_MAX, 0);
+-     }
+-@@ -3767,7 +3949,6 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
+- 
+- #ifdef RPI
+-     av_freep(&s->unif_mv_cmds);
+--    av_freep(&s->unif_xfm_cmds);
+-     av_freep(&s->univ_pred_cmds);
+- 
+- #ifdef RPI_INTER_QPU
+-@@ -3776,7 +3957,12 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
+-         s->unif_mvs = 0;
+-     }
+- #endif
+--    //gpu_free(&s->dummy);
+-+#ifdef RPI_LUMA_QPU
+-+    if (s->y_unif_mvs) {
+-+        gpu_free( &s->y_unif_mvs_ptr );
+-+        s->y_unif_mvs = 0;
+-+    }
+-+#endif
+- 
+- #ifdef EARLY_MALLOC
+-     printf("hevc_decode_free\n");
+-@@ -3861,9 +4047,6 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
+-     s->unif_mv_cmds = av_mallocz(sizeof(HEVCMvCmd)*RPI_MAX_MV_CMDS);
+-     if (!s->unif_mv_cmds)
+-         goto fail;
+--    s->unif_xfm_cmds = av_mallocz(sizeof(HEVCXfmCmd)*RPI_MAX_XFM_CMDS);
+--    if (!s->unif_xfm_cmds)
+--        goto fail;
+-     s->univ_pred_cmds = av_mallocz(sizeof(HEVCPredCmd)*RPI_MAX_PRED_CMDS);
+-     if (!s->univ_pred_cmds)
+-         goto fail;
+-@@ -3877,7 +4060,11 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
+-     {
+-         int uv_commands_per_qpu = UV_COMMANDS_PER_QPU;
+-         uint32_t *p;
+-+#ifdef RPI_CACHE_UNIF_MVS
+-+        gpu_malloc_cached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr );
+-+#else
+-         gpu_malloc_uncached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr );
+-+#endif
+-         s->unif_mvs = (uint32_t *) s->unif_mvs_ptr.arm; // TODO support this allocation in non EARLY_MALLOC
+- 
+-         // Set up initial locations for uniform streams
+-@@ -3892,6 +4079,28 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
+- 
+-     }
+- #endif
+-+#ifdef RPI_LUMA_QPU
+-+    {
+-+        int y_commands_per_qpu = Y_COMMANDS_PER_QPU;
+-+        uint32_t *p;
+-+#ifdef RPI_CACHE_UNIF_MVS
+-+        gpu_malloc_cached( 12 * y_commands_per_qpu * sizeof(uint32_t), &s->y_unif_mvs_ptr );
+-+#else
+-+        gpu_malloc_uncached( 12 * y_commands_per_qpu * sizeof(uint32_t), &s->y_unif_mvs_ptr );
+-+#endif
+-+        s->y_unif_mvs = (uint32_t *) s->y_unif_mvs_ptr.arm; // TODO support this allocation in non EARLY_MALLOC
+-+
+-+        // Set up initial locations for uniform streams
+-+        p = s->y_unif_mvs;
+-+        for(i = 0; i < 12; i++) {
+-+            s->y_mvs_base[i] = p;
+-+            p += y_commands_per_qpu;
+-+        }
+-+        s->mc_filter = qpu_get_fn(QPU_MC_FILTER);
+-+        s->mc_filter_b = qpu_get_fn(QPU_MC_FILTER_B);
+-+
+-+    }
+-+#endif
+-     //gpu_malloc_uncached(2048*64,&s->dummy);
+- 
+- #ifdef EARLY_MALLOC
+-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
+-index 4a39e39..5df9dcd 100644
+---- a/libavcodec/hevc.h
+-+++ b/libavcodec/hevc.h
+-@@ -44,9 +44,13 @@
+- #ifdef RPI
+- 
+-   #include "rpi_qpu.h"
+--  // Use QPU for inter prediction
+-+  // Define RPI_INTER_QPU to use QPU for chroma inter prediction
+-   #define RPI_INTER_QPU
+- 
+-+  #ifdef RPI_INTER_QPU
+-+    // Define RPI_LUMA_QPU to also use QPU for luma inter prediction
+-+    #define RPI_LUMA_QPU
+-+  #endif
+- #endif
+- 
+- #define MAX_DPB_SIZE 16 // A.4.1
+-@@ -809,7 +813,6 @@ typedef struct HEVCLocalContext {
+- 
+- // Worst case is for 4:4:4 4x4 blocks with 64 high coding tree blocks, so 16 MV cmds per 4 pixels across for each colour plane, * 2 for bi
+- #define RPI_MAX_MV_CMDS   (2*16*3*(RPI_MAX_WIDTH/4))
+--#define RPI_MAX_XFM_CMDS  (16*3*(RPI_MAX_WIDTH/4))
+- // Each block can have an intra prediction and a transform_add command
+- #define RPI_MAX_PRED_CMDS (2*16*3*(RPI_MAX_WIDTH/4))
+- // Worst case is 16x16 CTUs
+-@@ -844,9 +847,6 @@ typedef struct HEVCMvCmd {
+-     int8_t ref_idx[2];
+- } HEVCMvCmd;
+- 
+--// Command for transform to process a block of coefficients
+--typedef struct HEVCXfmCmd {
+--} HEVCXfmCmd;
+- 
+- // Command for intra prediction and transform_add of predictions to coefficients
+- #define RPI_PRED_TRANSFORM_ADD 0
+-@@ -892,8 +892,7 @@ typedef struct HEVCContext {
+- 
+- #ifdef RPI
+-     int enable_rpi;
+--    HEVCMvCmd *unif_mv_cmds;  // TODO rename
+--    HEVCXfmCmd *unif_xfm_cmds;
+-+    HEVCMvCmd *unif_mv_cmds;
+-     HEVCPredCmd *univ_pred_cmds;
+-     int buf_width;
+-     GPU_MEM_PTR_T coeffs_buf_default;
+-@@ -920,6 +919,15 @@ typedef struct HEVCContext {
+-     uint32_t mc_filter_uv_b0;
+-     uint32_t mc_filter_uv_b;
+- #endif
+-+#ifdef RPI_LUMA_QPU
+-+    GPU_MEM_PTR_T y_unif_mvs_ptr;
+-+    uint32_t *y_unif_mvs; // Base of memory for motion vector commands
+-+    uint32_t *y_mvs_base[12];
+-+    uint32_t *y_mvs[12];
+-+    // Function pointers
+-+    uint32_t mc_filter;
+-+    uint32_t mc_filter_b;
+-+#endif
+- 
+- #endif
+- 
+-@@ -1166,6 +1174,10 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+-                                  int log2_trafo_size, enum ScanType scan_idx,
+-                                  int c_idx);
+- 
+-+#ifdef RPI_INTER_QPU
+-+extern void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n);
+-+#endif
+-+
+- void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size);
+- 
+- 
+-diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
+-index ec84e8a..11629e4 100644
+---- a/libavcodec/hevc_filter.c
+-+++ b/libavcodec/hevc_filter.c
+-@@ -883,8 +883,7 @@ static int ff_hevc_buf_base(AVBufferRef *bref) {
+-   return p->vc & 0x3fffffff;
+- }
+- 
+--void ff_hevc_flush_chroma(HEVCContext *s, ThreadFrame *f, int n);
+--void ff_hevc_flush_chroma(HEVCContext *s, ThreadFrame *f, int n)
+-+void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n)
+- {
+-     if (s->enable_rpi && !(  s->nal_unit_type == NAL_TRAIL_N ||
+-             s->nal_unit_type == NAL_TSA_N   ||
+-@@ -911,10 +910,24 @@ void ff_hevc_flush_chroma(HEVCContext *s, ThreadFrame *f, int n)
+-         iocache.s[1].cmd = 3; // clean+invalidate
+-         iocache.s[1].addr = p->arm + base;
+-         iocache.s[1].size  = sz;
+-+
+-+#ifdef RPI_LUMA_QPU
+-+        p = av_buffer_pool_opaque(s->frame->buf[0]);
+-+        sz = s->frame->linesize[0] * (n-curr_y);
+-+        base = s->frame->linesize[0] * curr_y;
+-+        iocache.s[2].handle = p->vcsm_handle;
+-+        iocache.s[2].cmd = 3; // clean+invalidate
+-+        iocache.s[2].addr = p->arm + base;
+-+        iocache.s[2].size  = sz;
+-+#endif
+-         vcsm_clean_invalid( &iocache );
+- #else
+-         flush_buffer(s->frame->buf[1]);
+-         flush_buffer(s->frame->buf[2]);
+-+#ifdef RPI_LUMA_QPU
+-+        flush_buffer(s->frame->buf[1]);
+-+#endif
+-+
+- #endif
+-         //memcpy(s->dummy.arm,s->frame->data[0],2048*64);
+-         //memcpy(s->dummy.arm,s->frame->data[1],1024*32);
+-@@ -938,7 +951,7 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
+-             sao_filter_CTB(s, x, y - ctb_size);
+-             if (s->threads_type & FF_THREAD_FRAME ) {
+- #ifdef RPI_INTER_QPU
+--                ff_hevc_flush_chroma(s,&s->ref->tf, y);
+-+                ff_hevc_flush_buffer(s,&s->ref->tf, y);
+- #endif
+-                 ff_thread_report_progress(&s->ref->tf, y, 0);
+-             }
+-@@ -947,7 +960,7 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
+-             sao_filter_CTB(s, x , y);
+-             if (s->threads_type & FF_THREAD_FRAME ) {
+- #ifdef RPI_INTER_QPU
+--                ff_hevc_flush_chroma(s, &s->ref->tf, y + ctb_size);
+-+                ff_hevc_flush_buffer(s, &s->ref->tf, y + ctb_size);
+- #endif
+-                 ff_thread_report_progress(&s->ref->tf, y + ctb_size, 0);
+-             }
+-@@ -957,7 +970,7 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
+-         //int currh = s->ref->tf.progress->data[0];
+-         //if (((y + ctb_size)&63)==0)
+- #ifdef RPI_INTER_QPU
+--        ff_hevc_flush_chroma(s, &s->ref->tf, y + ctb_size - 4);
+-+        ff_hevc_flush_buffer(s, &s->ref->tf, y + ctb_size - 4);
+- #endif
+-         ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
+-     }
+-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
+-index aa65a77..e12304b 100644
+---- a/libavcodec/rpi_qpu.c
+-+++ b/libavcodec/rpi_qpu.c
+-@@ -1,9 +1,11 @@
+- #ifdef RPI
+- // This works better than the mmap in that the memory can be cached, but requires a kernel modification to enable the device.
+- // define RPI_TIME_TOTAL_QPU to print out how much time is spent in the QPU code
+--#define RPI_TIME_TOTAL_QPU
+-+//#define RPI_TIME_TOTAL_QPU
+- // define RPI_TIME_TOTAL_VPU to print out how much time is spent in the VPI code
+- //#define RPI_TIME_TOTAL_VPU
+-+// define RPI_TIME_TOTAL_POSTED to print out how much time is spent in the multi execute QPU/VPU combined
+-+//#define RPI_TIME_TOTAL_POSTED
+- // define RPI_ASYNC to run the VPU in a separate thread, need to make a separate call to check for completion
+- #define RPI_ASYNC
+- 
+-@@ -94,7 +96,8 @@ struct GPU
+-   int open_count; // Number of allocated video buffers
+-   int      mb; // Mailbox handle
+-   int      vc; // Address in GPU memory
+--  int mail[12]; // These are used to pass pairs of code/unifs to the QPUs
+-+  int mail[12*2]; // These are used to pass pairs of code/unifs to the QPUs for the first QPU task
+-+  int mail2[12*2]; // These are used to pass pairs of code/unifs to the QPUs for the second QPU task
+- };
+- 
+- // Stop more than one thread trying to allocate memory or use the processing resources at once
+-@@ -102,7 +105,7 @@ static pthread_mutex_t gpu_mutex = PTHREAD_MUTEX_INITIALIZER;
+- static volatile struct GPU* gpu = NULL;
+- static GPU_MEM_PTR_T gpu_mem_ptr;
+- 
+--#if defined(RPI_TIME_TOTAL_QPU) || defined(RPI_TIME_TOTAL_VPU)
+-+#if defined(RPI_TIME_TOTAL_QPU) || defined(RPI_TIME_TOTAL_VPU) || defined(RPI_TIME_TOTAL_POSTED)
+- static unsigned int Microseconds(void) {
+-     struct timespec ts;
+-     unsigned int x;
+-@@ -123,7 +126,7 @@ static pthread_cond_t post_cond_head = PTHREAD_COND_INITIALIZER;
+- static pthread_cond_t post_cond_tail = PTHREAD_COND_INITIALIZER;
+- static pthread_mutex_t post_mutex = PTHREAD_MUTEX_INITIALIZER;
+- 
+--static int vpu_cmds[MAXCMDS][16];
+-+static int vpu_cmds[MAXCMDS][32];
+- static volatile int vpu_async_tail=0; // Contains the number of posted jobs
+- static volatile int vpu_async_head=0;
+- #endif
+-@@ -247,7 +250,6 @@ int gpu_get_mailbox(void)
+- // Call this to clean and invalidate a region of memory
+- void gpu_cache_flush(GPU_MEM_PTR_T *p)
+- {
+--#define RPI_FAST_CACHEFLUSH
+- #ifdef RPI_FAST_CACHEFLUSH
+-     struct vcsm_user_clean_invalid_s iocache = {};
+-     iocache.s[0].handle = p->vcsm_handle;
+-@@ -261,6 +263,34 @@ void gpu_cache_flush(GPU_MEM_PTR_T *p)
+- #endif
+- }
+- 
+-+void gpu_cache_flush3(GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2)
+-+{
+-+#ifdef RPI_FAST_CACHEFLUSH
+-+    struct vcsm_user_clean_invalid_s iocache = {};
+-+    iocache.s[0].handle = p0->vcsm_handle;
+-+    iocache.s[0].cmd = 3; // clean+invalidate
+-+    iocache.s[0].addr = (int) p0->arm;
+-+    iocache.s[0].size  = p0->numbytes;
+-+    iocache.s[1].handle = p1->vcsm_handle;
+-+    iocache.s[1].cmd = 3; // clean+invalidate
+-+    iocache.s[1].addr = (int) p1->arm;
+-+    iocache.s[1].size  = p1->numbytes;
+-+    iocache.s[2].handle = p2->vcsm_handle;
+-+    iocache.s[2].cmd = 3; // clean+invalidate
+-+    iocache.s[2].addr = (int) p2->arm;
+-+    iocache.s[2].size  = p2->numbytes;
+-+    vcsm_clean_invalid( &iocache );
+-+#else
+-+    void *tmp;
+-+    tmp = vcsm_lock(p0->vcsm_handle);
+-+    vcsm_unlock_ptr(tmp);
+-+    tmp = vcsm_lock(p1->vcsm_handle);
+-+    vcsm_unlock_ptr(tmp);
+-+    tmp = vcsm_lock(p2->vcsm_handle);
+-+    vcsm_unlock_ptr(tmp);
+-+#endif
+-+}
+-+
+- static int gpu_malloc_cached_internal(int numbytes, GPU_MEM_PTR_T *p) {
+-   p->numbytes = numbytes;
+-   p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST, (char *)"Video Frame" );
+-@@ -357,9 +387,19 @@ unsigned int vpu_get_constants(void) {
+- #ifdef RPI_ASYNC
+- 
+- static void *vpu_start(void *arg) {
+-+#ifdef RPI_TIME_TOTAL_POSTED
+-+  int last_time=0;
+-+  long long on_time=0;
+-+  long long off_time=0;
+-+  int start_time;
+-+  int end_time;
+-+  int count=0;
+-+#endif
+-   while(1) {
+-+    int i;
+-     int *p;
+-     int qpu_code;
+-+    int qpu_codeb;
+-     pthread_mutex_lock(&post_mutex);
+-     while( vpu_async_tail - vpu_async_head <= 0)
+-     {
+-@@ -373,24 +413,49 @@ static void *vpu_start(void *arg) {
+-       break; // Last job
+-     }
+-     qpu_code = p[7];
+-+    qpu_codeb = p[16];
+-     //if (p[7]) {
+-         //GPU_MEM_PTR_T *buf = (GPU_MEM_PTR_T *)p[7];
+-         //gpu_cache_flush(buf);
+-     //}
+-+
+-+#ifdef RPI_TIME_TOTAL_POSTED
+-+    start_time = Microseconds();
+-+    if (last_time==0)
+-+      last_time = start_time;
+-+    off_time += start_time-last_time;
+-+#endif
+-+
+-     if (!qpu_code) {
+-       vpu_execute_code(p[0], p[1], p[2], p[3], p[4], p[5], p[6]);
+-     } else {
+--      int i;
+-       for(i=0;i<8;i++) {
+-         gpu->mail[i*2] = p[8+i];
+-         gpu->mail[i*2 + 1] = qpu_code;
+-       }
+--
+--      execute_multi(gpu->mb,8,gpu->vc + offsetof(struct GPU, mail), 1 /* no flush */, 5000 /* timeout ms */,
+--                              0, 0, 0, 0,
+-+      for(i=0;i<12;i++) {
+-+        gpu->mail2[i*2] = p[17+i];
+-+        gpu->mail2[i*2 + 1] = qpu_codeb;
+-+      }
+-+#if (0)
+-+      vpu_execute_code(p[0], p[1], p[2], p[3], p[4], p[5], p[6]);
+-+      execute_qpu(gpu->mb,8,gpu->vc + offsetof(struct GPU, mail), 1 /* no flush */, 5000 /* timeout ms */);
+-+#else
+-+      execute_multi(gpu->mb,
+-+                              12,gpu->vc + offsetof(struct GPU, mail2), 1, 5000,
+-+                              8,gpu->vc + offsetof(struct GPU, mail), 1 /* no flush */, 5000 /* timeout ms */,
+-                               p[0], p[1], p[2], p[3], p[4], p[5], p[6], // VPU0
+-                               0,    0   , 0   , 0   , 0   , 0   , 0); // VPU1
+-+#endif
+-     }
+-+#ifdef RPI_TIME_TOTAL_POSTED
+-+    end_time = Microseconds();
+-+    last_time = end_time;
+-+    on_time += end_time - start_time;
+-+    count++;
+-+    if ((count&0x7f)==0)
+-+      printf("Posted %d On=%dms, Off=%dms\n",count,(int)(on_time/1000),(int)(off_time/1000));
+-+#endif
+-     pthread_mutex_lock(&post_mutex);
+-     vpu_async_head++;
+-     pthread_cond_broadcast(&post_cond_head);
+-@@ -436,7 +501,9 @@ int vpu_post_code(unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned
+- }
+- 
+- int vpu_qpu_post_code(unsigned vpu_code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
+--                      int qpu_code, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8)
+-+                      int qpu_code, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8,
+-+                      int qpu_codeb, int unifs1b, int unifs2b, int unifs3b, int unifs4b, int unifs5b, int unifs6b, int unifs7b, int unifs8b, int unifs9b, int unifs10b, int unifs11b, int unifs12b
+-+                      )
+- {
+- 
+-   pthread_mutex_lock(&post_mutex);
+-@@ -464,6 +531,21 @@ int vpu_qpu_post_code(unsigned vpu_code, unsigned r0, unsigned r1, unsigned r2,
+-     p[13] = unifs6;
+-     p[14] = unifs7;
+-     p[15] = unifs8;
+-+
+-+    p[16] = qpu_codeb;
+-+    p[17] = unifs1b;
+-+    p[18] = unifs2b;
+-+    p[19] = unifs3b;
+-+    p[20] = unifs4b;
+-+    p[21] = unifs5b;
+-+    p[22] = unifs6b;
+-+    p[23] = unifs7b;
+-+    p[24] = unifs8b;
+-+    p[25] = unifs9b;
+-+    p[26] = unifs10b;
+-+    p[27] = unifs11b;
+-+    p[28] = unifs12b;
+-+
+-     if (num<=1)
+-       pthread_cond_broadcast(&post_cond_tail); // Otherwise the vpu thread must already be awake
+-     pthread_mutex_unlock(&post_mutex);
+-@@ -544,27 +626,27 @@ void qpu_run_shader12(int code, int num, int code2, int num2, int unifs1, int un
+-   off_time += start_time-last_time;
+- #endif
+-   for(i=0;i<num;i++) {
+--    gpu->mail[i*2 + 1] = code;
+-+    gpu->mail2[i*2 + 1] = code;
+-   }
+-   for(;i<num+num2;i++) {
+--    gpu->mail[i*2 + 1] = code2;
+-+    gpu->mail2[i*2 + 1] = code2;
+-   }
+--  gpu->mail[0 ] = unifs1;
+--  gpu->mail[2 ] = unifs2;
+--  gpu->mail[4 ] = unifs3;
+--  gpu->mail[6 ] = unifs4;
+--  gpu->mail[8 ] = unifs5;
+--  gpu->mail[10] = unifs6;
+--	gpu->mail[12] = unifs7;
+--	gpu->mail[14] = unifs8;
+--	gpu->mail[16] = unifs9;
+--	gpu->mail[18] = unifs10;
+--	gpu->mail[20] = unifs11;
+--	gpu->mail[22] = unifs12;
+-+  gpu->mail2[0 ] = unifs1;
+-+  gpu->mail2[2 ] = unifs2;
+-+  gpu->mail2[4 ] = unifs3;
+-+  gpu->mail2[6 ] = unifs4;
+-+  gpu->mail2[8 ] = unifs5;
+-+  gpu->mail2[10] = unifs6;
+-+	gpu->mail2[12] = unifs7;
+-+	gpu->mail2[14] = unifs8;
+-+	gpu->mail2[16] = unifs9;
+-+	gpu->mail2[18] = unifs10;
+-+	gpu->mail2[20] = unifs11;
+-+	gpu->mail2[22] = unifs12;
+- 	execute_qpu(
+- 		gpu->mb,
+- 		12 /* Number of QPUs */,
+--		gpu->vc + offsetof(struct GPU, mail),
+-+		gpu->vc + offsetof(struct GPU, mail2),
+- 		1 /* no flush */,  // Don't flush VPU L1 cache
+- 		5000 /* timeout ms */);
+- #ifdef RPI_TIME_TOTAL_QPU
+-@@ -635,21 +717,21 @@ unsigned int qpu_get_fn(int num) {
+-       gpu_unlock();
+-     }
+-     switch(num) {
+--    //case QPU_MC_SETUP:
+--    //  fn = mc_setup;
+--    //  break;
+--    //case QPU_MC_FILTER:
+--    //  fn = mc_filter;
+--    //  break;
+-+    case QPU_MC_SETUP:
+-+      fn = mc_setup;
+-+      break;
+-+    case QPU_MC_FILTER:
+-+      fn = mc_filter;
+-+      break;
+-     case QPU_MC_EXIT:
+-       fn = mc_exit;
+-       break;
+--    //case QPU_MC_INTERRUPT_EXIT:
+--    //  fn = mc_interrupt_exit;
+--    //  break;
+--    //case QPU_MC_FILTER_B:
+--    //  fn = mc_filter_b;
+--    //  break;
+-+    case QPU_MC_INTERRUPT_EXIT12:
+-+      fn = mc_interrupt_exit12;
+-+      break;
+-+    case QPU_MC_FILTER_B:
+-+      fn = mc_filter_b;
+-+      break;
+-     //case QPU_MC_FILTER_HONLY:
+-     //  fn = mc_filter_honly;
+-     //  break;
+-diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
+-index 0565a60..81c2bb1 100644
+---- a/libavcodec/rpi_qpu.h
+-+++ b/libavcodec/rpi_qpu.h
+-@@ -1,6 +1,7 @@
+- #ifndef RPI_QPU_H
+- #define RPI_QPU_H
+- 
+-+// Define RPI_FAST_CACHEFLUSH to use the VCSM cache flush code
+- #define RPI_FAST_CACHEFLUSH
+- 
+- typedef struct gpu_mem_ptr_s {
+-@@ -16,6 +17,7 @@ extern int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p);
+- extern int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p);
+- extern void gpu_free(GPU_MEM_PTR_T *p);
+- extern void gpu_cache_flush(GPU_MEM_PTR_T *p);
+-+extern void gpu_cache_flush3(GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2);
+- 
+- // QPU specific functions
+- extern void qpu_run_shader8(int code, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8);
+-@@ -26,7 +28,7 @@ enum {
+-   QPU_MC_SETUP,
+-   QPU_MC_FILTER,
+-   QPU_MC_EXIT,
+--  QPU_MC_INTERRUPT_EXIT,
+-+  QPU_MC_INTERRUPT_EXIT12,
+-   QPU_MC_FILTER_B,
+-   QPU_MC_FILTER_HONLY,
+-   QPU_MC_SETUP_UV,
+-@@ -44,7 +46,9 @@ extern unsigned int vpu_get_constants(void);
+- extern unsigned vpu_execute_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5);
+- extern int vpu_post_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5, GPU_MEM_PTR_T *buf);
+- int vpu_qpu_post_code(unsigned vpu_code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
+--                      int qpu_code, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8);
+-+                      int qpu_code, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8,
+-+                      int qpu_codeb, int unifs1b, int unifs2b, int unifs3b, int unifs4b, int unifs5b, int unifs6b, int unifs7b, int unifs8b, int unifs9b, int unifs10b, int unifs11b, int unifs12b
+-+                      );
+- extern void vpu_wait( int id);
+- 
+- // Simple test of shader code
+-diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
+-index a0f0282..e86eb30 100644
+---- a/libavcodec/rpi_shader.c
+-+++ b/libavcodec/rpi_shader.c
+-@@ -48,693 +48,674 @@ unsigned int rpi_shader[] = {
+- /* [0x000000b8] */ 0x00000000, 0xe0020367, // mov ra13, 0
+- /* [0x000000c0] */ 0x00000000, 0xe00203a7, // mov ra14, 0
+- /* [0x000000c8] */ 0x00000000, 0xe00203e7, // mov ra15, 0
+--/* [0x000000d0] */ 0x15827d80, 0x100208e7, // mov r3, unif
+--/* [0x000000d8] */ 0x119c17c0, 0xd00208a7, // shl r2, r3, 1
+--/* [0x000000e0] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+--/* [0x000000e8] */ 0x159e7480, 0x10020867, // mov r1, r2
+--/* [0x000000f0] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+--/* [0x000000f8] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+--/* [0x00000100] */ 0x159e7480, 0x10020827, // mov r0, r2
+--/* [0x00000108] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+--/* [0x00000110] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000118] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
+--/* [0x00000120] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+--/* [0x00000128] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
+--/* [0x00000130] */ 0x119c17c0, 0xd00208a7, // shl r2, r3, 1
+--/* [0x00000138] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+--/* [0x00000140] */ 0x159e7480, 0x10020867, // mov r1, r2
+--/* [0x00000148] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+--/* [0x00000150] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+--/* [0x00000158] */ 0x159e7480, 0x10020827, // mov r0, r2
+--/* [0x00000160] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+--/* [0x00000168] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000170] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+--/* [0x00000178] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
+--/* [0x00000180] */ 0x0f9c11c0, 0xd0020827, // asr r0, r0, 1
+--/* [0x00000188] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
+--/* [0x00000190] */ 0x0c9e7040, 0x10021567, // add rb21, r0, r1
+--/* [0x00000198] */ 0x15427d80, 0x10020827, // mov r0, ra_x
+--/* [0x000001a0] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
+--/* [0x000001a8] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base
+--/* [0x000001b0] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
+--/* [0x000001b8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
+--/* [0x000001c0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x000001c8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+--/* [0x000001d0] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x, r0
+-+/* [0x000000d0] */ 0x15427d80, 0x10020827, // mov r0, ra_x
+-+/* [0x000000d8] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
+-+/* [0x000000e0] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base
+-+/* [0x000000e8] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
+-+/* [0x000000f0] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
+-+/* [0x000000f8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x00000100] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+-+/* [0x00000108] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x, r0
+-+/* [0x00000110] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x00000118] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
+-+/* [0x00000120] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_frame_base, r2
+-+/* [0x00000128] */ 0x0c9e7440, 0x10020f27, // add t1s, r2, r1
+-+/* [0x00000130] */ 0x00000008, 0xe00208a7, // mov r2,8
+-+/* [0x00000138] */ 0x11827c80, 0x10021327, // shl rb12,unif, r2
+-+/* [0x00000140] */ 0x0c827c80, 0x10021367, // add rb13,unif,r2
+-+/* [0x00000148] */ 0x15827d80, 0x100208a7, // mov r2, unif
+-+/* [0x00000150] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
+-+/* [0x00000158] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+-+/* [0x00000160] */ 0x159e7480, 0x10020867, // mov r1, r2
+-+/* [0x00000168] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+-+/* [0x00000170] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+-+/* [0x00000178] */ 0x159e7480, 0x10020827, // mov r0, r2
+-+/* [0x00000180] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+-+/* [0x00000188] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000190] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+-+/* [0x00000198] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
+-+/* [0x000001a0] */ 0x0f9c11c0, 0xd00208a7, // asr r2, r0, 1
+-+/* [0x000001a8] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
+-+/* [0x000001b0] */ 0x0c9e7440, 0x10021567, // add rb21, r2, r1
+-+/* [0x000001b8] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
+-+/* [0x000001c0] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+-+/* [0x000001c8] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
+-+/* [0x000001d0] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
+- /* [0x000001d8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+--/* [0x000001e0] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
+--/* [0x000001e8] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_frame_base, r2
+--/* [0x000001f0] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
+--/* [0x000001f8] */ 0x15827d80, 0x10021327, // mov rb12,unif
+--/* [0x00000200] */ 0x15827d80, 0x10021367, // mov rb13,unif
+--/* [0x00000208] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
+--/* [0x00000210] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+--/* [0x00000218] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
+--/* [0x00000220] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000228] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+--/* [0x00000230] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x
+--/* [0x00000238] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_frame_base
+-+/* [0x000001e0] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
+-+/* [0x000001e8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x000001f0] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+-+/* [0x000001f8] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x
+-+/* [0x00000200] */ 0x0c627380, 0x10020f27, // add t1s, r1, ra_frame_base
+- // ::mc_filter_uv
+--/* [0x00000240] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00000248] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x00000250] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00000258] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x00000260] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+--/* [0x00000268] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x00000270] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+--/* [0x00000278] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x00000280] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
+--/* [0x00000288] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x00000290] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
+--/* [0x00000298] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+--/* [0x000002a0] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x000002a8] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000002b0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x000002b8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x000002c0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x000002c8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+--/* [0x000002d0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+--/* [0x000002d8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x000002e0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x000002e8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x000002f0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x000002f8] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000300] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000308] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000310] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000318] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x00000320] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000328] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000330] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000338] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
+--/* [0x00000340] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000348] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000350] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
+--/* [0x00000358] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000360] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
+--/* [0x00000368] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000370] */ 0x0f9e7080, 0x100613e7, // asr.ifnz rb15, r0, r2
+--/* [0x00000378] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000380] */ 0x0f9e7080, 0x100613a7, // asr.ifnz rb14, r0, r2
+--/* [0x00000388] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x00000390] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x00000208] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000210] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00000218] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000220] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000228] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x00000230] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000238] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x00000240] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x00000248] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
+-+/* [0x00000250] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x00000258] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
+-+/* [0x00000260] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x00000268] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x00000270] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000278] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x00000280] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x00000288] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x00000290] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+-+/* [0x00000298] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+-+/* [0x000002a0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x000002a8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x000002b0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x000002b8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x000002c0] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000002c8] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000002d0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000002d8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000002e0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x000002e8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000002f0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000002f8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000300] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
+-+/* [0x00000308] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000310] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000318] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
+-+/* [0x00000320] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000328] */ 0x0f9e7080, 0x100208e7, // asr r3, r0, r2
+-+/* [0x00000330] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000338] */ 0x0f9e7080, 0x100613e7, // asr.ifnz rb15, r0, r2
+-+/* [0x00000340] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000348] */ 0x0f9e7080, 0x100608e7, // asr.ifnz r3, r0, r2
+-+/* [0x00000350] */ 0x119c87c0, 0xd00213a7, // shl rb14,r3,8
+-+/* [0x00000358] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop
+--/* [0x00000398] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x000003a0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
+--/* [0x000003a8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+--/* [0x000003b0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x000003b8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x000003c0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x000003c8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x000003d0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x000003d8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
+--/* [0x000003e0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_frame_base, r2
+--/* [0x000003e8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x000003f0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x000003f8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x00000400] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x00000408] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x00000410] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x00000418] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x00000420] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x00000428] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00000430] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+--/* [0x00000438] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+--/* [0x00000440] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
+--/* [0x00000448] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x00000450] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+--/* [0x00000458] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+--/* [0x00000460] */ 0x00000020, 0xe0021327, // mov rb12,32
+--/* [0x00000468] */ 0x00000006, 0xe0021367, // mov rb13,6
+--/* [0x00000470] */ 0x00000001, 0xe00213a7, // mov rb14,1
+--/* [0x00000478] */ 0x00000000, 0xe00213e7, // mov rb15,0
+--/* [0x00000480] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+--/* [0x00000488] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+--/* [0x00000490] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+--/* [0x00000498] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+--/* [0x000004a0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x000004a8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x000004b0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+--/* [0x000004b8] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
+--/* [0x000004c0] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
+--/* [0x000004c8] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
+--/* [0x000004d0] */ 0xfffffea8, 0xf06809e7, // brr.anyn -, r:uvloop
+--/* [0x000004d8] */ 0x0c9cf3c0, 0x10020867, // add r1, r1, rb15
+--/* [0x000004e0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x000004e8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+--/* [0x000004f0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x000004f8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000500] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x00000508] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x00000510] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x00000518] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000520] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x00000528] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000530] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000360] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x00000368] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
+-+/* [0x00000370] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+-+/* [0x00000378] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000380] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x00000388] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000390] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000398] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x000003a0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
+-+/* [0x000003a8] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2
+-+/* [0x000003b0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x000003b8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x000003c0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x000003c8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x000003d0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x000003d8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x000003e0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x000003e8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x000003f0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x000003f8] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+-+/* [0x00000400] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+-+/* [0x00000408] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
+-+/* [0x00000410] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00000418] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+-+/* [0x00000420] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+-+/* [0x00000428] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+-+/* [0x00000430] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+-+/* [0x00000438] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+/* [0x00000440] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x00000448] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x00000450] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x00000458] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x00000460] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
+-+/* [0x00000468] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
+-+/* [0x00000470] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
+-+/* [0x00000478] */ 0xfffffec8, 0xf06809e7, // brr.anyn -, r:uvloop
+-+/* [0x00000480] */ 0x0c9cf3c0, 0x10020867, // add r1, r1, rb15
+-+/* [0x00000488] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x00000490] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x00000498] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x000004a0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x000004a8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x000004b0] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x000004b8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x000004c0] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x000004c8] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x000004d0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x000004d8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_filter_uv_b0
+--/* [0x00000538] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00000540] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x00000548] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00000550] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x00000558] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+--/* [0x00000560] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x00000568] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+--/* [0x00000570] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x00000578] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
+--/* [0x00000580] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x00000588] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
+--/* [0x00000590] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
+--/* [0x00000598] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x000005a0] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000005a8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x000005b0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x000005b8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x000005c0] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+--/* [0x000005c8] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+--/* [0x000005d0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x000005d8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x000005e0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x000005e8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x000005f0] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000005f8] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000600] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000608] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000610] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x00000618] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000620] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000628] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000630] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
+--/* [0x00000638] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000640] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000648] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x00000650] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000658] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x000004e0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x000004e8] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x000004f0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x000004f8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000500] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x00000508] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000510] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x00000518] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x00000520] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
+-+/* [0x00000528] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x00000530] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
+-+/* [0x00000538] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
+-+/* [0x00000540] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x00000548] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000550] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x00000558] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x00000560] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x00000568] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+-+/* [0x00000570] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+-+/* [0x00000578] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x00000580] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000588] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000590] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00000598] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000005a0] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000005a8] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000005b0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000005b8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x000005c0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000005c8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000005d0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000005d8] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
+-+/* [0x000005e0] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000005e8] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000005f0] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x000005f8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000600] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop_b0
+--/* [0x00000660] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x00000668] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
+--/* [0x00000670] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+--/* [0x00000678] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x00000680] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x00000688] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x00000690] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x00000698] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x000006a0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
+--/* [0x000006a8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_frame_base, r2
+--/* [0x000006b0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x000006b8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x000006c0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x000006c8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x000006d0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x000006d8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x000006e0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x000006e8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x000006f0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x000006f8] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+--/* [0x00000700] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+--/* [0x00000708] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+--/* [0x00000710] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x00000718] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+--/* [0x00000720] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+--/* [0x00000728] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+--/* [0x00000730] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+--/* [0x00000738] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+--/* [0x00000740] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+--/* [0x00000748] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x00000750] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
+--/* [0x00000758] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+--/* [0x00000760] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
+--/* [0x00000768] */ 0x009e7000, 0x100009e7, // nop
+--/* [0x00000770] */ 0x009e7000, 0x100009e7, // nop
+--/* [0x00000778] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000780] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000788] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000790] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x00000608] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x00000610] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
+-+/* [0x00000618] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+-+/* [0x00000620] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000628] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x00000630] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000638] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000640] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x00000648] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
+-+/* [0x00000650] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2
+-+/* [0x00000658] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000660] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x00000668] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x00000670] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00000678] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x00000680] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000688] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00000690] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00000698] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x000006a0] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+-+/* [0x000006a8] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+-+/* [0x000006b0] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+-+/* [0x000006b8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x000006c0] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+-+/* [0x000006c8] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+-+/* [0x000006d0] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+-+/* [0x000006d8] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+-+/* [0x000006e0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+/* [0x000006e8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x000006f0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x000006f8] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
+-+/* [0x00000700] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+-+/* [0x00000708] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
+-+/* [0x00000710] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x00000718] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x00000720] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000728] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000730] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000738] */ 0x009e7000, 0x100009e7, // nop
+- // ::mc_filter_uv_b
+--/* [0x00000798] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x000007a0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x000007a8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x000007b0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x000007b8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+--/* [0x000007c0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x000007c8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+--/* [0x000007d0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x000007d8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
+--/* [0x000007e0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x000007e8] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
+--/* [0x000007f0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+--/* [0x000007f8] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x00000800] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000808] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x00000810] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x00000818] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x00000820] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+--/* [0x00000828] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+--/* [0x00000830] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x00000838] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+--/* [0x00000840] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+--/* [0x00000848] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+--/* [0x00000850] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000858] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000860] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00000868] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
+--/* [0x00000870] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000878] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000880] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000888] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000890] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x00000898] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000008a0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000008a8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000008b0] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
+--/* [0x000008b8] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000008c0] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000008c8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x000008d0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x000008d8] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x00000740] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000748] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00000750] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000758] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000760] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x00000768] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000770] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x00000778] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x00000780] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
+-+/* [0x00000788] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x00000790] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
+-+/* [0x00000798] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x000007a0] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x000007a8] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000007b0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x000007b8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x000007c0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x000007c8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+-+/* [0x000007d0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+-+/* [0x000007d8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x000007e0] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+-+/* [0x000007e8] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+-+/* [0x000007f0] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+-+/* [0x000007f8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000800] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000808] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00000810] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
+-+/* [0x00000818] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000820] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000828] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000830] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000838] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x00000840] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000848] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000850] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000858] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
+-+/* [0x00000860] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000868] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000870] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x00000878] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000880] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop_b
+--/* [0x000008e0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x000008e8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
+--/* [0x000008f0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+--/* [0x000008f8] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x00000900] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x00000908] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x00000910] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x00000918] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x00000920] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
+--/* [0x00000928] */ 0x0c627c80, 0x10020e27, // add t0s, ra_frame_base, r2
+--/* [0x00000930] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000938] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x00000940] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x00000948] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x00000950] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x00000958] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x00000960] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x00000968] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x00000970] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00000978] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+--/* [0x00000980] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+--/* [0x00000988] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
+--/* [0x00000990] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x00000998] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+--/* [0x000009a0] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+--/* [0x000009a8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+--/* [0x000009b0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+--/* [0x000009b8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+--/* [0x000009c0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+--/* [0x000009c8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x000009d0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x000009d8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+--/* [0x000009e0] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
+--/* [0x000009e8] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
+--/* [0x000009f0] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
+--/* [0x000009f8] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
+--/* [0x00000a00] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x00000a08] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+--/* [0x00000a10] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00000a18] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000a20] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x00000a28] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x00000a30] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x00000a38] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000a40] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x00000a48] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000a50] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000888] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x00000890] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
+-+/* [0x00000898] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+-+/* [0x000008a0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x000008a8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x000008b0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x000008b8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x000008c0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x000008c8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
+-+/* [0x000008d0] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2
+-+/* [0x000008d8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x000008e0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x000008e8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x000008f0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x000008f8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x00000900] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000908] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00000910] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00000918] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00000920] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+-+/* [0x00000928] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+-+/* [0x00000930] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x00000938] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00000940] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+-+/* [0x00000948] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+-+/* [0x00000950] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+-+/* [0x00000958] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+-+/* [0x00000960] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+/* [0x00000968] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x00000970] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x00000978] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x00000980] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x00000988] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
+-+/* [0x00000990] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
+-+/* [0x00000998] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x000009a0] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
+-+/* [0x000009a8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x000009b0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x000009b8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x000009c0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x000009c8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x000009d0] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x000009d8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x000009e0] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x000009e8] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x000009f0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x000009f8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_exit
+--/* [0x00000a58] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x00000a60] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+--/* [0x00000a68] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000a70] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000a78] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000a80] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000a88] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x00000a90] */ 0x009e7000, 0x100009e7, // nop        ; nop
+--/* [0x00000a98] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000a00] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00000a08] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+-+/* [0x00000a10] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a18] */ 0x009e7000, 0xb00009e7, // ldtmu1
+-+/* [0x00000a20] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a28] */ 0x009e7000, 0xb00009e7, // ldtmu1
+-+/* [0x00000a30] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00000a38] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000a40] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_interrupt_exit8
+--/* [0x00000aa0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x00000aa8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000ab0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000ab8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000ac0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000ac8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000ad0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000ad8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000ae0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000ae8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000af0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000af8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000b00] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x00000b08] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+--/* [0x00000b10] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000a48] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00000a50] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a58] */ 0x009e7000, 0xb00009e7, // ldtmu1
+-+/* [0x00000a60] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a68] */ 0x009e7000, 0xb00009e7, // ldtmu1
+-+/* [0x00000a70] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000a78] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000a80] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000a88] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000a90] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000a98] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000aa0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000aa8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00000ab0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x00000ab8] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_setup
+--/* [0x00000b18] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000b20] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00000b28] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00000b30] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x00000b38] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
+--/* [0x00000b40] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x00000b48] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
+--/* [0x00000b50] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+--/* [0x00000b58] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
+--/* [0x00000b60] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+--/* [0x00000b68] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+--/* [0x00000b70] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
+--/* [0x00000b78] */ 0x8c9e7452, 0x10025e18, // add t0s, r2, r1 ; mov ra_frame_base, r2
+--/* [0x00000b80] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00000b88] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x00000b90] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
+--/* [0x00000b98] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
+--/* [0x00000ba0] */ 0x0c9c13c0, 0xd0020567, // add ra_y2, r1, 1
+--/* [0x00000ba8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+--/* [0x00000bb0] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
+--/* [0x00000bb8] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+--/* [0x00000bc0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+--/* [0x00000bc8] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
+--/* [0x00000bd0] */ 0x8c9e7452, 0x10025e19, // add t0s, r2, r1 ; mov ra_frame_base2, r2
+--/* [0x00000bd8] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
+--/* [0x00000be0] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
+--/* [0x00000be8] */ 0x15827d80, 0x10021427, // mov rb16, unif
+--/* [0x00000bf0] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000bf8] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+--/* [0x00000c00] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
+--/* [0x00000c08] */ 0x00000001, 0xe0020527, // mov ra20, 1
+--/* [0x00000c10] */ 0x00000100, 0xe00205a7, // mov ra22, 256
+--/* [0x00000c18] */ 0x00000040, 0xe00207a7, // mov ra30, 64
+--/* [0x00000c20] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
+--/* [0x00000c28] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
+--/* [0x00000c30] */ 0x00000018, 0xe00215e7, // mov rb23, 24
+--/* [0x00000c38] */ 0x00000000, 0xe0020227, // mov ra8, 0
+--/* [0x00000c40] */ 0x00000000, 0xe0020267, // mov ra9, 0
+--/* [0x00000c48] */ 0x00000000, 0xe00202a7, // mov ra10, 0
+--/* [0x00000c50] */ 0x00000000, 0xe00202e7, // mov ra11, 0
+--/* [0x00000c58] */ 0x00000000, 0xe0020327, // mov ra12, 0
+--/* [0x00000c60] */ 0x00000000, 0xe0020367, // mov ra13, 0
+--/* [0x00000c68] */ 0x00000000, 0xe00203a7, // mov ra14, 0
+--/* [0x00000c70] */ 0x00000000, 0xe00203e7, // mov ra15, 0
+--/* [0x00000c78] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+--/* [0x00000c80] */ 0x159e7480, 0x10020867, // mov r1, r2
+--/* [0x00000c88] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+--/* [0x00000c90] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+--/* [0x00000c98] */ 0x159e7480, 0x10020827, // mov r0, r2
+--/* [0x00000ca0] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+--/* [0x00000ca8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000cb0] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
+--/* [0x00000cb8] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+--/* [0x00000cc0] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
+--/* [0x00000cc8] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+--/* [0x00000cd0] */ 0x159e7480, 0x10020867, // mov r1, r2
+--/* [0x00000cd8] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+--/* [0x00000ce0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+--/* [0x00000ce8] */ 0x159e7480, 0x10020827, // mov r0, r2
+--/* [0x00000cf0] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+--/* [0x00000cf8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000d00] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+--/* [0x00000d08] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
+--/* [0x00000d10] */ 0x15827d80, 0x10021327, // mov rb12,unif
+--/* [0x00000d18] */ 0x15827d80, 0x10021367, // mov rb13,unif
+--/* [0x00000d20] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000d28] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
+--/* [0x00000d30] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+--/* [0x00000d38] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
+--/* [0x00000d40] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+--/* [0x00000d48] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_frame_base
+--/* [0x00000d50] */ 0x13540dc0, 0xd0020867, // max r1, ra_y2, 0
+--/* [0x00000d58] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+--/* [0x00000d60] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000ac0] */ 0x00000010, 0xe00208e7, // mov r3, 16
+-+/* [0x00000ac8] */ 0x15827d80, 0x10020227, // mov ra8, unif
+-+/* [0x00000ad0] */ 0x15827d80, 0x10020267, // mov ra9, unif
+-+/* [0x00000ad8] */ 0x15827d80, 0x100202a7, // mov ra10, unif
+-+/* [0x00000ae0] */ 0x15827d80, 0x100202e7, // mov ra11, unif
+-+/* [0x00000ae8] */ 0x15827d80, 0x10020867, // mov r1, unif
+-+/* [0x00000af0] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3
+-+/* [0x00000af8] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3
+-+/* [0x00000b00] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3
+-+/* [0x00000b08] */ 0x0d9c13c0, 0xd0021667, // sub rb_frame_width_minus_1,r1,1
+-+/* [0x00000b10] */ 0x0d9c11c0, 0xd00217a7, // sub rb_frame_height_minus_1,r0,1
+-+/* [0x00000b18] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
+-+/* [0x00000b20] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000b28] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+-+/* [0x00000b30] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
+-+/* [0x00000b38] */ 0x15227d80, 0x10020867, // mov r1, ra8
+-+/* [0x00000b40] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3
+-+/* [0x00000b48] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3
+-+/* [0x00000b50] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3
+-+/* [0x00000b58] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
+-+/* [0x00000b60] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+-+/* [0x00000b68] */ 0x922591f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, ra9
+-+/* [0x00000b70] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000b78] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
+-+/* [0x00000b80] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+-+/* [0x00000b88] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
+-+/* [0x00000b90] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+-+/* [0x00000b98] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x00000ba0] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
+-+/* [0x00000ba8] */ 0x8c9e7452, 0x10025e18, // add t0s, r2, r1 ; mov ra_frame_base, r2
+-+/* [0x00000bb0] */ 0x152a7d80, 0x10020867, // mov r1, ra10
+-+/* [0x00000bb8] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3
+-+/* [0x00000bc0] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3
+-+/* [0x00000bc8] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3
+-+/* [0x00000bd0] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
+-+/* [0x00000bd8] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+-+/* [0x00000be0] */ 0x922d91f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, ra11
+-+/* [0x00000be8] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
+-+/* [0x00000bf0] */ 0x0c9c13c0, 0xd0020567, // add ra_y2, r1, 1
+-+/* [0x00000bf8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+-+/* [0x00000c00] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
+-+/* [0x00000c08] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+-+/* [0x00000c10] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x00000c18] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
+-+/* [0x00000c20] */ 0x8c9e7452, 0x10025f19, // add t1s, r2, r1 ; mov ra_frame_base2, r2
+-+/* [0x00000c28] */ 0x00000001, 0xe0020527, // mov ra20, 1
+-+/* [0x00000c30] */ 0x00000100, 0xe00205a7, // mov ra22, 256
+-+/* [0x00000c38] */ 0x00000040, 0xe00207a7, // mov ra30, 64
+-+/* [0x00000c40] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
+-+/* [0x00000c48] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
+-+/* [0x00000c50] */ 0x00000018, 0xe00215e7, // mov rb23, 24
+-+/* [0x00000c58] */ 0x00000000, 0xe0020227, // mov ra8, 0
+-+/* [0x00000c60] */ 0x00000000, 0xe0020267, // mov ra9, 0
+-+/* [0x00000c68] */ 0x00000000, 0xe00202a7, // mov ra10, 0
+-+/* [0x00000c70] */ 0x00000000, 0xe00202e7, // mov ra11, 0
+-+/* [0x00000c78] */ 0x00000000, 0xe0020327, // mov ra12, 0
+-+/* [0x00000c80] */ 0x00000000, 0xe0020367, // mov ra13, 0
+-+/* [0x00000c88] */ 0x00000000, 0xe00203a7, // mov ra14, 0
+-+/* [0x00000c90] */ 0x00000000, 0xe00203e7, // mov ra15, 0
+-+/* [0x00000c98] */ 0x00004000, 0xe00204a7, // mov ra18, 0x4000
+-+/* [0x00000ca0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+-+/* [0x00000ca8] */ 0x159e7480, 0x10020867, // mov r1, r2
+-+/* [0x00000cb0] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+-+/* [0x00000cb8] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+-+/* [0x00000cc0] */ 0x159e7480, 0x10020827, // mov r0, r2
+-+/* [0x00000cc8] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+-+/* [0x00000cd0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000cd8] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+-+/* [0x00000ce0] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
+-+/* [0x00000ce8] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
+-+/* [0x00000cf0] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+-+/* [0x00000cf8] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
+-+/* [0x00000d00] */ 0x15827d80, 0x10020867, // mov r1, unif
+-+/* [0x00000d08] */ 0x919c82ff, 0xd0024822, // shl r0,r1,r3 ; mov r2,8
+-+/* [0x00000d10] */ 0x0f9e70c0, 0x10021367, // asr rb13,r0,r3
+-+/* [0x00000d18] */ 0x0f9e72c0, 0x10021327, // asr rb12,r1,r3
+-+/* [0x00000d20] */ 0x0c9cde80, 0x10021367, // add rb13,rb13,r2
+-+/* [0x00000d28] */ 0x119cce80, 0x10021327, // shl rb12, rb12, r2
+-+/* [0x00000d30] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
+-+/* [0x00000d38] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x00000d40] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
+-+/* [0x00000d48] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+-+/* [0x00000d50] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_frame_base
+-+/* [0x00000d58] */ 0x13540dc0, 0xd0020867, // max r1, ra_y2, 0
+-+/* [0x00000d60] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+- /* [0x00000d68] */ 0x0c541dc0, 0xd0020567, // add ra_y2, ra_y2, 1
+- /* [0x00000d70] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+--/* [0x00000d78] */ 0x0c667380, 0x10020e27, // add t0s, r1, ra_frame_base2
+--// ::mc_filter
+-+/* [0x00000d78] */ 0x0c667380, 0x10020f27, // add t1s, r1, ra_frame_base2
+-+// :per_block_setup
+- /* [0x00000d80] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+- /* [0x00000d88] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+- /* [0x00000d90] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+- /* [0x00000d98] */ 0x155e7d80, 0x10021027, // mov rx_xshift2, rx_xshift2_next
+--/* [0x00000da0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00000da8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x00000db0] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
+--/* [0x00000db8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x00000dc0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x00000dc8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+--/* [0x00000dd0] */ 0x0c9e7400, 0x100206a7, // add ra_frame_base_next, r2, r0
+--/* [0x00000dd8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00000de0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0   ; mov r1, unif
+--/* [0x00000de8] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
+--/* [0x00000df0] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
+--/* [0x00000df8] */ 0x0c9c13c0, 0xd0021067, // add ra_y2_next, r1, 1
+--/* [0x00000e00] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+--/* [0x00000e08] */ 0x0c9e7400, 0x100214e7, // add rx_frame_base2_next, r2, r0
+--/* [0x00000e10] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+--/* [0x00000e18] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x00000e20] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000e28] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x00000e30] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x00000e38] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x00000e40] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+--/* [0x00000e48] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+--/* [0x00000e50] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x00000e58] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000e60] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000e68] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00000e70] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000e78] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
+--/* [0x00000e80] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000e88] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000e90] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000e98] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x00000ea0] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
+--/* [0x00000ea8] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000eb0] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000eb8] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000ec0] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+--/* [0x00000ec8] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
+--/* [0x00000ed0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000ed8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000ee0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000ee8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+--/* [0x00000ef0] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
+--/* [0x00000ef8] */ 0x4f5971c6, 0x100251e0, // asr rb7, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000f00] */ 0x4f5971c6, 0x100251a0, // asr rb6, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000f08] */ 0x4f5971c6, 0x10025160, // asr rb5, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000da0] */ 0x00000010, 0xe00208e7, // mov r3, 16
+-+/* [0x00000da8] */ 0x15827d80, 0x10020867, // mov r1, unif
+-+/* [0x00000db0] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3
+-+/* [0x00000db8] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3
+-+/* [0x00000dc0] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3
+-+/* [0x00000dc8] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
+-+/* [0x00000dd0] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+-+/* [0x00000dd8] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
+-+/* [0x00000de0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000de8] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x00000df0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+-+/* [0x00000df8] */ 0x8c827436, 0x100246a1, // add ra_frame_base_next, r2, r0 ; mov r1, unif
+-+/* [0x00000e00] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3
+-+/* [0x00000e08] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3
+-+/* [0x00000e10] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3
+-+/* [0x00000e18] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
+-+/* [0x00000e20] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+-+/* [0x00000e28] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
+-+/* [0x00000e30] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
+-+/* [0x00000e38] */ 0x159e7240, 0x10021067, // mov ra_y2_next, r1
+-+/* [0x00000e40] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+-+/* [0x00000e48] */ 0x0c9e7400, 0x100214e7, // add rx_frame_base2_next, r2, r0
+-+/* [0x00000e50] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x00000e58] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000e60] */ 0x0e9e70c0, 0x10020867, // shr r1, r0, r3
+-+/* [0x00000e68] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x00000e70] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x00000e78] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+-+/* [0x00000e80] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+-+/* [0x00000e88] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x00000e90] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000e98] */ 0x119e70c0, 0x10020827, // shl r0, r0, r3
+-+/* [0x00000ea0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00000ea8] */ 0x95801dbf, 0xd0024821, // mov r0, unif ; mov r1,1
+-+/* [0x00000eb0] */ 0x4f5971c6, 0x10024260, // asr ra9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000eb8] */ 0x4f5971c6, 0x10024220, // asr ra8, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000ec0] */ 0x4f5971c6, 0x10044260, // asr.ifz ra9, r0, rb23;  mul24 r0, r0, ra22
+-+/* [0x00000ec8] */ 0x0f9d71c0, 0x10040227, // asr.ifz ra8, r0, rb23
+-+/* [0x00000ed0] */ 0x0d243f80, 0xd0020267, // sub ra9,3,ra9
+-+/* [0x00000ed8] */ 0x0d203f80, 0xd0020227, // sub ra8,3,ra8
+-+/* [0x00000ee0] */ 0x11243dc0, 0xd0020267, // shl ra9,ra9,3
+-+/* [0x00000ee8] */ 0x11203dc0, 0xd0020227, // shl ra8,ra8,3
+-+/* [0x00000ef0] */ 0x00ffff00, 0xe0020867, // mov r1,0xffff00
+-+/* [0x00000ef8] */ 0x11227380, 0x10020827, // shl r0, r1, ra8
+-+/* [0x00000f00] */ 0x0f9d71c0, 0x10020027, // asr ra0, r0, rb23
+-+/* [0x00000f08] */ 0x11267380, 0x10020827, // shl r0, r1, ra9
+- /* [0x00000f10] */ 0x0f9d71c0, 0x10021127, // asr rb4, r0, rb23
+--/* [0x00000f18] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000f20] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
+--/* [0x00000f28] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
+--/* [0x00000f30] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000f38] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
+--/* [0x00000f40] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x00000f18] */ 0x01040400, 0xe0020867, // mov r1,0x1040400
+-+/* [0x00000f20] */ 0x11227380, 0x10020827, // shl r0, r1, ra8
+-+/* [0x00000f28] */ 0x0f9d71c0, 0x10020067, // asr ra1, r0, rb23
+-+/* [0x00000f30] */ 0x11267380, 0x10020827, // shl r0, r1, ra9
+-+/* [0x00000f38] */ 0x0f9d71c0, 0x10021167, // asr rb5, r0, rb23
+-+/* [0x00000f40] */ 0xfbf5f600, 0xe0020867, // mov r1,0xfbf5f600
+-+/* [0x00000f48] */ 0x11227380, 0x10020827, // shl r0, r1, ra8
+-+/* [0x00000f50] */ 0x0f9d71c0, 0x100200a7, // asr ra2, r0, rb23
+-+/* [0x00000f58] */ 0x11267380, 0x10020827, // shl r0, r1, ra9
+-+/* [0x00000f60] */ 0x0f9d71c0, 0x100211a7, // asr rb6, r0, rb23
+-+/* [0x00000f68] */ 0x11283a40, 0xe0020867, // mov r1,0x11283a40
+-+/* [0x00000f70] */ 0x11227380, 0x10020827, // shl r0, r1, ra8
+-+/* [0x00000f78] */ 0x0f9d71c0, 0x100200e7, // asr ra3, r0, rb23
+-+/* [0x00000f80] */ 0x11267380, 0x10020827, // shl r0, r1, ra9
+-+/* [0x00000f88] */ 0x0f9d71c0, 0x100211e7, // asr rb7, r0, rb23
+-+/* [0x00000f90] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100
+-+/* [0x00000f98] */ 0x11227380, 0x10020827, // shl r0, r1, ra8
+-+/* [0x00000fa0] */ 0x0f9d71c0, 0x10020127, // asr ra4, r0, rb23
+-+/* [0x00000fa8] */ 0x11267380, 0x10020827, // shl r0, r1, ra9
+-+/* [0x00000fb0] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
+-+/* [0x00000fb8] */ 0xf6f5fb00, 0xe0020867, // mov r1,0xf6f5fb00
+-+/* [0x00000fc0] */ 0x11227380, 0x10020827, // shl r0, r1, ra8
+-+/* [0x00000fc8] */ 0x0f9d71c0, 0x10020167, // asr ra5, r0, rb23
+-+/* [0x00000fd0] */ 0x11267380, 0x10020827, // shl r0, r1, ra9
+-+/* [0x00000fd8] */ 0x0f9d71c0, 0x10021267, // asr rb9, r0, rb23
+-+/* [0x00000fe0] */ 0x04040100, 0xe0020867, // mov r1,0x4040100
+-+/* [0x00000fe8] */ 0x11227380, 0x10020827, // shl r0, r1, ra8
+-+/* [0x00000ff0] */ 0x0f9d71c0, 0x100201a7, // asr ra6, r0, rb23
+-+/* [0x00000ff8] */ 0x11267380, 0x10020827, // shl r0, r1, ra9
+-+/* [0x00001000] */ 0x0f9d71c0, 0x100212a7, // asr rb10, r0, rb23
+-+/* [0x00001008] */ 0xffff0000, 0xe0020867, // mov r1,0xffff0000
+-+/* [0x00001010] */ 0x11227380, 0x10020827, // shl r0, r1, ra8
+-+/* [0x00001018] */ 0x0f9d71c0, 0x100201e7, // asr ra7, r0, rb23
+-+/* [0x00001020] */ 0x11267380, 0x10020827, // shl r0, r1, ra9
+-+/* [0x00001028] */ 0x0f9d71c0, 0x100212e7, // asr rb11, r0, rb23
+-+/* [0x00001030] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00001038] */ 0x0f9e70c0, 0x100213e7, // asr rb15, r0, r3
+-+/* [0x00001040] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00001048] */ 0x119e70c0, 0x10020827, // shl r0, r0, r3
+-+/* [0x00001050] */ 0x8f9c00ff, 0xd0024823, // asr r0, r0, r3 ; mov r3, 0
+-+/* [0x00001058] */ 0x119c81c0, 0xd00213a7, // shl rb14, r0, 8
+-+// ::mc_filter
+- // :yloop
+--/* [0x00000f48] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
+--/* [0x00000f50] */ 0x8e4539bf, 0xa0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu0
+--/* [0x00000f58] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+--/* [0x00000f60] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x00000f68] */ 0x0e9c09c0, 0x10020867, // shr r1, r4, rx_xshift2
+--/* [0x00000f70] */ 0x159c1fc0, 0x10040567, // mov.ifz ra_y2, ra_y2_next
+--/* [0x00000f78] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x00000f80] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x00000f88] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
+--/* [0x00000f90] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
+--/* [0x00000f98] */ 0x13540dc0, 0xd00208a7, // max r2, ra_y2, 0
+--/* [0x00000fa0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x00000fa8] */ 0x4c541dd3, 0xd0024562, // add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
+--/* [0x00000fb0] */ 0xec654c87, 0x10024e20, // add t0s, ra_frame_base2, r2   ; v8subs r0, r0, rb20
+--/* [0x00000fb8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000fc0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x00000fc8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x00000fd0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x00000fd8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x00000fe0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x00000fe8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x00000ff0] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x00000ff8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00001000] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--/* [0x00001008] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--/* [0x00001010] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--/* [0x00001018] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--/* [0x00001020] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--/* [0x00001028] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--/* [0x00001030] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--/* [0x00001038] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--/* [0x00001040] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+--/* [0x00001048] */ 0x8d3487f6, 0xd00279cc, // sub.setf -, r3, 8    ; mov ra12, ra13
+--/* [0x00001050] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+--/* [0x00001058] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+--/* [0x00001060] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+--/* [0x00001068] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+--/* [0x00001070] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:yloop
+--/* [0x00001078] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x00001080] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+--/* [0x00001088] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+--/* [0x00001090] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+--/* [0x00001098] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+--/* [0x000010a0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+--/* [0x000010a8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+--/* [0x000010b0] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb4
+--/* [0x000010b8] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb5
+--/* [0x000010c0] */ 0x4c286237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb6
+--/* [0x000010c8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
+--/* [0x000010d0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x000010d8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x000010e0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+--/* [0x000010e8] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
+--/* [0x000010f0] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
+--/* [0x000010f8] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
+--/* [0x00001100] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:yloop
+--/* [0x00001108] */ 0x0c9cf3c0, 0x10020867, // add r1, r1, rb15
+--/* [0x00001110] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x00001118] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+--/* [0x00001120] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00001128] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00001130] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00001138] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00001060] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
+-+/* [0x00001068] */ 0x8e4539bf, 0xb0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu1
+-+/* [0x00001070] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+-+/* [0x00001078] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00001080] */ 0x0e9c09c0, 0x10020867, // shr r1, r4, rx_xshift2
+-+/* [0x00001088] */ 0x159c1fc0, 0x10040567, // mov.ifz ra_y2, ra_y2_next
+-+/* [0x00001090] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00001098] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x000010a0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
+-+/* [0x000010a8] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
+-+/* [0x000010b0] */ 0x13540dc0, 0xd00208a7, // max r2, ra_y2, 0
+-+/* [0x000010b8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x000010c0] */ 0x4c541dd3, 0xd0024562, // add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
+-+/* [0x000010c8] */ 0xec654c8f, 0x10024f21, // add t1s, ra_frame_base2, r2   ; v8subs r1, r1, rb20
+-+/* [0x000010d0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x000010d8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x000010e0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x000010e8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x000010f0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x000010f8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00001100] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00001108] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00001110] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00001118] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+/* [0x00001120] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+/* [0x00001128] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+/* [0x00001130] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+/* [0x00001138] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+/* [0x00001140] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+/* [0x00001148] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+/* [0x00001150] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+/* [0x00001158] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+-+/* [0x00001160] */ 0x8d2487f6, 0xd00279c8, // sub.setf -, r3, 8    ; mov ra8, ra9
+-+/* [0x00001168] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+-+/* [0x00001170] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+-+/* [0x00001178] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+-+/* [0x00001180] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+-+/* [0x00001188] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:yloop
+-+/* [0x00001190] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00001198] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+-+/* [0x000011a0] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+-+/* [0x000011a8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+-+/* [0x000011b0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+-+/* [0x000011b8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+/* [0x000011c0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x000011c8] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb4
+-+/* [0x000011d0] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb5
+-+/* [0x000011d8] */ 0x4c286237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb6
+-+/* [0x000011e0] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
+-+/* [0x000011e8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x000011f0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x000011f8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x00001200] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
+-+/* [0x00001208] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
+-+/* [0x00001210] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
+-+/* [0x00001218] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:yloop
+-+/* [0x00001220] */ 0x0c9cf3c0, 0x10020867, // add r1, r1, rb15
+-+/* [0x00001228] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x00001230] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x00001238] */ 0xfffffb28, 0xf0f809e7, // brr -, r:per_block_setup
+-+/* [0x00001240] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00001248] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00001250] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_filter_b
+--/* [0x00001140] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00001148] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00001150] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x00001158] */ 0x155e7d80, 0x10021027, // mov rx_xshift2, rx_xshift2_next
+--/* [0x00001160] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00001168] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x00001170] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
+--/* [0x00001178] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x00001180] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x00001188] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+--/* [0x00001190] */ 0x0c9e7400, 0x100206a7, // add ra_frame_base_next, r2, r0
+--/* [0x00001198] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x000011a0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0   ; mov r1, unif
+--/* [0x000011a8] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
+--/* [0x000011b0] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
+--/* [0x000011b8] */ 0x0c9c13c0, 0xd0021067, // add ra_y2_next, r1, 1
+--/* [0x000011c0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+--/* [0x000011c8] */ 0x0c9e7400, 0x100214e7, // add rx_frame_base2_next, r2, r0
+--/* [0x000011d0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+--/* [0x000011d8] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x000011e0] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000011e8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x000011f0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x000011f8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x00001200] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+--/* [0x00001208] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+--/* [0x00001210] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x00001218] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00001220] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00001228] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00001230] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00001238] */ 0x00000001, 0xe0020867, // mov r1, 1
+--/* [0x00001240] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
+--/* [0x00001248] */ 0x409f3001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 13, r1 << 13
+--/* [0x00001250] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00001258] */ 0x409f2001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 14, r1 << 14
+--/* [0x00001260] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00001268] */ 0x409f1001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 15, r1 << 15
+--/* [0x00001270] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00001278] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x00001280] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
+--/* [0x00001288] */ 0x409f7001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 9, r1 << 9
+--/* [0x00001290] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00001298] */ 0x409f6001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 10, r1 << 10
+--/* [0x000012a0] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000012a8] */ 0x409f5001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 11, r1 << 11
+--/* [0x000012b0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000012b8] */ 0x409f4001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 12, r1 << 12
+--/* [0x000012c0] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+--/* [0x000012c8] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
+--/* [0x000012d0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000012d8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000012e0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000012e8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+--/* [0x000012f0] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
+--/* [0x000012f8] */ 0x4f5971c6, 0x100251e0, // asr rb7, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00001300] */ 0x4f5971c6, 0x100251a0, // asr rb6, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00001308] */ 0x4f5971c6, 0x10025160, // asr rb5, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00001310] */ 0x0f9d71c0, 0x10021127, // asr rb4, r0, rb23
+--/* [0x00001318] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00001320] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
+--/* [0x00001328] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
+--/* [0x00001330] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00001338] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
+--/* [0x00001340] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :yloopb
+--/* [0x00001348] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
+--/* [0x00001350] */ 0x8e4539bf, 0xa0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu0
+--/* [0x00001358] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+--/* [0x00001360] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x00001368] */ 0x0e9c09c0, 0x10020867, // shr r1, r4, rx_xshift2
+--/* [0x00001370] */ 0x159c1fc0, 0x10040567, // mov.ifz ra_y2, ra_y2_next
+--/* [0x00001378] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x00001380] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x00001388] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
+--/* [0x00001390] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
+--/* [0x00001398] */ 0x13540dc0, 0xd00208a7, // max r2, ra_y2, 0
+--/* [0x000013a0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x000013a8] */ 0x4c541dd3, 0xd0024562, // add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
+--/* [0x000013b0] */ 0xec654c87, 0x10024e20, // add t0s, ra_frame_base2, r2   ; v8subs r0, r0, rb20
+--/* [0x000013b8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x000013c0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x000013c8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x000013d0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x000013d8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x000013e0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x000013e8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x000013f0] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x000013f8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00001400] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--/* [0x00001408] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--/* [0x00001410] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--/* [0x00001418] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--/* [0x00001420] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--/* [0x00001428] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--/* [0x00001430] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--/* [0x00001438] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--/* [0x00001440] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+--/* [0x00001448] */ 0x8d3487f6, 0xd00279cc, // sub.setf -, r3, 8    ; mov ra12, ra13
+--/* [0x00001450] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+--/* [0x00001458] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+--/* [0x00001460] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+--/* [0x00001468] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+--/* [0x00001470] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:yloopb
+--/* [0x00001478] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x00001480] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+--/* [0x00001488] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+--/* [0x00001490] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+--/* [0x00001498] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+--/* [0x000014a0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+--/* [0x000014a8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+--/* [0x000014b0] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb4
+--/* [0x000014b8] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb5
+--/* [0x000014c0] */ 0x4c286237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb6
+--/* [0x000014c8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
+--/* [0x000014d0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x000014d8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x000014e0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+--/* [0x000014e8] */ 0x4053800e, 0xd00049e1, // nop                     ; mul24 r1, r1 << 8, ra20 << 8
+--/* [0x000014f0] */ 0x4c78e38f, 0x10024860, // add r1, r1, ra30        ; mul24 r0, r1, rb14
+--/* [0x000014f8] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
+--/* [0x00001500] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:yloopb
+--/* [0x00001508] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
+--/* [0x00001510] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x00001518] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+--/* [0x00001520] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00001528] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00001530] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00001538] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00001258] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
+-+/* [0x00001260] */ 0x8e4539bf, 0xb0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu1
+-+/* [0x00001268] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+-+/* [0x00001270] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00001278] */ 0x0e9c09c0, 0x10020867, // shr r1, r4, rx_xshift2
+-+/* [0x00001280] */ 0x159c1fc0, 0x10040567, // mov.ifz ra_y2, ra_y2_next
+-+/* [0x00001288] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00001290] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00001298] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
+-+/* [0x000012a0] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
+-+/* [0x000012a8] */ 0x13540dc0, 0xd00208a7, // max r2, ra_y2, 0
+-+/* [0x000012b0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x000012b8] */ 0x4c541dd3, 0xd0024562, // add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
+-+/* [0x000012c0] */ 0xec654c8f, 0x10024f21, // add t1s, ra_frame_base2, r2   ; v8subs r1, r1, rb20
+-+/* [0x000012c8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x000012d0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x000012d8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x000012e0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x000012e8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x000012f0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x000012f8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00001300] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00001308] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00001310] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+/* [0x00001318] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+/* [0x00001320] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+/* [0x00001328] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+/* [0x00001330] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+/* [0x00001338] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+/* [0x00001340] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+/* [0x00001348] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+/* [0x00001350] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+-+/* [0x00001358] */ 0x8d2487f6, 0xd00279c8, // sub.setf -, r3, 8    ; mov ra8, ra9
+-+/* [0x00001360] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+-+/* [0x00001368] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+-+/* [0x00001370] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+-+/* [0x00001378] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+-+/* [0x00001380] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:yloopb
+-+/* [0x00001388] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00001390] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+-+/* [0x00001398] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+-+/* [0x000013a0] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+-+/* [0x000013a8] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+-+/* [0x000013b0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+/* [0x000013b8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x000013c0] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb4
+-+/* [0x000013c8] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb5
+-+/* [0x000013d0] */ 0x4c286237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb6
+-+/* [0x000013d8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
+-+/* [0x000013e0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x000013e8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x000013f0] */ 0x0f9ce3c0, 0xd0020827, // asr r0, r1, 14
+-+/* [0x000013f8] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
+-+/* [0x00001400] */ 0x405b8006, 0xd00049e0, // nop                     ; mul24 r0, r0 << 8, ra22 << 8
+-+/* [0x00001408] */ 0x0c4a7380, 0x10020867, // add r1, r1, ra18
+-+/* [0x00001410] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
+-+/* [0x00001418] */ 0xfffffe20, 0xf06809e7, // brr.anyn -, r:yloopb
+-+/* [0x00001420] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
+-+/* [0x00001428] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x00001430] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x00001438] */ 0xfffff928, 0xf0f809e7, // brr -, r:per_block_setup
+-+/* [0x00001440] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00001448] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00001450] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_interrupt_exit12
+--/* [0x00001540] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x00001548] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00001550] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00001558] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00001560] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00001568] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00001570] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00001578] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00001580] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00001588] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00001590] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00001598] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x000015a0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x000015a8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x000015b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x000015b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x000015c0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x000015c8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+--/* [0x000015d0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00001458] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00001460] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00001468] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00001470] */ 0x009e7000, 0xb00009e7, // ldtmu1
+-+/* [0x00001478] */ 0x009e7000, 0xb00009e7, // ldtmu1
+-+/* [0x00001480] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001488] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001490] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001498] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000014a0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000014a8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000014b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000014b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000014c0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000014c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000014d0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000014d8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x000014e0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x000014e8] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+// ::mc_exit1
+-+/* [0x000014f0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x000014f8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00001500] */ 0x009e7000, 0xb00009e7, // ldtmu1
+-+/* [0x00001508] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00001510] */ 0x009e7000, 0xb00009e7, // ldtmu1
+-+/* [0x00001518] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00001520] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x00001528] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_end
+- };
+- #ifdef __HIGHC__
+-diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
+-index 6e552d9..760bd17 100644
+---- a/libavcodec/rpi_shader.h
+-+++ b/libavcodec/rpi_shader.h
+-@@ -4,15 +4,16 @@
+- extern unsigned int rpi_shader[];
+- 
+- #define mc_setup_uv (rpi_shader + 0)
+--#define mc_filter_uv (rpi_shader + 144)
+--#define mc_filter_uv_b0 (rpi_shader + 334)
+--#define mc_filter_uv_b (rpi_shader + 486)
+--#define mc_exit (rpi_shader + 662)
+--#define mc_interrupt_exit8 (rpi_shader + 680)
+--#define mc_setup (rpi_shader + 710)
+--#define mc_filter (rpi_shader + 864)
+--#define mc_filter_b (rpi_shader + 1104)
+--#define mc_interrupt_exit12 (rpi_shader + 1360)
+--#define mc_end (rpi_shader + 1398)
+-+#define mc_filter_uv (rpi_shader + 130)
+-+#define mc_filter_uv_b0 (rpi_shader + 312)
+-+#define mc_filter_uv_b (rpi_shader + 464)
+-+#define mc_exit (rpi_shader + 640)
+-+#define mc_interrupt_exit8 (rpi_shader + 658)
+-+#define mc_setup (rpi_shader + 688)
+-+#define mc_filter (rpi_shader + 1048)
+-+#define mc_filter_b (rpi_shader + 1174)
+-+#define mc_interrupt_exit12 (rpi_shader + 1302)
+-+#define mc_exit1 (rpi_shader + 1340)
+-+#define mc_end (rpi_shader + 1356)
+- 
+- #endif
+-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
+-index a0b8e5a..60d1ec2 100644
+---- a/libavcodec/rpi_shader.qasm
+-+++ b/libavcodec/rpi_shader.qasm
+-@@ -21,6 +21,7 @@
+- #
+- # ra16                                          clipped(row start address+elem_num)&~3
+- # ra17                                          per-channel shifts
+-+# ra18                                          0x4000
+- # ra19                                          next ra17
+- #
+- # rb16                                          pitch
+-@@ -86,7 +87,7 @@
+- 
+- 
+- ################################################################################
+--# mc_setup_uv(next_kernel, x, y, ref_u_base, ref_v_base, frame_width, frame_height, pitch, dst_pitch, pad0, pad1, pad2)
+-+# mc_setup_uv(next_kernel, x, y, ref_u_base, ref_v_base, frame_width, frame_height, pitch, dst_pitch, offset, denom, vpm_id)
+- ::mc_setup_uv
+- 
+- # Read starting kernel
+-@@ -132,36 +133,6 @@ mov ra13, 0
+- mov ra14, 0
+- mov ra15, 0
+- 
+--# Compute part of VPM to use for DMA output
+--mov r3, unif
+--shl r2, r3, 1   # Convert QPU numbers to be even (this means we can only use 8 QPUs, but is necessary as we need to save 16bit intermediate results)
+--and r2, r2, 15
+--mov r1, r2
+--asr r1, r1, 2
+--shl r1, r1, 6
+--mov r0, r2
+--and r0, r0, 3
+--add r0, r0, r1
+--mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
+--shl r0, r0, 5
+--add rb27, r0, r1
+--
+--# Compute part of VPM to save data into
+--shl r2, r3, 1
+--and r2, r2, 15    # r2 = bcd0
+--mov r1, r2        # r1 = bcd0
+--asr r1, r1, 2     # r1 = bc
+--shl r1, r1, 6     # r1 = bc000000
+--mov r0, r2        # r0 = bcd0
+--and r0, r0, 3     # r0 = d0
+--add r0, r0, r1    # r0 = bc0000d0
+--mov r1, vpm_setup(0, 4, h8p(0, 0))   # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
+--add rb28, r0, r1
+--asr r0, r0, 1     # r0 = bc0000d
+--# Prepare VPM command for 16bit intermediates
+--mov r1, vpm_setup(0, 2, h16p(0, 0))  # 2 is stride - stride acts on ADDR which is Y[5:0],H[0] for 16 bit
+--add rb21, r0, r1
+--
+- # Compute base address for first and second access
+- mov r0, ra_x           # Load x
+- max r0, r0, 0; mov r1, ra_y # Load y
+-@@ -175,10 +146,31 @@ min r1, r1, rb_frame_height_minus_1
+- # submit texture requests for first line
+- add r2, r2, r0 ; mul24 r1, r1, rb_pitch
+- add t0s, r0, r1 ; mov ra_frame_base, r2
+--add t0s, r2, r1
+-+add t1s, r2, r1
+-+
+-+mov r2,8
+-+shl rb12,unif, r2 # offset before shift
+-+add rb13,unif,r2  # offset after shift
+-+
+-+# Compute part of VPM to use for DMA output
+-+mov r2, unif
+-+shl r2, r2, 1   # Convert QPU numbers to be even (this means we can only use 8 QPUs, but is necessary as we need to save 16bit intermediate results)
+-+and r2, r2, 15
+-+mov r1, r2
+-+asr r1, r1, 2
+-+shl r1, r1, 6
+-+mov r0, r2
+-+and r0, r0, 3
+-+add r0, r0, r1
+- 
+--mov rb12,unif # offset before shift
+--mov rb13,unif # offset after shift
+-+mov r1, vpm_setup(0, 4, h8p(0, 0))   # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
+-+add rb28, r0, r1  # VPM 8bit storage
+-+asr r2, r0, 1     # r0 = bc0000d
+-+mov r1, vpm_setup(0, 2, h16p(0, 0))  # 2 is stride - stride acts on ADDR which is Y[5:0],H[0] for 16 bit
+-+add rb21, r2, r1  # VPM for 16bit intermediates
+-+mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
+-+shl r0, r0, 5
+-+add rb27, r0, r1  # DMA out
+- 
+- # submit texture requests for second line
+- max r1, ra_y, 0
+-@@ -187,7 +179,7 @@ add ra_y, ra_y, 1
+- bra -, ra31
+- nop ; mul24 r1, r1, rb_pitch
+- add t0s, r1, ra_x
+--add t0s, r1, ra_frame_base
+-+add t1s, r1, ra_frame_base
+- 
+- 
+- 
+-@@ -248,17 +240,15 @@ mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+- mov r0, unif # U offset/weight
+- asr rb15, r0, r2  # Compute offset from MSBs
+- shl r0, r0, r2
+--asr rb14, r0, r2  # Compute weight from LSBs
+-+asr r3, r0, r2  # Compute weight from LSBs
+- mov r0, unif # V offset/weight
+- asr.ifnz rb15, r0, r2
+- shl r0, r0, r2
+--asr.ifnz rb14, r0, r2
+-+asr.ifnz r3, r0, r2
+-+shl rb14,r3,8 # Scale up weights so we can use mul24 in signed fashion
+- 
+- # r2 is elem_num
+- # r3 is loop counter
+--
+--mov r5rep, -8
+--
+- # retrieve texture results and pick out bytes
+- # then submit two more texture requests
+- 
+-@@ -269,7 +259,7 @@ mov r3, 0
+- # then submit two more texture requests
+- 
+- sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
+--shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
+-+shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
+- mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+- mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+- shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
+-@@ -278,7 +268,7 @@ max r2, ra_y, 0  # y
+- min r2, r2, rb_frame_height_minus_1
+- add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+- add t0s, ra_x, r2    ; v8subs r1, r1, rb20
+--add t0s, ra_frame_base, r2
+-+add t1s, ra_frame_base, r2
+- 
+- # generate seven shifted versions
+- # interleave with scroll of vertical context
+-@@ -301,11 +291,6 @@ mov ra13, ra14       # Delay slot 1
+- mov ra14, ra15       # Delay slot 2
+- mov ra15, r0         # Delay slot 3
+- 
+--mov rb12,32 # TODO remove these to make P weighted prediction work properly
+--mov rb13,6
+--mov rb14,1
+--mov rb15,0
+--
+- # apply vertical filter and write to VPM
+- 
+- nop                     ; mul24 r1, ra14, rb10
+-@@ -412,7 +397,7 @@ mov r3, 0
+- # then submit two more texture requests
+- 
+- sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
+--shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
+-+shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
+- mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+- mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+- shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
+-@@ -421,7 +406,7 @@ max r2, ra_y, 0  # y
+- min r2, r2, rb_frame_height_minus_1
+- add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+- add t0s, ra_x, r2    ; v8subs r1, r1, rb20
+--add t0s, ra_frame_base, r2
+-+add t1s, ra_frame_base, r2
+- 
+- # generate seven shifted versions
+- # interleave with scroll of vertical context
+-@@ -542,7 +527,7 @@ mov r3, 0
+- # then submit two more texture requests
+- 
+- sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
+--shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
+-+shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
+- mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+- mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+- shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
+-@@ -551,7 +536,7 @@ max r2, ra_y, 0  # y
+- min r2, r2, rb_frame_height_minus_1
+- add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+- add t0s, ra_x, r2    ; v8subs r1, r1, rb20
+--add t0s, ra_frame_base, r2
+-+add t1s, ra_frame_base, r2
+- 
+- # generate seven shifted versions
+- # interleave with scroll of vertical context
+-@@ -617,9 +602,9 @@ mov  -, vw_wait # wait on the VDW
+- mov -,srel(0)
+- 
+- ldtmu0
+-+ldtmu1
+- ldtmu0
+--ldtmu0
+--ldtmu0
+-+ldtmu1
+- 
+- nop        ; nop ; thrend
+- nop        ; nop # delay slot 1
+-@@ -630,9 +615,9 @@ nop        ; nop # delay slot 2
+- mov  -, vw_wait # wait on the VDW
+- 
+- ldtmu0
+-+ldtmu1
+- ldtmu0
+--ldtmu0
+--ldtmu0
+-+ldtmu1
+- 
+- mov -,sacq(0) # 1
+- mov -,sacq(0) # 2
+-@@ -656,200 +641,249 @@ nop        ; nop # delay slot 2
+- # For P frames we make the second x,y coordinates offset by +8
+- 
+- ################################################################################
+--# mc_setup(next_kernel, x, y, ref_y_base, x2, y2, ref_y2_base, frame_width, frame_height, pitch, dst_pitch, offset, shift, pad2)
+-+# mc_setup(y_x, ref_y_base, y2_x2, ref_y2_base, frame_width_height, pitch, dst_pitch, offset_shift, next_kernel)
+- ::mc_setup
+-+  mov r3, 16
+- 
+--# Read starting kernel
+--mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--mov ra31, unif
+--
+--# Compute base address for first and second access
+--add r0, unif, elem_num # Load x
+--max r0, r0, 0; mov r1, unif # Load y
+--min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
+--shl ra_xshift_next, r0, 3 # Compute shifts
+--add ra_y, r1, 1
+--and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
+--add r2, r2, r0  # r2 is address for frame0 (not including y offset)
+--max r1, r1, 0
+--min r1, r1, rb_frame_height_minus_1
+--nop             ; mul24 r1, r1, rb_pitch   # r2 contains the addresses (not including y offset) for frame0
+--add t0s, r2, r1 ; mov ra_frame_base, r2
+--
+--add r0, unif, elem_num # Load x
+--max r0, r0, 0; mov r1, unif # Load y
+--min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
+--shl rx_xshift2_next, r0, 3 # Compute shifts
+--add ra_y2, r1, 1
+--and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
+--add r2, r2, r0  # r2 is address for frame1 (not including y offset)
+--max r1, r1, 0
+--min r1, r1, rb_frame_height_minus_1
+--nop             ; mul24 r1, r1, rb_pitch   # r2 contains the addresses (not including y offset) for frame0
+--add t0s, r2, r1 ; mov ra_frame_base2, r2
+--
+-+  # Need to save these because we need to know the frame dimensions before computing texture coordinates
+-+  mov ra8, unif
+-+  mov ra9, unif
+-+  mov ra10, unif
+-+  mov ra11, unif
+- 
+- # Read image dimensions
+--sub rb25,unif,1
+--sub rb30,unif,1
+-+  mov r1, unif # width_height
+-+  shl r0,r1,r3
+-+  asr r1,r1,r3 # width
+-+  asr r0,r0,r3 # height
+-+  sub rb_frame_width_minus_1,r1,1
+-+  sub rb_frame_height_minus_1,r0,1
+- 
+- # get source pitch
+--mov rb16, unif
+-+  mov rb_pitch, unif
+- 
+- # get destination pitch
+--mov r0, unif
+--mov r1, vdw_setup_1(0)
+--add rb24, r1, r0
+-+  mov r0, unif
+-+  mov r1, vdw_setup_1(0)
+-+  add rb24, r1, r0
+- 
+--# load constants
+--
+--mov ra20, 1
+--mov ra22, 256
+--mov ra30, 64
+--
+--mov rb20, 0xffffff00
+--mov rb22, 255
+--mov rb23, 24
+-+# Compute base address for first and second access
+-+  mov r1, ra8 # y_x
+-+  shl r0,r1,r3 # r0 is x<<16
+-+  asr r1,r1,r3 # r1 is y
+-+  asr r0,r0,r3 # r0 is x
+-+  add r0, r0, elem_num # Load x
+-+  max r0, r0, 0
+-+  min r0, r0, rb_frame_width_minus_1 ; mov r2, ra9  # Load the frame base
+-+  shl ra_xshift_next, r0, 3 # Compute shifts
+-+  add ra_y, r1, 1
+-+  and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
+-+  add r2, r2, r0  # r2 is address for frame0 (not including y offset)
+-+  max r1, r1, 0
+-+  min r1, r1, rb_frame_height_minus_1
+-+  nop             ; mul24 r1, r1, rb_pitch   # r2 contains the addresses (not including y offset) for frame0
+-+  add t0s, r2, r1 ; mov ra_frame_base, r2
+-+
+-+  mov r1, ra10 # y_x
+-+  shl r0,r1,r3 # r0 is x<<16
+-+  asr r1,r1,r3 # r1 is y
+-+  asr r0,r0,r3 # r0 is x
+-+  add r0, r0, elem_num # Load x
+-+  max r0, r0, 0
+-+  min r0, r0, rb_frame_width_minus_1 ; mov r2, ra11  # Load the frame base
+-+  shl rx_xshift2_next, r0, 3 # Compute shifts
+-+  add ra_y2, r1, 1
+-+  and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
+-+  add r2, r2, r0  # r2 is address for frame1 (not including y offset)
+-+  max r1, r1, 0
+-+  min r1, r1, rb_frame_height_minus_1
+-+  nop             ; mul24 r1, r1, rb_pitch   # r2 contains the addresses (not including y offset) for frame0
+-+  add t1s, r2, r1 ; mov ra_frame_base2, r2
+- 
+--# touch vertical context to keep simulator happy
+- 
+--mov ra8, 0
+--mov ra9, 0
+--mov ra10, 0
+--mov ra11, 0
+--mov ra12, 0
+--mov ra13, 0
+--mov ra14, 0
+--mov ra15, 0
+-+# load constants
+- 
+--# Compute part of VPM to use for DMA output
+--mov r2, qpu_num
+--mov r1, r2
+--asr r1, r1, 2
+--shl r1, r1, 6
+--mov r0, r2
+--and r0, r0, 3
+--add r0, r0, r1
+--mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
+--shl r0, r0, 5
+--add rb27, r0, r1
+-+  mov ra20, 1
+-+  mov ra22, 256
+-+  mov ra30, 64
+- 
+--# Compute part of VPM to save data into
+--mov r2, qpu_num   # qpu_num = abcd
+--mov r1, r2
+--asr r1, r1, 2
+--shl r1, r1, 6
+--mov r0, r2
+--and r0, r0, 3
+--add r0, r0, r1
+--mov r1, vpm_setup(0, 4, h8p(0, 0))   # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
+--add rb28, r0, r1
+-+  mov rb20, 0xffffff00
+-+  mov rb22, 255
+-+  mov rb23, 24
+- 
+--mov rb12,unif # offset before shift
+--mov rb13,unif # shift
+-+# touch vertical context to keep simulator happy
+- 
+--# Dump padding words
+--mov r0, unif
+-+  mov ra8, 0
+-+  mov ra9, 0
+-+  mov ra10, 0
+-+  mov ra11, 0
+-+  mov ra12, 0
+-+  mov ra13, 0
+-+  mov ra14, 0
+-+  mov ra15, 0
+-+  mov ra18, 0x4000
+-+
+-+# Compute part of VPM to use
+-+  mov r2, qpu_num
+-+  mov r1, r2
+-+  asr r1, r1, 2
+-+  shl r1, r1, 6
+-+  mov r0, r2
+-+  and r0, r0, 3
+-+  add r0, r0, r1
+-+  mov r1, vpm_setup(0, 4, h8p(0, 0))   # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
+-+  add rb28, r0, r1  # VPM for saving data
+-+  mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
+-+  shl r0, r0, 5
+-+  add rb27, r0, r1  # Command for dma output
+-+
+-+# Weighted prediction denom
+-+
+-+  mov r1, unif # offset_shift
+-+  shl r0,r1,r3 ; mov r2,8
+-+  asr rb13,r0,r3 # shift
+-+  asr rb12,r1,r3 # offset
+-+  add rb13,rb13,r2    # mul24 is unsigned so scale up into high bits
+-+  shl rb12, rb12, r2 # Account for larger shift
+- 
+- # submit texture requests for second line
+--max r1, ra_y, 0
+--min r1, r1, rb_frame_height_minus_1
+--add ra_y, ra_y, 1
+--nop ; mul24 r1, r1, rb_pitch
+--add t0s, r1, ra_frame_base
+--
+--max r1, ra_y2, 0
+--min r1, r1, rb_frame_height_minus_1
+--bra -, ra31
+--add ra_y2, ra_y2, 1           # Delay 1
+--nop ; mul24 r1, r1, rb_pitch  # Delay 2
+--add t0s, r1, ra_frame_base2   # Delay 3
+--
+--
+--################################################################################
+--
+--# mc_filter(next_kernel, x, y, frame_base, x2, y2, frame_base2, height, hcoeffs[0], hcoeffs2[0], hcoeffs[1], hcoeffs2[1], vcoeffs[0], vcoeffs2[0], vcoeffs[1], vcoeffs2[1], offsetweight0, offsetweight1, this_dst)
+--# In a P block, only the first half of coefficients contain used information.
+--# At this point we have already issued two pairs of texture requests for the current block
+--# ra_x, ra_x16_base point to the current coordinates for this block
+--::mc_filter
+--mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--mov ra31, unif
+-+  max r1, ra_y, 0
+-+  min r1, r1, rb_frame_height_minus_1
+-+  add ra_y, ra_y, 1
+-+  nop ; mul24 r1, r1, rb_pitch
+-+  add t0s, r1, ra_frame_base
+-+
+-+  max r1, ra_y2, 0
+-+  min r1, r1, rb_frame_height_minus_1
+-+  add ra_y2, ra_y2, 1
+-+  nop ; mul24 r1, r1, rb_pitch
+-+  add t1s, r1, ra_frame_base2
+-+
+-+# FALL THROUGHT TO PER-BLOCK SETUP
+-+
+-+# Start of per-block setup code
+-+# P and B blocks share the same setup code to save on Icache space
+-+:per_block_setup
+-+  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+  mov ra31, unif
+- 
+- # per-channel shifts were calculated on the *previous* invocation
+--
+--mov ra_xshift, ra_xshift_next
+--mov rx_xshift2, rx_xshift2_next
+-+  mov ra_xshift, ra_xshift_next
+-+  mov rx_xshift2, rx_xshift2_next
+- 
+- # get base addresses and per-channel shifts for *next* invocation
+--add r0, unif, elem_num # Load x
+--max r0, r0, 0; mov r1, unif # Load y
+--min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
+--shl ra_xshift_next, r0, 3 # Compute shifts
+--mov ra_y_next, r1
+--and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
+--add ra_frame_base_next, r2, r0  # r2 is address for frame0 (not including y offset)
+--
+--add r0, unif, elem_num # Load x
+--max r0, r0, 0   ; mov r1, unif # Load y
+--min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
+--shl rx_xshift2_next, r0, 3 # Compute shifts
+--add ra_y2_next, r1, 1
+--and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
+--add rx_frame_base2_next, r2, r0  # r2 is address for frame1 (not including y offset)
+--
+-+  mov r3, 16
+-+  mov r1, unif # y_x
+-+  shl r0,r1,r3 # r0 is x<<16
+-+  asr r1,r1,r3 # r1 is y
+-+  asr r0,r0,r3 # r0 is x
+-+  add r0, r0, elem_num # Load x
+-+  max r0, r0, 0
+-+  min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
+-+  shl ra_xshift_next, r0, 3 # Compute shifts
+-+  mov ra_y_next, r1
+-+  and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
+-+  add ra_frame_base_next, r2, r0 ; mov r1, unif # y2_x2
+-+
+-+  shl r0,r1,r3 # r0 is x2<<16
+-+  asr r1,r1,r3 # r1 is y2
+-+  asr r0,r0,r3 # r0 is x2
+-+  add r0, r0, elem_num # Load x
+-+  max r0, r0, 0
+-+  min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
+-+  shl rx_xshift2_next, r0, 3 # Compute shifts
+-+  mov ra_y2_next, r1
+-+  and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
+-+  add rx_frame_base2_next, r2, r0  # r2 is address for frame1 (not including y offset)
+- 
+- # set up VPM write
+--mov vw_setup, rb28
+-+  mov vw_setup, rb28
+- 
+- # get width,height of block
+--mov r2, 16
+--mov r0, unif
+--shr r1, r0, r2 # Extract width
+--sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
+--and r0, r0, rb22 # Extract height
+--add rb17, r0, 5
+--add rb18, r0, 7
+--shl r0, r0, 7
+--add r0, r0, r1 # Combine width and height of destination area
+--shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
+--add rb26, r0, rb27
+-+  mov r0, unif
+-+  shr r1, r0, r3 # Extract width
+-+  sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
+-+  and r0, r0, rb22 # Extract height
+-+  add rb17, r0, 5
+-+  add rb18, r0, 7
+-+  shl r0, r0, 7
+-+  add r0, r0, r1 # Combine width and height of destination area
+-+  shl r0, r0, r3 # Shift into bits 16 upwards of the vdw_setup0 register
+-+  add rb26, r0, rb27
+- 
+- # get filter coefficients and discard unused B frame values
+--mov r0, unif
+--mov.ifnz -, unif # Alternate coefficients are unused for P frames
+--asr ra3, r0, rb23;      mul24 r0, r0, ra22 # These may need some pre-rotation to be used in B frames correctly
+--asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--asr ra0, r0, rb23;      mov r0, unif
+--mov.ifnz -, unif
+--asr ra7, r0, rb23;      mul24 r0, r0, ra22
+--asr ra6, r0, rb23;      mul24 r0, r0, ra22
+--asr ra5, r0, rb23;      mul24 r0, r0, ra22
+--asr ra4, r0, rb23;      mov r0, unif
+--mov.ifnz -, unif
+--asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--asr rb8, r0, rb23;      mov r0, unif
+--mov.ifnz -, unif
+--asr rb7, r0, rb23;      mul24 r0, r0, ra22
+--asr rb6, r0, rb23;      mul24 r0, r0, ra22
+--asr rb5, r0, rb23;      mul24 r0, r0, ra22
+--asr rb4, r0, rb23
+--
+--mov r0, unif # Frame0 offset/weight
+--mov.ifnz -, unif # Frame1 offset/weight unused
+--asr rb15, r0, r2  # Compute offset from MSBs
+--shl r0, r0, r2
+--asr rb14, r0, r2  # Compute weight from LSBs
+--
+--# r3 is loop counter
+-+  mov r0, unif ; mov r1,1  # Packed filter offsets, unpack into ra8... (to be used for vertical context later)
+-+  asr ra9, r0, rb23;      mul24 r0, r0, ra22 # my2
+-+  asr ra8, r0, rb23;      mul24 r0, r0, ra22 # mx2
+-+  asr.ifz ra9, r0, rb23;  mul24 r0, r0, ra22 # my:my2
+-+  asr.ifz ra8, r0, rb23                      # mx:mx2
+-+  sub ra9,3,ra9
+-+  sub ra8,3,ra8
+-+  shl ra9,ra9,3   # Scale up by 8
+-+  shl ra8,ra8,3   # Scale up by 8
+-+# Now if we want aligned we have a mul of 1, so put 0 coefficients at the top
+-+  mov r1,0xffff00
+-+  shl r0, r1, ra8
+-+  asr ra0, r0, rb23
+-+  shl r0, r1, ra9
+-+  asr rb4, r0, rb23
+-+
+-+  mov r1,0x1040400
+-+  shl r0, r1, ra8
+-+  asr ra1, r0, rb23
+-+  shl r0, r1, ra9
+-+  asr rb5, r0, rb23
+-+
+-+  mov r1,0xfbf5f600
+-+  shl r0, r1, ra8
+-+  asr ra2, r0, rb23
+-+  shl r0, r1, ra9
+-+  asr rb6, r0, rb23
+-+
+-+  mov r1,0x11283a40
+-+  shl r0, r1, ra8
+-+  asr ra3, r0, rb23
+-+  shl r0, r1, ra9
+-+  asr rb7, r0, rb23
+-+
+-+  mov r1,0x3a281100
+-+  shl r0, r1, ra8
+-+  asr ra4, r0, rb23
+-+  shl r0, r1, ra9
+-+  asr rb8, r0, rb23
+-+
+-+  mov r1,0xf6f5fb00
+-+  shl r0, r1, ra8
+-+  asr ra5, r0, rb23
+-+  shl r0, r1, ra9
+-+  asr rb9, r0, rb23
+-+
+-+  mov r1,0x4040100
+-+  shl r0, r1, ra8
+-+  asr ra6, r0, rb23
+-+  shl r0, r1, ra9
+-+  asr rb10, r0, rb23
+-+
+-+  mov r1,0xffff0000
+-+  shl r0, r1, ra8
+-+  asr ra7, r0, rb23
+-+  shl r0, r1, ra9
+-+  asr rb11, r0, rb23
+-+
+-+# Extract weighted prediction information
+-+  mov r0, unif      # offset/weight  TODO move up
+-+  asr rb15, r0, r3  # Compute offset from MSBs
+-+  bra -, ra31
+-+  shl r0, r0, r3    #                                                            Delay 1
+-+  asr r0, r0, r3 ; mov r3, 0 # Compute weight from LSBs and reset loop counter   Delay 2
+-+  shl rb14, r0, 8 # Use a larger shift to avoid unsigned multiply problem        Delay 3
+- 
+--# retrieve texture results and pick out bytes
+--# then submit two more texture requests
+-+################################################################################
+-+# mc_filter(y_x, frame_base, y2_x2, frame_base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
+-+# In a P block, y2_x2 should be y_x+8
+-+# At this point we have already issued two pairs of texture requests for the current block
+- 
+--mov r3, 0
+-+::mc_filter
+- 
+- :yloop
+- # retrieve texture results and pick out bytes
+-@@ -858,91 +892,90 @@ mov r3, 0
+- # If we knew there was no clipping then this code would get simpler.
+- # Perhaps we could add on the pitch and clip using larger values?
+- 
+--sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
+--shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu0
+--mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+--mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--shr r1, r4, rx_xshift2
+--mov.ifz ra_y2, ra_y2_next
+-+  sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
+-+  shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu1
+-+  mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+-+  mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+  shr r1, r4, rx_xshift2
+-+  mov.ifz ra_y2, ra_y2_next
+- 
+--max r2, ra_y, 0  # y
+--min r2, r2, rb_frame_height_minus_1
+--add ra_y, ra_y, 1            ; mul24 r2, r2, r3
+--add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte
+--
+--max r2, ra_y2, 0  # y
+--min r2, r2, rb_frame_height_minus_1
+--add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
+--add t0s, ra_frame_base2, r2   ; v8subs r0, r0, rb20
+-+  max r2, ra_y, 0  # y
+-+  min r2, r2, rb_frame_height_minus_1
+-+  add ra_y, ra_y, 1            ; mul24 r2, r2, r3
+-+  add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte
+- 
+-+  max r2, ra_y2, 0  # y
+-+  min r2, r2, rb_frame_height_minus_1
+-+  add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
+-+  add t1s, ra_frame_base2, r2   ; v8subs r1, r1, rb20
+- 
+- # generate seven shifted versions
+- # interleave with scroll of vertical context
+- 
+--mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+- 
+- # apply horizontal filter
+--nop                  ; mul24 r2, r0, ra0
+--nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--add r0, r2, r3       ; mov r3, rb31
+--sub.setf -, r3, 8    ; mov ra12, ra13
+--mov ra9, ra10
+--mov ra10, ra11
+--mov ra11, ra12
+--mov ra12, ra13
+--brr.anyn -, r:yloop
+--mov ra13, ra14       # Delay slot 1
+--mov ra14, ra15       # Delay slot 2
+--mov ra15, r0         # Delay slot 3
+-+  nop                  ; mul24 r2, r0, ra0
+-+  nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+  nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+  nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+  add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+  nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+  add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+  nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+  add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+  nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+  add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+  nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+  add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+  nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+  add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+  nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+  add r0, r2, r3       ; mov r3, rb31
+-+  sub.setf -, r3, 8    ; mov ra8, ra9
+-+  mov ra9, ra10
+-+  mov ra10, ra11
+-+  mov ra11, ra12
+-+  mov ra12, ra13
+-+  brr.anyn -, r:yloop
+-+  mov ra13, ra14       # Delay slot 1
+-+  mov ra14, ra15       # Delay slot 2
+-+  mov ra15, r0         # Delay slot 3
+- 
+- # apply vertical filter and write to VPM
+- 
+--nop                     ; mul24 r1, ra14, rb10
+--nop                     ; mul24 r0, ra13, rb9
+--add r1, r1, r0          ; mul24 r0, ra12, rb8
+--add r1, r1, r0          ; mul24 r0, ra15, rb11
+--add r1, r1, r0          ; mul24 r0, ra8, rb4
+--add r1, r1, r0          ; mul24 r0, ra9, rb5
+--add r1, r1, r0          ; mul24 r0, ra10, rb6
+--add r1, r1, r0          ; mul24 r0, ra11, rb7
+--
+--add r1, r1, r0          ; mov -, vw_wait
+--sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--asr r1, r1, 14
+--nop                     ; mul24 r1, r1, rb14
+--add r1, r1, rb12
+--asr r1, r1, rb13
+--brr.anyn -, r:yloop
+--add r1, r1, rb15       # Delay 1
+--min r1, r1, rb22       # Delay 2
+--max vpm, r1, 0         # Delay 3
+-+  nop                     ; mul24 r1, ra14, rb10
+-+  nop                     ; mul24 r0, ra13, rb9
+-+  add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+  add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+  add r1, r1, r0          ; mul24 r0, ra8, rb4
+-+  add r1, r1, r0          ; mul24 r0, ra9, rb5
+-+  add r1, r1, r0          ; mul24 r0, ra10, rb6
+-+  add r1, r1, r0          ; mul24 r0, ra11, rb7
+-+
+-+  add r1, r1, r0          ; mov -, vw_wait
+-+  sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+  asr r1, r1, 14
+-+  nop                     ; mul24 r1, r1, rb14
+-+  add r1, r1, rb12
+-+  asr r1, r1, rb13
+-+  brr.anyn -, r:yloop
+-+  add r1, r1, rb15       # Delay 1
+-+  min r1, r1, rb22       # Delay 2
+-+  max vpm, r1, 0         # Delay 3
+- 
+- # DMA out
+- 
+--bra -, ra31
+--mov vw_setup, rb26 # VDW setup 0    Delay 1
+--mov vw_setup, rb29 # Stride         Delay 2
+--mov vw_addr, unif # start the VDW   Delay 3
+-+  brr -, r:per_block_setup
+-+  mov vw_setup, rb26 # VDW setup 0    Delay 1
+-+  mov vw_setup, rb29 # Stride         Delay 2
+-+  mov vw_addr, unif # start the VDW   Delay 3
+- 
+- 
+- 
+- ################################################################################
+- 
+--# mc_filter_b(next_kernel, x, y, frame_base, x2, y2, frame_base2, width_height, hcoeffs[0], hcoeffs2[0], hcoeffs[1], hcoeffs2[1], vcoeffs[0], vcoeffs2[0], vcoeffs[1], vcoeffs2[1], offsetweight0, offsetweight1, this_dst)
+-+# mc_filter_b(y_x, frame_base, y2_x2, frame_base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
+- # In a P block, only the first half of coefficients contain used information.
+- # At this point we have already issued two pairs of texture requests for the current block
+- # May be better to just send 16.16 motion vector and figure out the coefficients inside this block (only 4 cases so can compute hcoeffs in around 24 cycles?)
+-@@ -952,92 +985,6 @@ mov vw_addr, unif # start the VDW   Delay 3
+- # Or possibly by taking advantage of symmetry?
+- # From 19->7 32bits per command.
+- ::mc_filter_b
+--mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--mov ra31, unif
+--
+--# per-channel shifts were calculated on the *previous* invocation
+--
+--mov ra_xshift, ra_xshift_next
+--mov rx_xshift2, rx_xshift2_next
+--
+--# get base addresses and per-channel shifts for *next* invocation
+--add r0, unif, elem_num # Load x
+--max r0, r0, 0; mov r1, unif # Load y
+--min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
+--shl ra_xshift_next, r0, 3 # Compute shifts
+--mov ra_y_next, r1
+--and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
+--add ra_frame_base_next, r2, r0  # r2 is address for frame0 (not including y offset)
+--
+--add r0, unif, elem_num # Load x
+--max r0, r0, 0   ; mov r1, unif # Load y
+--min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
+--shl rx_xshift2_next, r0, 3 # Compute shifts
+--add ra_y2_next, r1, 1
+--and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
+--add rx_frame_base2_next, r2, r0  # r2 is address for frame1 (not including y offset)
+--
+--
+--# set up VPM write
+--mov vw_setup, rb28
+--
+--# get width,height of block
+--mov r2, 16
+--mov r0, unif
+--shr r1, r0, r2 # Extract width
+--sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
+--and r0, r0, rb22 # Extract height
+--add rb17, r0, 5
+--add rb18, r0, 7
+--shl r0, r0, 7
+--add r0, r0, r1 # Combine width and height of destination area
+--shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
+--add rb26, r0, rb27
+--
+--# get filter coefficients and discard unused B frame values
+--mov r0, unif
+--mov r1, 1
+--mov.ifnz r0, unif # Alternate coefficients are unused for P frames
+--nop              ;      mul24 r0, r0 << 13, r1 << 13
+--asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--nop              ;      mul24 r0, r0 << 14, r1 << 14
+--asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--nop              ;      mul24 r0, r0 << 15, r1 << 15 # Adjust such that a rotate of 1 will produce the values with first 8 on left, second 8 on right
+--asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--asr ra0, r0, rb23;      mov r0, unif
+--mov.ifnz r0, unif
+--nop              ;      mul24 r0, r0 << 9, r1 << 9
+--asr ra7, r0, rb23;      mul24 r0, r0, ra22
+--nop              ;      mul24 r0, r0 << 10, r1 << 10
+--asr ra6, r0, rb23;      mul24 r0, r0, ra22
+--nop              ;      mul24 r0, r0 << 11, r1 << 11
+--asr ra5, r0, rb23;      mul24 r0, r0, ra22
+--nop              ;      mul24 r0, r0 << 12, r1 << 12
+--asr ra4, r0, rb23;      mov r0, unif
+--mov.ifnz r0, unif
+--asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--asr rb8, r0, rb23;      mov r0, unif
+--mov.ifnz r0, unif
+--asr rb7, r0, rb23;      mul24 r0, r0, ra22
+--asr rb6, r0, rb23;      mul24 r0, r0, ra22
+--asr rb5, r0, rb23;      mul24 r0, r0, ra22
+--asr rb4, r0, rb23
+--
+--mov r0, unif # Frame0 offset/weight
+--mov.ifnz r0, unif # Frame1 offset/weight unused
+--asr rb15, r0, r2  # Compute offset from MSBs
+--shl r0, r0, r2
+--asr rb14, r0, r2  # Compute weight from LSBs
+--
+--# r3 is loop counter
+--
+--# retrieve texture results and pick out bytes
+--# then submit two more texture requests
+--
+--mov r3, 0
+--
+- :yloopb
+- # retrieve texture results and pick out bytes
+- # then submit two more texture requests
+-@@ -1045,111 +992,123 @@ mov r3, 0
+- # If we knew there was no clipping then this code would get simpler.
+- # Perhaps we could add on the pitch and clip using larger values?
+- 
+--sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
+--shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu0
+--mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+--mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--shr r1, r4, rx_xshift2
+--mov.ifz ra_y2, ra_y2_next
+-+  sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
+-+  shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu1
+-+  mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+-+  mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+  shr r1, r4, rx_xshift2
+-+  mov.ifz ra_y2, ra_y2_next
+- 
+--max r2, ra_y, 0  # y
+--min r2, r2, rb_frame_height_minus_1
+--add ra_y, ra_y, 1            ; mul24 r2, r2, r3
+--add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte
+--
+--max r2, ra_y2, 0  # y
+--min r2, r2, rb_frame_height_minus_1
+--add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
+--add t0s, ra_frame_base2, r2   ; v8subs r0, r0, rb20
+-+  max r2, ra_y, 0  # y
+-+  min r2, r2, rb_frame_height_minus_1
+-+  add ra_y, ra_y, 1            ; mul24 r2, r2, r3
+-+  add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte
+- 
+-+  max r2, ra_y2, 0  # y
+-+  min r2, r2, rb_frame_height_minus_1
+-+  add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
+-+  add t1s, ra_frame_base2, r2   ; v8subs r1, r1, rb20
+- 
+- # generate seven shifted versions
+- # interleave with scroll of vertical context
+- 
+--mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+- 
+- # apply horizontal filter
+--nop                  ; mul24 r2, r0, ra0
+--nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--add r0, r2, r3       ; mov r3, rb31
+--sub.setf -, r3, 8    ; mov ra12, ra13
+--mov ra9, ra10
+--mov ra10, ra11
+--mov ra11, ra12
+--mov ra12, ra13
+--brr.anyn -, r:yloopb
+--mov ra13, ra14       # Delay slot 1
+--mov ra14, ra15       # Delay slot 2
+--mov ra15, r0         # Delay slot 3
+--
+--# apply vertical filter and write to VPM
+--
+--nop                     ; mul24 r1, ra14, rb10
+--nop                     ; mul24 r0, ra13, rb9
+--add r1, r1, r0          ; mul24 r0, ra12, rb8
+--add r1, r1, r0          ; mul24 r0, ra15, rb11
+--add r1, r1, r0          ; mul24 r0, ra8, rb4
+--add r1, r1, r0          ; mul24 r0, ra9, rb5
+--add r1, r1, r0          ; mul24 r0, ra10, rb6
+--add r1, r1, r0          ; mul24 r0, ra11, rb7
+--
+--add r1, r1, r0          ; mov -, vw_wait
+--sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--asr r1, r1, 14
+--nop                     ; mul24 r1, r1 << 8, ra20 << 8 # Rotate to align left and right halves
+--add r1, r1, ra30        ; mul24 r0, r1, rb14
+--add r1, r1, r0
+--brr.anyn -, r:yloopb
+--asr r1, r1, 7          # Delay 1
+--min r1, r1, rb22       # Delay 2
+--max vpm, r1, 0         # Delay 3
+-+  nop                  ; mul24 r2, r0, ra0
+-+  nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+  nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+  nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+  add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+  nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+  add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+  nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+  add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+  nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+  add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+  nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+  add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+  nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+  add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+  nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+  add r0, r2, r3       ; mov r3, rb31
+-+  sub.setf -, r3, 8    ; mov ra8, ra9
+-+  mov ra9, ra10
+-+  mov ra10, ra11
+-+  mov ra11, ra12
+-+  mov ra12, ra13
+-+  brr.anyn -, r:yloopb
+-+  mov ra13, ra14       # Delay slot 1
+-+  mov ra14, ra15       # Delay slot 2
+-+  mov ra15, r0         # Delay slot 3
+-+
+-+  # apply vertical filter and write to VPM
+-+
+-+  nop                     ; mul24 r1, ra14, rb10
+-+  nop                     ; mul24 r0, ra13, rb9
+-+  add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+  add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+  add r1, r1, r0          ; mul24 r0, ra8, rb4
+-+  add r1, r1, r0          ; mul24 r0, ra9, rb5
+-+  add r1, r1, r0          ; mul24 r0, ra10, rb6
+-+  add r1, r1, r0          ; mul24 r0, ra11, rb7
+-+
+-+  add r1, r1, r0          ; mov -, vw_wait
+-+  sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+  asr r0, r1, 14
+-+  asr r1, r1, 6           # Wait state so we can use the rotate instruction
+-+  nop                     ; mul24 r0, r0 << 8, ra22 << 8 # Rotate to align left and right halves
+-+  add r1, r1, ra18
+-+  add r1, r1, r0
+-+  brr.anyn -, r:yloopb
+-+  asr r1, r1, 15         # Delay 1
+-+  min r1, r1, rb22       # Delay 2
+-+  max vpm, r1, 0         # Delay 3
+- 
+- # DMA out
+--bra -, ra31
+--mov vw_setup, rb26 # VDW setup 0    Delay 1
+--mov vw_setup, rb29 # Stride         Delay 2
+--mov vw_addr, unif # start the VDW   Delay 3
+-+  brr -, r:per_block_setup
+-+  mov vw_setup, rb26 # VDW setup 0    Delay 1
+-+  mov vw_setup, rb29 # Stride         Delay 2
+-+  mov vw_addr, unif # start the VDW   Delay 3
+- 
+- ################################################################################
+- 
+- # mc_interrupt_exit12()
+- ::mc_interrupt_exit12
+--mov  -, vw_wait # wait on the VDW
+--
+--ldtmu0
+--ldtmu0
+--ldtmu0
+--ldtmu0
+--
+--mov -,sacq(0) # 1
+--mov -,sacq(0) # 2
+--mov -,sacq(0) # 3
+--mov -,sacq(0) # 4
+--mov -,sacq(0) # 5
+--mov -,sacq(0) # 6
+--mov -,sacq(0) # 7
+--mov -,sacq(0) # 8
+--mov -,sacq(0) # 9
+--mov -,sacq(0) # 10
+--mov -,sacq(0) # 11
+--
+--nop        ; nop ; thrend
+--mov interrupt, 1; nop # delay slot 1
+--nop        ; nop # delay slot 2
+-+  mov  -, vw_wait # wait on the VDW
+-+
+-+  ldtmu0
+-+  ldtmu0
+-+  ldtmu1
+-+  ldtmu1
+-+
+-+  mov -,sacq(0) # 1
+-+  mov -,sacq(0) # 2
+-+  mov -,sacq(0) # 3
+-+  mov -,sacq(0) # 4
+-+  mov -,sacq(0) # 5
+-+  mov -,sacq(0) # 6
+-+  mov -,sacq(0) # 7
+-+  mov -,sacq(0) # 8
+-+  mov -,sacq(0) # 9
+-+  mov -,sacq(0) # 10
+-+  mov -,sacq(0) # 11
+-+
+-+  nop        ; nop ; thrend
+-+  mov interrupt, 1; nop # delay slot 1
+-+  nop        ; nop # delay slot 2
+-+
+-+
+-+::mc_exit1
+-+  mov  -, vw_wait # wait on the VDW
+-+
+-+  ldtmu0
+-+  ldtmu1
+-+  ldtmu0
+-+  ldtmu1
+-+  nop        ; nop ; thrend
+-+  mov interrupt, 1; nop # delay slot 1
+-+  nop        ; nop # delay slot 2
+- 
+- 
+- ::mc_end
+--- 
+-2.7.4
+-
+-
+-From f02ec34c772aad3caa17432c6a4860f9ed0d5dc6 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Tue, 2 Jun 2015 10:58:25 +0100
+-Subject: [PATCH 48/68] Added option to simulate QPUs
+-
+----
+- libavcodec/hevc.c          | 288 +++++++++++++++++++++++++++++++++++++++++++--
+- libavcodec/rpi_qpu.c       |  24 ++--
+- libavcodec/rpi_shader.qasm |   6 +-
+- 3 files changed, 295 insertions(+), 23 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 2da88ec..34d92e2 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -56,6 +56,8 @@
+-   // Define RPI_CACHE_UNIF_MVS to write motion vector uniform stream to cached memory
+-   // RPI_CACHE_UNIF_MVS doesn't seem to make much difference, so left undefined.
+- 
+-+  // Define RPI_SIMULATE_QPUS for debugging to run QPU code on the ARMs
+-+  //#define RPI_SIMULATE_QPUS
+- 
+- #endif
+- 
+-@@ -124,7 +126,6 @@ static void pic_arrays_free(HEVCContext *s)
+- 
+- #ifdef EARLY_MALLOC
+- #else
+--    printf("pic_arrays_free\n");
+-     if (s->coeffs_buf_arm[0]) {
+-       gpu_free(&s->coeffs_buf_default);
+-       s->coeffs_buf_arm[0] = 0;
+-@@ -174,11 +175,9 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
+- #ifdef RPI
+- #ifdef EARLY_MALLOC
+- #else
+--    assert(sps);
+-+    av_assert0(sps);
+-     int coeffs_in_ctb = (1 << sps->log2_ctb_size) * (1 << sps->log2_ctb_size);
+-     int coefs_per_row = sps->ctb_width * coeffs_in_ctb * 3;  // Allow space for chroma
+--    printf("pic_arrays_init\n");
+--    printf("Allocated %d\n",coefs_per_row);
+-     gpu_malloc_cached(sizeof(int16_t) * coefs_per_row, &s->coeffs_buf_default);
+-     s->coeffs_buf_arm[0] = (int16_t*) s->coeffs_buf_default.arm;
+-     if (!s->coeffs_buf_arm[0])
+-@@ -2988,6 +2987,274 @@ static void rpi_inter_clear(HEVCContext *s)
+- #endif
+- }
+- 
+-+
+-+#ifdef RPI_SIMULATE_QPUS
+-+
+-+static int32_t clipx(int x,int FRAME_WIDTH)
+-+{
+-+	if (x<=0) return 0;
+-+	if (x>=FRAME_WIDTH) return FRAME_WIDTH-1;
+-+	return x;
+-+}
+-+
+-+static int32_t clipy(int y,int FRAME_HEIGHT)
+-+{
+-+	if (y<=0) return 0;
+-+	if (y>=FRAME_HEIGHT) return FRAME_HEIGHT-1;
+-+	return y;
+-+}
+-+
+-+/*static int32_t filter8(uint8_t *data, int x0, int y0, int pitch, int mx, int my,int round,int denom,int weight,int offset)
+-+{
+-+   int32_t vsum = 0;
+-+   int x, y;
+-+
+-+   for (y = 0; y < 8; y++) {
+-+      int32_t hsum = 0;
+-+
+-+      for (x = 0; x < 8; x++)
+-+         hsum += lumaFilter[mx][x]*data[clipx(x + x0) + clipy(y + y0) * pitch];
+-+
+-+      vsum += lumaFilter[my][y]*hsum;
+-+   }
+-+   vsum >>= 6;
+-+   vsum = (((vsum*weight)+round)>>denom)+offset;
+-+
+-+   return av_clip_uint8( vsum );
+-+}*/
+-+
+-+static int32_t filter8_chroma(uint8_t *data, int x0, int y0, int pitch, int hcoeffs, int vcoeffs,int offset_weight,int offset_before,int denom,int pic_width, int pic_height)
+-+{
+-+  int32_t vsum = 0;
+-+  int x, y;
+-+  int chromaFilterH[4];
+-+  int chromaFilterV[4];
+-+  int i;
+-+  int offset_after = offset_weight>>16;
+-+  int weight = (offset_weight<<16)>>16;
+-+  for(i=0;i<4;i++) {
+-+    chromaFilterH[i] = ((hcoeffs>>(8*i))<<24)>>24;
+-+    chromaFilterV[i] = ((vcoeffs>>(8*i))<<24)>>24;
+-+  }
+-+
+-+   for (y = 0; y < 4; y++) {
+-+      int32_t hsum = 0;
+-+
+-+      for (x = 0; x < 4; x++)
+-+         hsum += chromaFilterH[x]*data[clipx(x + x0,pic_width) + clipy(y + y0,pic_height) * pitch];
+-+
+-+      vsum += chromaFilterV[y]*hsum;
+-+   }
+-+   vsum >>= 6;
+-+   vsum = (((vsum*weight)+offset_before)>>denom)+offset_after;
+-+
+-+   return vsum;
+-+}
+-+
+-+int lumaFilter[4][8]={ {0,0,0,64,0,0,0,0},{-1,4,-10,58,17,-5,1,0},{-1,4,-11,40,40,-11,4,-1},{0,1,-5,17,58,-10,4,-1} };
+-+
+-+static int32_t filter8_luma(uint8_t *data, int x0, int y0, int pitch, int my_mx,int offset_weight,int offset_before,int denom,int pic_width, int pic_height)
+-+{
+-+  int32_t vsum = 0;
+-+  int x, y;
+-+  int i;
+-+  int offset_after = offset_weight>>16;
+-+  int weight = (offset_weight<<16)>>16;
+-+
+-+   for (y = 0; y < 8; y++) {
+-+      int32_t hsum = 0;
+-+
+-+      for (x = 0; x < 8; x++)
+-+         hsum += lumaFilter[my_mx&3][x]*data[clipx(x + x0,pic_width) + clipy(y + y0,pic_height) * pitch];
+-+
+-+      vsum += lumaFilter[(my_mx>>8)&3][y]*hsum;
+-+   }
+-+   vsum >>= 6;
+-+   vsum = (((vsum*weight)+offset_before)>>denom)+offset_after;
+-+
+-+   return vsum;
+-+}
+-+
+-+static uint8_t *test_frame(HEVCContext *s,uint32_t p, AVFrame *frame, int cIdx)
+-+{
+-+  //int pic_width        = s->ps.sps->width >> s->ps.sps->hshift[cIdx];
+-+  int pic_height       = s->ps.sps->height >> s->ps.sps->vshift[cIdx];
+-+  int pitch = frame->linesize[cIdx];
+-+  uint32_t base = get_vc_address(frame->buf[cIdx]);
+-+  if (p>=base && p<base+pitch*pic_height) {
+-+    return frame->data[cIdx] + (p-base);
+-+  }
+-+  return NULL;
+-+}
+-+
+-+static uint8_t *compute_arm_addr(HEVCContext *s,uint32_t p, int cIdx)
+-+{
+-+  SliceHeader *sh   = &s->sh;
+-+  uint8_t *arm = test_frame(s,p,s->frame,cIdx);
+-+  int i;
+-+  if (arm) return arm;
+-+  if (sh->slice_type == P_SLICE || sh->slice_type == B_SLICE)
+-+  {
+-+    for(i=0;i<sh->nb_refs[L0];i++) {
+-+      arm = test_frame(s,p,s->ref->refPicList[0].ref[i]->frame,cIdx);
+-+      if (arm) return arm;
+-+    }
+-+  }
+-+  if (sh->slice_type == B_SLICE) {
+-+    for(i=0;i<sh->nb_refs[L1];i++) {
+-+      arm = test_frame(s,p,s->ref->refPicList[1].ref[i]->frame,cIdx);
+-+      if (arm) return arm;
+-+    }
+-+  }
+-+  printf("Frame 0x%x not found! Exit=%x\n",p,qpu_get_fn(QPU_MC_EXIT));
+-+  exit(-1);
+-+  return NULL;
+-+}
+-+
+-+static void rpi_simulate_inter_chroma(HEVCContext *s,uint32_t *p)
+-+{
+-+  uint32_t next_kernel;
+-+  uint32_t x0;
+-+  uint32_t y0;
+-+  uint8_t *ref_u_base;
+-+  uint8_t *ref_v_base;
+-+  uint32_t frame_width = p[5];
+-+  uint32_t frame_height = p[6];
+-+  uint32_t pitch = p[7];
+-+  uint32_t dst_pitch = p[8];
+-+  int32_t offset_before = p[9];
+-+  int32_t denom = p[10];
+-+  uint32_t vpm_id = p[11];
+-+  uint32_t tmp_u_dst[256];
+-+  uint32_t tmp_v_dst[256];
+-+  while(1) {
+-+    p += 12;
+-+    next_kernel = p[0-12];
+-+    x0 = p[1-12];
+-+    y0 = p[2-12];
+-+    if (next_kernel==s->mc_filter_uv || next_kernel==s->mc_filter_uv_b0 || next_kernel==s->mc_filter_uv_b) {
+-+      int x,y;
+-+      uint32_t width_height = p[5];
+-+      uint32_t hcoeffs = p[6];
+-+      uint32_t vcoeffs = p[7];
+-+      uint32_t offset_weight_u = p[8];
+-+      uint32_t offset_weight_v = p[9];
+-+      uint8_t *this_u_dst;
+-+      uint8_t *this_v_dst;
+-+      uint32_t width = width_height >> 16;
+-+      uint32_t height = (width_height << 16) >> 16;
+-+      ref_u_base = compute_arm_addr(s,p[3-12],1);
+-+      ref_v_base = compute_arm_addr(s,p[4-12],2);
+-+      if (next_kernel!=s->mc_filter_uv_b0)
+-+      {
+-+        this_u_dst = compute_arm_addr(s,p[10],1);
+-+        this_v_dst = compute_arm_addr(s,p[11],2);
+-+      }
+-+      for (y=0; y<height; ++y) {
+-+        for (x=0; x<width; ++x) {
+-+          if (next_kernel==s->mc_filter_uv) {
+-+            int32_t refa = filter8_chroma(ref_u_base,x+x0, y+y0, pitch, hcoeffs, vcoeffs, offset_weight_u,offset_before,denom,frame_width,frame_height);
+-+            int32_t refb = filter8_chroma(ref_v_base,x+x0, y+y0, pitch, hcoeffs, vcoeffs, offset_weight_v,offset_before,denom,frame_width,frame_height);
+-+            this_u_dst[x+y*dst_pitch] = av_clip_uint8(refa);
+-+            this_v_dst[x+y*dst_pitch] = av_clip_uint8(refb);
+-+          } else if (next_kernel==s->mc_filter_uv_b0) {
+-+            int32_t refa = filter8_chroma(ref_u_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1,0,0,frame_width,frame_height);
+-+            int32_t refb = filter8_chroma(ref_v_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1,0,0,frame_width,frame_height);
+-+            tmp_u_dst[x+y*16] = refa;
+-+            tmp_v_dst[x+y*16] = refb;
+-+          } else {
+-+            int32_t refa = filter8_chroma(ref_u_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1, 64 + tmp_u_dst[x+y*16], 7, frame_width, frame_height);
+-+            int32_t refb = filter8_chroma(ref_v_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1, 64 + tmp_v_dst[x+y*16], 7, frame_width, frame_height);
+-+            this_u_dst[x+y*dst_pitch] = av_clip_uint8(refa);
+-+            this_v_dst[x+y*dst_pitch] = av_clip_uint8(refb);
+-+          }
+-+        }
+-+      }
+-+    } else {
+-+      av_assert0(next_kernel==qpu_get_fn(QPU_MC_INTERRUPT_EXIT8) || next_kernel==qpu_get_fn(QPU_MC_EXIT) );
+-+      break;
+-+    }
+-+  }
+-+}
+-+
+-+// mc_setup(y_x, ref_y_base, y2_x2, ref_y2_base, frame_width_height, pitch, dst_pitch, offset_shift, next_kernel)
+-+static void rpi_simulate_inter_luma(HEVCContext *s,uint32_t *p)
+-+{
+-+  uint32_t next_kernel;
+-+  int y_x,y2_x2;
+-+  uint32_t x0;
+-+  uint32_t y0;
+-+  uint32_t x2;
+-+  uint32_t y2;
+-+  uint8_t *ref_y_base;
+-+  uint8_t *ref_y2_base;
+-+  uint32_t frame_width_height = p[4];
+-+  uint32_t frame_width = frame_width_height>>16;
+-+  uint32_t frame_height = (frame_width_height<<16)>>16;
+-+  uint32_t pitch = p[5];
+-+  uint32_t dst_pitch = p[6];
+-+  int offset_shift = p[7];
+-+  int32_t offset_before = offset_shift>>16;
+-+  int32_t denom = (offset_shift<<16)>>16;
+-+  while(1) {
+-+    p += 9;
+-+    next_kernel = p[8-9];
+-+    y_x = p[0-9];
+-+    x0 = (y_x<<16)>>16;
+-+    y0 = y_x>>16;
+-+    y2_x2 = p[2-9];
+-+    x2 = (y2_x2<<16)>>16;
+-+    y2 = y2_x2>>16;
+-+
+-+    if (next_kernel==s->mc_filter || next_kernel==s->mc_filter_b) {
+-+      // y_x, frame_base, y2_x2, frame_base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
+-+      int x,y;
+-+      uint32_t width_height = p[4];
+-+      uint32_t my2_mx2_my_mx = p[5];
+-+      uint32_t offset_weight = p[6];
+-+      uint8_t *this_dst = compute_arm_addr(s,p[7],0);
+-+      uint32_t width = width_height >> 16;
+-+      uint32_t height = (width_height << 16) >> 16;
+-+      ref_y_base = compute_arm_addr(s,p[1-9],0);
+-+      ref_y2_base = compute_arm_addr(s,p[3-9],0);
+-+      for (y=0; y<height; ++y) {
+-+        for (x=0; x<width; ++x) {
+-+          if (next_kernel==s->mc_filter) {
+-+            int32_t refa = filter8_luma(ref_y_base,x+x0, y+y0, pitch, my2_mx2_my_mx, offset_weight,offset_before,denom,frame_width,frame_height);
+-+            this_dst[x+y*dst_pitch] = av_clip_uint8(refa);
+-+          }
+-+          else {
+-+            int32_t refa = filter8_luma(ref_y_base, x+x0, y+y0, pitch, my2_mx2_my_mx, 1, 0, 0, frame_width, frame_height);
+-+            int32_t refb = filter8_luma(ref_y2_base, x+x2, y+y2, pitch, my2_mx2_my_mx>>16, 1, 64 + refa, 7, frame_width, frame_height);
+-+            this_dst[x+y*dst_pitch] = av_clip_uint8(refb);
+-+          }
+-+        }
+-+      }
+-+    } else {
+-+      av_assert0(next_kernel==qpu_get_fn(QPU_MC_INTERRUPT_EXIT12) || next_kernel==qpu_get_fn(QPU_MC_EXIT) );
+-+      break;
+-+    }
+-+  }
+-+}
+-+
+-+static void rpi_simulate_inter_qpu(HEVCContext *s)
+-+{
+-+  // First run the transform as normal
+-+  int i;
+-+  rpi_execute_transform(s);
+-+  for(i=0;i<8;i++)
+-+  {
+-+    rpi_simulate_inter_chroma(s,s->mvs_base[i]);
+-+  }
+-+  for(i=0;i<12;i++)
+-+  {
+-+    rpi_simulate_inter_luma(s,s->y_mvs_base[i]);
+-+  }
+-+}
+-+
+-+#endif
+-+
+-+
+- static void rpi_execute_inter_qpu(HEVCContext *s)
+- {
+-     int k;
+-@@ -3006,7 +3273,7 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
+-         s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
+-         s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
+-         s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP_UV); // Also need a dummy for V
+--        assert(s->u_mvs[k] - s->mvs_base[k] < UV_COMMANDS_PER_QPU);
+-+        av_assert0(s->u_mvs[k] - s->mvs_base[k] < UV_COMMANDS_PER_QPU);
+-     }
+- 
+-     s->u_mvs[8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
+-@@ -3016,11 +3283,16 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
+-         s->y_mvs[k][-RPI_LUMA_COMMAND_WORDS+1] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
+-         s->y_mvs[k][-RPI_LUMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // Also need a dummy for second request
+-         s->y_mvs[k][-RPI_LUMA_COMMAND_WORDS+8] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
+--        assert(s->y_mvs[k] - s->y_mvs_base[k] < Y_COMMANDS_PER_QPU);
+-+        av_assert0(s->y_mvs[k] - s->y_mvs_base[k] < Y_COMMANDS_PER_QPU);
+-     }
+-     s->y_mvs[12-1][-RPI_LUMA_COMMAND_WORDS+8] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT12); // This QPU will signal interrupt when all others are done and have acquired a semaphore
+- #endif
+- 
+-+#ifdef RPI_SIMULATE_QPUS
+-+    rpi_simulate_inter_qpu(s);
+-+    s->vpu_id = -1;
+-+    return;
+-+#endif
+- 
+- #ifdef RPI_MULTI_MAILBOX
+- #ifdef RPI_CACHE_UNIF_MVS
+-@@ -3101,7 +3373,7 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-                     && s->ps.pps->num_tile_rows <= 1 && s->ps.pps->num_tile_columns <= 1;
+- #endif
+- 
+--    /*if (!s->enable_rpi) {
+-+    if (!s->enable_rpi) {
+-       if (s->ps.pps->cross_component_prediction_enabled_flag)
+-         printf("Cross component\n");
+-       if (s->ps.pps->num_tile_rows > 1 || s->ps.pps->num_tile_columns > 1)
+-@@ -3110,7 +3382,7 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-         printf("Weighted P slice\n");
+-       if (s->ps.pps->weighted_bipred_flag && s->sh.slice_type == B_SLICE)
+-         printf("Weighted B slice\n");
+--    }*/
+-+    }
+- 
+- #endif
+- 
+-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
+-index e12304b..4480f72 100644
+---- a/libavcodec/rpi_qpu.c
+-+++ b/libavcodec/rpi_qpu.c
+-@@ -13,7 +13,7 @@
+- #include <stdlib.h>
+- #include <string.h>
+- #include <stddef.h>
+--#include <assert.h>
+-+#include "libavutil/avassert.h"
+- 
+- #include "config.h"
+- 
+-@@ -160,13 +160,13 @@ static int gpu_init(volatile struct GPU **gpu) {
+-   // Now copy over the QPU code into GPU memory
+-   {
+-     int num_bytes = qpu_get_fn(QPU_MC_END) - qpu_get_fn(QPU_MC_SETUP_UV);
+--    assert(num_bytes<=QPU_CODE_SIZE*sizeof(unsigned int));
+-+    av_assert0(num_bytes<=QPU_CODE_SIZE*sizeof(unsigned int));
+-     memcpy((void*)ptr->qpu_code, rpi_shader, num_bytes);
+-   }
+-   // And the VPU code
+-   {
+-     int num_bytes = sizeof(rpi_hevc_transform);
+--    assert(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
+-+    av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
+-     memcpy((void*)ptr->vpu_code, rpi_hevc_transform, num_bytes);
+-   }
+-   // And the transform coefficients
+-@@ -216,13 +216,13 @@ static void gpu_unlock(void) {
+- static int gpu_malloc_uncached_internal(int numbytes, GPU_MEM_PTR_T *p, int mb) {
+-   p->numbytes = numbytes;
+-   p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
+--  assert(p->vcsm_handle);
+-+  av_assert0(p->vcsm_handle);
+-   p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
+--  assert(p->vc_handle);
+-+  av_assert0(p->vc_handle);
+-   p->arm = vcsm_lock(p->vcsm_handle);
+--  assert(p->arm);
+-+  av_assert0(p->arm);
+-   p->vc = mem_lock(mb, p->vc_handle);
+--  assert(p->vc);
+-+  av_assert0(p->vc);
+-   return 0;
+- }
+- 
+-@@ -243,7 +243,7 @@ int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p)
+- 
+- int gpu_get_mailbox(void)
+- {
+--  assert(gpu);
+-+  av_assert0(gpu);
+-   return gpu->mb;
+- }
+- 
+-@@ -297,13 +297,13 @@ static int gpu_malloc_cached_internal(int numbytes, GPU_MEM_PTR_T *p) {
+-   //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_VC, (char *)"Video Frame" );
+-   //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
+-   //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST_AND_VC, (char *)"Video Frame" );
+--  assert(p->vcsm_handle);
+-+  av_assert0(p->vcsm_handle);
+-   p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
+--  assert(p->vc_handle);
+-+  av_assert0(p->vc_handle);
+-   p->arm = vcsm_lock(p->vcsm_handle);
+--  assert(p->arm);
+-+  av_assert0(p->arm);
+-   p->vc = mem_lock(gpu->mb, p->vc_handle);
+--  assert(p->vc);
+-+  av_assert0(p->vc);
+-   return 0;
+- }
+- 
+-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
+-index 60d1ec2..0686249 100644
+---- a/libavcodec/rpi_shader.qasm
+-+++ b/libavcodec/rpi_shader.qasm
+-@@ -149,8 +149,8 @@ add t0s, r0, r1 ; mov ra_frame_base, r2
+- add t1s, r2, r1
+- 
+- mov r2,8
+--shl rb12,unif, r2 # offset before shift
+--add rb13,unif,r2  # offset after shift
+-+shl rb12,unif,r2 # offset before shift
+-+add rb13,unif,r2  # denominator
+- 
+- # Compute part of VPM to use for DMA output
+- mov r2, unif
+-@@ -185,7 +185,7 @@ add t1s, r1, ra_frame_base
+- 
+- ################################################################################
+- 
+--# mc_filter_uv(next_kernel, x, y, frame_u_base, frame_v_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_u_dst, this_v_dst)
+-+# mc_filter_uv(next_kernel, x, y, frame_u_base, frame_v_base, width_height, hcoeffs, vcoeffs, offset_weight_u, offset_weight_v, this_u_dst, this_v_dst)
+- 
+- # At this point we have already issued two pairs of texture requests for the current block
+- # ra_x, ra_x16_base point to the current coordinates for this block
+--- 
+-2.7.4
+-
+-
+-From 8bdf6b06c612ff4971c2ce99a62d093cf92468ca Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Tue, 2 Jun 2015 13:17:50 +0100
+-Subject: [PATCH 49/68] Increased motion vector memory and fixed block size
+- computation for non-multiple of 2 block sizes
+-
+----
+- libavcodec/hevc.c | 50 +++++++++++++++++++++++++++++++-------------------
+- 1 file changed, 31 insertions(+), 19 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 34d92e2..3fb1e2a 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -83,11 +83,9 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
+- 
+- // Split image of 2048 into parts 64 wide
+- // So some QPUs will have 3 blocks of 64 to do, and others 2 blocks for an image 2048 wide with 32 blocks across
+--// Each block of 64*64
+--// Smallest CTU size is 16x16, so smallest block is 8x8
+--// Corresponds to a total of 83kbytes over all 12 QPUs
+-+// For each block of 64*64 the smallest block size is 8x4
+- #define RPI_LUMA_COMMAND_WORDS 9
+--#define Y_COMMANDS_PER_QPU ((1+3*(64*64)/(8*8)) * RPI_LUMA_COMMAND_WORDS)
+-+#define Y_COMMANDS_PER_QPU ((1+3*(64*64)/(8*4)) * RPI_LUMA_COMMAND_WORDS)
+- 
+- #define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24)
+- 
+-@@ -2042,11 +2040,13 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-             uint32_t *y = s->y_mvs[chan % 12];
+-             for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
+-               for(int start_x=0;start_x < nPbW;start_x+=16) {
+-+                  int bw = nPbW-start_x;
+-+                  int bh = nPbH-start_y;
+-                   y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff);
+-                   y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[0]);
+-                   y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + 8 + start_x) & 0xffff);
+-                   y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[0]);
+--                  *y++ = ( (nPbW<16 ? nPbW : 16) << 16 ) + (nPbH<16 ? nPbH : 16);
+-+                  *y++ = ( (bw<16 ? bw : 16) << 16 ) + (bh<16 ? bh : 16);
+-                   *y++ = my2_mx2_my_mx;
+-                   if (weight_flag) {
+-                       *y++ = (s->sh.luma_offset_l0[current_mv.ref_idx[reflist]] << 16) + (s->sh.luma_weight_l0[current_mv.ref_idx[reflist]] & 0xffff);
+-@@ -2089,12 +2089,14 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                 uint32_t *u = s->u_mvs[chan & 7];
+-                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
+-                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
+-+                      int bw = nPbW_c-start_x;
+-+                      int bh = nPbH_c-start_y;
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[1]);
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[2]);
+--                      *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
+-+                      *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
+-                       *u++ = rpi_filter_coefs[_mx][0];
+-                       *u++ = rpi_filter_coefs[_my][0];
+-                       if (weight_flag) {
+-@@ -2141,11 +2143,13 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-             uint32_t *y = s->y_mvs[chan % 12];
+-             for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
+-               for(int start_x=0;start_x < nPbW;start_x+=16) {
+-+                  int bw = nPbW-start_x;
+-+                  int bh = nPbH-start_y;
+-                   y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff);
+-                   y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[0]);
+-                   y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + 8 + start_x) & 0xffff);
+-                   y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[0]);
+--                  *y++ = ( (nPbW<16 ? nPbW : 16) << 16 ) + (nPbH<16 ? nPbH : 16);
+-+                  *y++ = ( (bw<16 ? bw : 16) << 16 ) + (bh<16 ? bh : 16);
+-                   *y++ = my2_mx2_my_mx;
+-                   if (weight_flag) {
+-                       *y++ = (s->sh.luma_offset_l0[current_mv.ref_idx[reflist]] << 16) + (s->sh.luma_weight_l0[current_mv.ref_idx[reflist]] & 0xffff);
+-@@ -2189,12 +2193,14 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                 uint32_t *u = s->u_mvs[chan & 7];
+-                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
+-                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
+-+                      int bw = nPbW_c-start_x;
+-+                      int bh = nPbH_c-start_y;
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[1]);
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[2]);
+--                      *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
+-+                      *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
+-                       // TODO chroma weight and offset... s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]
+-                       *u++ = rpi_filter_coefs[_mx][0];
+-                       *u++ = rpi_filter_coefs[_my][0];
+-@@ -2246,11 +2252,13 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-             uint32_t *y = s->y_mvs[chan % 12];
+-             for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
+-               for(int start_x=0;start_x < nPbW;start_x+=8) { // B blocks work 8 at a time
+-+                  int bw = nPbW-start_x;
+-+                  int bh = nPbH-start_y;
+-                   y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff);
+-                   y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[0]);
+-                   y++[-RPI_LUMA_COMMAND_WORDS] = ((y2 - 3 + start_y) << 16) + ( (x2 - 3 + start_x) & 0xffff); // Second fetch is for ref1
+-                   y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[0]);
+--                  *y++ = ( (nPbW<8 ? nPbW : 8) << 16 ) + (nPbH<16 ? nPbH : 16);
+-+                  *y++ = ( (bw<8 ? bw : 8) << 16 ) + (bh<16 ? bh : 16);
+-                   *y++ = my2_mx2_my_mx;
+-                   *y++ = 1; // B frame weighted prediction not supported
+-                   *y++ = (get_vc_address(s->frame->buf[0]) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
+-@@ -2293,12 +2301,14 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                 uint32_t *u = s->u_mvs[chan & 7];
+-                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
+-                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
+-+                      int bw = nPbW_c-start_x;
+-+                      int bh = nPbH_c-start_y;
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b0;
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[1]);
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[2]);
+--                      *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
+-+                      *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
+-                       *u++ = rpi_filter_coefs[_mx][0];
+-                       *u++ = rpi_filter_coefs[_my][0];
+-                       u+=2; // Weights not supported in B slices
+-@@ -2309,7 +2319,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = y2_c - 1 + start_y;
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[1]);
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[2]);
+--                      *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
+-+                      *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
+-                       *u++ = rpi_filter_coefs[_mx2][0];
+-                       *u++ = rpi_filter_coefs[_my2][0];
+-                       u+=2; // Weights not supported in B slices
+-@@ -3178,14 +3188,15 @@ static void rpi_simulate_inter_chroma(HEVCContext *s,uint32_t *p)
+- }
+- 
+- // mc_setup(y_x, ref_y_base, y2_x2, ref_y2_base, frame_width_height, pitch, dst_pitch, offset_shift, next_kernel)
+--static void rpi_simulate_inter_luma(HEVCContext *s,uint32_t *p)
+-+static void rpi_simulate_inter_luma(HEVCContext *s,uint32_t *p,int chan)
+- {
+-   uint32_t next_kernel;
+-   int y_x,y2_x2;
+--  uint32_t x0;
+--  uint32_t y0;
+--  uint32_t x2;
+--  uint32_t y2;
+-+  int x0;
+-+  int y0;
+-+  int x2;
+-+  int y2;
+-+  uint32_t *p0 = p;
+-   uint8_t *ref_y_base;
+-   uint8_t *ref_y2_base;
+-   uint32_t frame_width_height = p[4];
+-@@ -3215,13 +3226,15 @@ static void rpi_simulate_inter_luma(HEVCContext *s,uint32_t *p)
+-       uint8_t *this_dst = compute_arm_addr(s,p[7],0);
+-       uint32_t width = width_height >> 16;
+-       uint32_t height = (width_height << 16) >> 16;
+-+      uint8_t *dst_base = s->frame->data[0];
+-       ref_y_base = compute_arm_addr(s,p[1-9],0);
+-       ref_y2_base = compute_arm_addr(s,p[3-9],0);
+-       for (y=0; y<height; ++y) {
+-         for (x=0; x<width; ++x) {
+-           if (next_kernel==s->mc_filter) {
+-             int32_t refa = filter8_luma(ref_y_base,x+x0, y+y0, pitch, my2_mx2_my_mx, offset_weight,offset_before,denom,frame_width,frame_height);
+--            this_dst[x+y*dst_pitch] = av_clip_uint8(refa);
+-+            refa = av_clip_uint8(refa);
+-+            this_dst[x+y*dst_pitch] = refa;
+-           }
+-           else {
+-             int32_t refa = filter8_luma(ref_y_base, x+x0, y+y0, pitch, my2_mx2_my_mx, 1, 0, 0, frame_width, frame_height);
+-@@ -3248,7 +3261,7 @@ static void rpi_simulate_inter_qpu(HEVCContext *s)
+-   }
+-   for(i=0;i<12;i++)
+-   {
+--    rpi_simulate_inter_luma(s,s->y_mvs_base[i]);
+-+    rpi_simulate_inter_luma(s,s->y_mvs_base[i],i);
+-   }
+- }
+- 
+-@@ -3290,7 +3303,6 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
+- 
+- #ifdef RPI_SIMULATE_QPUS
+-     rpi_simulate_inter_qpu(s);
+--    s->vpu_id = -1;
+-     return;
+- #endif
+- 
+--- 
+-2.7.4
+-
+-
+-From da5ae7e96dd961ccc7bc162c8acf336d54a50092 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Tue, 2 Jun 2015 14:36:54 +0100
+-Subject: [PATCH 50/68] Added support for skip deblock
+-
+----
+- libavcodec/hevc.c        |  5 +++++
+- libavcodec/hevc.h        |  2 ++
+- libavcodec/hevc_filter.c | 14 ++++----------
+- 3 files changed, 11 insertions(+), 10 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 3fb1e2a..0ac4f4c 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -3397,6 +3397,11 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-     }
+- 
+- #endif
+-+    s->used_for_ref = !(s->nal_unit_type == NAL_TRAIL_N ||
+-+                        s->nal_unit_type == NAL_TSA_N   ||
+-+                        s->nal_unit_type == NAL_STSA_N  ||
+-+                        s->nal_unit_type == NAL_RADL_N  ||
+-+                        s->nal_unit_type == NAL_RASL_N);
+- 
+-     if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) {
+-         av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n");
+-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
+-index 5df9dcd..5cb90b5 100644
+---- a/libavcodec/hevc.h
+-+++ b/libavcodec/hevc.h
+-@@ -890,6 +890,8 @@ typedef struct HEVCContext {
+-     int                 width;
+-     int                 height;
+- 
+-+    int used_for_ref;
+-+
+- #ifdef RPI
+-     int enable_rpi;
+-     HEVCMvCmd *unif_mv_cmds;
+-diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
+-index 11629e4..14a0952 100644
+---- a/libavcodec/hevc_filter.c
+-+++ b/libavcodec/hevc_filter.c
+-@@ -512,16 +512,14 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+-                s->ps.pps->transquant_bypass_enable_flag;
+- 
+- #ifdef DISABLE_DEBLOCK_NONREF
+--    if (    s->nal_unit_type == NAL_TRAIL_N ||
+--            s->nal_unit_type == NAL_TSA_N   ||
+--            s->nal_unit_type == NAL_STSA_N  ||
+--            s->nal_unit_type == NAL_RADL_N  ||
+--            s->nal_unit_type == NAL_RASL_N )
+-+    if (!s->used_for_ref)
+-       return; // Don't deblock non-reference frames
+- #endif
+- #ifdef DISABLE_DEBLOCK
+-     return;
+- #endif
+-+    if (!s->used_for_ref && s->avctx->skip_loop_filter >= AVDISCARD_NONREF)
+-+        return;
+- 
+-     if (x0) {
+-         left_tc_offset   = s->deblock[ctb - 1].tc_offset;
+-@@ -885,11 +883,7 @@ static int ff_hevc_buf_base(AVBufferRef *bref) {
+- 
+- void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n)
+- {
+--    if (s->enable_rpi && !(  s->nal_unit_type == NAL_TRAIL_N ||
+--            s->nal_unit_type == NAL_TSA_N   ||
+--            s->nal_unit_type == NAL_STSA_N  ||
+--            s->nal_unit_type == NAL_RADL_N  ||
+--            s->nal_unit_type == NAL_RASL_N )) {
+-+    if (s->enable_rpi && s->used_for_ref) {
+- #ifdef RPI_FAST_CACHEFLUSH
+-         struct vcsm_user_clean_invalid_s iocache = {};
+-         int curr_y = ((int *)f->progress->data)[0];
+--- 
+-2.7.4
+-
+-
+-From 6401d88c310cd3bfec7be94bf3ceb6d0c5736c7e Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Tue, 2 Jun 2015 15:22:52 +0100
+-Subject: [PATCH 51/68] Added support for skip_frame
+-
+----
+- libavcodec/hevc.c | 15 ++++++++++-----
+- 1 file changed, 10 insertions(+), 5 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 0ac4f4c..639e4df 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -3397,11 +3397,6 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-     }
+- 
+- #endif
+--    s->used_for_ref = !(s->nal_unit_type == NAL_TRAIL_N ||
+--                        s->nal_unit_type == NAL_TSA_N   ||
+--                        s->nal_unit_type == NAL_STSA_N  ||
+--                        s->nal_unit_type == NAL_RADL_N  ||
+--                        s->nal_unit_type == NAL_RASL_N);
+- 
+-     if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) {
+-         av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n");
+-@@ -3925,6 +3920,16 @@ static int decode_nal_unit(HEVCContext *s, const H2645NAL *nal)
+-         if (ret < 0)
+-             return ret;
+- 
+-+        s->used_for_ref = !(s->nal_unit_type == NAL_TRAIL_N ||
+-+                        s->nal_unit_type == NAL_TSA_N   ||
+-+                        s->nal_unit_type == NAL_STSA_N  ||
+-+                        s->nal_unit_type == NAL_RADL_N  ||
+-+                        s->nal_unit_type == NAL_RASL_N);
+-+
+-+        if (!s->used_for_ref && s->avctx->skip_frame >= AVDISCARD_NONREF) {
+-+            s->is_decoded = 0;
+-+            break;
+-+        }
+-         if (s->max_ra == INT_MAX) {
+-             if (s->nal_unit_type == NAL_CRA_NUT || IS_BLA(s)) {
+-                 s->max_ra = s->poc;
+--- 
+-2.7.4
+-
+-
+-From d2951e2ca73e234d1b775621e3993948a4a2c8ea Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Wed, 3 Jun 2015 09:15:38 +0100
+-Subject: [PATCH 52/68] Fixed cache flushing of luma when using old method
+-
+----
+- libavcodec/hevc_filter.c | 2 +-
+- 1 file changed, 1 insertion(+), 1 deletion(-)
+-
+-diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
+-index 14a0952..b286bbf 100644
+---- a/libavcodec/hevc_filter.c
+-+++ b/libavcodec/hevc_filter.c
+-@@ -919,7 +919,7 @@ void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n)
+-         flush_buffer(s->frame->buf[1]);
+-         flush_buffer(s->frame->buf[2]);
+- #ifdef RPI_LUMA_QPU
+--        flush_buffer(s->frame->buf[1]);
+-+        flush_buffer(s->frame->buf[0]);
+- #endif
+- 
+- #endif
+--- 
+-2.7.4
+-
+-
+-From 7ae612e69c1cabcc7d0b37b65efa8c5bdcfa7bf5 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Wed, 3 Jun 2015 11:37:27 +0100
+-Subject: [PATCH 53/68] Option to parallelise coefficient decode and inter
+- prediction and deblock for each frame
+-
+----
+- libavcodec/hevc.c              | 701 +++++++++++++++++++++++++++--------------
+- libavcodec/hevc.h              |  74 +++--
+- libavcodec/hevc_cabac.c        |  12 +-
+- libavcodec/hevcpred_template.c |   5 +-
+- 4 files changed, 522 insertions(+), 270 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 639e4df..12aacc5 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -43,8 +43,6 @@
+- 
+- #ifdef RPI
+-   #include "rpi_qpu.h"
+--  // For some unknown reason, the code seems to crash if I do a late malloc
+--  //#define EARLY_MALLOC
+-   // Move Inter prediction into separate pass
+-   #define RPI_INTER
+- 
+-@@ -58,6 +56,21 @@
+- 
+-   // Define RPI_SIMULATE_QPUS for debugging to run QPU code on the ARMs
+-   //#define RPI_SIMULATE_QPUS
+-+  #ifdef RPI_WORKER
+-+    #include "pthread.h"
+-+  #endif
+-+
+-+  static void rpi_execute_dblk_cmds(HEVCContext *s);
+-+  static void rpi_execute_transform(HEVCContext *s);
+-+  static void rpi_execute_inter_qpu(HEVCContext *s);
+-+  static void rpi_execute_pred_cmds(HEVCContext *s);
+-+  static void rpi_execute_inter_cmds(HEVCContext *s);
+-+  static void rpi_inter_clear(HEVCContext *s);
+-+
+-+  // Define INTER_PASS0 to do inter prediction in first pass
+-+  //#define INTER_PASS0
+-+  // Define LAUNCH_PASS0 to launch QPU/VPU from pass0
+-+  //#define LAUNCH_PASS0
+- 
+- #endif
+- 
+-@@ -105,6 +118,143 @@ static uint32_t get_vc_address(AVBufferRef *bref) {
+-   GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
+-   return p->vc;
+- }
+-+#endif
+-+
+-+
+-+#ifdef RPI_WORKER
+-+
+-+//#define LOG_ENTER printf("Enter %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s);
+-+//#define LOG_EXIT printf("Exit %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s);
+-+
+-+#define LOG_ENTER
+-+#define LOG_EXIT
+-+
+-+// Call this when we have completed pass0 and wish to trigger pass1 for the current job
+-+static void worker_submit_job(HEVCContext *s)
+-+{
+-+  LOG_ENTER
+-+  //pthread_mutex_lock(&s->worker_mutex);
+-+  s->worker_tail++; // This is the only place that can change tail so we do not need the mutex
+-+  s->pass0_job = (s->pass0_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
+-+  pthread_cond_broadcast(&s->worker_cond_tail); // Let people know that the tail has moved
+-+  //pthread_mutex_unlock(&s->worker_mutex);
+-+  LOG_EXIT
+-+}
+-+
+-+// Call this to say we have completed pass1
+-+static void worker_complete_middle_job(HEVCContext *s)
+-+{
+-+  LOG_ENTER
+-+  //pthread_mutex_lock(&s->worker_mutex);
+-+  s->worker_middle++; // This is the only place that can change head so we do not need the mutex
+-+  s->pass1_job = (s->pass1_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
+-+  pthread_cond_broadcast(&s->worker_cond_middle); // Let people know that the tail has moved
+-+  //pthread_mutex_unlock(&s->worker_mutex);
+-+  LOG_EXIT
+-+}
+-+
+-+// Call this to say we have completed pass2
+-+static void worker_complete_job(HEVCContext *s)
+-+{
+-+  LOG_ENTER
+-+  //pthread_mutex_lock(&s->worker_mutex);
+-+  s->worker_head++; // This is the only place that can change head so we do not need the mutex
+-+  s->pass2_job = (s->pass2_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
+-+  pthread_cond_broadcast(&s->worker_cond_head); // Let people know that the tail has moved
+-+  //pthread_mutex_unlock(&s->worker_mutex);
+-+  LOG_EXIT
+-+}
+-+
+-+// Call this to wait for all jobs to have completed at the end of a frame
+-+static void worker_wait(HEVCContext *s)
+-+{
+-+  LOG_ENTER
+-+  pthread_mutex_lock(&s->worker_mutex);
+-+  while( s->worker_head !=s->worker_tail)
+-+  {
+-+    pthread_cond_wait(&s->worker_cond_head, &s->worker_mutex);
+-+  }
+-+  pthread_mutex_unlock(&s->worker_mutex);
+-+  LOG_EXIT
+-+}
+-+
+-+// Call worker_pass0_ready to wait until the s->pass0_job slot becomes
+-+// available to receive the next job.
+-+static void worker_pass0_ready(HEVCContext *s)
+-+{
+-+  LOG_ENTER
+-+    pthread_mutex_lock(&s->worker_mutex);
+-+    // tail is number of submitted jobs
+-+    // head is number of completed jobs
+-+    // tail-head is number of outstanding jobs in the queue
+-+    // we need to ensure there is at least 1 space left for us to use
+-+    while( s->worker_tail - s->worker_head >= RPI_MAX_JOBS)
+-+    {
+-+      // Wait until another job is completed
+-+      pthread_cond_wait(&s->worker_cond_head, &s->worker_mutex);
+-+    }
+-+    pthread_mutex_unlock(&s->worker_mutex);
+-+  LOG_EXIT
+-+}
+-+
+-+static void *worker_start(void *arg)
+-+{
+-+  HEVCContext *s = (HEVCContext *)arg;
+-+  while(1) {
+-+    pthread_mutex_lock(&s->worker_mutex);
+-+
+-+    while( !s->kill_worker && s->worker_tail - s->worker_middle <= 0)
+-+    {
+-+      pthread_cond_wait(&s->worker_cond_tail, &s->worker_mutex);
+-+    }
+-+    pthread_mutex_unlock(&s->worker_mutex);
+-+
+-+    if (s->kill_worker) {
+-+      break;
+-+    }
+-+    LOG_ENTER
+-+    // printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
+-+#ifndef LAUNCH_PASS0
+-+    rpi_execute_inter_qpu(s);
+-+#endif
+-+#ifndef INTER_PASS0
+-+    // Perform inter prediction
+-+    rpi_execute_inter_cmds(s);
+-+#endif
+-+    // Wait for transform completion
+-+    vpu_wait(s->vpu_id);
+-+
+-+    worker_complete_middle_job(s);
+-+    LOG_EXIT
+-+  }
+-+  return NULL;
+-+}
+-+
+-+static void *worker_deblock_start(void *arg)
+-+{
+-+  HEVCContext *s = (HEVCContext *)arg;
+-+  while(1) {
+-+    pthread_mutex_lock(&s->worker_mutex);
+-+    while( !s->kill_worker && s->worker_middle - s->worker_head <= 0)
+-+    {
+-+      pthread_cond_wait(&s->worker_cond_middle, &s->worker_mutex);
+-+    }
+-+    pthread_mutex_unlock(&s->worker_mutex);
+-+
+-+    if (s->kill_worker) {
+-+      break;
+-+    }
+-+    LOG_ENTER
+-+    // Perform intra prediction and residual reconstruction
+-+    rpi_execute_pred_cmds(s);
+-+    // Perform deblocking for CTBs in this row
+-+    rpi_execute_dblk_cmds(s);
+-+
+-+    worker_complete_job(s);
+-+    LOG_EXIT
+-+  }
+-+  return NULL;
+-+}
+- 
+- #endif
+- 
+-@@ -121,19 +271,18 @@ static uint32_t get_vc_address(AVBufferRef *bref) {
+- static void pic_arrays_free(HEVCContext *s)
+- {
+- #ifdef RPI
+--
+--#ifdef EARLY_MALLOC
+--#else
+--    if (s->coeffs_buf_arm[0]) {
+--      gpu_free(&s->coeffs_buf_default);
+--      s->coeffs_buf_arm[0] = 0;
+--    }
+--    if (s->coeffs_buf_arm[2]) {
+--      gpu_free(&s->coeffs_buf_accelerated);
+--      s->coeffs_buf_arm[2] = 0;
+-+    int job;
+-+    for(job=0;job<RPI_MAX_JOBS;job++) {
+-+      if (s->coeffs_buf_arm[job][0]) {
+-+        gpu_free(&s->coeffs_buf_default[job]);
+-+        s->coeffs_buf_arm[job][0] = 0;
+-+      }
+-+      if (s->coeffs_buf_arm[job][2]) {
+-+        gpu_free(&s->coeffs_buf_accelerated[job]);
+-+        s->coeffs_buf_arm[job][2] = 0;
+-+      }
+-     }
+- #endif
+--#endif
+-     av_freep(&s->sao);
+-     av_freep(&s->deblock);
+- 
+-@@ -171,24 +320,26 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
+-     int min_pu_size      = sps->min_pu_width * sps->min_pu_height;
+- 
+- #ifdef RPI
+--#ifdef EARLY_MALLOC
+--#else
+-     av_assert0(sps);
+-     int coeffs_in_ctb = (1 << sps->log2_ctb_size) * (1 << sps->log2_ctb_size);
+-     int coefs_per_row = sps->ctb_width * coeffs_in_ctb * 3;  // Allow space for chroma
+--    gpu_malloc_cached(sizeof(int16_t) * coefs_per_row, &s->coeffs_buf_default);
+--    s->coeffs_buf_arm[0] = (int16_t*) s->coeffs_buf_default.arm;
+--    if (!s->coeffs_buf_arm[0])
+--        goto fail;
+--    gpu_malloc_cached(sizeof(int16_t) * coefs_per_row * 2, &s->coeffs_buf_accelerated);
+--    s->coeffs_buf_arm[2] = (int16_t*) s->coeffs_buf_accelerated.arm;
+--    s->coeffs_buf_vc[2] = s->coeffs_buf_accelerated.vc;
+--    if (!s->coeffs_buf_arm[2])
+--        goto fail;
+--    s->coeffs_buf_arm[3] = coefs_per_row + s->coeffs_buf_arm[2];
+--    s->coeffs_buf_vc[3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[2];
+--    printf("Done\n");
+--#endif
+-+    int job;
+-+    for(job=0;job<RPI_MAX_JOBS;job++) {
+-+      printf("Allocated %d\n",coefs_per_row);
+-+      for(job=0;job<RPI_MAX_JOBS;job++) {
+-+        gpu_malloc_cached(sizeof(int16_t) * coefs_per_row, &s->coeffs_buf_default[job]);
+-+        s->coeffs_buf_arm[job][0] = (int16_t*) s->coeffs_buf_default[job].arm;
+-+        if (!s->coeffs_buf_arm[job][0])
+-+            goto fail;
+-+        gpu_malloc_cached(sizeof(int16_t) * coefs_per_row * 2, &s->coeffs_buf_accelerated[job]);
+-+        s->coeffs_buf_arm[job][2] = (int16_t*) s->coeffs_buf_accelerated[job].arm;
+-+        s->coeffs_buf_vc[job][2] = s->coeffs_buf_accelerated[job].vc;
+-+        if (!s->coeffs_buf_arm[job][2])
+-+            goto fail;
+-+        s->coeffs_buf_arm[job][3] = coefs_per_row + s->coeffs_buf_arm[job][2];
+-+        s->coeffs_buf_vc[job][3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[job][2];
+-+      }
+-+    }
+- #endif
+- 
+-     s->bs_width  = (width  >> 2) + 1;
+-@@ -1036,7 +1187,7 @@ static void rpi_intra_pred(HEVCContext *s, int log2_trafo_size, int x0, int y0,
+- {
+-     if (s->enable_rpi) {
+-         HEVCLocalContext *lc = s->HEVClc;
+--        HEVCPredCmd *cmd = s->univ_pred_cmds + s->num_pred_cmds++;
+-+        HEVCPredCmd *cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++;
+-         cmd->type = RPI_PRED_INTRA;
+-         cmd->size = log2_trafo_size;
+-         cmd->c_idx = c_idx;
+-@@ -1496,7 +1647,7 @@ static void rpi_luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+-                         AVFrame *ref, const Mv *mv, int x_off, int y_off,
+-                         int block_w, int block_h, int luma_weight, int luma_offset)
+- {
+--    HEVCMvCmd *cmd = s->unif_mv_cmds + s->num_mv_cmds++;
+-+    HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++;
+-     cmd->cmd = RPI_CMD_LUMA_UNI;
+-     cmd->dst = dst;
+-     cmd->dststride = dststride;
+-@@ -1515,7 +1666,7 @@ static void rpi_luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+-                        AVFrame *ref0, const Mv *mv0, int x_off, int y_off,
+-                        int block_w, int block_h, AVFrame *ref1, const Mv *mv1, struct MvField *current_mv)
+- {
+--    HEVCMvCmd *cmd = s->unif_mv_cmds + s->num_mv_cmds++;
+-+    HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++;
+-     cmd->cmd = RPI_CMD_LUMA_BI;
+-     cmd->dst = dst;
+-     cmd->dststride = dststride;
+-@@ -1537,7 +1688,7 @@ static void rpi_chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
+-                           ptrdiff_t dststride, uint8_t *src0, ptrdiff_t srcstride, int reflist,
+-                           int x_off, int y_off, int block_w, int block_h, struct MvField *current_mv, int chroma_weight, int chroma_offset)
+- {
+--    HEVCMvCmd *cmd = s->unif_mv_cmds + s->num_mv_cmds++;
+-+    HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++;
+-     cmd->cmd = RPI_CMD_CHROMA_UNI;
+-     cmd->dst = dst0;
+-     cmd->dststride = dststride;
+-@@ -1555,7 +1706,7 @@ static void rpi_chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
+- static void rpi_chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVFrame *ref0, AVFrame *ref1,
+-                          int x_off, int y_off, int block_w, int block_h, struct MvField *current_mv, int cidx)
+- {
+--    HEVCMvCmd *cmd = s->unif_mv_cmds + s->num_mv_cmds++;
+-+    HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++;
+-     cmd->cmd = RPI_CMD_CHROMA_BI+cidx;
+-     cmd->dst = dst0;
+-     cmd->dststride = dststride;
+-@@ -2037,7 +2188,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-             int chan = x0>>6; // 64 wide blocks per QPU
+-             int weight_flag = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
+-                               (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
+--            uint32_t *y = s->y_mvs[chan % 12];
+-+            uint32_t *y = s->y_mvs[s->pass0_job][chan % 12];
+-             for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
+-               for(int start_x=0;start_x < nPbW;start_x+=16) {
+-                   int bw = nPbW-start_x;
+-@@ -2057,7 +2208,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                   y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter;
+-                 }
+-             }
+--            s->y_mvs[chan % 12] = y;
+-+            s->y_mvs[s->pass0_job][chan % 12] = y;
+-         } else
+- #endif
+-         {
+-@@ -2086,7 +2237,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                 int weight_flag      = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
+-                                        (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
+- 
+--                uint32_t *u = s->u_mvs[chan & 7];
+-+                uint32_t *u = s->u_mvs[s->pass0_job][chan & 7];
+-                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
+-                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
+-                       int bw = nPbW_c-start_x;
+-@@ -2110,7 +2261,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+-                     }
+-                 }
+--                s->u_mvs[chan & 7] = u;
+-+                s->u_mvs[s->pass0_job][chan & 7] = u;
+-                 return;
+-             }
+- #endif
+-@@ -2140,7 +2291,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-             int chan = x0>>6; // 64 wide blocks per QPU
+-             int weight_flag = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
+-                               (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
+--            uint32_t *y = s->y_mvs[chan % 12];
+-+            uint32_t *y = s->y_mvs[s->pass0_job][chan % 12];
+-             for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
+-               for(int start_x=0;start_x < nPbW;start_x+=16) {
+-                   int bw = nPbW-start_x;
+-@@ -2160,7 +2311,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                   y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter;
+-                 }
+-             }
+--            s->y_mvs[chan % 12] = y;
+-+            s->y_mvs[s->pass0_job][chan % 12] = y;
+-         } else
+- #endif
+- 
+-@@ -2190,7 +2341,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                 int weight_flag      = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
+-                                        (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
+- 
+--                uint32_t *u = s->u_mvs[chan & 7];
+-+                uint32_t *u = s->u_mvs[s->pass0_job][chan & 7];
+-                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
+-                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
+-                       int bw = nPbW_c-start_x;
+-@@ -2215,7 +2366,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+-                     }
+-                 }
+--                s->u_mvs[chan & 7] = u;
+-+                s->u_mvs[s->pass0_job][chan & 7] = u;
+-                 return;
+-             }
+- #endif
+-@@ -2249,7 +2400,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-             int x2 = x0 + (mv2->x >> 2);
+-             int y2 = y0 + (mv2->y >> 2);
+-             int chan = x0>>6; // 64 wide blocks per QPU
+--            uint32_t *y = s->y_mvs[chan % 12];
+-+            uint32_t *y = s->y_mvs[s->pass0_job][chan % 12];
+-             for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
+-               for(int start_x=0;start_x < nPbW;start_x+=8) { // B blocks work 8 at a time
+-                   int bw = nPbW-start_x;
+-@@ -2265,7 +2416,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                   y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter_b;
+-                 }
+-             }
+--            s->y_mvs[chan % 12] = y;
+-+            s->y_mvs[s->pass0_job][chan % 12] = y;
+-         } else
+- #endif
+-         {
+-@@ -2298,7 +2449,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+- 
+-                 int chan = x0>>8; // Allocate commands for the first 256 luma pixels across to the first QPU.  This is optimised for images around 1920 width
+- 
+--                uint32_t *u = s->u_mvs[chan & 7];
+-+                uint32_t *u = s->u_mvs[s->pass0_job][chan & 7];
+-                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
+-                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
+-                       int bw = nPbW_c-start_x;
+-@@ -2327,7 +2478,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+-                     }
+-                 }
+--                s->u_mvs[chan & 7] = u;
+-+                s->u_mvs[s->pass0_job][chan & 7] = u;
+-                 return;
+-             }
+- #endif
+-@@ -2832,40 +2983,54 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
+- static void rpi_execute_dblk_cmds(HEVCContext *s)
+- {
+-     int n;
+-+    int job = s->pass2_job;
+-     int ctb_size    = 1 << s->ps.sps->log2_ctb_size;
+--    int (*p)[2] = s->dblk_cmds;
+--    for(n = s->num_dblk_cmds; n>0 ;n--,p++) {
+-+    int (*p)[2] = s->dblk_cmds[job];
+-+    for(n = s->num_dblk_cmds[job]; n>0 ;n--,p++) {
+-         ff_hevc_hls_filters(s, (*p)[0], (*p)[1], ctb_size);
+-     }
+--    s->num_dblk_cmds = 0;
+-+    s->num_dblk_cmds[job] = 0;
+- }
+- 
+- static void rpi_execute_transform(HEVCContext *s)
+- {
+-     int i=2;
+-+#ifdef LAUNCH_PASS0
+-+    int job = s->pass0_job;
+-+#else
+-+    int job = s->pass1_job;
+-+#endif
+-     //int j;
+-     //int16_t *coeffs = s->coeffs_buf_arm[i];
+-     //for(j=s->num_coeffs[i]; j > 0; j-= 16*16, coeffs+=16*16) {
+-     //    s->hevcdsp.idct[4-2](coeffs, 16);
+-     //}
+- 
+--    gpu_cache_flush(&s->coeffs_buf_accelerated);
+--    s->vpu_id = vpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0, &s->coeffs_buf_accelerated);
+-+    gpu_cache_flush(&s->coeffs_buf_accelerated[job]);
+-+    s->vpu_id = vpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2],
+-+                               s->num_coeffs[job][2] >> 8, s->coeffs_buf_vc[job][3],
+-+                               s->num_coeffs[job][3] >> 10, 0, &s->coeffs_buf_accelerated[job]);
+-     //vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0);
+-     //gpu_cache_flush(&s->coeffs_buf_accelerated);
+-     //vpu_wait(s->vpu_id);
+- 
+-     for(i=0;i<4;i++)
+--        s->num_coeffs[i] = 0;
+-+        s->num_coeffs[job][i] = 0;
+- }
+- 
+- static void rpi_execute_pred_cmds(HEVCContext *s)
+- {
+-   int i;
+--  HEVCPredCmd *cmd = s->univ_pred_cmds;
+-+  int job = s->pass2_job;
+-+  HEVCPredCmd *cmd = s->univ_pred_cmds[job];
+-+#ifdef RPI_WORKER
+-+  HEVCLocalContextIntra *lc = &s->HEVClcIntra;
+-+#else
+-   HEVCLocalContext *lc = s->HEVClc;
+-+#endif
+- 
+--  for(i = s->num_pred_cmds; i > 0; i--, cmd++) {
+-+  for(i = s->num_pred_cmds[job]; i > 0; i--, cmd++) {
+-+      //printf("i=%d cmd=%p job1=%d job0=%d\n",i,cmd,s->pass1_job,s->pass0_job);
+-       if (cmd->type == RPI_PRED_INTRA) {
+-           lc->tu.intra_pred_mode_c = lc->tu.intra_pred_mode = cmd->mode;
+-           lc->na.cand_bottom_left  = (cmd->na >> 4) & 1;
+-@@ -2884,21 +3049,26 @@ static void rpi_execute_pred_cmds(HEVCContext *s)
+- #endif
+-       }
+-   }
+--  s->num_pred_cmds = 0;
+-+  s->num_pred_cmds[job] = 0;
+- }
+- 
+- static void rpi_execute_inter_cmds(HEVCContext *s)
+- {
+--    HEVCMvCmd *cmd = s->unif_mv_cmds;
+-+#ifdef INTER_PASS0
+-+    int job = s->pass0_job;
+-+#else
+-+    int job = s->pass1_job;
+-+#endif
+-+    HEVCMvCmd *cmd = s->unif_mv_cmds[job];
+-     int n,cidx;
+-     AVFrame myref;
+-     AVFrame myref1;
+-     struct MvField mymv;
+--    if (s->num_mv_cmds > RPI_MAX_MV_CMDS) {
+-+    if (s->num_mv_cmds[job] > RPI_MAX_MV_CMDS) {
+-         printf("Overflow inter_cmds\n");
+-         exit(-1);
+-     }
+--    for(n = s->num_mv_cmds; n>0 ; n--, cmd++) {
+-+    for(n = s->num_mv_cmds[job]; n>0 ; n--, cmd++) {
+-         switch(cmd->cmd) {
+-         case RPI_CMD_LUMA_UNI:
+-             myref.data[0] = cmd->src;
+-@@ -2938,7 +3108,28 @@ static void rpi_execute_inter_cmds(HEVCContext *s)
+-             break;
+-         }
+-     }
+--    s->num_mv_cmds = 0;
+-+    s->num_mv_cmds[job] = 0;
+-+}
+-+
+-+static void rpi_do_all_passes(HEVCContext *s)
+-+{
+-+#ifdef RPI_INTER_QPU
+-+    // Kick off inter prediction on QPUs
+-+    rpi_execute_inter_qpu(s);
+-+#else
+-+    rpi_execute_transform(s);
+-+#endif
+-+    // Perform luma inter prediction
+-+    rpi_execute_inter_cmds(s);
+-+    // Wait for transform completion
+-+    vpu_wait(s->vpu_id);
+-+    // Perform intra prediction and residual reconstruction
+-+    rpi_execute_pred_cmds(s);
+-+    // Perform deblocking for CTBs in this row
+-+    rpi_execute_dblk_cmds(s);
+-+#ifdef RPI_INTER_QPU
+-+    rpi_inter_clear(s);
+-+#endif
+- }
+- 
+- #endif
+-@@ -2946,6 +3137,7 @@ static void rpi_execute_inter_cmds(HEVCContext *s)
+- #ifdef RPI_INTER_QPU
+- static void rpi_inter_clear(HEVCContext *s)
+- {
+-+    int job = s->pass0_job;
+-     int i;
+-     int pic_width        = s->ps.sps->width >> s->ps.sps->hshift[1];
+-     int pic_height       = s->ps.sps->height >> s->ps.sps->vshift[1];
+-@@ -2953,51 +3145,50 @@ static void rpi_inter_clear(HEVCContext *s)
+-                            (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
+- 
+-     for(i=0;i<8;i++) {
+--        s->u_mvs[i] = s->mvs_base[i];
+--        *s->u_mvs[i]++ = 0;
+--        *s->u_mvs[i]++ = 0;
+--        *s->u_mvs[i]++ = 0;
+--        *s->u_mvs[i]++ = 0;
+--        *s->u_mvs[i]++ = 0;
+--        *s->u_mvs[i]++ = pic_width;
+--        *s->u_mvs[i]++ = pic_height;
+--        *s->u_mvs[i]++ = s->frame->linesize[1];
+--        *s->u_mvs[i]++ = s->frame->linesize[2];
+-+        s->u_mvs[job][i] = s->mvs_base[job][i];
+-+        *s->u_mvs[job][i]++ = 0;
+-+        *s->u_mvs[job][i]++ = 0;
+-+        *s->u_mvs[job][i]++ = 0;
+-+        *s->u_mvs[job][i]++ = 0;
+-+        *s->u_mvs[job][i]++ = 0;
+-+        *s->u_mvs[job][i]++ = pic_width;
+-+        *s->u_mvs[job][i]++ = pic_height;
+-+        *s->u_mvs[job][i]++ = s->frame->linesize[1];
+-+        *s->u_mvs[job][i]++ = s->frame->linesize[2];
+-         if (weight_flag) {
+--            *s->u_mvs[i]++ = 1 << (s->sh.chroma_log2_weight_denom + 6 - 1);
+--            *s->u_mvs[i]++ = s->sh.chroma_log2_weight_denom + 6;
+-+            *s->u_mvs[job][i]++ = 1 << (s->sh.chroma_log2_weight_denom + 6 - 1);
+-+            *s->u_mvs[job][i]++ = s->sh.chroma_log2_weight_denom + 6;
+-         } else {
+--            *s->u_mvs[i]++ = 1 << 5;
+--            *s->u_mvs[i]++ = 6;
+-+            *s->u_mvs[job][i]++ = 1 << 5;
+-+            *s->u_mvs[job][i]++ = 6;
+-         }
+--        *s->u_mvs[i]++ = i;  // Select section of VPM (avoid collisions with 3d unit)
+-+        *s->u_mvs[job][i]++ = i;  // Select section of VPM (avoid collisions with 3d unit)
+-     }
+- 
+- #ifdef RPI_LUMA_QPU
+-     for(i=0;i<12;i++) {
+--        s->y_mvs[i] = s->y_mvs_base[i];
+--        *s->y_mvs[i]++ = 0; // y_x
+--        *s->y_mvs[i]++ = 0; // ref_y_base
+--        *s->y_mvs[i]++ = 0; // y2_x2
+--        *s->y_mvs[i]++ = 0; // ref_y2_base
+--        *s->y_mvs[i]++ = (s->ps.sps->width << 16) + s->ps.sps->height;
+--        *s->y_mvs[i]++ = s->frame->linesize[0]; // pitch
+--        *s->y_mvs[i]++ = s->frame->linesize[0]; // dst_pitch
+-+        s->y_mvs[job][i] = s->y_mvs_base[job][i];
+-+        *s->y_mvs[job][i]++ = 0; // y_x
+-+        *s->y_mvs[job][i]++ = 0; // ref_y_base
+-+        *s->y_mvs[job][i]++ = 0; // y2_x2
+-+        *s->y_mvs[job][i]++ = 0; // ref_y2_base
+-+        *s->y_mvs[job][i]++ = (s->ps.sps->width << 16) + s->ps.sps->height;
+-+        *s->y_mvs[job][i]++ = s->frame->linesize[0]; // pitch
+-+        *s->y_mvs[job][i]++ = s->frame->linesize[0]; // dst_pitch
+-         if (weight_flag) {
+-             int offset = 1 << (s->sh.luma_log2_weight_denom + 6 - 1);
+-             int shift = s->sh.luma_log2_weight_denom + 6;
+--            *s->y_mvs[i]++ = (offset << 16) + shift;
+-+            *s->y_mvs[job][i]++ = (offset << 16) + shift;
+-         } else {
+-             int offset = 1 << 5;
+-             int shift = 6;
+--            *s->y_mvs[i]++ = (offset << 16) + shift;
+-+            *s->y_mvs[job][i]++ = (offset << 16) + shift;
+-         }
+--        *s->y_mvs[i]++ = 0; // Next kernel
+-+        *s->y_mvs[job][i]++ = 0; // Next kernel
+-     }
+- #endif
+- }
+- 
+--
+- #ifdef RPI_SIMULATE_QPUS
+- 
+- static int32_t clipx(int x,int FRAME_WIDTH)
+-@@ -3271,10 +3462,15 @@ static void rpi_simulate_inter_qpu(HEVCContext *s)
+- static void rpi_execute_inter_qpu(HEVCContext *s)
+- {
+-     int k;
+-+#ifdef LAUNCH_PASS0
+-+    int job = s->pass0_job;
+-+#else
+-+    int job = s->pass1_job;
+-+#endif
+-     int i;
+--    uint32_t *unif_vc = (uint32_t *)s->unif_mvs_ptr.vc;
+-+    uint32_t *unif_vc = (uint32_t *)s->unif_mvs_ptr[job].vc;
+- #ifdef RPI_LUMA_QPU
+--    uint32_t *y_unif_vc = (uint32_t *)s->y_unif_mvs_ptr.vc;
+-+    uint32_t *y_unif_vc = (uint32_t *)s->y_unif_mvs_ptr[job].vc;
+- #endif
+-     if (s->sh.slice_type == I_SLICE) {
+- #ifdef RPI_MULTI_MAILBOX
+-@@ -3283,22 +3479,22 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
+- #endif
+-     }
+-     for(k=0;k<8;k++) {
+--        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
+--        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
+--        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP_UV); // Also need a dummy for V
+--        av_assert0(s->u_mvs[k] - s->mvs_base[k] < UV_COMMANDS_PER_QPU);
+-+        s->u_mvs[job][k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
+-+        s->u_mvs[job][k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
+-+        s->u_mvs[job][k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP_UV); // Also need a dummy for V
+-+        av_assert0(s->u_mvs[job][k] - s->mvs_base[job][k] < UV_COMMANDS_PER_QPU);
+-     }
+- 
+--    s->u_mvs[8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
+-+    s->u_mvs[job][8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
+- 
+- #ifdef RPI_LUMA_QPU
+-     for(k=0;k<12;k++) {
+--        s->y_mvs[k][-RPI_LUMA_COMMAND_WORDS+1] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
+--        s->y_mvs[k][-RPI_LUMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // Also need a dummy for second request
+--        s->y_mvs[k][-RPI_LUMA_COMMAND_WORDS+8] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
+--        av_assert0(s->y_mvs[k] - s->y_mvs_base[k] < Y_COMMANDS_PER_QPU);
+-+        s->y_mvs[job][k][-RPI_LUMA_COMMAND_WORDS+1] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
+-+        s->y_mvs[job][k][-RPI_LUMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // Also need a dummy for second request
+-+        s->y_mvs[job][k][-RPI_LUMA_COMMAND_WORDS+8] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
+-+        av_assert0(s->y_mvs[job][k] - s->y_mvs_base[job][k] < Y_COMMANDS_PER_QPU);
+-     }
+--    s->y_mvs[12-1][-RPI_LUMA_COMMAND_WORDS+8] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT12); // This QPU will signal interrupt when all others are done and have acquired a semaphore
+-+    s->y_mvs[job][12-1][-RPI_LUMA_COMMAND_WORDS+8] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT12); // This QPU will signal interrupt when all others are done and have acquired a semaphore
+- #endif
+- 
+- #ifdef RPI_SIMULATE_QPUS
+-@@ -3308,34 +3504,34 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
+- 
+- #ifdef RPI_MULTI_MAILBOX
+- #ifdef RPI_CACHE_UNIF_MVS
+--    gpu_cache_flush3(&s->coeffs_buf_accelerated,&s->y_unif_mvs_ptr, &s->unif_mvs_ptr);
+-+    gpu_cache_flush3(&s->coeffs_buf_accelerated[job],&s->y_unif_mvs_ptr[job], &s->unif_mvs_ptr[job]);
+- #else
+--    gpu_cache_flush(&s->coeffs_buf_accelerated);
+-+    gpu_cache_flush(&s->coeffs_buf_accelerated[job]);
+- #endif
+--    s->vpu_id = vpu_qpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0,
+-+    s->vpu_id = vpu_qpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2], s->num_coeffs[job][2] >> 8, s->coeffs_buf_vc[job][3], s->num_coeffs[job][3] >> 10, 0,
+-                                    qpu_get_fn(QPU_MC_SETUP_UV),
+--                                   (uint32_t)(unif_vc+(s->mvs_base[0 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+--                                   (uint32_t)(unif_vc+(s->mvs_base[1 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+--                                   (uint32_t)(unif_vc+(s->mvs_base[2 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+--                                   (uint32_t)(unif_vc+(s->mvs_base[3 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+--                                   (uint32_t)(unif_vc+(s->mvs_base[4 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+--                                   (uint32_t)(unif_vc+(s->mvs_base[5 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+--                                   (uint32_t)(unif_vc+(s->mvs_base[6 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+--                                   (uint32_t)(unif_vc+(s->mvs_base[7 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-+                                   (uint32_t)(unif_vc+(s->mvs_base[job][0 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(unif_vc+(s->mvs_base[job][1 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(unif_vc+(s->mvs_base[job][2 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(unif_vc+(s->mvs_base[job][3 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(unif_vc+(s->mvs_base[job][4 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(unif_vc+(s->mvs_base[job][5 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(unif_vc+(s->mvs_base[job][6 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(unif_vc+(s->mvs_base[job][7 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+- #ifdef RPI_LUMA_QPU
+-                                    qpu_get_fn(QPU_MC_SETUP),
+--                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[0 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
+--                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[1 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
+--                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[2 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
+--                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[3 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
+--                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[4 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
+--                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[5 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
+--                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[6 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
+--                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[7 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
+--                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[8 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
+--                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[9 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
+--                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[10 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
+--                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[11 ] - (uint32_t*)s->y_unif_mvs_ptr.arm))
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][0 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][1 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][2 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][3 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][4 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][5 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][6 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][7 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][8 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][9 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][10 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][11 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm))
+- #else
+-                                    0,
+-                                    0,0,0,0,
+-@@ -3344,17 +3540,17 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
+- #endif
+-                                  );
+-     for(i=0;i<4;i++)
+--        s->num_coeffs[i] = 0;
+-+        s->num_coeffs[job][i] = 0;
+- #else
+-     qpu_run_shader8(qpu_get_fn(QPU_MC_SETUP_UV),
+--      (uint32_t)(unif_vc+(s->mvs_base[0 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+--      (uint32_t)(unif_vc+(s->mvs_base[1 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+--      (uint32_t)(unif_vc+(s->mvs_base[2 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+--      (uint32_t)(unif_vc+(s->mvs_base[3 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+--      (uint32_t)(unif_vc+(s->mvs_base[4 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+--      (uint32_t)(unif_vc+(s->mvs_base[5 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+--      (uint32_t)(unif_vc+(s->mvs_base[6 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+--      (uint32_t)(unif_vc+(s->mvs_base[7 ] - (uint32_t*)s->unif_mvs_ptr.arm))
+-+      (uint32_t)(unif_vc+(s->mvs_base[job][0 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[job][1 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[job][2 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[job][3 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[job][4 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[job][5 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[job][6 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[job][7 ] - (uint32_t*)s->unif_mvs_ptr[job].arm))
+-       );
+- #endif
+- 
+-@@ -3411,6 +3607,11 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-         }
+-     }
+- 
+-+#ifdef RPI_WORKER
+-+    s->pass0_job = 0;
+-+    s->pass1_job = 0;
+-+    s->pass2_job = 0;
+-+#endif
+- #ifdef RPI_INTER_QPU
+-     rpi_inter_clear(s);
+- #endif
+-@@ -3431,46 +3632,42 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-         s->filter_slice_edges[ctb_addr_rs]  = s->sh.slice_loop_filter_across_slices_enabled_flag;
+- 
+-         more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
+-+
+- #ifdef RPI
+-         if (s->enable_rpi) {
+--          s->dblk_cmds[s->num_dblk_cmds][0] = x_ctb;
+--          s->dblk_cmds[s->num_dblk_cmds++][1] = y_ctb;
+-+          s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]][0] = x_ctb;
+-+          s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]++][1] = y_ctb;
+-           if ( (((y_ctb + ctb_size)&63) == 0) && x_ctb + ctb_size >= s->ps.sps->width) {
+--            // Transform all blocks
+--            // printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
+--#ifdef RPI_MULTI_MAILBOX
+--            // Kick off inter prediction on QPUs
+--            rpi_execute_inter_qpu(s);
+--            // Perform luma inter prediction
+--            rpi_execute_inter_cmds(s);
+--#else
+--            rpi_execute_transform(s);
+--            // Perform inter prediction
+--            rpi_execute_inter_cmds(s);
+--#ifdef RPI_INTER_QPU
+--            // Kick off inter prediction on QPUs
+--            rpi_execute_inter_qpu(s);
+--#endif
+--#endif
+--
+--            // Wait for transform completion
+--            vpu_wait(s->vpu_id);
+--
+--            // Copy back reconstructed data
+--            //memcpy(s->frame->data[0],s->dummy.arm,2048*64);
+--            //memcpy(s->frame->data[1],s->dummy.arm,1024*32);
+--            //memcpy(s->frame->data[2],s->dummy.arm,1024*32);
+-+#ifdef RPI_WORKER
+-+            if (s->used_for_ref) {
+-+              // Split work load onto separate threads so we make as rapid progress as possible with this frame
+-+  #ifdef INTER_PASS0
+-+              rpi_execute_inter_cmds(s);
+-+  #endif
+-+  #ifdef LAUNCH_PASS0
+-+              rpi_execute_inter_qpu(s);
+-+  #endif
+-+              // Pass on this job to worker thread
+-+              worker_submit_job(s);
+-+              // Make sure we have space to prepare the next job
+-+              worker_pass0_ready(s);
+- 
+--            // Perform intra prediction and residual reconstruction
+--            rpi_execute_pred_cmds(s);
+--            // Perform deblocking for CTBs in this row
+--            rpi_execute_dblk_cmds(s);
+-+              // Prepare the next batch of commands
+- #ifdef RPI_INTER_QPU
+--            rpi_inter_clear(s);
+-+              rpi_inter_clear(s);
+-+#endif
+-+            } else {
+-+              // Non-ref frame so do it all on this thread
+-+              rpi_do_all_passes(s);
+-+            }
+-+#else
+-+            rpi_do_all_passes(s);
+- #endif
+-           }
+-         }
+- #endif
+-+
+-+
+-         if (more_data < 0) {
+-             s->tab_slice_address[ctb_addr_rs] = -1;
+-             return more_data;
+-@@ -3487,18 +3684,21 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-     }
+- 
+- #ifdef RPI
+--    if (s->enable_rpi && s->num_dblk_cmds) {
+--#ifdef RPI_INTER_QPU
+--        rpi_execute_inter_qpu(s);
+--#endif
+--#ifndef RPI_MULTI_MAILBOX
+--        rpi_execute_transform(s);
+-+
+-+#ifdef RPI_WORKER
+-+    // Wait for the worker to finish all its jobs
+-+    if (s->enable_rpi) {
+-+        worker_wait(s);
+-+        av_assert0(s->pass0_job==s->pass1_job);
+-+        av_assert0(s->pass1_job==s->pass2_job);
+-+    }
+- #endif
+--        rpi_execute_inter_cmds(s);
+--        vpu_wait(s->vpu_id);
+--        rpi_execute_pred_cmds(s);
+--        rpi_execute_dblk_cmds(s);
+-+
+-+    // Finish off any half-completed rows
+-+    if (s->enable_rpi && s->num_dblk_cmds[s->pass0_job]) {
+-+        rpi_do_all_passes(s);
+-     }
+-+
+- #endif
+- 
+-     if (x_ctb + ctb_size >= s->ps.sps->width &&
+-@@ -4230,6 +4430,48 @@ fail:
+-     return AVERROR(ENOMEM);
+- }
+- 
+-+#ifdef RPI_WORKER
+-+static av_cold void hevc_init_worker(HEVCContext *s)
+-+{
+-+    int err;
+-+    pthread_cond_init(&s->worker_cond_head, NULL);
+-+    pthread_cond_init(&s->worker_cond_middle, NULL);
+-+    pthread_cond_init(&s->worker_cond_tail, NULL);
+-+    pthread_mutex_init(&s->worker_mutex, NULL);
+-+
+-+    s->worker_tail=0;
+-+    s->worker_middle=0;
+-+    s->worker_head=0;
+-+    s->kill_worker=0;
+-+    err = pthread_create(&s->worker_thread, NULL, worker_start, s);
+-+    err = pthread_create(&s->worker_deblock_thread, NULL, worker_deblock_start, s);
+-+    if (err) {
+-+        printf("Failed to create worker thread\n");
+-+        exit(-1);
+-+    }
+-+}
+-+
+-+static av_cold void hevc_exit_worker(HEVCContext *s)
+-+{
+-+    void *res;
+-+    s->kill_worker=1;
+-+    pthread_cond_broadcast(&s->worker_cond_tail);
+-+    pthread_cond_broadcast(&s->worker_cond_middle);
+-+    pthread_join(s->worker_thread, &res);
+-+    pthread_join(s->worker_deblock_thread, &res);
+-+
+-+    pthread_cond_destroy(&s->worker_cond_head);
+-+    pthread_cond_destroy(&s->worker_cond_middle);
+-+    pthread_cond_destroy(&s->worker_cond_tail);
+-+    pthread_mutex_destroy(&s->worker_mutex);
+-+
+-+    s->worker_tail=0;
+-+    s->worker_middle=0;
+-+    s->worker_head=0;
+-+    s->kill_worker=0;
+-+}
+-+#endif
+-+
+- static av_cold int hevc_decode_free(AVCodecContext *avctx)
+- {
+-     HEVCContext       *s = avctx->priv_data;
+-@@ -4242,33 +4484,29 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
+-     av_freep(&s->cabac_state);
+- 
+- #ifdef RPI
+--    av_freep(&s->unif_mv_cmds);
+--    av_freep(&s->univ_pred_cmds);
+-+
+-+#ifdef RPI_WORKER
+-+    hevc_exit_worker(s);
+-+#endif
+-+
+-+    for(i=0;i<RPI_MAX_JOBS;i++) {
+-+      av_freep(&s->unif_mv_cmds[i]);
+-+      av_freep(&s->univ_pred_cmds[i]);
+- 
+- #ifdef RPI_INTER_QPU
+--    if (s->unif_mvs) {
+--        gpu_free( &s->unif_mvs_ptr );
+--        s->unif_mvs = 0;
+--    }
+-+      if (s->unif_mvs[i]) {
+-+        gpu_free( &s->unif_mvs_ptr[i] );
+-+        s->unif_mvs[i] = 0;
+-+      }
+- #endif
+- #ifdef RPI_LUMA_QPU
+--    if (s->y_unif_mvs) {
+--        gpu_free( &s->y_unif_mvs_ptr );
+--        s->y_unif_mvs = 0;
+--    }
+-+      if (s->y_unif_mvs[i]) {
+-+        gpu_free( &s->y_unif_mvs_ptr[i] );
+-+        s->y_unif_mvs[i] = 0;
+-+      }
+- #endif
+--
+--#ifdef EARLY_MALLOC
+--    printf("hevc_decode_free\n");
+--    if (s->coeffs_buf_arm[0]) {
+--      gpu_free(&s->coeffs_buf_default);
+--      s->coeffs_buf_arm[0] = 0;
+--    }
+--    if (s->coeffs_buf_arm[2]) {
+--      gpu_free(&s->coeffs_buf_accelerated);
+--      s->coeffs_buf_arm[2] = 0;
+-     }
+--#endif
+-+
+- #endif
+- 
+-     for (i = 0; i < 3; i++) {
+-@@ -4328,6 +4566,7 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
+- {
+-     HEVCContext *s = avctx->priv_data;
+-     int i;
+-+    int job;
+- 
+-     s->avctx = avctx;
+- 
+-@@ -4338,12 +4577,14 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
+-     s->sList[0] = s;
+- 
+- #ifdef RPI
+--    s->unif_mv_cmds = av_mallocz(sizeof(HEVCMvCmd)*RPI_MAX_MV_CMDS);
+--    if (!s->unif_mv_cmds)
+--        goto fail;
+--    s->univ_pred_cmds = av_mallocz(sizeof(HEVCPredCmd)*RPI_MAX_PRED_CMDS);
+--    if (!s->univ_pred_cmds)
+--        goto fail;
+-+    for(job=0;job<RPI_MAX_JOBS;job++) {
+-+        s->unif_mv_cmds[job] = av_mallocz(sizeof(HEVCMvCmd)*RPI_MAX_MV_CMDS);
+-+        if (!s->unif_mv_cmds[job])
+-+            goto fail;
+-+        s->univ_pred_cmds[job] = av_mallocz(sizeof(HEVCPredCmd)*RPI_MAX_PRED_CMDS);
+-+        if (!s->univ_pred_cmds[job])
+-+            goto fail;
+-+    }
+- 
+- #ifdef RPI_INTER_QPU
+-     // We divide the image into blocks 256 wide and 64 high
+-@@ -4354,18 +4595,20 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
+-     {
+-         int uv_commands_per_qpu = UV_COMMANDS_PER_QPU;
+-         uint32_t *p;
+-+		for(job=0;job<RPI_MAX_JOBS;job++) {
+- #ifdef RPI_CACHE_UNIF_MVS
+--        gpu_malloc_cached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr );
+-+          gpu_malloc_cached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr[job] );
+- #else
+--        gpu_malloc_uncached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr );
+-+          gpu_malloc_uncached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr[job] );
+- #endif
+--        s->unif_mvs = (uint32_t *) s->unif_mvs_ptr.arm; // TODO support this allocation in non EARLY_MALLOC
+-+          s->unif_mvs[job] = (uint32_t *) s->unif_mvs_ptr[job].arm;
+- 
+--        // Set up initial locations for uniform streams
+--        p = s->unif_mvs;
+--        for(i = 0; i < 8; i++) {
+--            s->mvs_base[i] = p;
+-+          // Set up initial locations for uniform streams
+-+          p = s->unif_mvs[job];
+-+          for(i = 0; i < 8; i++) {
+-+            s->mvs_base[job][i] = p;
+-             p += uv_commands_per_qpu;
+-+          }
+-         }
+-         s->mc_filter_uv = qpu_get_fn(QPU_MC_FILTER_UV);
+-         s->mc_filter_uv_b0 = qpu_get_fn(QPU_MC_FILTER_UV_B0);
+-@@ -4374,61 +4617,35 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
+-     }
+- #endif
+- #ifdef RPI_LUMA_QPU
+-+    for(job=0;job<RPI_MAX_JOBS;job++)
+-     {
+-         int y_commands_per_qpu = Y_COMMANDS_PER_QPU;
+-         uint32_t *p;
+- #ifdef RPI_CACHE_UNIF_MVS
+--        gpu_malloc_cached( 12 * y_commands_per_qpu * sizeof(uint32_t), &s->y_unif_mvs_ptr );
+-+        gpu_malloc_cached( 12 * y_commands_per_qpu * sizeof(uint32_t), &s->y_unif_mvs_ptr[job] );
+- #else
+--        gpu_malloc_uncached( 12 * y_commands_per_qpu * sizeof(uint32_t), &s->y_unif_mvs_ptr );
+-+        gpu_malloc_uncached( 12 * y_commands_per_qpu * sizeof(uint32_t), &s->y_unif_mvs_ptr[job] );
+- #endif
+--        s->y_unif_mvs = (uint32_t *) s->y_unif_mvs_ptr.arm; // TODO support this allocation in non EARLY_MALLOC
+-+        s->y_unif_mvs[job] = (uint32_t *) s->y_unif_mvs_ptr[job].arm;
+- 
+-         // Set up initial locations for uniform streams
+--        p = s->y_unif_mvs;
+-+        p = s->y_unif_mvs[job];
+-         for(i = 0; i < 12; i++) {
+--            s->y_mvs_base[i] = p;
+-+            s->y_mvs_base[job][i] = p;
+-             p += y_commands_per_qpu;
+-         }
+--        s->mc_filter = qpu_get_fn(QPU_MC_FILTER);
+--        s->mc_filter_b = qpu_get_fn(QPU_MC_FILTER_B);
+--
+-     }
+-+    s->mc_filter = qpu_get_fn(QPU_MC_FILTER);
+-+    s->mc_filter_b = qpu_get_fn(QPU_MC_FILTER_B);
+- #endif
+-     //gpu_malloc_uncached(2048*64,&s->dummy);
+- 
+--#ifdef EARLY_MALLOC
+--    {
+--        int coeffs_in_ctb = 64*64;
+--        int coefs_per_row = (2048/64) * coeffs_in_ctb * 3;  // Allow space for chroma
+--        s->coeffs_buf_arm[0] = 0;
+--        s->coeffs_buf_arm[2] = 0;
+--        printf("Allocated %d\n",coefs_per_row);
+--        gpu_malloc_cached(sizeof(int16_t) * coefs_per_row, &s->coeffs_buf_default);
+--        s->coeffs_buf_arm[0] = (int16_t*) s->coeffs_buf_default.arm;
+--        if (!s->coeffs_buf_arm[0])
+--            goto fail;
+--        gpu_malloc_cached(sizeof(int16_t) * coefs_per_row * 2, &s->coeffs_buf_accelerated);
+--        s->coeffs_buf_arm[2] = (int16_t*) s->coeffs_buf_accelerated.arm;
+--        s->coeffs_buf_vc[2] = s->coeffs_buf_accelerated.vc;
+--        if (!s->coeffs_buf_arm[2])
+--            goto fail;
+--        s->coeffs_buf_arm[3] = coefs_per_row + s->coeffs_buf_arm[2];
+--        s->coeffs_buf_vc[3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[2];
+--        printf("Done\n");
+--#ifdef RPI_PRECLEAR
+--        //memset(s->coeffs_buf_arm[0],0, sizeof(int16_t) * coefs_per_row);
+--        memclear16(s->coeffs_buf_arm[0], coefs_per_row);
+--        //memset(s->coeffs_buf_arm[2],0, sizeof(int16_t) * coefs_per_row);
+--        memclear16(s->coeffs_buf_arm[2], coefs_per_row);
+--        //memset(s->coeffs_buf_arm[3],0, sizeof(int16_t) * coefs_per_row);
+--        memclear16(s->coeffs_buf_arm[3], coefs_per_row);
+--#endif
+--    }
+--#endif
+--
+-     s->enable_rpi = 0;
+- 
+-+#ifdef RPI_WORKER
+-+    hevc_init_worker(s);
+-+#endif
+-+
+- #endif
+- 
+-     s->cabac_state = av_malloc(HEVC_CONTEXTS);
+-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
+-index 5cb90b5..7bd295a 100644
+---- a/libavcodec/hevc.h
+-+++ b/libavcodec/hevc.h
+-@@ -51,6 +51,12 @@
+-     // Define RPI_LUMA_QPU to also use QPU for luma inter prediction
+-     #define RPI_LUMA_QPU
+-   #endif
+-+
+-+  // By passing jobs to a worker thread we hope to be able to catch up during slow frames
+-+  #define RPI_MAX_JOBS 2
+-+  // Define RPI_WORKER to launch a worker thread for pixel processing tasks
+-+  #define RPI_WORKER
+-+
+- #endif
+- 
+- #define MAX_DPB_SIZE 16 // A.4.1
+-@@ -806,6 +812,13 @@ typedef struct HEVCLocalContext {
+-     int boundary_flags;
+- } HEVCLocalContext;
+- 
+-+#ifdef RPI_WORKER
+-+typedef struct HEVCLocalContextIntra {
+-+    TransformUnit tu;
+-+    NeighbourAvailable na;
+-+} HEVCLocalContextIntra;
+-+#endif
+-+
+- #ifdef RPI
+- 
+- // RPI_MAX_WIDTH is maximum width in pixels supported by the accelerated code
+-@@ -874,7 +887,7 @@ typedef struct HEVCPredCmd {
+- 
+- typedef struct HEVCContext {
+- #ifdef RPI
+--    int dblk_cmds[RPI_MAX_DEBLOCK_CMDS][2];
+-+    int dblk_cmds[RPI_MAX_JOBS][RPI_MAX_DEBLOCK_CMDS][2];
+- #endif
+-     const AVClass *c;  // needed by private avoptions
+-     AVCodecContext *avctx;
+-@@ -883,7 +896,9 @@ typedef struct HEVCContext {
+- 
+-     HEVCLocalContext    *HEVClcList[MAX_NB_THREADS];
+-     HEVCLocalContext    *HEVClc;
+--
+-+#ifdef RPI_WORKER
+-+    HEVCLocalContextIntra HEVClcIntra;
+-+#endif
+-     uint8_t             threads_type;
+-     uint8_t             threads_number;
+- 
+-@@ -894,43 +909,60 @@ typedef struct HEVCContext {
+- 
+- #ifdef RPI
+-     int enable_rpi;
+--    HEVCMvCmd *unif_mv_cmds;
+--    HEVCPredCmd *univ_pred_cmds;
+-+    HEVCMvCmd *unif_mv_cmds[RPI_MAX_JOBS];
+-+    HEVCPredCmd *univ_pred_cmds[RPI_MAX_JOBS];
+-     int buf_width;
+--    GPU_MEM_PTR_T coeffs_buf_default;
+--    GPU_MEM_PTR_T coeffs_buf_accelerated;
+--    int16_t *coeffs_buf_arm[4];
+--    unsigned int coeffs_buf_vc[4];
+--    int num_coeffs[4];
+--    int num_xfm_cmds;
+--    int num_mv_cmds;
+--    int num_pred_cmds;
+--    int num_dblk_cmds;
+-+    GPU_MEM_PTR_T coeffs_buf_default[RPI_MAX_JOBS];
+-+    GPU_MEM_PTR_T coeffs_buf_accelerated[RPI_MAX_JOBS];
+-+    int16_t *coeffs_buf_arm[RPI_MAX_JOBS][4];
+-+    unsigned int coeffs_buf_vc[RPI_MAX_JOBS][4];
+-+    int num_coeffs[RPI_MAX_JOBS][4];
+-+    int num_xfm_cmds[RPI_MAX_JOBS];
+-+    int num_mv_cmds[RPI_MAX_JOBS];
+-+    int num_pred_cmds[RPI_MAX_JOBS];
+-+    int num_dblk_cmds[RPI_MAX_JOBS];
+-     int vpu_id;
+-     //GPU_MEM_PTR_T dummy;
+-+    int pass0_job; // Pass0 does coefficient decode
+-+    int pass1_job; // Pass1 does pixel processing
+-+    int pass2_job; // Pass2 does reconstruction and deblocking
+- #ifdef RPI_INTER_QPU
+--    GPU_MEM_PTR_T unif_mvs_ptr;
+--    uint32_t *unif_mvs; // Base of memory for motion vector commands
+-+    GPU_MEM_PTR_T unif_mvs_ptr[RPI_MAX_JOBS];
+-+    uint32_t *unif_mvs[RPI_MAX_JOBS]; // Base of memory for motion vector commands
+- 
+-     // _base pointers are to the start of the row
+--    uint32_t *mvs_base[8];
+-+    uint32_t *mvs_base[RPI_MAX_JOBS][8];
+-     // these pointers are to the next free space
+--    uint32_t *u_mvs[8];
+-+    uint32_t *u_mvs[RPI_MAX_JOBS][8];
+-     // Function pointers
+-     uint32_t mc_filter_uv;
+-     uint32_t mc_filter_uv_b0;
+-     uint32_t mc_filter_uv_b;
+- #endif
+- #ifdef RPI_LUMA_QPU
+--    GPU_MEM_PTR_T y_unif_mvs_ptr;
+--    uint32_t *y_unif_mvs; // Base of memory for motion vector commands
+--    uint32_t *y_mvs_base[12];
+--    uint32_t *y_mvs[12];
+-+    GPU_MEM_PTR_T y_unif_mvs_ptr[RPI_MAX_JOBS];
+-+    uint32_t *y_unif_mvs[RPI_MAX_JOBS]; // Base of memory for motion vector commands
+-+    uint32_t *y_mvs_base[RPI_MAX_JOBS][12];
+-+    uint32_t *y_mvs[RPI_MAX_JOBS][12];
+-     // Function pointers
+-     uint32_t mc_filter;
+-     uint32_t mc_filter_b;
+- #endif
+- 
+-+#ifdef RPI_WORKER
+-+    pthread_t worker_thread;
+-+    pthread_t worker_deblock_thread;
+-+    pthread_cond_t worker_cond_head;
+-+    pthread_cond_t worker_cond_tail;
+-+    pthread_cond_t worker_cond_middle;
+-+    pthread_mutex_t worker_mutex;
+-+
+-+    int worker_tail; // Contains the number of posted jobs
+-+    int worker_head; // Contains the number of completed jobs
+-+    int worker_middle; // Contains the number of completed jobs
+-+    int kill_worker; // set to 1 to terminate the worker
+-+#endif
+-+
+- #endif
+- 
+-     uint8_t *cabac_state;
+-diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
+-index 38f53de..f0982cd 100644
+---- a/libavcodec/hevc_cabac.c
+-+++ b/libavcodec/hevc_cabac.c
+-@@ -1051,11 +1051,11 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+-     if (s->enable_rpi) {
+-         int n = trafo_size * trafo_size;
+-         if (use_vpu) {
+--            coeffs = s->coeffs_buf_arm[log2_trafo_size - 2] + s->num_coeffs[log2_trafo_size - 2];
+--            s->num_coeffs[log2_trafo_size - 2] += n;
+-+            coeffs = s->coeffs_buf_arm[s->pass0_job][log2_trafo_size - 2] + s->num_coeffs[s->pass0_job][log2_trafo_size - 2];
+-+            s->num_coeffs[s->pass0_job][log2_trafo_size - 2] += n;
+-         } else {
+--            coeffs = s->coeffs_buf_arm[0] + s->num_coeffs[0];
+--            s->num_coeffs[0] += n;
+-+            coeffs = s->coeffs_buf_arm[s->pass0_job][0] + s->num_coeffs[s->pass0_job][0];
+-+            s->num_coeffs[s->pass0_job][0] += n;
+-         }
+-     }
+-     // We now do the memset after transform_add while we know the data is cached.
+-@@ -1508,7 +1508,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+-                 s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
+-             }
+-         } else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2) {
+--            s->hevcdsp.idct_4x4_luma(coeffs);
+-+           s->hevcdsp.idct_4x4_luma(coeffs);
+-         } else {
+- #ifdef RPI
+-             if (!use_vpu) {
+-@@ -1553,7 +1553,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+-     }
+- #ifdef RPI
+-     if (s->enable_rpi) {
+--        HEVCPredCmd *cmd = s->univ_pred_cmds + s->num_pred_cmds++;
+-+        HEVCPredCmd *cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++;
+-         cmd->type = RPI_PRED_TRANSFORM_ADD;
+-         cmd->size = log2_trafo_size;
+-         cmd->buf = coeffs;
+-diff --git a/libavcodec/hevcpred_template.c b/libavcodec/hevcpred_template.c
+-index 71c6d52..344e021 100644
+---- a/libavcodec/hevcpred_template.c
+-+++ b/libavcodec/hevcpred_template.c
+-@@ -71,8 +71,11 @@ do {                                  \
+-                 AV_WN4P(&ptr[i], a);                                           \
+-             else                                                               \
+-                 a = PIXEL_SPLAT_X4(ptr[i + 3])
+--
+-+#ifdef RPI_WORKER
+-+    HEVCLocalContextIntra *lc = &s->HEVClcIntra;
+-+#else
+-     HEVCLocalContext *lc = s->HEVClc;
+-+#endif
+-     int i;
+-     int hshift = s->ps.sps->hshift[c_idx];
+-     int vshift = s->ps.sps->vshift[c_idx];
+--- 
+-2.7.4
+-
+-
+-From 1e0885f8d98175777fff65b4cedd708176c2abcf Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Wed, 3 Jun 2015 13:43:48 +0100
+-Subject: [PATCH 54/68] Avoid lockup bug with RPI_WORKER enabled
+-
+----
+- libavcodec/hevc.c       | 22 +++++++++++-----------
+- libavcodec/hevc_cabac.c |  1 -
+- 2 files changed, 11 insertions(+), 12 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 12aacc5..182a82f 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -133,11 +133,11 @@ static uint32_t get_vc_address(AVBufferRef *bref) {
+- static void worker_submit_job(HEVCContext *s)
+- {
+-   LOG_ENTER
+--  //pthread_mutex_lock(&s->worker_mutex);
+--  s->worker_tail++; // This is the only place that can change tail so we do not need the mutex
+-+  pthread_mutex_lock(&s->worker_mutex);
+-+  s->worker_tail++;
+-   s->pass0_job = (s->pass0_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
+-   pthread_cond_broadcast(&s->worker_cond_tail); // Let people know that the tail has moved
+--  //pthread_mutex_unlock(&s->worker_mutex);
+-+  pthread_mutex_unlock(&s->worker_mutex);
+-   LOG_EXIT
+- }
+- 
+-@@ -145,11 +145,11 @@ static void worker_submit_job(HEVCContext *s)
+- static void worker_complete_middle_job(HEVCContext *s)
+- {
+-   LOG_ENTER
+--  //pthread_mutex_lock(&s->worker_mutex);
+--  s->worker_middle++; // This is the only place that can change head so we do not need the mutex
+-+  pthread_mutex_lock(&s->worker_mutex);
+-+  s->worker_middle++;
+-   s->pass1_job = (s->pass1_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
+--  pthread_cond_broadcast(&s->worker_cond_middle); // Let people know that the tail has moved
+--  //pthread_mutex_unlock(&s->worker_mutex);
+-+  pthread_cond_broadcast(&s->worker_cond_middle); // Let people know that the middle has moved
+-+  pthread_mutex_unlock(&s->worker_mutex);
+-   LOG_EXIT
+- }
+- 
+-@@ -157,11 +157,11 @@ static void worker_complete_middle_job(HEVCContext *s)
+- static void worker_complete_job(HEVCContext *s)
+- {
+-   LOG_ENTER
+--  //pthread_mutex_lock(&s->worker_mutex);
+--  s->worker_head++; // This is the only place that can change head so we do not need the mutex
+-+  pthread_mutex_lock(&s->worker_mutex);
+-+  s->worker_head++;
+-   s->pass2_job = (s->pass2_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
+--  pthread_cond_broadcast(&s->worker_cond_head); // Let people know that the tail has moved
+--  //pthread_mutex_unlock(&s->worker_mutex);
+-+  pthread_cond_broadcast(&s->worker_cond_head); // Let people know that the head has moved
+-+  pthread_mutex_unlock(&s->worker_mutex);
+-   LOG_EXIT
+- }
+- 
+-diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
+-index f0982cd..6523e66 100644
+---- a/libavcodec/hevc_cabac.c
+-+++ b/libavcodec/hevc_cabac.c
+-@@ -1497,7 +1497,6 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+-                 for (i = 0; i < 8; i++)
+-                     FFSWAP(int16_t, coeffs[i], coeffs[16 - i - 1]);
+-             }
+--
+-             s->hevcdsp.transform_skip(coeffs, log2_trafo_size);
+- 
+-             if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
+--- 
+-2.7.4
+-
+-
+-From 1d7ad81069dec6914ec7e9983855d7a1b5e4b123 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Wed, 3 Jun 2015 15:37:19 +0100
+-Subject: [PATCH 55/68] Added code to flush buffers at start of frame
+-
+----
+- libavcodec/hevc.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
+- 1 file changed, 72 insertions(+)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 182a82f..e5b9f1e 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -43,6 +43,7 @@
+- 
+- #ifdef RPI
+-   #include "rpi_qpu.h"
+-+  #include "rpi_user_vcsm.h"
+-   // Move Inter prediction into separate pass
+-   #define RPI_INTER
+- 
+-@@ -3508,6 +3509,7 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
+- #else
+-     gpu_cache_flush(&s->coeffs_buf_accelerated[job]);
+- #endif
+-+
+-     s->vpu_id = vpu_qpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2], s->num_coeffs[job][2] >> 8, s->coeffs_buf_vc[job][3], s->num_coeffs[job][3] >> 10, 0,
+-                                    qpu_get_fn(QPU_MC_SETUP_UV),
+-                                    (uint32_t)(unif_vc+(s->mvs_base[job][0 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-@@ -3558,6 +3560,71 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
+- }
+- #endif
+- 
+-+#ifdef RPI
+-+
+-+static void flush_buffer(AVBufferRef *bref) {
+-+    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
+-+    gpu_cache_flush(p);
+-+}
+-+
+-+static void flush_frame(HEVCContext *s,AVFrame *frame)
+-+{
+-+#if 1
+-+    struct vcsm_user_clean_invalid_s iocache = {};
+-+    int n = s->ps.sps->height;
+-+    int curr_y = 0;
+-+    int curr_uv = 0;
+-+    int n_uv = n >> s->ps.sps->vshift[1];
+-+    int sz,base;
+-+    sz = s->frame->linesize[1] * (n_uv-curr_uv);
+-+    base = s->frame->linesize[1] * curr_uv;
+-+    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[1]);
+-+    iocache.s[0].handle = p->vcsm_handle;
+-+    iocache.s[0].cmd = 3; // clean+invalidate
+-+    iocache.s[0].addr = p->arm + base;
+-+    iocache.s[0].size  = sz;
+-+    p = av_buffer_pool_opaque(frame->buf[2]);
+-+    iocache.s[1].handle = p->vcsm_handle;
+-+    iocache.s[1].cmd = 3; // clean+invalidate
+-+    iocache.s[1].addr = p->arm + base;
+-+    iocache.s[1].size  = sz;
+-+    p = av_buffer_pool_opaque(frame->buf[0]);
+-+    sz = s->frame->linesize[0] * (n-curr_y);
+-+    base = s->frame->linesize[0] * curr_y;
+-+    iocache.s[2].handle = p->vcsm_handle;
+-+    iocache.s[2].cmd = 3; // clean+invalidate
+-+    iocache.s[2].addr = p->arm + base;
+-+    iocache.s[2].size  = sz;
+-+    vcsm_clean_invalid( &iocache );
+-+#else
+-+    flush_buffer(frame->buf[0]);
+-+    flush_buffer(frame->buf[1]);
+-+    flush_buffer(frame->buf[2]);
+-+#endif
+-+}
+-+
+-+static void flush_all(HEVCContext *s)
+-+{
+-+#if 0
+-+    struct vcsm_user_clean_invalid_s iocache = {};
+-+    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(s->frame->buf[0]);
+-+    iocache.s[0].handle = p->vcsm_handle;
+-+    iocache.s[0].cmd = 4; // Flush all
+-+    iocache.s[0].addr = p->arm;
+-+    iocache.s[0].size  = 4096;
+-+    vcsm_clean_invalid( &iocache );
+-+#else
+-+  int i,k;
+-+  for(i=0;i<2;i++) {
+-+    for (k = 0; k < s->sh.nb_refs[i]; k++) {
+-+      flush_frame(s,s->ref->refPicList[i].ref[k]->frame);
+-+    }
+-+  }
+-+  flush_frame(s,s->frame);
+-+#endif
+-+}
+-+#endif
+-+
+- static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+- {
+-     HEVCContext *s  = avctxt->priv_data;
+-@@ -3592,8 +3659,12 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-         printf("Weighted B slice\n");
+-     }
+- 
+-+    // Now flush all reference frames and our destination frame to get everything ready for decode
+-+    flush_all(s);
+- #endif
+- 
+-+    //printf("L0=%d L1=%d\n",s->sh.nb_refs[L1],s->sh.nb_refs[L1]);
+-+
+-     if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) {
+-         av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n");
+-         return AVERROR_INVALIDDATA;
+-@@ -3664,6 +3735,7 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-             rpi_do_all_passes(s);
+- #endif
+-           }
+-+
+-         }
+- #endif
+- 
+--- 
+-2.7.4
+-
+-
+-From 7a57f233dcd4048e20a0b5bc06bc20abb589d3fa Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Wed, 3 Jun 2015 16:42:24 +0100
+-Subject: [PATCH 56/68] Reduce the amount that needs to be flushed
+-
+----
+- libavcodec/hevc.c | 35 +++++++++++------------------------
+- 1 file changed, 11 insertions(+), 24 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index e5b9f1e..73d7f74 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -3569,7 +3569,7 @@ static void flush_buffer(AVBufferRef *bref) {
+- 
+- static void flush_frame(HEVCContext *s,AVFrame *frame)
+- {
+--#if 1
+-+#ifdef RPI_FAST_CACHEFLUSH
+-     struct vcsm_user_clean_invalid_s iocache = {};
+-     int n = s->ps.sps->height;
+-     int curr_y = 0;
+-@@ -3603,26 +3603,6 @@ static void flush_frame(HEVCContext *s,AVFrame *frame)
+- #endif
+- }
+- 
+--static void flush_all(HEVCContext *s)
+--{
+--#if 0
+--    struct vcsm_user_clean_invalid_s iocache = {};
+--    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(s->frame->buf[0]);
+--    iocache.s[0].handle = p->vcsm_handle;
+--    iocache.s[0].cmd = 4; // Flush all
+--    iocache.s[0].addr = p->arm;
+--    iocache.s[0].size  = 4096;
+--    vcsm_clean_invalid( &iocache );
+--#else
+--  int i,k;
+--  for(i=0;i<2;i++) {
+--    for (k = 0; k < s->sh.nb_refs[i]; k++) {
+--      flush_frame(s,s->ref->refPicList[i].ref[k]->frame);
+--    }
+--  }
+--  flush_frame(s,s->frame);
+--#endif
+--}
+- #endif
+- 
+- static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-@@ -3658,9 +3638,6 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-       if (s->ps.pps->weighted_bipred_flag && s->sh.slice_type == B_SLICE)
+-         printf("Weighted B slice\n");
+-     }
+--
+--    // Now flush all reference frames and our destination frame to get everything ready for decode
+--    flush_all(s);
+- #endif
+- 
+-     //printf("L0=%d L1=%d\n",s->sh.nb_refs[L1],s->sh.nb_refs[L1]);
+-@@ -4130,6 +4107,11 @@ static int hevc_frame_start(HEVCContext *s)
+-     if (!s->avctx->hwaccel)
+-         ff_thread_finish_setup(s->avctx);
+- 
+-+#ifdef RPI_INTER_QPU
+-+    // Invalidate the output data buffer so it is ready for the QPUs to write into it.
+-+    flush_frame(s,s->frame);
+-+#endif
+-+
+-     return 0;
+- 
+- fail:
+-@@ -4331,6 +4313,11 @@ fail:
+-         ff_hevc_flush_buffer(s, &s->ref->tf, s->ps.sps->height);
+- #endif
+-         ff_thread_report_progress(&s->ref->tf, INT_MAX, 0);
+-+    } else if (s->ref) {
+-+#ifdef RPI_INTER_QPU
+-+      // When running single threaded we need to flush the whole frame
+-+      flush_frame(s,s->frame);
+-+#endif
+-     }
+-     return ret;
+- }
+--- 
+-2.7.4
+-
+-
+-From 26eba8e3266cc5f2120e8284a1ce486d6a402010 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Thu, 4 Jun 2015 07:59:28 +0100
+-Subject: [PATCH 57/68] Corrected support for disabled rpi when using
+- RPI_WORKER
+-
+----
+- libavcodec/hevc.h              | 18 ++++++++++--------
+- libavcodec/hevcpred_template.c |  2 +-
+- 2 files changed, 11 insertions(+), 9 deletions(-)
+-
+-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
+-index 7bd295a..3cb34bd 100644
+---- a/libavcodec/hevc.h
+-+++ b/libavcodec/hevc.h
+-@@ -769,7 +769,17 @@ typedef struct HEVCFrame {
+-     uint8_t flags;
+- } HEVCFrame;
+- 
+-+#ifdef RPI_WORKER
+-+typedef struct HEVCLocalContextIntra {
+-+    TransformUnit tu;
+-+    NeighbourAvailable na;
+-+} HEVCLocalContextIntra;
+-+#endif
+-+
+- typedef struct HEVCLocalContext {
+-+    TransformUnit tu;
+-+    NeighbourAvailable na;  // WARNING tu and na must be the first two fields to match HEVCLocalContextIntra
+-+
+-     uint8_t cabac_state[HEVC_CONTEXTS];
+- 
+-     uint8_t stat_coeff[4];
+-@@ -784,7 +794,6 @@ typedef struct HEVCLocalContext {
+- 
+-     int qPy_pred;
+- 
+--    TransformUnit tu;
+- 
+-     uint8_t ctb_left_flag;
+-     uint8_t ctb_up_flag;
+-@@ -801,7 +810,6 @@ typedef struct HEVCLocalContext {
+-     int ct_depth;
+-     CodingUnit cu;
+-     PredictionUnit pu;
+--    NeighbourAvailable na;
+- 
+- #define BOUNDARY_LEFT_SLICE     (1 << 0)
+- #define BOUNDARY_LEFT_TILE      (1 << 1)
+-@@ -812,12 +820,6 @@ typedef struct HEVCLocalContext {
+-     int boundary_flags;
+- } HEVCLocalContext;
+- 
+--#ifdef RPI_WORKER
+--typedef struct HEVCLocalContextIntra {
+--    TransformUnit tu;
+--    NeighbourAvailable na;
+--} HEVCLocalContextIntra;
+--#endif
+- 
+- #ifdef RPI
+- 
+-diff --git a/libavcodec/hevcpred_template.c b/libavcodec/hevcpred_template.c
+-index 344e021..325b60e 100644
+---- a/libavcodec/hevcpred_template.c
+-+++ b/libavcodec/hevcpred_template.c
+-@@ -72,7 +72,7 @@ do {                                  \
+-             else                                                               \
+-                 a = PIXEL_SPLAT_X4(ptr[i + 3])
+- #ifdef RPI_WORKER
+--    HEVCLocalContextIntra *lc = &s->HEVClcIntra;
+-+    HEVCLocalContextIntra *lc = s->enable_rpi ? &s->HEVClcIntra : (HEVCLocalContextIntra *)s->HEVClc ;
+- #else
+-     HEVCLocalContext *lc = s->HEVClc;
+- #endif
+--- 
+-2.7.4
+-
+-
+-From 5b3eee9be88a5326df7621de95095def969e05a8 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Thu, 4 Jun 2015 11:52:55 +0100
+-Subject: [PATCH 58/68] Draft support for tiles
+-
+----
+- libavcodec/hevc.c              | 140 +++++++++++++++++++++++------------------
+- libavcodec/hevc.h              |  21 +++++--
+- libavcodec/hevc_filter.c       |   2 +-
+- libavcodec/hevcpred_template.c |   2 +-
+- 4 files changed, 99 insertions(+), 66 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 73d7f74..ec67252 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -63,10 +63,10 @@
+- 
+-   static void rpi_execute_dblk_cmds(HEVCContext *s);
+-   static void rpi_execute_transform(HEVCContext *s);
+--  static void rpi_execute_inter_qpu(HEVCContext *s);
+-+  static void rpi_launch_vpu_qpu(HEVCContext *s);
+-   static void rpi_execute_pred_cmds(HEVCContext *s);
+-   static void rpi_execute_inter_cmds(HEVCContext *s);
+--  static void rpi_inter_clear(HEVCContext *s);
+-+  static void rpi_begin(HEVCContext *s);
+- 
+-   // Define INTER_PASS0 to do inter prediction in first pass
+-   //#define INTER_PASS0
+-@@ -90,16 +90,18 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
+- 
+- #ifdef RPI_INTER_QPU
+- 
+-+// Each luma QPU processes 2*RPI_NUM_CHUNKS 64x64 blocks
+-+// Each chroma QPU processes 3*RPI_NUM_CHUNKS 64x64 blocks, but requires two commands for B blocks
+-+// For each block of 64*64 the smallest block size is 8x4
+-+// We also need an extra command for the setup information
+-+
+- #define RPI_CHROMA_COMMAND_WORDS 12
+--#define UV_COMMANDS_PER_QPU ((1 + (256*64*2)/(4*4)) * RPI_CHROMA_COMMAND_WORDS)
+-+#define UV_COMMANDS_PER_QPU ((1 + 3*RPI_NUM_CHUNKS*(64*64)*2/(8*4)) * RPI_CHROMA_COMMAND_WORDS)
+- // The QPU code for UV blocks only works up to a block width of 8
+- #define RPI_CHROMA_BLOCK_WIDTH 8
+- 
+--// Split image of 2048 into parts 64 wide
+--// So some QPUs will have 3 blocks of 64 to do, and others 2 blocks for an image 2048 wide with 32 blocks across
+--// For each block of 64*64 the smallest block size is 8x4
+- #define RPI_LUMA_COMMAND_WORDS 9
+--#define Y_COMMANDS_PER_QPU ((1+3*(64*64)/(8*4)) * RPI_LUMA_COMMAND_WORDS)
+-+#define Y_COMMANDS_PER_QPU ((1+2*RPI_NUM_CHUNKS*(64*64)/(8*4)) * RPI_LUMA_COMMAND_WORDS)
+- 
+- #define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24)
+- 
+-@@ -216,7 +218,7 @@ static void *worker_start(void *arg)
+-     LOG_ENTER
+-     // printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
+- #ifndef LAUNCH_PASS0
+--    rpi_execute_inter_qpu(s);
+-+    rpi_launch_vpu_qpu(s);
+- #endif
+- #ifndef INTER_PASS0
+-     // Perform inter prediction
+-@@ -322,9 +324,14 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
+- 
+- #ifdef RPI
+-     av_assert0(sps);
+--    int coeffs_in_ctb = (1 << sps->log2_ctb_size) * (1 << sps->log2_ctb_size);
+--    int coefs_per_row = sps->ctb_width * coeffs_in_ctb * 3;  // Allow space for chroma
+-+    int coefs_in_ctb = (1 << sps->log2_ctb_size) * (1 << sps->log2_ctb_size);
+-+    int coefs_per_luma = 64*64*24*RPI_NUM_CHUNKS;
+-+    int coefs_per_chroma = (coefs_per_luma * 2) >> sps->vshift[1] >> sps->hshift[1];
+-+    int coefs_per_row = coefs_per_luma + coefs_per_chroma;
+-     int job;
+-+    s->max_ctu_count = coefs_per_luma / coefs_in_ctb;
+-+    s->ctu_per_y_chan = s->max_ctu_count / 12;
+-+    s->ctu_per_uv_chan = s->max_ctu_count / 8;
+-     for(job=0;job<RPI_MAX_JOBS;job++) {
+-       printf("Allocated %d\n",coefs_per_row);
+-       for(job=0;job<RPI_MAX_JOBS;job++) {
+-@@ -2186,10 +2193,9 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-             int my2_mx2_my_mx = (my_mx << 16) + my_mx;
+-             int x1 = x0 + (mv->x >> 2);
+-             int y1 = y0 + (mv->y >> 2);
+--            int chan = x0>>6; // 64 wide blocks per QPU
+-             int weight_flag = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
+-                               (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
+--            uint32_t *y = s->y_mvs[s->pass0_job][chan % 12];
+-+            uint32_t *y = s->curr_y_mvs;
+-             for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
+-               for(int start_x=0;start_x < nPbW;start_x+=16) {
+-                   int bw = nPbW-start_x;
+-@@ -2209,7 +2215,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                   y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter;
+-                 }
+-             }
+--            s->y_mvs[s->pass0_job][chan % 12] = y;
+-+            s->curr_y_mvs = y;
+-         } else
+- #endif
+-         {
+-@@ -2233,12 +2239,10 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+- 
+-                 int x1_c = x0_c + (mv->x >> (2 + hshift));
+-                 int y1_c = y0_c + (mv->y >> (2 + hshift));
+--                //int chan = x0>>8; // Allocate commands for the first 256 luma pixels across to the first QPU.  This is optimised for images around 1920 width
+--                int chan = x0>>8;
+-                 int weight_flag      = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
+-                                        (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
+- 
+--                uint32_t *u = s->u_mvs[s->pass0_job][chan & 7];
+-+                uint32_t *u = s->curr_u_mvs;
+-                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
+-                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
+-                       int bw = nPbW_c-start_x;
+-@@ -2262,7 +2266,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+-                     }
+-                 }
+--                s->u_mvs[s->pass0_job][chan & 7] = u;
+-+                s->curr_u_mvs = u;
+-                 return;
+-             }
+- #endif
+-@@ -2289,10 +2293,9 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-             int my2_mx2_my_mx = (my_mx << 16) + my_mx;
+-             int x1 = x0 + (mv->x >> 2);
+-             int y1 = y0 + (mv->y >> 2);
+--            int chan = x0>>6; // 64 wide blocks per QPU
+-             int weight_flag = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
+-                               (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
+--            uint32_t *y = s->y_mvs[s->pass0_job][chan % 12];
+-+            uint32_t *y = s->curr_y_mvs;
+-             for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
+-               for(int start_x=0;start_x < nPbW;start_x+=16) {
+-                   int bw = nPbW-start_x;
+-@@ -2312,7 +2315,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                   y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter;
+-                 }
+-             }
+--            s->y_mvs[s->pass0_job][chan % 12] = y;
+-+            s->curr_y_mvs = y;
+-         } else
+- #endif
+- 
+-@@ -2337,12 +2340,10 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+- 
+-                 int x1_c = x0_c + (mv->x >> (2 + hshift));
+-                 int y1_c = y0_c + (mv->y >> (2 + hshift));
+--                //int chan = x0>>8; // Allocate commands for the first 256 luma pixels across to the first QPU.  This is optimised for images around 1920 width
+--                int chan = x0>>8;
+-                 int weight_flag      = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
+-                                        (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
+- 
+--                uint32_t *u = s->u_mvs[s->pass0_job][chan & 7];
+-+                uint32_t *u = s->curr_u_mvs;
+-                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
+-                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
+-                       int bw = nPbW_c-start_x;
+-@@ -2367,7 +2368,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+-                     }
+-                 }
+--                s->u_mvs[s->pass0_job][chan & 7] = u;
+-+                s->curr_u_mvs = u;
+-                 return;
+-             }
+- #endif
+-@@ -2400,8 +2401,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-             int y1 = y0 + (mv->y >> 2);
+-             int x2 = x0 + (mv2->x >> 2);
+-             int y2 = y0 + (mv2->y >> 2);
+--            int chan = x0>>6; // 64 wide blocks per QPU
+--            uint32_t *y = s->y_mvs[s->pass0_job][chan % 12];
+-+            uint32_t *y = s->curr_y_mvs;
+-             for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
+-               for(int start_x=0;start_x < nPbW;start_x+=8) { // B blocks work 8 at a time
+-                   int bw = nPbW-start_x;
+-@@ -2417,7 +2417,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                   y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter_b;
+-                 }
+-             }
+--            s->y_mvs[s->pass0_job][chan % 12] = y;
+-+            s->curr_y_mvs = y;
+-         } else
+- #endif
+-         {
+-@@ -2448,9 +2448,8 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                 int x2_c = x0_c + (mv2->x >> (2 + hshift));
+-                 int y2_c = y0_c + (mv2->y >> (2 + hshift));
+- 
+--                int chan = x0>>8; // Allocate commands for the first 256 luma pixels across to the first QPU.  This is optimised for images around 1920 width
+- 
+--                uint32_t *u = s->u_mvs[s->pass0_job][chan & 7];
+-+                uint32_t *u = s->curr_u_mvs;
+-                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
+-                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
+-                       int bw = nPbW_c-start_x;
+-@@ -2479,7 +2478,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+-                     }
+-                 }
+--                s->u_mvs[s->pass0_job][chan & 7] = u;
+-+                s->curr_u_mvs = u;
+-                 return;
+-             }
+- #endif
+-@@ -3114,12 +3113,8 @@ static void rpi_execute_inter_cmds(HEVCContext *s)
+- 
+- static void rpi_do_all_passes(HEVCContext *s)
+- {
+--#ifdef RPI_INTER_QPU
+--    // Kick off inter prediction on QPUs
+--    rpi_execute_inter_qpu(s);
+--#else
+--    rpi_execute_transform(s);
+--#endif
+-+    // Kick off QPUs and VPUs
+-+    rpi_launch_vpu_qpu(s);
+-     // Perform luma inter prediction
+-     rpi_execute_inter_cmds(s);
+-     // Wait for transform completion
+-@@ -3128,18 +3123,18 @@ static void rpi_do_all_passes(HEVCContext *s)
+-     rpi_execute_pred_cmds(s);
+-     // Perform deblocking for CTBs in this row
+-     rpi_execute_dblk_cmds(s);
+--#ifdef RPI_INTER_QPU
+--    rpi_inter_clear(s);
+--#endif
+-+    // Prepare next batch
+-+    rpi_begin(s);
+- }
+- 
+- #endif
+- 
+--#ifdef RPI_INTER_QPU
+--static void rpi_inter_clear(HEVCContext *s)
+-+#ifdef RPI
+-+static void rpi_begin(HEVCContext *s)
+- {
+-     int job = s->pass0_job;
+-     int i;
+-+#ifdef RPI_INTER_QPU
+-     int pic_width        = s->ps.sps->width >> s->ps.sps->hshift[1];
+-     int pic_height       = s->ps.sps->height >> s->ps.sps->vshift[1];
+-     int weight_flag      = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
+-@@ -3165,6 +3160,8 @@ static void rpi_inter_clear(HEVCContext *s)
+-         }
+-         *s->u_mvs[job][i]++ = i;  // Select section of VPM (avoid collisions with 3d unit)
+-     }
+-+    s->curr_u_mvs = s->u_mvs[job][0];
+-+#endif
+- 
+- #ifdef RPI_LUMA_QPU
+-     for(i=0;i<12;i++) {
+-@@ -3187,8 +3184,11 @@ static void rpi_inter_clear(HEVCContext *s)
+-         }
+-         *s->y_mvs[job][i]++ = 0; // Next kernel
+-     }
+-+    s->curr_y_mvs = s->y_mvs[job][0];
+- #endif
+-+    s->ctu_count = 0;
+- }
+-+#endif
+- 
+- #ifdef RPI_SIMULATE_QPUS
+- 
+-@@ -3459,8 +3459,9 @@ static void rpi_simulate_inter_qpu(HEVCContext *s)
+- 
+- #endif
+- 
+-+#ifdef RPI_INTER_QPU
+- 
+--static void rpi_execute_inter_qpu(HEVCContext *s)
+-+static void rpi_launch_vpu_qpu(HEVCContext *s)
+- {
+-     int k;
+- #ifdef LAUNCH_PASS0
+-@@ -3558,6 +3559,15 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
+- 
+- 
+- }
+-+#else
+-+
+-+#ifdef RPI
+-+static void rpi_launch_vpu_qpu(HEVCContext *s)
+-+{
+-+  rpi_execute_transform(s);
+-+}
+-+#endif
+-+
+- #endif
+- 
+- #ifdef RPI
+-@@ -3617,29 +3627,20 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+- #ifdef RPI
+- #ifdef RPI_INTER_QPU
+-     s->enable_rpi = s->ps.sps->bit_depth == 8
+--                    && s->ps.sps->width <= RPI_MAX_WIDTH
+-                     && !s->ps.pps->cross_component_prediction_enabled_flag
+--                    && s->ps.pps->num_tile_rows <= 1 && s->ps.pps->num_tile_columns <= 1
+-                     && !(s->ps.pps->weighted_bipred_flag && s->sh.slice_type == B_SLICE);
+- #else
+-     s->enable_rpi = s->ps.sps->bit_depth == 8
+--                    && s->ps.sps->width <= RPI_MAX_WIDTH
+--                    && !s->ps.pps->cross_component_prediction_enabled_flag
+--                    && s->ps.pps->num_tile_rows <= 1 && s->ps.pps->num_tile_columns <= 1;
+-+                    && !s->ps.pps->cross_component_prediction_enabled_flag;
+- #endif
+- 
+-     if (!s->enable_rpi) {
+-       if (s->ps.pps->cross_component_prediction_enabled_flag)
+-         printf("Cross component\n");
+--      if (s->ps.pps->num_tile_rows > 1 || s->ps.pps->num_tile_columns > 1)
+--        printf("Tiles\n");
+--      if (s->ps.pps->weighted_pred_flag && s->sh.slice_type == P_SLICE)
+--        printf("Weighted P slice\n");
+-       if (s->ps.pps->weighted_bipred_flag && s->sh.slice_type == B_SLICE)
+-         printf("Weighted B slice\n");
+-     }
+- #endif
+--
+-     //printf("L0=%d L1=%d\n",s->sh.nb_refs[L1],s->sh.nb_refs[L1]);
+- 
+-     if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) {
+-@@ -3660,8 +3661,8 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-     s->pass1_job = 0;
+-     s->pass2_job = 0;
+- #endif
+--#ifdef RPI_INTER_QPU
+--    rpi_inter_clear(s);
+-+#ifdef RPI
+-+    rpi_begin(s);
+- #endif
+- 
+-     while (more_data && ctb_addr_ts < s->ps.sps->ctb_size) {
+-@@ -3679,13 +3680,34 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-         s->deblock[ctb_addr_rs].tc_offset   = s->sh.tc_offset;
+-         s->filter_slice_edges[ctb_addr_rs]  = s->sh.slice_loop_filter_across_slices_enabled_flag;
+- 
+-+#ifdef RPI_INTER_QPU
+-+        s->curr_u_mvs = s->u_mvs[s->pass0_job][s->ctu_count / s->ctu_per_uv_chan];
+-+#endif
+-+#ifdef RPI_LUMA_QPU
+-+        s->curr_y_mvs = s->y_mvs[s->pass0_job][s->ctu_count / s->ctu_per_y_chan];
+-+#endif
+-+
+-         more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
+- 
+-+#ifdef RPI_INTER_QPU
+-+        s->u_mvs[s->pass0_job][s->ctu_count / s->ctu_per_uv_chan] = s->curr_u_mvs;
+-+#endif
+-+#ifdef RPI_LUMA_QPU
+-+        s->y_mvs[s->pass0_job][s->ctu_count / s->ctu_per_y_chan] = s->curr_y_mvs;
+-+#endif
+-+
+- #ifdef RPI
+-         if (s->enable_rpi) {
+-+          //av_assert0(s->num_dblk_cmds[s->pass0_job]>=0);
+-+          //av_assert0(s->num_dblk_cmds[s->pass0_job]<RPI_MAX_DEBLOCK_CMDS);
+-+          //av_assert0(s->pass0_job<RPI_MAX_JOBS);
+-+          //av_assert0(s->pass0_job>=0);
+-           s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]][0] = x_ctb;
+-           s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]++][1] = y_ctb;
+--          if ( (((y_ctb + ctb_size)&63) == 0) && x_ctb + ctb_size >= s->ps.sps->width) {
+-+          s->ctu_count++;
+-+          //printf("%d %d/%d job=%d\n",s->ctu_count,s->num_dblk_cmds[s->pass0_job],RPI_MAX_DEBLOCK_CMDS,s->pass0_job);
+-+
+-+          if ( s->ctu_count >= s->max_ctu_count ) {
+- #ifdef RPI_WORKER
+-             if (s->used_for_ref) {
+-               // Split work load onto separate threads so we make as rapid progress as possible with this frame
+-@@ -3693,7 +3715,7 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-               rpi_execute_inter_cmds(s);
+-   #endif
+-   #ifdef LAUNCH_PASS0
+--              rpi_execute_inter_qpu(s);
+-+              rpi_launch_vpu_qpu(s);
+-   #endif
+-               // Pass on this job to worker thread
+-               worker_submit_job(s);
+-@@ -3701,9 +3723,7 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-               worker_pass0_ready(s);
+- 
+-               // Prepare the next batch of commands
+--#ifdef RPI_INTER_QPU
+--              rpi_inter_clear(s);
+--#endif
+-+              rpi_begin(s);
+-             } else {
+-               // Non-ref frame so do it all on this thread
+-               rpi_do_all_passes(s);
+-@@ -3744,7 +3764,7 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+- #endif
+- 
+-     // Finish off any half-completed rows
+--    if (s->enable_rpi && s->num_dblk_cmds[s->pass0_job]) {
+-+    if (s->enable_rpi && s->ctu_count) {
+-         rpi_do_all_passes(s);
+-     }
+- 
+-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
+-index 3cb34bd..a141316 100644
+---- a/libavcodec/hevc.h
+-+++ b/libavcodec/hevc.h
+-@@ -823,8 +823,15 @@ typedef struct HEVCLocalContext {
+- 
+- #ifdef RPI
+- 
+-+// The processing is done in chunks
+-+// Each chunk corresponds to 24 64x64 luma blocks (24 so it is divisible by 8 for chroma and 12 for luma)
+-+// This is a distance of 1536 pixels across the screen
+-+// Increasing RPI_NUM_CHUNKS will reduce time spent activating QPUs and cache flushing,
+-+// but allocate more memory and increase the latency before data in the next frame can be processed
+-+#define RPI_NUM_CHUNKS 1
+-+
+- // RPI_MAX_WIDTH is maximum width in pixels supported by the accelerated code
+--#define RPI_MAX_WIDTH 2048
+-+#define RPI_MAX_WIDTH (RPI_NUM_CHUNKS*64*24)
+- 
+- // Worst case is for 4:4:4 4x4 blocks with 64 high coding tree blocks, so 16 MV cmds per 4 pixels across for each colour plane, * 2 for bi
+- #define RPI_MAX_MV_CMDS   (2*16*3*(RPI_MAX_WIDTH/4))
+-@@ -888,9 +895,6 @@ typedef struct HEVCPredCmd {
+- #endif
+- 
+- typedef struct HEVCContext {
+--#ifdef RPI
+--    int dblk_cmds[RPI_MAX_JOBS][RPI_MAX_DEBLOCK_CMDS][2];
+--#endif
+-     const AVClass *c;  // needed by private avoptions
+-     AVCodecContext *avctx;
+- 
+-@@ -928,6 +932,10 @@ typedef struct HEVCContext {
+-     int pass0_job; // Pass0 does coefficient decode
+-     int pass1_job; // Pass1 does pixel processing
+-     int pass2_job; // Pass2 does reconstruction and deblocking
+-+    int ctu_count; // Number of CTUs done in pass0 so far
+-+    int max_ctu_count; // Number of CTUs when we trigger a round of processing
+-+    int ctu_per_y_chan; // Number of CTUs per luma QPU
+-+    int ctu_per_uv_chan; // Number of CTUs per chroma QPU
+- #ifdef RPI_INTER_QPU
+-     GPU_MEM_PTR_T unif_mvs_ptr[RPI_MAX_JOBS];
+-     uint32_t *unif_mvs[RPI_MAX_JOBS]; // Base of memory for motion vector commands
+-@@ -936,6 +944,7 @@ typedef struct HEVCContext {
+-     uint32_t *mvs_base[RPI_MAX_JOBS][8];
+-     // these pointers are to the next free space
+-     uint32_t *u_mvs[RPI_MAX_JOBS][8];
+-+    uint32_t *curr_u_mvs; // Current uniform stream to use for chroma
+-     // Function pointers
+-     uint32_t mc_filter_uv;
+-     uint32_t mc_filter_uv_b0;
+-@@ -946,6 +955,7 @@ typedef struct HEVCContext {
+-     uint32_t *y_unif_mvs[RPI_MAX_JOBS]; // Base of memory for motion vector commands
+-     uint32_t *y_mvs_base[RPI_MAX_JOBS][12];
+-     uint32_t *y_mvs[RPI_MAX_JOBS][12];
+-+    uint32_t *curr_y_mvs; // Current uniform stream for luma
+-     // Function pointers
+-     uint32_t mc_filter;
+-     uint32_t mc_filter_b;
+-@@ -1084,6 +1094,9 @@ typedef struct HEVCContext {
+-     uint32_t max_mastering_luminance;
+-     uint32_t min_mastering_luminance;
+- 
+-+#ifdef RPI
+-+    int dblk_cmds[RPI_MAX_JOBS][RPI_MAX_DEBLOCK_CMDS][2];
+-+#endif
+- } HEVCContext;
+- 
+- int ff_hevc_decode_short_term_rps(GetBitContext *gb, AVCodecContext *avctx,
+-diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
+-index b286bbf..1f04790 100644
+---- a/libavcodec/hevc_filter.c
+-+++ b/libavcodec/hevc_filter.c
+-@@ -891,7 +891,7 @@ void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n)
+-         int n_uv = n >> s->ps.sps->vshift[1];
+-         int sz,base;
+-         if (curr_uv < 0) curr_uv = 0;
+--        if (n_uv<=curr_uv) { assert(0); return; } // Should not happen
+-+        if (n_uv<=curr_uv) { return; }
+-         sz = s->frame->linesize[1] * (n_uv-curr_uv);
+-         base = s->frame->linesize[1] * curr_uv;
+-         GPU_MEM_PTR_T *p = av_buffer_pool_opaque(s->frame->buf[1]);
+-diff --git a/libavcodec/hevcpred_template.c b/libavcodec/hevcpred_template.c
+-index 325b60e..28d2653 100644
+---- a/libavcodec/hevcpred_template.c
+-+++ b/libavcodec/hevcpred_template.c
+-@@ -72,7 +72,7 @@ do {                                  \
+-             else                                                               \
+-                 a = PIXEL_SPLAT_X4(ptr[i + 3])
+- #ifdef RPI_WORKER
+--    HEVCLocalContextIntra *lc = s->enable_rpi ? &s->HEVClcIntra : (HEVCLocalContextIntra *)s->HEVClc ;
+-+    HEVCLocalContextIntra *lc = (s->enable_rpi) ? &s->HEVClcIntra : (HEVCLocalContextIntra *)s->HEVClc ;
+- #else
+-     HEVCLocalContext *lc = s->HEVClc;
+- #endif
+--- 
+-2.7.4
+-
+-
+-From 1674a80d147e5342ef6ea9a4fb4ddfc640c15a05 Mon Sep 17 00:00:00 2001
+-From: popcornmix <popcornmix@gmail.com>
+-Date: Thu, 4 Jun 2015 15:48:10 +0100
+-Subject: [PATCH 59/68] Move deblocker into second pass
+-
+----
+- libavcodec/hevc.c | 79 +++++++++++++++++++++++++++++++++++++++++++++----------
+- 1 file changed, 65 insertions(+), 14 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index ec67252..6cecbdd 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -67,6 +67,8 @@
+-   static void rpi_execute_pred_cmds(HEVCContext *s);
+-   static void rpi_execute_inter_cmds(HEVCContext *s);
+-   static void rpi_begin(HEVCContext *s);
+-+  static void flush_frame(HEVCContext *s,AVFrame *frame);
+-+  static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2);
+- 
+-   // Define INTER_PASS0 to do inter prediction in first pass
+-   //#define INTER_PASS0
+-@@ -227,6 +229,11 @@ static void *worker_start(void *arg)
+-     // Wait for transform completion
+-     vpu_wait(s->vpu_id);
+- 
+-+    // Perform intra prediction and residual reconstruction
+-+    rpi_execute_pred_cmds(s);
+-+    // Perform deblocking for CTBs in this row
+-+    rpi_execute_dblk_cmds(s);
+-+
+-     worker_complete_middle_job(s);
+-     LOG_EXIT
+-   }
+-@@ -248,10 +255,6 @@ static void *worker_deblock_start(void *arg)
+-       break;
+-     }
+-     LOG_ENTER
+--    // Perform intra prediction and residual reconstruction
+--    rpi_execute_pred_cmds(s);
+--    // Perform deblocking for CTBs in this row
+--    rpi_execute_dblk_cmds(s);
+- 
+-     worker_complete_job(s);
+-     LOG_EXIT
+-@@ -2983,7 +2986,7 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
+- static void rpi_execute_dblk_cmds(HEVCContext *s)
+- {
+-     int n;
+--    int job = s->pass2_job;
+-+    int job = s->pass1_job;
+-     int ctb_size    = 1 << s->ps.sps->log2_ctb_size;
+-     int (*p)[2] = s->dblk_cmds[job];
+-     for(n = s->num_dblk_cmds[job]; n>0 ;n--,p++) {
+-@@ -3021,7 +3024,7 @@ static void rpi_execute_transform(HEVCContext *s)
+- static void rpi_execute_pred_cmds(HEVCContext *s)
+- {
+-   int i;
+--  int job = s->pass2_job;
+-+  int job = s->pass1_job;
+-   HEVCPredCmd *cmd = s->univ_pred_cmds[job];
+- #ifdef RPI_WORKER
+-   HEVCLocalContextIntra *lc = &s->HEVClcIntra;
+-@@ -3506,11 +3509,10 @@ static void rpi_launch_vpu_qpu(HEVCContext *s)
+- 
+- #ifdef RPI_MULTI_MAILBOX
+- #ifdef RPI_CACHE_UNIF_MVS
+--    gpu_cache_flush3(&s->coeffs_buf_accelerated[job],&s->y_unif_mvs_ptr[job], &s->unif_mvs_ptr[job]);
+-+    flush_frame3(s, s->frame,&s->coeffs_buf_accelerated[job],&s->y_unif_mvs_ptr[job], &s->unif_mvs_ptr[job]);
+- #else
+--    gpu_cache_flush(&s->coeffs_buf_accelerated[job]);
+-+    flush_frame3(s, s->frame,&s->coeffs_buf_accelerated[job],NULL,NULL);
+- #endif
+--
+-     s->vpu_id = vpu_qpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2], s->num_coeffs[job][2] >> 8, s->coeffs_buf_vc[job][3], s->num_coeffs[job][3] >> 10, 0,
+-                                    qpu_get_fn(QPU_MC_SETUP_UV),
+-                                    (uint32_t)(unif_vc+(s->mvs_base[job][0 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-@@ -3613,6 +3615,60 @@ static void flush_frame(HEVCContext *s,AVFrame *frame)
+- #endif
+- }
+- 
+-+static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2)
+-+{
+-+#ifdef RPI_FAST_CACHEFLUSH
+-+    struct vcsm_user_clean_invalid_s iocache = {};
+-+    int n = s->ps.sps->height;
+-+    int curr_y = 0;
+-+    int curr_uv = 0;
+-+    int n_uv = n >> s->ps.sps->vshift[1];
+-+    int sz,base;
+-+    sz = s->frame->linesize[1] * (n_uv-curr_uv);
+-+    base = s->frame->linesize[1] * curr_uv;
+-+    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[1]);
+-+    iocache.s[0].handle = p->vcsm_handle;
+-+    iocache.s[0].cmd = 3; // clean+invalidate
+-+    iocache.s[0].addr = p->arm + base;
+-+    iocache.s[0].size  = sz;
+-+    p = av_buffer_pool_opaque(frame->buf[2]);
+-+    iocache.s[1].handle = p->vcsm_handle;
+-+    iocache.s[1].cmd = 3; // clean+invalidate
+-+    iocache.s[1].addr = p->arm + base;
+-+    iocache.s[1].size  = sz;
+-+    p = av_buffer_pool_opaque(frame->buf[0]);
+-+    sz = s->frame->linesize[0] * (n-curr_y);
+-+    base = s->frame->linesize[0] * curr_y;
+-+    iocache.s[2].handle = p->vcsm_handle;
+-+    iocache.s[2].cmd = 3; // clean+invalidate
+-+    iocache.s[2].addr = p->arm + base;
+-+    iocache.s[2].size  = sz;
+-+
+-+    iocache.s[3].handle = p0->vcsm_handle;
+-+    iocache.s[3].cmd = 3; // clean+invalidate
+-+    iocache.s[3].addr = (int) p0->arm;
+-+    iocache.s[3].size  = p0->numbytes;
+-+    if (p1) {
+-+      iocache.s[4].handle = p1->vcsm_handle;
+-+      iocache.s[4].cmd = 3; // clean+invalidate
+-+      iocache.s[4].addr = (int) p1->arm;
+-+      iocache.s[4].size  = p1->numbytes;
+-+    }
+-+    if (p2) {
+-+      iocache.s[5].handle = p2->vcsm_handle;
+-+      iocache.s[5].cmd = 3; // clean+invalidate
+-+      iocache.s[5].addr = (int) p2->arm;
+-+      iocache.s[5].size  = p2->numbytes;
+-+    }
+-+    vcsm_clean_invalid( &iocache );
+-+#else
+-+    flush_buffer(frame->buf[0]);
+-+    flush_buffer(frame->buf[1]);
+-+    flush_buffer(frame->buf[2]);
+-+    gpu_cache_flush3(p0, p1, p2);
+-+#endif
+-+}
+-+
+- #endif
+- 
+- static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-@@ -4127,11 +4183,6 @@ static int hevc_frame_start(HEVCContext *s)
+-     if (!s->avctx->hwaccel)
+-         ff_thread_finish_setup(s->avctx);
+- 
+--#ifdef RPI_INTER_QPU
+--    // Invalidate the output data buffer so it is ready for the QPUs to write into it.
+--    flush_frame(s,s->frame);
+--#endif
+--
+-     return 0;
+- 
+- fail:
+--- 
+-2.7.4
+-
+-
+-From a453fe438c4ab311d6476955d0a40a5d2ed8a1c6 Mon Sep 17 00:00:00 2001
+-From: popcornmix <popcornmix@gmail.com>
+-Date: Thu, 4 Jun 2015 16:10:23 +0100
+-Subject: [PATCH 60/68] Change order of ctu accesses to improve qpu performance
+-
+----
+- libavcodec/hevc.c | 8 ++++----
+- 1 file changed, 4 insertions(+), 4 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 6cecbdd..ec17e64 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -3737,19 +3737,19 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-         s->filter_slice_edges[ctb_addr_rs]  = s->sh.slice_loop_filter_across_slices_enabled_flag;
+- 
+- #ifdef RPI_INTER_QPU
+--        s->curr_u_mvs = s->u_mvs[s->pass0_job][s->ctu_count / s->ctu_per_uv_chan];
+-+        s->curr_u_mvs = s->u_mvs[s->pass0_job][s->ctu_count % 8];
+- #endif
+- #ifdef RPI_LUMA_QPU
+--        s->curr_y_mvs = s->y_mvs[s->pass0_job][s->ctu_count / s->ctu_per_y_chan];
+-+        s->curr_y_mvs = s->y_mvs[s->pass0_job][s->ctu_count % 12];
+- #endif
+- 
+-         more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
+- 
+- #ifdef RPI_INTER_QPU
+--        s->u_mvs[s->pass0_job][s->ctu_count / s->ctu_per_uv_chan] = s->curr_u_mvs;
+-+        s->u_mvs[s->pass0_job][s->ctu_count % 8]= s->curr_u_mvs;
+- #endif
+- #ifdef RPI_LUMA_QPU
+--        s->y_mvs[s->pass0_job][s->ctu_count / s->ctu_per_y_chan] = s->curr_y_mvs;
+-+        s->y_mvs[s->pass0_job][s->ctu_count % 12] = s->curr_y_mvs;
+- #endif
+- 
+- #ifdef RPI
+--- 
+-2.7.4
+-
+-
+-From 504de0435e8f660c1b7b2d6ec053dc922a2d2896 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Mon, 8 Jun 2015 09:36:59 +0100
+-Subject: [PATCH 61/68] Removed deblocker thread
+-
+----
+- libavcodec/hevc.c | 77 +++----------------------------------------------------
+- libavcodec/hevc.h |  4 ---
+- 2 files changed, 4 insertions(+), 77 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index ec17e64..1868532 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -70,11 +70,6 @@
+-   static void flush_frame(HEVCContext *s,AVFrame *frame);
+-   static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2);
+- 
+--  // Define INTER_PASS0 to do inter prediction in first pass
+--  //#define INTER_PASS0
+--  // Define LAUNCH_PASS0 to launch QPU/VPU from pass0
+--  //#define LAUNCH_PASS0
+--
+- #endif
+- 
+- // #define DISABLE_MC
+-@@ -147,24 +142,12 @@ static void worker_submit_job(HEVCContext *s)
+- }
+- 
+- // Call this to say we have completed pass1
+--static void worker_complete_middle_job(HEVCContext *s)
+--{
+--  LOG_ENTER
+--  pthread_mutex_lock(&s->worker_mutex);
+--  s->worker_middle++;
+--  s->pass1_job = (s->pass1_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
+--  pthread_cond_broadcast(&s->worker_cond_middle); // Let people know that the middle has moved
+--  pthread_mutex_unlock(&s->worker_mutex);
+--  LOG_EXIT
+--}
+--
+--// Call this to say we have completed pass2
+- static void worker_complete_job(HEVCContext *s)
+- {
+-   LOG_ENTER
+-   pthread_mutex_lock(&s->worker_mutex);
+-   s->worker_head++;
+--  s->pass2_job = (s->pass2_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
+-+  s->pass1_job = (s->pass1_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
+-   pthread_cond_broadcast(&s->worker_cond_head); // Let people know that the head has moved
+-   pthread_mutex_unlock(&s->worker_mutex);
+-   LOG_EXIT
+-@@ -208,7 +191,7 @@ static void *worker_start(void *arg)
+-   while(1) {
+-     pthread_mutex_lock(&s->worker_mutex);
+- 
+--    while( !s->kill_worker && s->worker_tail - s->worker_middle <= 0)
+-+    while( !s->kill_worker && s->worker_tail - s->worker_head <= 0)
+-     {
+-       pthread_cond_wait(&s->worker_cond_tail, &s->worker_mutex);
+-     }
+-@@ -219,13 +202,9 @@ static void *worker_start(void *arg)
+-     }
+-     LOG_ENTER
+-     // printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
+--#ifndef LAUNCH_PASS0
+-     rpi_launch_vpu_qpu(s);
+--#endif
+--#ifndef INTER_PASS0
+-     // Perform inter prediction
+-     rpi_execute_inter_cmds(s);
+--#endif
+-     // Wait for transform completion
+-     vpu_wait(s->vpu_id);
+- 
+-@@ -234,28 +213,6 @@ static void *worker_start(void *arg)
+-     // Perform deblocking for CTBs in this row
+-     rpi_execute_dblk_cmds(s);
+- 
+--    worker_complete_middle_job(s);
+--    LOG_EXIT
+--  }
+--  return NULL;
+--}
+--
+--static void *worker_deblock_start(void *arg)
+--{
+--  HEVCContext *s = (HEVCContext *)arg;
+--  while(1) {
+--    pthread_mutex_lock(&s->worker_mutex);
+--    while( !s->kill_worker && s->worker_middle - s->worker_head <= 0)
+--    {
+--      pthread_cond_wait(&s->worker_cond_middle, &s->worker_mutex);
+--    }
+--    pthread_mutex_unlock(&s->worker_mutex);
+--
+--    if (s->kill_worker) {
+--      break;
+--    }
+--    LOG_ENTER
+--
+-     worker_complete_job(s);
+-     LOG_EXIT
+-   }
+-@@ -2998,11 +2955,7 @@ static void rpi_execute_dblk_cmds(HEVCContext *s)
+- static void rpi_execute_transform(HEVCContext *s)
+- {
+-     int i=2;
+--#ifdef LAUNCH_PASS0
+--    int job = s->pass0_job;
+--#else
+-     int job = s->pass1_job;
+--#endif
+-     //int j;
+-     //int16_t *coeffs = s->coeffs_buf_arm[i];
+-     //for(j=s->num_coeffs[i]; j > 0; j-= 16*16, coeffs+=16*16) {
+-@@ -3057,11 +3010,7 @@ static void rpi_execute_pred_cmds(HEVCContext *s)
+- 
+- static void rpi_execute_inter_cmds(HEVCContext *s)
+- {
+--#ifdef INTER_PASS0
+--    int job = s->pass0_job;
+--#else
+-     int job = s->pass1_job;
+--#endif
+-     HEVCMvCmd *cmd = s->unif_mv_cmds[job];
+-     int n,cidx;
+-     AVFrame myref;
+-@@ -3467,11 +3416,7 @@ static void rpi_simulate_inter_qpu(HEVCContext *s)
+- static void rpi_launch_vpu_qpu(HEVCContext *s)
+- {
+-     int k;
+--#ifdef LAUNCH_PASS0
+--    int job = s->pass0_job;
+--#else
+-     int job = s->pass1_job;
+--#endif
+-     int i;
+-     uint32_t *unif_vc = (uint32_t *)s->unif_mvs_ptr[job].vc;
+- #ifdef RPI_LUMA_QPU
+-@@ -3574,10 +3519,12 @@ static void rpi_launch_vpu_qpu(HEVCContext *s)
+- 
+- #ifdef RPI
+- 
+-+#ifndef RPI_FAST_CACHEFLUSH
+- static void flush_buffer(AVBufferRef *bref) {
+-     GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
+-     gpu_cache_flush(p);
+- }
+-+#endif
+- 
+- static void flush_frame(HEVCContext *s,AVFrame *frame)
+- {
+-@@ -3715,7 +3662,6 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+- #ifdef RPI_WORKER
+-     s->pass0_job = 0;
+-     s->pass1_job = 0;
+--    s->pass2_job = 0;
+- #endif
+- #ifdef RPI
+-     rpi_begin(s);
+-@@ -3767,12 +3713,6 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+- #ifdef RPI_WORKER
+-             if (s->used_for_ref) {
+-               // Split work load onto separate threads so we make as rapid progress as possible with this frame
+--  #ifdef INTER_PASS0
+--              rpi_execute_inter_cmds(s);
+--  #endif
+--  #ifdef LAUNCH_PASS0
+--              rpi_launch_vpu_qpu(s);
+--  #endif
+-               // Pass on this job to worker thread
+-               worker_submit_job(s);
+-               // Make sure we have space to prepare the next job
+-@@ -3814,8 +3754,6 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-     // Wait for the worker to finish all its jobs
+-     if (s->enable_rpi) {
+-         worker_wait(s);
+--        av_assert0(s->pass0_job==s->pass1_job);
+--        av_assert0(s->pass1_job==s->pass2_job);
+-     }
+- #endif
+- 
+-@@ -4565,16 +4503,13 @@ static av_cold void hevc_init_worker(HEVCContext *s)
+- {
+-     int err;
+-     pthread_cond_init(&s->worker_cond_head, NULL);
+--    pthread_cond_init(&s->worker_cond_middle, NULL);
+-     pthread_cond_init(&s->worker_cond_tail, NULL);
+-     pthread_mutex_init(&s->worker_mutex, NULL);
+- 
+-     s->worker_tail=0;
+--    s->worker_middle=0;
+-     s->worker_head=0;
+-     s->kill_worker=0;
+-     err = pthread_create(&s->worker_thread, NULL, worker_start, s);
+--    err = pthread_create(&s->worker_deblock_thread, NULL, worker_deblock_start, s);
+-     if (err) {
+-         printf("Failed to create worker thread\n");
+-         exit(-1);
+-@@ -4586,17 +4521,13 @@ static av_cold void hevc_exit_worker(HEVCContext *s)
+-     void *res;
+-     s->kill_worker=1;
+-     pthread_cond_broadcast(&s->worker_cond_tail);
+--    pthread_cond_broadcast(&s->worker_cond_middle);
+-     pthread_join(s->worker_thread, &res);
+--    pthread_join(s->worker_deblock_thread, &res);
+- 
+-     pthread_cond_destroy(&s->worker_cond_head);
+--    pthread_cond_destroy(&s->worker_cond_middle);
+-     pthread_cond_destroy(&s->worker_cond_tail);
+-     pthread_mutex_destroy(&s->worker_mutex);
+- 
+-     s->worker_tail=0;
+--    s->worker_middle=0;
+-     s->worker_head=0;
+-     s->kill_worker=0;
+- }
+-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
+-index a141316..ef5bfb1 100644
+---- a/libavcodec/hevc.h
+-+++ b/libavcodec/hevc.h
+-@@ -931,7 +931,6 @@ typedef struct HEVCContext {
+-     //GPU_MEM_PTR_T dummy;
+-     int pass0_job; // Pass0 does coefficient decode
+-     int pass1_job; // Pass1 does pixel processing
+--    int pass2_job; // Pass2 does reconstruction and deblocking
+-     int ctu_count; // Number of CTUs done in pass0 so far
+-     int max_ctu_count; // Number of CTUs when we trigger a round of processing
+-     int ctu_per_y_chan; // Number of CTUs per luma QPU
+-@@ -963,15 +962,12 @@ typedef struct HEVCContext {
+- 
+- #ifdef RPI_WORKER
+-     pthread_t worker_thread;
+--    pthread_t worker_deblock_thread;
+-     pthread_cond_t worker_cond_head;
+-     pthread_cond_t worker_cond_tail;
+--    pthread_cond_t worker_cond_middle;
+-     pthread_mutex_t worker_mutex;
+- 
+-     int worker_tail; // Contains the number of posted jobs
+-     int worker_head; // Contains the number of completed jobs
+--    int worker_middle; // Contains the number of completed jobs
+-     int kill_worker; // set to 1 to terminate the worker
+- #endif
+- 
+--- 
+-2.7.4
+-
+-
+-From 74892301cdb0829de959b798debac6ffe1c71603 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Mon, 8 Jun 2015 11:04:43 +0100
+-Subject: [PATCH 62/68] Reduced amount of output frame that is invalidated
+-
+----
+- libavcodec/hevc.c | 45 +++++++++++++++++++++++++++++----------------
+- 1 file changed, 29 insertions(+), 16 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 1868532..cbb4f46 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -68,7 +68,7 @@
+-   static void rpi_execute_inter_cmds(HEVCContext *s);
+-   static void rpi_begin(HEVCContext *s);
+-   static void flush_frame(HEVCContext *s,AVFrame *frame);
+--  static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2);
+-+  static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2, int job);
+- 
+- #endif
+- 
+-@@ -3454,9 +3454,9 @@ static void rpi_launch_vpu_qpu(HEVCContext *s)
+- 
+- #ifdef RPI_MULTI_MAILBOX
+- #ifdef RPI_CACHE_UNIF_MVS
+--    flush_frame3(s, s->frame,&s->coeffs_buf_accelerated[job],&s->y_unif_mvs_ptr[job], &s->unif_mvs_ptr[job]);
+-+    flush_frame3(s, s->frame,&s->coeffs_buf_accelerated[job],&s->y_unif_mvs_ptr[job], &s->unif_mvs_ptr[job], job);
+- #else
+--    flush_frame3(s, s->frame,&s->coeffs_buf_accelerated[job],NULL,NULL);
+-+    flush_frame3(s, s->frame,&s->coeffs_buf_accelerated[job],NULL,NULL, job);
+- #endif
+-     s->vpu_id = vpu_qpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2], s->num_coeffs[job][2] >> 8, s->coeffs_buf_vc[job][3], s->num_coeffs[job][3] >> 10, 0,
+-                                    qpu_get_fn(QPU_MC_SETUP_UV),
+-@@ -3530,6 +3530,7 @@ static void flush_frame(HEVCContext *s,AVFrame *frame)
+- {
+- #ifdef RPI_FAST_CACHEFLUSH
+-     struct vcsm_user_clean_invalid_s iocache = {};
+-+    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[1]);
+-     int n = s->ps.sps->height;
+-     int curr_y = 0;
+-     int curr_uv = 0;
+-@@ -3537,22 +3538,21 @@ static void flush_frame(HEVCContext *s,AVFrame *frame)
+-     int sz,base;
+-     sz = s->frame->linesize[1] * (n_uv-curr_uv);
+-     base = s->frame->linesize[1] * curr_uv;
+--    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[1]);
+-     iocache.s[0].handle = p->vcsm_handle;
+-     iocache.s[0].cmd = 3; // clean+invalidate
+--    iocache.s[0].addr = p->arm + base;
+-+    iocache.s[0].addr = (int)(p->arm) + base;
+-     iocache.s[0].size  = sz;
+-     p = av_buffer_pool_opaque(frame->buf[2]);
+-     iocache.s[1].handle = p->vcsm_handle;
+-     iocache.s[1].cmd = 3; // clean+invalidate
+--    iocache.s[1].addr = p->arm + base;
+-+    iocache.s[1].addr = (int)(p->arm) + base;
+-     iocache.s[1].size  = sz;
+-     p = av_buffer_pool_opaque(frame->buf[0]);
+-     sz = s->frame->linesize[0] * (n-curr_y);
+-     base = s->frame->linesize[0] * curr_y;
+-     iocache.s[2].handle = p->vcsm_handle;
+-     iocache.s[2].cmd = 3; // clean+invalidate
+--    iocache.s[2].addr = p->arm + base;
+-+    iocache.s[2].addr = (int)(p->arm) + base;
+-     iocache.s[2].size  = sz;
+-     vcsm_clean_invalid( &iocache );
+- #else
+-@@ -3562,33 +3562,46 @@ static void flush_frame(HEVCContext *s,AVFrame *frame)
+- #endif
+- }
+- 
+--static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2)
+-+static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2, int job)
+- {
+- #ifdef RPI_FAST_CACHEFLUSH
+-     struct vcsm_user_clean_invalid_s iocache = {};
+--    int n = s->ps.sps->height;
+--    int curr_y = 0;
+--    int curr_uv = 0;
+--    int n_uv = n >> s->ps.sps->vshift[1];
+-+    int n;
+-+    int curr_y;
+-+    int curr_uv;
+-+    int n_uv;
+-+    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[1]);
+-     int sz,base;
+-+    int (*d)[2] = s->dblk_cmds[job];
+-+    int low=(*d)[1];
+-+    int high=(*d)[1];
+-+    for(n = s->num_dblk_cmds[job]; n>0 ;n--,d++) {
+-+        int y = (*d)[1];
+-+        low=FFMIN(low,y);
+-+        high=FFMAX(high,y);
+-+    }
+-+    curr_y = low;
+-+    n = high+(1 << s->ps.sps->log2_ctb_size);
+-+    curr_uv = curr_y >> s->ps.sps->vshift[1];
+-+    n_uv = n >> s->ps.sps->vshift[1];
+-+
+-     sz = s->frame->linesize[1] * (n_uv-curr_uv);
+-     base = s->frame->linesize[1] * curr_uv;
+--    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[1]);
+-     iocache.s[0].handle = p->vcsm_handle;
+-     iocache.s[0].cmd = 3; // clean+invalidate
+--    iocache.s[0].addr = p->arm + base;
+-+    iocache.s[0].addr = (int)(p->arm) + base;
+-     iocache.s[0].size  = sz;
+-     p = av_buffer_pool_opaque(frame->buf[2]);
+-     iocache.s[1].handle = p->vcsm_handle;
+-     iocache.s[1].cmd = 3; // clean+invalidate
+--    iocache.s[1].addr = p->arm + base;
+-+    iocache.s[1].addr = (int)(p->arm) + base;
+-     iocache.s[1].size  = sz;
+-     p = av_buffer_pool_opaque(frame->buf[0]);
+-     sz = s->frame->linesize[0] * (n-curr_y);
+-     base = s->frame->linesize[0] * curr_y;
+-     iocache.s[2].handle = p->vcsm_handle;
+-     iocache.s[2].cmd = 3; // clean+invalidate
+--    iocache.s[2].addr = p->arm + base;
+-+    iocache.s[2].addr = (int)(p->arm) + base;
+-     iocache.s[2].size  = sz;
+- 
+-     iocache.s[3].handle = p0->vcsm_handle;
+--- 
+-2.7.4
+-
+-
+-From 090b6be5b501bd3c547700926e540397f0b39e69 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Mon, 8 Jun 2015 11:55:29 +0100
+-Subject: [PATCH 63/68] Packed 16x16 and 32x32 into the same buffer
+-
+----
+- libavcodec/hevc.c       | 24 +++++++++++++++---------
+- libavcodec/hevc_cabac.c |  9 ++++++++-
+- libavcodec/rpi_qpu.c    |  2 +-
+- 3 files changed, 24 insertions(+), 11 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index cbb4f46..a596534 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -299,12 +299,12 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
+-         s->coeffs_buf_arm[job][0] = (int16_t*) s->coeffs_buf_default[job].arm;
+-         if (!s->coeffs_buf_arm[job][0])
+-             goto fail;
+--        gpu_malloc_cached(sizeof(int16_t) * coefs_per_row * 2, &s->coeffs_buf_accelerated[job]);
+-+        gpu_malloc_cached(sizeof(int16_t) * (coefs_per_row + 32*32), &s->coeffs_buf_accelerated[job]);  // We prefetch past the end so provide an extra blocks worth of data
+-         s->coeffs_buf_arm[job][2] = (int16_t*) s->coeffs_buf_accelerated[job].arm;
+-         s->coeffs_buf_vc[job][2] = s->coeffs_buf_accelerated[job].vc;
+-         if (!s->coeffs_buf_arm[job][2])
+-             goto fail;
+--        s->coeffs_buf_arm[job][3] = coefs_per_row + s->coeffs_buf_arm[job][2];
+-+        s->coeffs_buf_arm[job][3] = coefs_per_row + s->coeffs_buf_arm[job][2];  // This points to just beyond the end of the buffer.  Coefficients fill in backwards.
+-         s->coeffs_buf_vc[job][3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[job][2];
+-       }
+-     }
+-@@ -2956,15 +2956,20 @@ static void rpi_execute_transform(HEVCContext *s)
+- {
+-     int i=2;
+-     int job = s->pass1_job;
+--    //int j;
+--    //int16_t *coeffs = s->coeffs_buf_arm[i];
+--    //for(j=s->num_coeffs[i]; j > 0; j-= 16*16, coeffs+=16*16) {
+--    //    s->hevcdsp.idct[4-2](coeffs, 16);
+--    //}
+-+    /*int j;
+-+    int16_t *coeffs = s->coeffs_buf_arm[job][i];
+-+    for(j=s->num_coeffs[job][i]; j > 0; j-= 16*16, coeffs+=16*16) {
+-+        s->hevcdsp.idct[4-2](coeffs, 16);
+-+    }
+-+    i=3;
+-+    coeffs = s->coeffs_buf_arm[job][i] - s->num_coeffs[job][i];
+-+    for(j=s->num_coeffs[job][i]; j > 0; j-= 32*32, coeffs+=32*32) {
+-+        s->hevcdsp.idct[5-2](coeffs, 32);
+-+    }*/
+- 
+-     gpu_cache_flush(&s->coeffs_buf_accelerated[job]);
+-     s->vpu_id = vpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2],
+--                               s->num_coeffs[job][2] >> 8, s->coeffs_buf_vc[job][3],
+-+                               s->num_coeffs[job][2] >> 8, s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3],
+-                                s->num_coeffs[job][3] >> 10, 0, &s->coeffs_buf_accelerated[job]);
+-     //vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0);
+-     //gpu_cache_flush(&s->coeffs_buf_accelerated);
+-@@ -3458,7 +3463,8 @@ static void rpi_launch_vpu_qpu(HEVCContext *s)
+- #else
+-     flush_frame3(s, s->frame,&s->coeffs_buf_accelerated[job],NULL,NULL, job);
+- #endif
+--    s->vpu_id = vpu_qpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2], s->num_coeffs[job][2] >> 8, s->coeffs_buf_vc[job][3], s->num_coeffs[job][3] >> 10, 0,
+-+    s->vpu_id = vpu_qpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2], s->num_coeffs[job][2] >> 8,
+-+                                                                      s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3], s->num_coeffs[job][3] >> 10, 0,
+-                                    qpu_get_fn(QPU_MC_SETUP_UV),
+-                                    (uint32_t)(unif_vc+(s->mvs_base[job][0 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-                                    (uint32_t)(unif_vc+(s->mvs_base[job][1 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
+-index 6523e66..8656917 100644
+---- a/libavcodec/hevc_cabac.c
+-+++ b/libavcodec/hevc_cabac.c
+-@@ -1051,7 +1051,14 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+-     if (s->enable_rpi) {
+-         int n = trafo_size * trafo_size;
+-         if (use_vpu) {
+--            coeffs = s->coeffs_buf_arm[s->pass0_job][log2_trafo_size - 2] + s->num_coeffs[s->pass0_job][log2_trafo_size - 2];
+-+            // We support size 4 and size 5.
+-+            // Size 4 grows from the front  (Coeffs_buf_arm[2] points to start of buf)
+-+            // Size 5 grows from the back   (Coeffs_buf_arm[3] points to end of buf)
+-+            // num_coeffs is indexed by log2_trafo_size-2
+-+            if (log2_trafo_size == 4)
+-+                coeffs = s->coeffs_buf_arm[s->pass0_job][log2_trafo_size - 2] + s->num_coeffs[s->pass0_job][log2_trafo_size - 2];
+-+            else
+-+                coeffs = s->coeffs_buf_arm[s->pass0_job][log2_trafo_size - 2] - s->num_coeffs[s->pass0_job][log2_trafo_size - 2] - n;
+-             s->num_coeffs[s->pass0_job][log2_trafo_size - 2] += n;
+-         } else {
+-             coeffs = s->coeffs_buf_arm[s->pass0_job][0] + s->num_coeffs[s->pass0_job][0];
+-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
+-index 4480f72..0121fca 100644
+---- a/libavcodec/rpi_qpu.c
+-+++ b/libavcodec/rpi_qpu.c
+-@@ -5,7 +5,7 @@
+- // define RPI_TIME_TOTAL_VPU to print out how much time is spent in the VPI code
+- //#define RPI_TIME_TOTAL_VPU
+- // define RPI_TIME_TOTAL_POSTED to print out how much time is spent in the multi execute QPU/VPU combined
+--//#define RPI_TIME_TOTAL_POSTED
+-+#define RPI_TIME_TOTAL_POSTED
+- // define RPI_ASYNC to run the VPU in a separate thread, need to make a separate call to check for completion
+- #define RPI_ASYNC
+- 
+--- 
+-2.7.4
+-
+-
+-From ed359bbce56817bf9db0e54701103bd0505c353b Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Thu, 25 Jun 2015 09:02:47 +0100
+-Subject: [PATCH 64/68] Moved luma deblock to VPU
+-
+----
+- libavcodec/hevc.c               |   18 +-
+- libavcodec/hevc.h               |   11 +
+- libavcodec/hevc_filter.c        |  120 ++-
+- libavcodec/rpi_hevc_transform.h | 1802 ++++++++++++++++++++++++++++++++++++++-
+- libavcodec/rpi_hevc_transform.s |  426 +++++++++
+- libavcodec/rpi_qpu.c            |   12 +-
+- libavcodec/rpi_shader.c         |    2 +-
+- 7 files changed, 2378 insertions(+), 13 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index a596534..4ce94a7 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -246,6 +246,12 @@ static void pic_arrays_free(HEVCContext *s)
+-       }
+-     }
+- #endif
+-+#ifdef RPI_DEBLOCK_VPU
+-+    if (s->y_setup_arm) {
+-+      gpu_free(&s->y_setup_ptr);
+-+      s->y_setup_arm = 0;
+-+    }
+-+#endif
+-     av_freep(&s->sao);
+-     av_freep(&s->deblock);
+- 
+-@@ -283,12 +289,12 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
+-     int min_pu_size      = sps->min_pu_width * sps->min_pu_height;
+- 
+- #ifdef RPI
+--    av_assert0(sps);
+-     int coefs_in_ctb = (1 << sps->log2_ctb_size) * (1 << sps->log2_ctb_size);
+-     int coefs_per_luma = 64*64*24*RPI_NUM_CHUNKS;
+-     int coefs_per_chroma = (coefs_per_luma * 2) >> sps->vshift[1] >> sps->hshift[1];
+-     int coefs_per_row = coefs_per_luma + coefs_per_chroma;
+-     int job;
+-+    av_assert0(sps);
+-     s->max_ctu_count = coefs_per_luma / coefs_in_ctb;
+-     s->ctu_per_y_chan = s->max_ctu_count / 12;
+-     s->ctu_per_uv_chan = s->max_ctu_count / 8;
+-@@ -309,6 +315,16 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
+-       }
+-     }
+- #endif
+-+#ifdef RPI_DEBLOCK_VPU
+-+    s->enable_rpi_deblock = !sps->sao_enabled;
+-+    s->setup_width = (sps->width+15) / 16;
+-+    s->setup_height = (sps->height+15) / 16;
+-+    gpu_malloc_uncached(sizeof(*s->y_setup_arm) * s->setup_width * s->setup_height, &s->y_setup_ptr); // TODO make this cached
+-+    s->y_setup_arm = (void*)s->y_setup_ptr.arm;
+-+    s->y_setup_vc = (void*)s->y_setup_ptr.vc;
+-+    memset(s->y_setup_arm, 0, s->y_setup_ptr.numbytes);
+-+    printf("Setup %d by %d by %d\n",s->setup_width,s->setup_height,sizeof(*s->y_setup_arm));
+-+#endif
+- 
+-     s->bs_width  = (width  >> 2) + 1;
+-     s->bs_height = (height >> 2) + 1;
+-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
+-index ef5bfb1..cf08489 100644
+---- a/libavcodec/hevc.h
+-+++ b/libavcodec/hevc.h
+-@@ -57,6 +57,8 @@
+-   // Define RPI_WORKER to launch a worker thread for pixel processing tasks
+-   #define RPI_WORKER
+- 
+-+  #define RPI_DEBLOCK_VPU
+-+
+- #endif
+- 
+- #define MAX_DPB_SIZE 16 // A.4.1
+-@@ -971,6 +973,15 @@ typedef struct HEVCContext {
+-     int kill_worker; // set to 1 to terminate the worker
+- #endif
+- 
+-+#ifdef RPI_DEBLOCK_VPU
+-+    int enable_rpi_deblock;
+-+    GPU_MEM_PTR_T y_setup_ptr;
+-+    uint8_t (*y_setup_arm)[2][2][2][4];
+-+    uint8_t (*y_setup_vc)[2][2][2][4];
+-+    int setup_width; // Number of 16x16 blocks across the image
+-+    int setup_height; // Number of 16x16 blocks down the image
+-+#endif
+-+
+- #endif
+- 
+-     uint8_t *cabac_state;
+-diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
+-index 1f04790..06371da 100644
+---- a/libavcodec/hevc_filter.c
+-+++ b/libavcodec/hevc_filter.c
+-@@ -564,6 +564,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+-                                                          s->frame->linesize[LUMA],
+-                                                          beta, tc, no_p, no_q);
+-                 } else
+-+#ifdef RPI_DEBLOCK_VPU
+-+                if (s->enable_rpi_deblock) {
+-+                    uint8_t (*setup)[2][2][4];
+-+                    int num16 = (y>>4)*s->setup_width + (x>>4);
+-+                    int a = ((y>>3) & 1) << 1;
+-+                    int b = (x>>3) & 1;
+-+                    setup = s->y_setup_arm[num16];
+-+                    setup[0][b][0][a] = beta;
+-+                    setup[0][b][0][a + 1] = beta;
+-+                    setup[0][b][1][a] = tc[0];
+-+                    setup[0][b][1][a + 1] = tc[1];
+-+                } else
+-+#endif
+-                     s->hevcdsp.hevc_v_loop_filter_luma(src,
+-                                                        s->frame->linesize[LUMA],
+-                                                        beta, tc, no_p, no_q);
+-@@ -596,6 +609,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+-                                                          s->frame->linesize[LUMA],
+-                                                          beta, tc, no_p, no_q);
+-                 } else
+-+#ifdef RPI_DEBLOCK_VPU
+-+                if (s->enable_rpi_deblock) {
+-+                    uint8_t (*setup)[2][2][4];
+-+                    int num16 = (y>>4)*s->setup_width + (x>>4);
+-+                    int a = ((x>>3) & 1) << 1;
+-+                    int b = (y>>3) & 1;
+-+                    setup = s->y_setup_arm[num16];
+-+                    setup[1][b][0][a] = beta;
+-+                    setup[1][b][0][a + 1] = beta;
+-+                    setup[1][b][1][a] = tc[0];
+-+                    setup[1][b][1][a + 1] = tc[1];
+-+                } else
+-+#endif
+-                     s->hevcdsp.hevc_h_loop_filter_luma(src,
+-                                                        s->frame->linesize[LUMA],
+-                                                        beta, tc, no_p, no_q);
+-@@ -876,33 +902,85 @@ static void flush_buffer(AVBufferRef *bref) {
+- }
+- 
+- // Return Physical address for this image
+--static int ff_hevc_buf_base(AVBufferRef *bref) {
+-+static uint32_t get_vc_address(AVBufferRef *bref) {
+-   GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
+--  return p->vc & 0x3fffffff;
+-+  return p->vc;
+- }
+- 
+-+// ff_hevc_flush_buffer_lines
+-+// flushes and invalidates all pixel rows in [start,end-1]
+-+static void ff_hevc_flush_buffer_lines(HEVCContext *s, int start, int end, int flush_luma, int flush_chroma)
+-+{
+-+#ifdef RPI_FAST_CACHEFLUSH
+-+        struct vcsm_user_clean_invalid_s iocache = {};
+-+        int curr_y = start;
+-+        int n = end;
+-+        int curr_uv = curr_y >> s->ps.sps->vshift[1];
+-+        int n_uv = n >> s->ps.sps->vshift[1];
+-+        int sz,base;
+-+        GPU_MEM_PTR_T *p;
+-+        if (curr_uv < 0) curr_uv = 0;
+-+        if (n_uv<=curr_uv) { return; }
+-+        sz = s->frame->linesize[1] * (n_uv-curr_uv);
+-+        base = s->frame->linesize[1] * curr_uv;
+-+        if (flush_chroma) {
+-+          p = av_buffer_pool_opaque(s->frame->buf[1]);
+-+          iocache.s[0].handle = p->vcsm_handle;
+-+          iocache.s[0].cmd = 3; // clean+invalidate
+-+          iocache.s[0].addr = (int)p->arm + base;
+-+          iocache.s[0].size  = sz;
+-+          p = av_buffer_pool_opaque(s->frame->buf[2]);
+-+          iocache.s[1].handle = p->vcsm_handle;
+-+          iocache.s[1].cmd = 3; // clean+invalidate
+-+          iocache.s[1].addr = (int)p->arm + base;
+-+          iocache.s[1].size  = sz;
+-+        }
+-+        if (flush_luma) {
+-+          p = av_buffer_pool_opaque(s->frame->buf[0]);
+-+          sz = s->frame->linesize[0] * (n-curr_y);
+-+          base = s->frame->linesize[0] * curr_y;
+-+          iocache.s[2].handle = p->vcsm_handle;
+-+          iocache.s[2].cmd = 3; // clean+invalidate
+-+          iocache.s[2].addr = (int)p->arm + base;
+-+          iocache.s[2].size  = sz;
+-+        }
+-+        vcsm_clean_invalid( &iocache );
+-+#else
+-+        if (flush_chroma) {
+-+          flush_buffer(s->frame->buf[1]);
+-+          flush_buffer(s->frame->buf[2]);
+-+        }
+-+        if (flush_luma) {
+-+          flush_buffer(s->frame->buf[0]);
+-+        }
+-+#endif
+-+}
+-+
+-+
+- void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n)
+- {
+-     if (s->enable_rpi && s->used_for_ref) {
+-+      // TODO make this use ff_hevc_flush_buffer_lines
+- #ifdef RPI_FAST_CACHEFLUSH
+-         struct vcsm_user_clean_invalid_s iocache = {};
+-         int curr_y = ((int *)f->progress->data)[0];
+-         int curr_uv = curr_y >> s->ps.sps->vshift[1];
+-         int n_uv = n >> s->ps.sps->vshift[1];
+-         int sz,base;
+-+        GPU_MEM_PTR_T *p;
+-         if (curr_uv < 0) curr_uv = 0;
+-         if (n_uv<=curr_uv) { return; }
+-         sz = s->frame->linesize[1] * (n_uv-curr_uv);
+-         base = s->frame->linesize[1] * curr_uv;
+--        GPU_MEM_PTR_T *p = av_buffer_pool_opaque(s->frame->buf[1]);
+-+        p = av_buffer_pool_opaque(s->frame->buf[1]);
+-         iocache.s[0].handle = p->vcsm_handle;
+-         iocache.s[0].cmd = 3; // clean+invalidate
+--        iocache.s[0].addr = p->arm + base;
+-+        iocache.s[0].addr = (int)p->arm + base;
+-         iocache.s[0].size  = sz;
+-         p = av_buffer_pool_opaque(s->frame->buf[2]);
+-         iocache.s[1].handle = p->vcsm_handle;
+-         iocache.s[1].cmd = 3; // clean+invalidate
+--        iocache.s[1].addr = p->arm + base;
+-+        iocache.s[1].addr = (int)p->arm + base;
+-         iocache.s[1].size  = sz;
+- 
+- #ifdef RPI_LUMA_QPU
+-@@ -911,7 +989,7 @@ void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n)
+-         base = s->frame->linesize[0] * curr_y;
+-         iocache.s[2].handle = p->vcsm_handle;
+-         iocache.s[2].cmd = 3; // clean+invalidate
+--        iocache.s[2].addr = p->arm + base;
+-+        iocache.s[2].addr = (int)p->arm + base;
+-         iocache.s[2].size  = sz;
+- #endif
+-         vcsm_clean_invalid( &iocache );
+-@@ -930,11 +1008,40 @@ void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n)
+- }
+- #endif
+- 
+-+#ifdef RPI_DEBLOCK_VPU
+-+/* rpi_deblock deblocks an entire row of ctbs using the VPU */
+-+static void rpi_deblock(HEVCContext *s, int y, int ctb_size)
+-+{
+-+  // Flush image, 4 lines above to bottom of ctb stripe
+-+  ff_hevc_flush_buffer_lines(s, FFMAX(y-4,0), y+ctb_size, 1, 0);
+-+  // TODO flush buffer of beta/tc setup when it becomes cached
+-+  // Call VPU
+-+  // TODO add this to a separate pipeline of VPU jobs that can be run in parallel and wait for completion
+-+  vpu_wait(vpu_post_code( vpu_get_fn(), get_vc_address(s->frame->buf[0]) + s->frame->linesize[0] * y, s->frame->linesize[0],
+-+                               s->setup_width, (int) ( s->y_setup_vc + s->setup_width * (y>>4) ),
+-+                               ctb_size>>4, 2, 0)); // 2 means to do the deblocking code
+-+}
+-+
+-+static void rpi_deblock2(HEVCContext *s, int y, int ctb_size)
+-+{
+-+   int y2;
+-+   for(y2=y;y2<y+ctb_size;y2+=16) {
+-+      rpi_deblock(s,y2,16);
+-+   }
+-+}
+-+#endif
+-+
+- void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
+- {
+-     int x_end = x >= s->ps.sps->width  - ctb_size;
+-     if (s->avctx->skip_loop_filter < AVDISCARD_ALL)
+-         deblocking_filter_CTB(s, x, y);
+-+#ifdef RPI_DEBLOCK_VPU
+-+    if (s->enable_rpi_deblock && x_end)
+-+    {
+-+      rpi_deblock(s, y, ctb_size);
+-+    }
+-+#endif
+-     if (s->ps.sps->sao_enabled) {
+-         int y_end = y >= s->ps.sps->height - ctb_size;
+-         if (y && x)
+-@@ -965,6 +1072,7 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
+-         //if (((y + ctb_size)&63)==0)
+- #ifdef RPI_INTER_QPU
+-         ff_hevc_flush_buffer(s, &s->ref->tf, y + ctb_size - 4);
+-+        // TODO we no longer need to flush the luma buffer as it is in GPU memory when using deblocking on the rpi
+- #endif
+-         ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
+-     }
+-diff --git a/libavcodec/rpi_hevc_transform.h b/libavcodec/rpi_hevc_transform.h
+-index 4f13622..b3f155f 100644
+---- a/libavcodec/rpi_hevc_transform.h
+-+++ b/libavcodec/rpi_hevc_transform.h
+-@@ -3,7 +3,13 @@ unsigned char rpi_hevc_transform [] = {
+- 106,
+- 0,
+- 144,
+--35,
+-+38,
+-+1,
+-+37,
+-+106,
+-+0,
+-+144,
+-+57,
+- 1,
+- 169,
+- 3,
+-@@ -627,4 +633,1798 @@ unsigned char rpi_hevc_transform [] = {
+- 30,
+- 90,
+- 0,
+-+169,
+-+3,
+-+73,
+-+64,
+-+52,
+-+64,
+-+45,
+-+64,
+-+2,
+-+64,
+-+10,
+-+64,
+-+64,
+-+198,
+-+1,
+-+7,
+-+8,
+-+232,
+-+63,
+-+0,
+-+0,
+-+0,
+-+6,
+-+232,
+-+253,
+-+255,
+-+255,
+-+255,
+-+0,
+-+246,
+-+0,
+-+0,
+-+0,
+-+4,
+-+215,
+-+64,
+-+3,
+-+96,
+-+2,
+-+248,
+-+0,
+-+35,
+-+0,
+-+0,
+-+64,
+-+56,
+-+0,
+-+0,
+-+4,
+-+248,
+-+0,
+-+36,
+-+0,
+-+0,
+-+64,
+-+56,
+-+8,
+-+0,
+-+0,
+-+240,
+-+64,
+-+0,
+-+132,
+-+3,
+-+128,
+-+240,
+-+0,
+-+0,
+-+132,
+-+3,
+-+128,
+-+144,
+-+137,
+-+0,
+-+131,
+-+98,
+-+0,
+-+255,
+-+64,
+-+0,
+-+0,
+-+20,
+-+200,
+-+243,
+-+0,
+-+0,
+-+128,
+-+144,
+-+129,
+-+0,
+-+131,
+-+102,
+-+0,
+-+158,
+-+67,
+-+0,
+-+2,
+-+248,
+-+0,
+-+35,
+-+0,
+-+0,
+-+64,
+-+56,
+-+0,
+-+0,
+-+4,
+-+248,
+-+0,
+-+36,
+-+0,
+-+0,
+-+64,
+-+56,
+-+8,
+-+0,
+-+0,
+-+240,
+-+64,
+-+0,
+-+132,
+-+3,
+-+128,
+-+240,
+-+0,
+-+0,
+-+132,
+-+3,
+-+128,
+-+144,
+-+108,
+-+0,
+-+131,
+-+98,
+-+0,
+-+255,
+-+64,
+-+0,
+-+0,
+-+20,
+-+200,
+-+243,
+-+0,
+-+0,
+-+128,
+-+144,
+-+100,
+-+0,
+-+131,
+-+102,
+-+0,
+-+248,
+-+64,
+-+0,
+-+112,
+-+0,
+-+192,
+-+243,
+-+211,
+-+31,
+-+128,
+-+248,
+-+0,
+-+0,
+-+112,
+-+0,
+-+192,
+-+243,
+-+211,
+-+31,
+-+128,
+-+144,
+-+161,
+-+0,
+-+188,
+-+64,
+-+67,
+-+232,
+-+0,
+-+2,
+-+0,
+-+0,
+-+0,
+-+255,
+-+64,
+-+0,
+-+0,
+-+20,
+-+200,
+-+243,
+-+0,
+-+0,
+-+128,
+-+144,
+-+150,
+-+0,
+-+195,
+-+232,
+-+0,
+-+2,
+-+0,
+-+0,
+-+12,
+-+128,
+-+7,
+-+192,
+-+130,
+-+248,
+-+0,
+-+0,
+-+112,
+-+192,
+-+224,
+-+16,
+-+195,
+-+31,
+-+132,
+-+248,
+-+1,
+-+0,
+-+112,
+-+0,
+-+224,
+-+16,
+-+203,
+-+31,
+-+3,
+-+99,
+-+131,
+-+71,
+-+68,
+-+232,
+-+32,
+-+0,
+-+0,
+-+0,
+-+0,
+-+99,
+-+2,
+-+99,
+-+23,
+-+102,
+-+7,
+-+106,
+-+127,
+-+156,
+-+182,
+-+255,
+-+0,
+-+248,
+-+64,
+-+0,
+-+112,
+-+0,
+-+192,
+-+243,
+-+211,
+-+31,
+-+128,
+-+248,
+-+0,
+-+0,
+-+112,
+-+0,
+-+192,
+-+243,
+-+211,
+-+31,
+-+128,
+-+144,
+-+112,
+-+0,
+-+188,
+-+64,
+-+67,
+-+232,
+-+0,
+-+2,
+-+0,
+-+0,
+-+0,
+-+255,
+-+64,
+-+0,
+-+0,
+-+20,
+-+200,
+-+243,
+-+0,
+-+0,
+-+128,
+-+144,
+-+101,
+-+0,
+-+195,
+-+232,
+-+0,
+-+2,
+-+0,
+-+0,
+-+12,
+-+128,
+-+7,
+-+192,
+-+130,
+-+248,
+-+0,
+-+0,
+-+112,
+-+192,
+-+224,
+-+16,
+-+195,
+-+31,
+-+132,
+-+248,
+-+1,
+-+0,
+-+112,
+-+0,
+-+224,
+-+16,
+-+203,
+-+31,
+-+25,
+-+102,
+-+9,
+-+106,
+-+2,
+-+30,
+-+41,
+-+3,
+-+26,
+-+87,
+-+162,
+-+64,
+-+64,
+-+198,
+-+1,
+-+23,
+-+127,
+-+158,
+-+103,
+-+255,
+-+239,
+-+3,
+-+0,
+-+254,
+-+0,
+-+143,
+-+92,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+64,
+-+143,
+-+93,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+128,
+-+143,
+-+94,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+192,
+-+143,
+-+95,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+192,
+-+142,
+-+208,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+128,
+-+142,
+-+209,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+64,
+-+142,
+-+210,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+0,
+-+142,
+-+211,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+128,
+-+144,
+-+107,
+-+0,
+-+8,
+-+255,
+-+99,
+-+23,
+-+0,
+-+212,
+-+192,
+-+51,
+-+0,
+-+0,
+-+8,
+-+255,
+-+163,
+-+23,
+-+0,
+-+228,
+-+192,
+-+51,
+-+0,
+-+0,
+-+8,
+-+255,
+-+227,
+-+23,
+-+0,
+-+244,
+-+192,
+-+51,
+-+0,
+-+0,
+-+8,
+-+255,
+-+35,
+-+52,
+-+0,
+-+180,
+-+192,
+-+51,
+-+0,
+-+0,
+-+8,
+-+255,
+-+99,
+-+52,
+-+0,
+-+164,
+-+192,
+-+51,
+-+0,
+-+0,
+-+8,
+-+255,
+-+163,
+-+52,
+-+0,
+-+148,
+-+192,
+-+51,
+-+0,
+-+0,
+-+111,
+-+3,
+-+239,
+-+3,
+-+0,
+-+254,
+-+0,
+-+143,
+-+12,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+64,
+-+143,
+-+13,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+128,
+-+143,
+-+14,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+192,
+-+143,
+-+15,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+192,
+-+142,
+-+16,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+128,
+-+142,
+-+17,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+64,
+-+142,
+-+18,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+0,
+-+142,
+-+19,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+128,
+-+144,
+-+33,
+-+0,
+-+8,
+-+255,
+-+99,
+-+3,
+-+0,
+-+212,
+-+192,
+-+51,
+-+0,
+-+0,
+-+8,
+-+255,
+-+163,
+-+3,
+-+0,
+-+228,
+-+192,
+-+51,
+-+0,
+-+0,
+-+8,
+-+255,
+-+227,
+-+3,
+-+0,
+-+244,
+-+192,
+-+51,
+-+0,
+-+0,
+-+8,
+-+255,
+-+35,
+-+4,
+-+0,
+-+180,
+-+192,
+-+51,
+-+0,
+-+0,
+-+8,
+-+255,
+-+99,
+-+4,
+-+0,
+-+164,
+-+192,
+-+51,
+-+0,
+-+0,
+-+8,
+-+255,
+-+163,
+-+4,
+-+0,
+-+148,
+-+192,
+-+51,
+-+0,
+-+0,
+-+111,
+-+3,
+-+32,
+-+246,
+-+192,
+-+11,
+-+1,
+-+16,
+-+32,
+-+246,
+-+2,
+-+137,
+-+47,
+-+240,
+-+40,
+-+246,
+-+2,
+-+140,
+-+47,
+-+240,
+-+128,
+-+245,
+-+99,
+-+140,
+-+5,
+-+4,
+-+0,
+-+247,
+-+99,
+-+140,
+-+1,
+-+20,
+-+88,
+-+246,
+-+99,
+-+140,
+-+1,
+-+20,
+-+0,
+-+247,
+-+35,
+-+136,
+-+62,
+-+226,
+-+32,
+-+247,
+-+35,
+-+136,
+-+32,
+-+210,
+-+0,
+-+247,
+-+34,
+-+136,
+-+63,
+-+2,
+-+208,
+-+246,
+-+34,
+-+136,
+-+0,
+-+4,
+-+0,
+-+247,
+-+99,
+-+136,
+-+58,
+-+162,
+-+32,
+-+247,
+-+99,
+-+136,
+-+33,
+-+146,
+-+0,
+-+247,
+-+98,
+-+136,
+-+59,
+-+18,
+-+208,
+-+246,
+-+98,
+-+136,
+-+0,
+-+20,
+-+0,
+-+247,
+-+162,
+-+136,
+-+33,
+-+2,
+-+88,
+-+246,
+-+98,
+-+137,
+-+2,
+-+68,
+-+88,
+-+246,
+-+162,
+-+137,
+-+3,
+-+68,
+-+208,
+-+254,
+-+227,
+-+136,
+-+60,
+-+242,
+-+192,
+-+243,
+-+188,
+-+11,
+-+208,
+-+254,
+-+227,
+-+136,
+-+56,
+-+178,
+-+192,
+-+243,
+-+188,
+-+10,
+-+32,
+-+255,
+-+226,
+-+136,
+-+38,
+-+58,
+-+192,
+-+243,
+-+60,
+-+0,
+-+208,
+-+254,
+-+227,
+-+136,
+-+59,
+-+242,
+-+192,
+-+243,
+-+60,
+-+128,
+-+32,
+-+255,
+-+226,
+-+136,
+-+49,
+-+58,
+-+192,
+-+243,
+-+60,
+-+128,
+-+0,
+-+255,
+-+226,
+-+136,
+-+34,
+-+34,
+-+192,
+-+243,
+-+60,
+-+128,
+-+32,
+-+255,
+-+226,
+-+136,
+-+37,
+-+58,
+-+192,
+-+243,
+-+60,
+-+128,
+-+0,
+-+254,
+-+192,
+-+136,
+-+1,
+-+4,
+-+0,
+-+240,
+-+0,
+-+160,
+-+0,
+-+255,
+-+194,
+-+8,
+-+0,
+-+52,
+-+195,
+-+243,
+-+0,
+-+128,
+-+0,
+-+255,
+-+202,
+-+40,
+-+0,
+-+52,
+-+195,
+-+243,
+-+0,
+-+128,
+-+0,
+-+254,
+-+0,
+-+240,
+-+35,
+-+10,
+-+0,
+-+240,
+-+60,
+-+0,
+-+0,
+-+254,
+-+192,
+-+136,
+-+1,
+-+4,
+-+0,
+-+240,
+-+0,
+-+160,
+-+0,
+-+255,
+-+226,
+-+140,
+-+34,
+-+34,
+-+195,
+-+243,
+-+60,
+-+0,
+-+32,
+-+255,
+-+227,
+-+140,
+-+36,
+-+58,
+-+192,
+-+243,
+-+60,
+-+0,
+-+0,
+-+254,
+-+192,
+-+136,
+-+0,
+-+4,
+-+0,
+-+240,
+-+0,
+-+160,
+-+16,
+-+246,
+-+226,
+-+136,
+-+35,
+-+50,
+-+16,
+-+246,
+-+226,
+-+136,
+-+35,
+-+50,
+-+32,
+-+246,
+-+226,
+-+136,
+-+35,
+-+50,
+-+32,
+-+254,
+-+226,
+-+136,
+-+35,
+-+58,
+-+192,
+-+243,
+-+60,
+-+0,
+-+11,
+-+96,
+-+0,
+-+254,
+-+0,
+-+240,
+-+1,
+-+4,
+-+0,
+-+240,
+-+64,
+-+115,
+-+5,
+-+106,
+-+0,
+-+144,
+-+173,
+-+1,
+-+27,
+-+96,
+-+0,
+-+254,
+-+0,
+-+240,
+-+1,
+-+4,
+-+0,
+-+240,
+-+64,
+-+147,
+-+5,
+-+106,
+-+0,
+-+144,
+-+227,
+-+0,
+-+64,
+-+246,
+-+163,
+-+140,
+-+1,
+-+4,
+-+0,
+-+246,
+-+192,
+-+175,
+-+63,
+-+2,
+-+0,
+-+246,
+-+192,
+-+174,
+-+59,
+-+2,
+-+0,
+-+246,
+-+128,
+-+175,
+-+62,
+-+2,
+-+0,
+-+246,
+-+128,
+-+174,
+-+58,
+-+2,
+-+0,
+-+246,
+-+64,
+-+175,
+-+61,
+-+2,
+-+0,
+-+246,
+-+64,
+-+174,
+-+57,
+-+2,
+-+0,
+-+255,
+-+43,
+-+240,
+-+4,
+-+212,
+-+192,
+-+243,
+-+128,
+-+11,
+-+64,
+-+254,
+-+43,
+-+240,
+-+1,
+-+228,
+-+192,
+-+243,
+-+128,
+-+10,
+-+64,
+-+254,
+-+43,
+-+240,
+-+1,
+-+244,
+-+192,
+-+243,
+-+128,
+-+10,
+-+64,
+-+254,
+-+43,
+-+240,
+-+1,
+-+180,
+-+192,
+-+243,
+-+128,
+-+10,
+-+64,
+-+254,
+-+43,
+-+141,
+-+0,
+-+164,
+-+192,
+-+243,
+-+128,
+-+10,
+-+88,
+-+246,
+-+35,
+-+141,
+-+3,
+-+68,
+-+32,
+-+247,
+-+35,
+-+141,
+-+191,
+-+66,
+-+240,
+-+246,
+-+35,
+-+141,
+-+50,
+-+66,
+-+0,
+-+255,
+-+235,
+-+143,
+-+52,
+-+242,
+-+192,
+-+243,
+-+60,
+-+128,
+-+0,
+-+255,
+-+43,
+-+240,
+-+2,
+-+212,
+-+192,
+-+243,
+-+128,
+-+11,
+-+0,
+-+255,
+-+43,
+-+240,
+-+191,
+-+226,
+-+192,
+-+243,
+-+188,
+-+10,
+-+64,
+-+254,
+-+43,
+-+141,
+-+0,
+-+180,
+-+192,
+-+243,
+-+128,
+-+10,
+-+88,
+-+246,
+-+35,
+-+141,
+-+2,
+-+68,
+-+32,
+-+247,
+-+35,
+-+141,
+-+190,
+-+66,
+-+240,
+-+246,
+-+35,
+-+141,
+-+50,
+-+66,
+-+0,
+-+255,
+-+171,
+-+143,
+-+52,
+-+226,
+-+192,
+-+243,
+-+60,
+-+128,
+-+0,
+-+255,
+-+43,
+-+240,
+-+4,
+-+180,
+-+192,
+-+243,
+-+128,
+-+11,
+-+0,
+-+255,
+-+43,
+-+240,
+-+191,
+-+226,
+-+192,
+-+243,
+-+188,
+-+10,
+-+128,
+-+253,
+-+43,
+-+240,
+-+3,
+-+212,
+-+192,
+-+243,
+-+128,
+-+10,
+-+64,
+-+254,
+-+35,
+-+141,
+-+1,
+-+196,
+-+192,
+-+243,
+-+128,
+-+10,
+-+88,
+-+246,
+-+35,
+-+141,
+-+3,
+-+68,
+-+32,
+-+247,
+-+35,
+-+141,
+-+189,
+-+66,
+-+240,
+-+246,
+-+35,
+-+141,
+-+50,
+-+66,
+-+0,
+-+255,
+-+107,
+-+143,
+-+52,
+-+210,
+-+192,
+-+243,
+-+60,
+-+128,
+-+0,
+-+255,
+-+43,
+-+240,
+-+4,
+-+148,
+-+192,
+-+243,
+-+128,
+-+11,
+-+64,
+-+254,
+-+43,
+-+240,
+-+1,
+-+164,
+-+192,
+-+243,
+-+128,
+-+10,
+-+64,
+-+254,
+-+43,
+-+240,
+-+1,
+-+180,
+-+192,
+-+243,
+-+128,
+-+10,
+-+64,
+-+254,
+-+43,
+-+240,
+-+1,
+-+244,
+-+192,
+-+243,
+-+128,
+-+10,
+-+64,
+-+254,
+-+43,
+-+141,
+-+0,
+-+228,
+-+192,
+-+243,
+-+128,
+-+10,
+-+88,
+-+246,
+-+35,
+-+141,
+-+3,
+-+68,
+-+32,
+-+247,
+-+35,
+-+141,
+-+187,
+-+66,
+-+240,
+-+246,
+-+35,
+-+141,
+-+50,
+-+66,
+-+0,
+-+255,
+-+235,
+-+142,
+-+52,
+-+178,
+-+192,
+-+243,
+-+60,
+-+128,
+-+0,
+-+255,
+-+43,
+-+240,
+-+2,
+-+148,
+-+192,
+-+243,
+-+128,
+-+11,
+-+0,
+-+255,
+-+43,
+-+240,
+-+187,
+-+162,
+-+192,
+-+243,
+-+188,
+-+10,
+-+64,
+-+254,
+-+43,
+-+141,
+-+0,
+-+244,
+-+192,
+-+243,
+-+128,
+-+10,
+-+88,
+-+246,
+-+35,
+-+141,
+-+2,
+-+68,
+-+32,
+-+247,
+-+35,
+-+141,
+-+186,
+-+66,
+-+240,
+-+246,
+-+35,
+-+141,
+-+50,
+-+66,
+-+0,
+-+255,
+-+171,
+-+142,
+-+52,
+-+162,
+-+192,
+-+243,
+-+60,
+-+128,
+-+0,
+-+255,
+-+43,
+-+240,
+-+4,
+-+244,
+-+192,
+-+243,
+-+128,
+-+11,
+-+0,
+-+255,
+-+43,
+-+240,
+-+187,
+-+162,
+-+192,
+-+243,
+-+188,
+-+10,
+-+128,
+-+253,
+-+43,
+-+240,
+-+3,
+-+148,
+-+192,
+-+243,
+-+128,
+-+10,
+-+64,
+-+254,
+-+35,
+-+141,
+-+1,
+-+132,
+-+192,
+-+243,
+-+128,
+-+10,
+-+88,
+-+246,
+-+35,
+-+141,
+-+3,
+-+68,
+-+32,
+-+247,
+-+35,
+-+141,
+-+185,
+-+66,
+-+240,
+-+246,
+-+35,
+-+141,
+-+50,
+-+66,
+-+0,
+-+255,
+-+107,
+-+142,
+-+52,
+-+146,
+-+192,
+-+243,
+-+60,
+-+128,
+-+64,
+-+255,
+-+98,
+-+141,
+-+0,
+-+52,
+-+192,
+-+243,
+-+0,
+-+0,
+-+0,
+-+254,
+-+0,
+-+240,
+-+53,
+-+10,
+-+0,
+-+240,
+-+60,
+-+0,
+-+0,
+-+254,
+-+0,
+-+240,
+-+1,
+-+4,
+-+0,
+-+240,
+-+64,
+-+147,
+-+5,
+-+106,
+-+0,
+-+144,
+-+177,
+-+0,
+-+88,
+-+246,
+-+163,
+-+140,
+-+1,
+-+4,
+-+128,
+-+245,
+-+99,
+-+141,
+-+10,
+-+4,
+-+88,
+-+246,
+-+162,
+-+138,
+-+1,
+-+68,
+-+0,
+-+247,
+-+162,
+-+138,
+-+36,
+-+162,
+-+88,
+-+254,
+-+162,
+-+138,
+-+3,
+-+164,
+-+192,
+-+243,
+-+128,
+-+11,
+-+0,
+-+255,
+-+226,
+-+137,
+-+32,
+-+2,
+-+195,
+-+243,
+-+60,
+-+0,
+-+32,
+-+247,
+-+226,
+-+137,
+-+42,
+-+114,
+-+0,
+-+255,
+-+34,
+-+138,
+-+33,
+-+18,
+-+195,
+-+243,
+-+60,
+-+0,
+-+32,
+-+247,
+-+34,
+-+138,
+-+42,
+-+130,
+-+16,
+-+246,
+-+98,
+-+138,
+-+40,
+-+114,
+-+16,
+-+246,
+-+98,
+-+138,
+-+41,
+-+146,
+-+32,
+-+246,
+-+98,
+-+138,
+-+41,
+-+146,
+-+32,
+-+246,
+-+226,
+-+137,
+-+41,
+-+146,
+-+40,
+-+246,
+-+34,
+-+138,
+-+41,
+-+146,
+-+32,
+-+247,
+-+163,
+-+141,
+-+63,
+-+178,
+-+32,
+-+247,
+-+227,
+-+141,
+-+62,
+-+162,
+-+0,
+-+254,
+-+0,
+-+240,
+-+8,
+-+4,
+-+0,
+-+240,
+-+128,
+-+11,
+-+128,
+-+253,
+-+35,
+-+240,
+-+9,
+-+100,
+-+192,
+-+243,
+-+128,
+-+10,
+-+128,
+-+253,
+-+163,
+-+141,
+-+128,
+-+115,
+-+192,
+-+243,
+-+152,
+-+10,
+-+88,
+-+246,
+-+163,
+-+141,
+-+4,
+-+100,
+-+208,
+-+246,
+-+35,
+-+139,
+-+0,
+-+100,
+-+32,
+-+255,
+-+34,
+-+139,
+-+53,
+-+202,
+-+192,
+-+243,
+-+60,
+-+128,
+-+0,
+-+254,
+-+0,
+-+139,
+-+0,
+-+4,
+-+0,
+-+240,
+-+0,
+-+160,
+-+240,
+-+246,
+-+163,
+-+141,
+-+48,
+-+98,
+-+0,
+-+247,
+-+99,
+-+139,
+-+63,
+-+210,
+-+0,
+-+247,
+-+98,
+-+139,
+-+1,
+-+212,
+-+88,
+-+254,
+-+98,
+-+139,
+-+1,
+-+212,
+-+192,
+-+243,
+-+128,
+-+11,
+-+32,
+-+255,
+-+99,
+-+139,
+-+62,
+-+98,
+-+192,
+-+243,
+-+188,
+-+10,
+-+88,
+-+246,
+-+98,
+-+139,
+-+1,
+-+212,
+-+240,
+-+246,
+-+98,
+-+139,
+-+50,
+-+210,
+-+0,
+-+247,
+-+163,
+-+128,
+-+59,
+-+146,
+-+0,
+-+247,
+-+160,
+-+128,
+-+1,
+-+36,
+-+88,
+-+254,
+-+160,
+-+128,
+-+1,
+-+36,
+-+192,
+-+243,
+-+128,
+-+11,
+-+0,
+-+247,
+-+163,
+-+128,
+-+58,
+-+98,
+-+64,
+-+255,
+-+35,
+-+240,
+-+0,
+-+100,
+-+192,
+-+243,
+-+128,
+-+10,
+-+64,
+-+255,
+-+163,
+-+128,
+-+0,
+-+164,
+-+192,
+-+243,
+-+128,
+-+10,
+-+88,
+-+246,
+-+160,
+-+128,
+-+1,
+-+36,
+-+240,
+-+246,
+-+160,
+-+128,
+-+50,
+-+34,
+-+8,
+-+255,
+-+227,
+-+143,
+-+54,
+-+242,
+-+192,
+-+243,
+-+60,
+-+128,
+-+40,
+-+255,
+-+227,
+-+142,
+-+54,
+-+178,
+-+192,
+-+243,
+-+60,
+-+128,
+-+0,
+-+254,
+-+0,
+-+240,
+-+39,
+-+10,
+-+0,
+-+240,
+-+60,
+-+128,
+-+8,
+-+255,
+-+163,
+-+143,
+-+45,
+-+226,
+-+192,
+-+243,
+-+60,
+-+128,
+-+0,
+-+254,
+-+0,
+-+240,
+-+44,
+-+10,
+-+0,
+-+240,
+-+60,
+-+0,
+-+0,
+-+254,
+-+0,
+-+240,
+-+40,
+-+10,
+-+0,
+-+240,
+-+60,
+-+128,
+-+8,
+-+255,
+-+163,
+-+142,
+-+2,
+-+162,
+-+192,
+-+243,
+-+60,
+-+128,
+-+90,
+-+0,
+- };
+-diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s
+-index fd159bc..b055208 100644
+---- a/libavcodec/rpi_hevc_transform.s
+-+++ b/libavcodec/rpi_hevc_transform.s
+-@@ -83,6 +83,8 @@
+- hevc_trans_16x16:
+-   cmp r5,1
+-   beq memclear16
+-+  cmp r5,2
+-+  beq hevc_deblock_16x16
+-   push r6-r15, lr # TODO cut down number of used registers
+-   mov r14,r3 # coeffs32
+-   mov r15,r4 # num32
+-@@ -282,3 +284,427 @@ loop:
+-   cmp r1,0
+-   bgt loop
+-   b lr
+-+
+-+
+-+################################################################################
+-+# HEVC VPU Deblock
+-+#
+-+# Vertical edges before horizontal
+-+# Decision can change every 4 pixels, but only 8 pixel boundaries are deblocked
+-+#
+-+# ARM is responsible for storing beta and tc for each 4 pixels horiz and vert edge.
+-+# The VPU code works in units of 16x16 blocks.
+-+# We do vertical filtering for the current block followed by horizontal filtering for the previous (except for the first time).
+-+# One final horizontal filter is required at the end.
+-+# PCM is not allowed in this code.
+-+#
+-+#
+-+# H(16-4:16+15,0) contains previous block (note that we need 4 lines above of context that may get altered during filtering)
+-+# H(16:31,16) contains current block (note that we do not need the upper lines until the horizontal filtering.
+-+
+-+.set P0,63
+-+.set P1,62
+-+.set P2,61
+-+.set P3,60
+-+.set Q0,59
+-+.set Q1,58
+-+.set Q2,57
+-+.set Q3,56
+-+
+-+.set dp,32
+-+.set dq,33
+-+.set d,34
+-+.set decision,35
+-+.set beta,36
+-+.set beta2,37
+-+.set beta3,38
+-+.set ptest,39
+-+.set qtest,40
+-+.set pqtest,41
+-+.set thresh,42
+-+.set deltatest, 44
+-+.set deltap1, 45
+-+.set tc25, 46
+-+.set setup,47
+-+.set tc,48
+-+.set tc25,49
+-+.set tc2, 50
+-+.set do_filter, 51
+-+.set delta, 52
+-+.set tc10, 53
+-+.set delta0, 54
+-+.set delta1, 55
+-+.set zeros, 0
+-+.set setup_input, 1
+-+.set deltaq1, 2
+-+
+-+
+-+
+-+# hevc_deblock_16x16 deblocks an entire row that is 16 pixels high by the full width of the image.
+-+# Row has num16 16x16 blocks across
+-+# Beta goes from 0 to 64
+-+# tc goes from 0 to 24
+-+# setup[block_idx][0=vert,1=horz][0=first edge, 1=second edge][0=beta,1=tc][0..3=edge number]
+-+#   has 8 bytes per edge
+-+#   has 16 bytes per direction
+-+#   has 32 bytes per 16x16 block
+-+# hevc_deblock_16x16(uint8_t *img (r0), int stride (r1), int num16w (r2), uint8_t setup[num16][2][2][2][4](r3),int num16h(r4))
+-+hevc_deblock_16x16:
+-+  push r6-r15, lr
+-+  mov r9,r4
+-+  mov r4,r3
+-+  mov r13,r2
+-+  mov r2,r0
+-+  mov r10,r0
+-+  subscale4 r0,r1
+-+  mov r8,63
+-+  mov r6,-3
+-+  vmov H(zeros,0),0
+-+# r7 is number of blocks still to load
+-+# r0 is location of current block - 4 * stride
+-+# r1 is stride
+-+# r2 is location of current block
+-+# r3 is offset of start of block (actual edges start at H(16,16)+r3 for horizontal and H(16,0)+r3 for vertical
+-+# r4 is setup
+-+# r5 is for temporary calculations
+-+# r8 holds 63
+-+# r6 holds -3
+-+# r9 holds the number of 16 high rows to process
+-+# r10 holds the original img base
+-+# r11 returns 0 if no filtering was done on the edge
+-+# r12 saves a copy of this
+-+# r13 is copy of width
+-+
+-+process_row:
+-+  # First iteration does not do horizontal filtering on previous
+-+  mov r7, r13
+-+  mov r3,0
+-+  vldb H(12++,16)+r3,(r0 += r1) REP 4    # Load the current block
+-+  vldb H(16++,16)+r3,(r2 += r1) REP 16
+-+  vldb H(setup_input,0), (r4)  # We may wish to prefetch these
+-+  vstb H(zeros,0),(r4)
+-+  bl vert_filter
+-+  add r3,8
+-+  vadd H(setup_input,0),H(setup_input,8),0 # Rotate to second set of 8
+-+  bl vert_filter
+-+  sub r3,8
+-+  b start_deblock_loop
+-+deblock_loop:
+-+  # Middle iterations do vertical on current block and horizontal on preceding
+-+  vldb H(12++,16)+r3,(r0 += r1) REP 4  # load the current block
+-+  vldb H(16++,16)+r3,(r2 += r1) REP 16
+-+  vldb H(setup_input,0), (r4)
+-+  vstb H(zeros,0),(r4)
+-+  bl vert_filter
+-+  add r3,8
+-+  vadd H(setup_input,0),H(setup_input,8),0
+-+  bl vert_filter
+-+  sub r3,8
+-+  vldb H(setup_input,0), -16(r4)
+-+  vstb H(zeros,0),-16(r4)
+-+  bl horz_filter
+-+  mov r12,r11
+-+  add r3,8*64
+-+  vadd H(setup_input,0),H(setup_input,8),0
+-+  bl horz_filter
+-+  sub r3,8*64
+-+  addcmpbeq r12,0,0,skip_save_top
+-+  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
+-+skip_save_top:
+-+  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
+-+start_deblock_loop:
+-+  # move onto next 16x16 (could do this with circular buffer support instead)
+-+  add r3,16
+-+  and r3,r8
+-+  add r4,32
+-+  # Perform loop counter operations (may work with an addcmpbgt as well?)
+-+  add r0,16
+-+  add r2,16
+-+  sub r7,1
+-+  cmp r7,0 # Are there still more blocks to load
+-+  bgt deblock_loop
+-+
+-+  # Final iteration needs to just do horizontal filtering
+-+  vldb H(setup_input,0), -16(r4)
+-+  vstb H(zeros,0),-16(r4)
+-+  bl horz_filter
+-+  mov r12,r11
+-+  add r3,8*64
+-+  vadd H(setup_input,0),H(setup_input,8),0
+-+  bl horz_filter
+-+  sub r3,64*8
+-+  addcmpbeq r12,0,0,skip_save_top2
+-+  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
+-+skip_save_top2:
+-+  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
+-+
+-+# Now look to see if we should do another row
+-+  sub r9,1
+-+  cmp r9,0
+-+  bgt start_again
+-+  pop r6-r15, pc
+-+start_again:
+-+  # Need to sort out r0,r2 to point to next row down
+-+  addscale16 r10,r1
+-+  mov r2,r10
+-+  subscale4 r0,r2,r1
+-+  b process_row
+-+
+-+
+-+# At this stage H(16,16)+r3 points to the first pixel of the 16 high edge to be filtered
+-+# So we can reuse the code we move the parts to be filtered into HX(P0/P1/P2/P3/Q0/Q1/Q2/Q3,0) - we will perform a final saturation step on placing them back into the correct locations
+-+
+-+vert_filter:
+-+  push lr
+-+
+-+  vmov HX(P3,0), V(16,12)+r3
+-+  vmov HX(P2,0), V(16,13)+r3
+-+  vmov HX(P1,0), V(16,14)+r3
+-+  vmov HX(P0,0), V(16,15)+r3
+-+  vmov HX(Q0,0), V(16,16)+r3
+-+  vmov HX(Q1,0), V(16,17)+r3
+-+  vmov HX(Q2,0), V(16,18)+r3
+-+  vmov HX(Q3,0), V(16,19)+r3
+-+
+-+  bl do_luma_filter
+-+
+-+  vadds V(16,13)+r3, HX(P2,0), 0
+-+  vadds V(16,14)+r3, HX(P1,0), 0
+-+  vadds V(16,15)+r3, HX(P0,0), 0
+-+  # P3 and Q3 never change so don't bother saving back
+-+  vadds V(16,16)+r3, HX(Q0,0), 0
+-+  vadds V(16,17)+r3, HX(Q1,0), 0
+-+  vadds V(16,18)+r3, HX(Q2,0), 0
+-+
+-+  pop pc
+-+
+-+# Filter edge at H(16,0)+r3
+-+horz_filter:
+-+  push lr
+-+
+-+  vmov HX(P3,0), H(12,0)+r3
+-+  vmov HX(P2,0), H(13,0)+r3
+-+  vmov HX(P1,0), H(14,0)+r3
+-+  vmov HX(P0,0), H(15,0)+r3
+-+  vmov HX(Q0,0), H(16,0)+r3
+-+  vmov HX(Q1,0), H(17,0)+r3
+-+  vmov HX(Q2,0), H(18,0)+r3
+-+  vmov HX(Q3,0), H(19,0)+r3
+-+
+-+  bl do_luma_filter
+-+
+-+  vadds H(13,0)+r3, HX(P2,0), 0
+-+  vadds H(14,0)+r3, HX(P1,0), 0
+-+  vadds H(15,0)+r3, HX(P0,0), 0
+-+  # P3 and Q3 never change so don't bother saving back
+-+  vadds H(16,0)+r3, HX(Q0,0), 0
+-+  vadds H(17,0)+r3, HX(Q1,0), 0
+-+  vadds H(18,0)+r3, HX(Q2,0), 0
+-+
+-+  pop pc
+-+
+-+# r4 points to array of beta/tc for each 4 length edge
+-+do_luma_filter:
+-+  valtl H(setup,0),H(setup_input,0),H(setup_input,0) # b*8tc*8
+-+  valtl HX(beta,0),H(setup,0),H(setup,0)
+-+  valtu HX(tc,0),H(setup,0),H(setup,0)
+-+  vmul HX(tc25,0), HX(tc,0), 5
+-+  vadd HX(tc25,0),HX(tc25,0), 1
+-+  vasr HX(tc25,0), HX(tc25,0), 1
+-+
+-+  # Compute decision
+-+  vadd HX(dp,0),HX(P1,0),HX(P1,0) # 2*P1
+-+  vsub HX(dp,0),HX(P2,0),HX(dp,0) # P2-2*P1
+-+  vadd HX(dp,0),HX(dp,0),HX(P0,0) # P2-2*P1+P0
+-+  vdist HX(dp,0),HX(dp,0),0 # abs(P2-2*P1+P0) # dp0
+-+
+-+  vadd HX(dq,0),HX(Q1,0),HX(Q1,0) # 2*Q1
+-+  vsub HX(dq,0),HX(Q2,0),HX(dq,0) # Q2-2*Q1
+-+  vadd HX(dq,0),HX(dq,0),HX(Q0,0) # Q2-2*Q1+Q0
+-+  vdist HX(dq,0),HX(dq,0),0 # abs(Q2-2*Q1+Q0) # dq0
+-+
+-+  vadd HX(d,0), HX(dp,0), HX(dq,0)
+-+  vasr HX(beta2,0),HX(beta,0),2
+-+  vasr HX(beta3,0),HX(beta,0),3
+-+
+-+  # Compute flags that are negative if all conditions pass
+-+  vdist HX(decision,0), HX(P0,0), HX(P3,0) CLRA SACC
+-+  vdist HX(decision,0), HX(Q0,0), HX(Q3,0) SACC
+-+  vsub HX(decision,0), HX(decision,0), HX(beta3,0) SETF
+-+
+-+  vdist HX(decision,0), HX(P0,0), HX(Q0,0) IFN
+-+  vsub HX(decision,0), HX(decision,0), HX(tc25,0) IFN SETF
+-+  vadd HX(decision,0), HX(d,0), HX(d,0) IFN
+-+  vsub HX(decision,0), HX(decision,0), HX(beta2,0) IFN SETF
+-+  vmov HX(decision,0), 1 IFNN
+-+  vadd H(decision,0),H(decision,3),0 IFN
+-+  vadd H(decision,16),H(decision,19),0 IFN
+-+  vmov -,HX(decision,0) SETF   # N marks strong filter
+-+  vmov HX(decision,0), 1 IFNN  # NN marks normal filter
+-+
+-+  vadd HX(do_filter,0), HX(d,3), HX(d,0)
+-+  vsub HX(do_filter,0), HX(do_filter,0), HX(beta,0) SETF # IFNN means no filter
+-+  vmov HX(decision,0),0 IFNN # Z marks no filter
+-+
+-+  # Expand out decision (currently valid one every 4 pixels)  0...1...2...3
+-+  # First extract out even terms
+-+  vodd HX(decision,0),HX(decision,0),HX(decision,0)  # 0.1.2.3
+-+  vodd HX(decision,0),HX(decision,0),HX(decision,0)  # 0123
+-+  # Now expand back
+-+  valtl HX(decision,0),HX(decision,0),HX(decision,0) # 00112233
+-+  valtl HX(decision,0),HX(decision,0),HX(decision,0) SETF # 0000111122223333
+-+
+-+  # HX(decision,0) is negative if want strong filtering, 1 if want normal filtering, 0 if want no filtering
+-+
+-+  # Do a quick check to see if there is anything to do
+-+  mov r11, 0 # Signal no filtering
+-+  vmov -,1 IFNZ SUMS r5
+-+  cmp r5,0
+-+  beq filtering_done
+-+  mov r11, 1 # Signal some filtering
+-+  # And whether there is any strong filtering
+-+  vmov -,1 IFN SUMS r5
+-+  cmp r5,0
+-+  beq normal_filtering
+-+
+-+  ##############################################################################
+-+  # Strong filtering - could maybe fast case if all have same sign? (especially if all disabled!)
+-+  vshl HX(tc2,0), HX(tc,0), 1  # Note that in normal filtering tx2 is tc/2, while here it is tc*2
+-+
+-+  # Take a copy of the original pixels for use in decision calculation
+-+  vmov HX(P0,32),HX(P0,0)
+-+  vmov HX(Q0,32),HX(Q0,0)
+-+  vmov HX(P1,32),HX(P1,0)
+-+  vmov HX(Q1,32),HX(Q1,0)
+-+  vmov HX(P2,32),HX(P2,0)
+-+  vmov HX(Q2,32),HX(Q2,0)
+-+
+-+  vadd -,HX(P2,32),4 CLRA SACC
+-+  vshl -,HX(P1,32),1 SACC
+-+  vshl -,HX(P0,32),1 SACC
+-+  vshl -,HX(Q0,32),1 SACC
+-+  vshl HX(delta,0),HX(Q1,32),0 SACC
+-+  vasr HX(delta,0),HX(delta,0), 3
+-+  vsub HX(delta,0),HX(delta,0),HX(P0,32)
+-+  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
+-+  vadd HX(P0,0),HX(P0,32),HX(delta,0) IFN
+-+
+-+  vadd -,HX(P2,32),2 CLRA SACC
+-+  vadd -,HX(P1,32),HX(P0,32) SACC
+-+  vshl HX(delta,0),HX(Q0,32),0 SACC
+-+  vasr HX(delta,0),HX(delta,0), 2
+-+  vsub HX(delta,0),HX(delta,0),HX(P1,32)
+-+  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
+-+  vadd HX(P1,0),HX(P1,32),HX(delta,0) IFN
+-+
+-+  vadd -,HX(Q0,32),4 CLRA SACC
+-+  vadd -,HX(P1,32),HX(P0,32) SACC
+-+  vmul -,HX(P2,32),3 SACC
+-+  vshl HX(delta,0),HX(P3,0),1 SACC # Note that we have not made a copy of P3, so using P3,0 is correct
+-+  vasr HX(delta,0),HX(delta,0), 3
+-+  vsub HX(delta,0),HX(delta,0),HX(P2,32)
+-+  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
+-+  vadd HX(P2,0),HX(P2,32),HX(delta,0) IFN
+-+  #vmov HX(P2,0),3 IFN
+-+
+-+  # Now reverse all P/Qs
+-+
+-+  vadd -,HX(Q2,32),4 CLRA SACC
+-+  vshl -,HX(Q1,32),1 SACC
+-+  vshl -,HX(Q0,32),1 SACC
+-+  vshl -,HX(P0,32),1 SACC
+-+  vshl HX(delta,0),HX(P1,32),0 SACC
+-+  vasr HX(delta,0),HX(delta,0), 3
+-+  vsub HX(delta,0),HX(delta,0),HX(Q0,32)
+-+  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
+-+  vadd HX(Q0,0),HX(Q0,32),HX(delta,0) IFN
+-+
+-+  vadd -,HX(Q2,32),2 CLRA SACC
+-+  vadd -,HX(Q1,32),HX(Q0,32) SACC
+-+  vshl HX(delta,0),HX(P0,32),0 SACC
+-+  vasr HX(delta,0),HX(delta,0), 2
+-+  vsub HX(delta,0),HX(delta,0),HX(Q1,32)
+-+  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
+-+  vadd HX(Q1,0),HX(Q1,32),HX(delta,0) IFN
+-+
+-+  vadd -,HX(P0,32),4 CLRA SACC
+-+  vadd -,HX(Q1,32),HX(Q0,32) SACC
+-+  vmul -,HX(Q2,32),3 SACC
+-+  vshl HX(delta,0),HX(Q3,0),1 SACC # Note that we have not made a copy of Q3, so using Q3,0 is correct
+-+  vasr HX(delta,0),HX(delta,0), 3
+-+  vsub HX(delta,0),HX(delta,0),HX(Q2,32)
+-+  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
+-+  vadd HX(Q2,0),HX(Q2,32),HX(delta,0) IFN
+-+
+-+  ##############################################################################
+-+  # Normal filtering
+-+normal_filtering:
+-+  # Invert the decision flags
+-+  # make instruction more complicated as assembler has error and loses SETF
+-+  vrsub HX(tc10,0), HX(decision,0), 0 SETF # IFN means normal filtering
+-+  vmov  -, HX(tc10,0) SETF # IFN means normal filtering
+-+
+-+  vmov -,1 IFN SUMS r5
+-+  cmp r5,0
+-+  beq filtering_done
+-+
+-+  vasr HX(tc2,0), HX(tc,0), 1
+-+  vmul HX(tc10,0), HX(tc,0), 10
+-+
+-+  vasr HX(thresh,0), HX(beta,0), 1
+-+  vadd HX(thresh,0), HX(thresh,0), HX(beta,0)
+-+  vasr HX(thresh,0), HX(thresh,0), 3 CLRA SACC
+-+
+-+  vadd HX(ptest,0),HX(dp,3),HX(dp,0)
+-+  vsub HX(ptest,0),HX(ptest,0),HX(thresh,0) # ptest is negative if we need to do the P2 pixel
+-+  vadd HX(qtest,0),HX(dq,3),HX(dq,0)
+-+  vsub HX(qtest,0),HX(qtest,0),HX(thresh,0) # qtest is negative if we need to do the Q2 pixel
+-+  # Expand ptest and qtest together
+-+  vodd HX(pqtest,0),HX(ptest,0),HX(qtest,0)  # p.p.p.p.q.q.q.q
+-+  vodd HX(pqtest,0),HX(pqtest,0),HX(pqtest,0) # ppppqqqq........
+-+  valtl HX(pqtest,0),HX(pqtest,0),HX(pqtest,0) # ppppppppqqqqqqqq
+-+  valtl HX(ptest,0),HX(pqtest,0),HX(pqtest,0)
+-+  valtu HX(qtest,0),HX(pqtest,0),HX(pqtest,0)
+-+
+-+  vsub HX(delta0,0), HX(Q0,0), HX(P0,0)
+-+  vsub HX(delta1,0), HX(Q1,0), HX(P1,0)
+-+  vmov -,8 CLRA SACC
+-+  vmul -,HX(delta0,0), 9 SACC
+-+  vmul HX(delta0,0),HX(delta1,0), r6 SACC
+-+  vasr HX(delta0,0), HX(delta0,0), 4
+-+  vdist HX(deltatest,0), HX(delta0,0), 0
+-+  vsub HX(deltatest,0), HX(deltatest,0), HX(tc10,0) IFN SETF # negative if still need to do something
+-+  vmov HX(deltatest,0), 0 IFNN # clear if no need to do anything so we can reload flags later
+-+
+-+  vclamps HX(delta0,0), HX(delta0,0), HX(tc,0)
+-+
+-+  vadd HX(deltap1,0), HX(P2,0), HX(P0,0)
+-+  vadd HX(deltap1,0), HX(deltap1,0), 1
+-+  vasr HX(deltap1,0), HX(deltap1,0), 1 CLRA SACC
+-+  vsub HX(deltap1,0), HX(delta0,0), HX(P1,0) SACC
+-+  vasr HX(deltap1,0), HX(deltap1,0), 1
+-+  vclamps HX(deltap1,0), HX(deltap1,0), HX(tc2,0)
+-+
+-+  vadd HX(deltaq1,0), HX(Q2,0), HX(Q0,0)
+-+  vadd HX(deltaq1,0), HX(deltaq1,0), 1
+-+  vasr HX(deltaq1,0), HX(deltaq1,0), 1 CLRA SACC
+-+  vadd HX(deltaq1,0), HX(delta0,0), HX(Q1,0)
+-+  vrsub -, HX(delta0,0), 0 SACC
+-+  vrsub HX(deltaq1,0), HX(Q1,0), 0 SACC
+-+  vasr HX(deltaq1,0), HX(deltaq1,0), 1
+-+  vclamps HX(deltaq1,0), HX(deltaq1,0), HX(tc2,0)
+-+
+-+  vadds HX(P0,0), HX(P0,0), HX(delta0,0) IFN
+-+  vsubs HX(Q0,0), HX(Q0,0), HX(delta0,0) IFN
+-+
+-+  vmov -,HX(ptest,0) IFN SETF # Negative if need to do p1
+-+  vadds HX(P1,0), HX(P1,0), HX(deltap1,0) IFN
+-+
+-+  vmov -,HX(deltatest,0) SETF
+-+  vmov -,HX(qtest,0) IFN SETF # Negative if need to do q1
+-+  vadds HX(Q1,0), HX(Q1,0), HX(deltaq1,0) IFN
+-+
+-+  #vmov HX(P2,0),1 IFN
+-+
+-+filtering_done:
+-+  b lr
+-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
+-index 0121fca..05b2169 100644
+---- a/libavcodec/rpi_qpu.c
+-+++ b/libavcodec/rpi_qpu.c
+-@@ -147,7 +147,7 @@ static int gpu_init(volatile struct GPU **gpu) {
+-   vcsm_init();
+-   gpu_malloc_uncached_internal(sizeof(struct GPU), &gpu_mem_ptr, mb);
+-   ptr = (volatile struct GPU*)gpu_mem_ptr.arm;
+--  memset(ptr, 0, sizeof *ptr);
+-+  memset((void*)ptr, 0, sizeof *ptr);
+-   vc = gpu_mem_ptr.vc;
+- 
+-   ptr->mb = mb;
+-@@ -254,7 +254,7 @@ void gpu_cache_flush(GPU_MEM_PTR_T *p)
+-     struct vcsm_user_clean_invalid_s iocache = {};
+-     iocache.s[0].handle = p->vcsm_handle;
+-     iocache.s[0].cmd = 3; // clean+invalidate
+--    iocache.s[0].addr = p->arm;
+-+    iocache.s[0].addr = (int) p->arm;
+-     iocache.s[0].size  = p->numbytes;
+-     vcsm_clean_invalid( &iocache );
+- #else
+-@@ -390,6 +390,7 @@ static void *vpu_start(void *arg) {
+- #ifdef RPI_TIME_TOTAL_POSTED
+-   int last_time=0;
+-   long long on_time=0;
+-+  long long on_time_deblock=0;
+-   long long off_time=0;
+-   int start_time;
+-   int end_time;
+-@@ -451,10 +452,13 @@ static void *vpu_start(void *arg) {
+- #ifdef RPI_TIME_TOTAL_POSTED
+-     end_time = Microseconds();
+-     last_time = end_time;
+--    on_time += end_time - start_time;
+-+    if (p[6]==2)
+-+      on_time_deblock += end_time - start_time;
+-+    else
+-+      on_time += end_time - start_time;
+-     count++;
+-     if ((count&0x7f)==0)
+--      printf("Posted %d On=%dms, Off=%dms\n",count,(int)(on_time/1000),(int)(off_time/1000));
+-+      printf("Posted %d On=%dms, On_deblock=%dms, Off=%dms\n",count,(int)(on_time/1000),(int)(on_time_deblock/1000),(int)(off_time/1000));
+- #endif
+-     pthread_mutex_lock(&post_mutex);
+-     vpu_async_head++;
+-diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
+-index e86eb30..c5d8b29 100644
+---- a/libavcodec/rpi_shader.c
+-+++ b/libavcodec/rpi_shader.c
+-@@ -61,7 +61,7 @@ unsigned int rpi_shader[] = {
+- /* [0x00000120] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_frame_base, r2
+- /* [0x00000128] */ 0x0c9e7440, 0x10020f27, // add t1s, r2, r1
+- /* [0x00000130] */ 0x00000008, 0xe00208a7, // mov r2,8
+--/* [0x00000138] */ 0x11827c80, 0x10021327, // shl rb12,unif, r2
+-+/* [0x00000138] */ 0x11827c80, 0x10021327, // shl rb12,unif,r2
+- /* [0x00000140] */ 0x0c827c80, 0x10021367, // add rb13,unif,r2
+- /* [0x00000148] */ 0x15827d80, 0x100208a7, // mov r2, unif
+- /* [0x00000150] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
+--- 
+-2.7.4
+-
+-
+-From e9c59f0d7b42dfb10d85ab2477f95b44484a8d70 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Wed, 1 Jul 2015 09:21:17 +0100
+-Subject: [PATCH 65/68] Added ability to combine jobs
+-
+----
+- libavcodec/rpi_qpu.c | 81 +++++++++++++++++++++++++++++++++++++++++++++++++++-
+- 1 file changed, 80 insertions(+), 1 deletion(-)
+-
+-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
+-index 05b2169..91777be 100644
+---- a/libavcodec/rpi_qpu.c
+-+++ b/libavcodec/rpi_qpu.c
+-@@ -8,6 +8,8 @@
+- #define RPI_TIME_TOTAL_POSTED
+- // define RPI_ASYNC to run the VPU in a separate thread, need to make a separate call to check for completion
+- #define RPI_ASYNC
+-+// Define RPI_COMBINE_JOBS to find jobs that can be executed in parallel
+-+#define RPI_COMBINE_JOBS
+- 
+- #include <stdio.h>
+- #include <stdlib.h>
+-@@ -398,9 +400,15 @@ static void *vpu_start(void *arg) {
+- #endif
+-   while(1) {
+-     int i;
+--    int *p;
+-+    int *p; // Pointer for a QPU/VPU job
+-+#ifdef RPI_COMBINE_JOBS
+-+    int *q = NULL; // Pointer for a VPU only job
+-+    int have_qpu = 0;
+-+    int have_vpu = 0;
+-+#endif
+-     int qpu_code;
+-     int qpu_codeb;
+-+    int num_jobs; // Number of jobs available
+-     pthread_mutex_lock(&post_mutex);
+-     while( vpu_async_tail - vpu_async_head <= 0)
+-     {
+-@@ -408,13 +416,38 @@ static void *vpu_start(void *arg) {
+-       pthread_cond_wait(&post_cond_tail, &post_mutex);
+-     }
+-     p = vpu_cmds[vpu_async_head%MAXCMDS];
+-+    num_jobs = vpu_async_tail - vpu_async_head;
+-     pthread_mutex_unlock(&post_mutex);
+- 
+-     if (p[6] == -1) {
+-       break; // Last job
+-     }
+-+    if (p[7] == 0 && p[0] == 0 && p[16]==0)
+-+      goto job_done_early;
+-+
+-+#ifdef RPI_COMBINE_JOBS
+-+    // First scan for a qpu job
+-+    for (int x=0;x<num_jobs;x++) {
+-+      p = vpu_cmds[(vpu_async_head+x)%MAXCMDS];
+-+      if (p[7]) {
+-+        have_qpu = 1;
+-+        break;
+-+      }
+-+    }
+-+    // Now scan for a non-qpu job
+-+    for (int x=0;x<num_jobs;x++) {
+-+      q = vpu_cmds[(vpu_async_head+x)%MAXCMDS];
+-+      if (!q[7]) {
+-+        have_vpu = 1;
+-+        break;
+-+      }
+-+    }
+-+    printf("Have_qpu = %d, have_vpu=%d\n",have_qpu,have_vpu);
+-+#endif
+-     qpu_code = p[7];
+-     qpu_codeb = p[16];
+-+
+-+
+-     //if (p[7]) {
+-         //GPU_MEM_PTR_T *buf = (GPU_MEM_PTR_T *)p[7];
+-         //gpu_cache_flush(buf);
+-@@ -427,6 +460,40 @@ static void *vpu_start(void *arg) {
+-     off_time += start_time-last_time;
+- #endif
+- 
+-+#ifdef RPI_COMBINE_JOBS
+-+    if (have_qpu) {
+-+      for(i=0;i<8;i++) {
+-+        gpu->mail[i*2] = p[8+i];
+-+        gpu->mail[i*2 + 1] = qpu_code;
+-+      }
+-+      for(i=0;i<12;i++) {
+-+        gpu->mail2[i*2] = p[17+i];
+-+        gpu->mail2[i*2 + 1] = qpu_codeb;
+-+      }
+-+      if (have_vpu) {
+-+        execute_multi(gpu->mb,
+-+                              12,gpu->vc + offsetof(struct GPU, mail2), 1, 5000,
+-+                              8,gpu->vc + offsetof(struct GPU, mail), 1 /* no flush */, 5000 /* timeout ms */,
+-+                              p[0], p[1], p[2], p[3], p[4], p[5], p[6], // VPU0
+-+                              q[0], q[1], q[2], q[3], q[4], q[5], q[6]); // VPU1
+-+        q[0] = 0;
+-+      } else {
+-+        execute_multi(gpu->mb,
+-+                              12,gpu->vc + offsetof(struct GPU, mail2), 1, 5000,
+-+                              8,gpu->vc + offsetof(struct GPU, mail), 1 /* no flush */, 5000 /* timeout ms */,
+-+                              p[0], p[1], p[2], p[3], p[4], p[5], p[6], // VPU0
+-+                              0,    0   , 0   , 0   , 0   , 0   , 0); // VPU1
+-+      }
+-+      p[0] = 0;
+-+      p[7] = 0;
+-+      p[16] = 0;
+-+    } else {
+-+        av_assert0(have_vpu);
+-+        vpu_execute_code(q[0], q[1], q[2], q[3], q[4], q[5], q[6]);
+-+        q[0] = 0;
+-+    }
+-+#else
+-+
+-     if (!qpu_code) {
+-       vpu_execute_code(p[0], p[1], p[2], p[3], p[4], p[5], p[6]);
+-     } else {
+-@@ -449,17 +516,29 @@ static void *vpu_start(void *arg) {
+-                               0,    0   , 0   , 0   , 0   , 0   , 0); // VPU1
+- #endif
+-     }
+-+#endif
+-+
+- #ifdef RPI_TIME_TOTAL_POSTED
+-     end_time = Microseconds();
+-     last_time = end_time;
+-+#ifdef RPI_COMBINE_JOBS
+-+    // There are three cases we may wish to distinguish of VPU/QPU activity
+-+    on_time += end_time - start_time;
+-+#else
+-     if (p[6]==2)
+-       on_time_deblock += end_time - start_time;
+-     else
+-       on_time += end_time - start_time;
+-+#endif
+-     count++;
+-     if ((count&0x7f)==0)
+-+#ifdef RPI_COMBINE_JOBS
+-       printf("Posted %d On=%dms, On_deblock=%dms, Off=%dms\n",count,(int)(on_time/1000),(int)(on_time_deblock/1000),(int)(off_time/1000));
+-+#else
+-+      printf("Posted %d On=%dms, Off=%dms\n",count,(int)(on_time/1000),(int)(off_time/1000));
+-+#endif
+- #endif
+-+job_done_early:
+-     pthread_mutex_lock(&post_mutex);
+-     vpu_async_head++;
+-     pthread_cond_broadcast(&post_cond_head);
+--- 
+-2.7.4
+-
+-
+-From 0d54661f303b2a8903e806648ed54a34dcf315dc Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Wed, 1 Jul 2015 12:53:10 +0100
+-Subject: [PATCH 66/68] Added chroma deblocking
+-
+----
+- libavcodec/hevc.c               |  20 ++
+- libavcodec/hevc.h               |  12 +-
+- libavcodec/hevc_filter.c        |  92 +++++-
+- libavcodec/rpi_hevc_transform.h | 644 +++++++++++++++++++++++++++++++++++++++-
+- libavcodec/rpi_hevc_transform.s | 207 +++++++++++++
+- libavcodec/rpi_qpu.c            |  27 +-
+- libavcodec/rpi_shader.qasm      |  11 +
+- 7 files changed, 988 insertions(+), 25 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 4ce94a7..8437e10 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -251,6 +251,14 @@ static void pic_arrays_free(HEVCContext *s)
+-       gpu_free(&s->y_setup_ptr);
+-       s->y_setup_arm = 0;
+-     }
+-+    if (s->uv_setup_arm) {
+-+      gpu_free(&s->uv_setup_ptr);
+-+      s->uv_setup_arm = 0;
+-+    }
+-+    if (s->vpu_cmds_arm) {
+-+      gpu_free(&s->vpu_cmds_ptr);
+-+      s->vpu_cmds_arm = 0;
+-+    }
+- #endif
+-     av_freep(&s->sao);
+-     av_freep(&s->deblock);
+-@@ -324,6 +332,18 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
+-     s->y_setup_vc = (void*)s->y_setup_ptr.vc;
+-     memset(s->y_setup_arm, 0, s->y_setup_ptr.numbytes);
+-     printf("Setup %d by %d by %d\n",s->setup_width,s->setup_height,sizeof(*s->y_setup_arm));
+-+
+-+    s->uv_setup_width = ( (sps->width >> sps->hshift[1]) + 15) / 16;
+-+    s->uv_setup_height = ( (sps->height >> sps->vshift[1]) + 15) / 16;
+-+    gpu_malloc_uncached(sizeof(*s->uv_setup_arm) * s->uv_setup_width * s->uv_setup_height, &s->uv_setup_ptr); // TODO make this cached
+-+    s->uv_setup_arm = (void*)s->uv_setup_ptr.arm;
+-+    s->uv_setup_vc = (void*)s->uv_setup_ptr.vc;
+-+    memset(s->uv_setup_arm, 0, s->uv_setup_ptr.numbytes);
+-+    printf("Setup uv %d by %d by %d\n",s->uv_setup_width,s->uv_setup_height,sizeof(*s->uv_setup_arm));
+-+
+-+    gpu_malloc_uncached(sizeof(*s->vpu_cmds_arm) * 3,&s->vpu_cmds_ptr);
+-+    s->vpu_cmds_arm = (void*) s->vpu_cmds_ptr.arm;
+-+    s->vpu_cmds_vc = s->vpu_cmds_ptr.vc;
+- #endif
+- 
+-     s->bs_width  = (width  >> 2) + 1;
+-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
+-index cf08489..7eb37e6 100644
+---- a/libavcodec/hevc.h
+-+++ b/libavcodec/hevc.h
+-@@ -56,7 +56,7 @@
+-   #define RPI_MAX_JOBS 2
+-   // Define RPI_WORKER to launch a worker thread for pixel processing tasks
+-   #define RPI_WORKER
+--
+-+  // Define RPI_DEBLOCK_VPU to perform deblocking on the VPUs
+-   #define RPI_DEBLOCK_VPU
+- 
+- #endif
+-@@ -980,6 +980,16 @@ typedef struct HEVCContext {
+-     uint8_t (*y_setup_vc)[2][2][2][4];
+-     int setup_width; // Number of 16x16 blocks across the image
+-     int setup_height; // Number of 16x16 blocks down the image
+-+
+-+    GPU_MEM_PTR_T uv_setup_ptr;
+-+    uint8_t (*uv_setup_arm)[2][2][2][4];  // Half of this is unused [][][1][], but easier for the VPU as it allows us to store with zeros and addresses are aligned
+-+    uint8_t (*uv_setup_vc)[2][2][2][4];
+-+    int uv_setup_width;
+-+    int uv_setup_height;
+-+
+-+    GPU_MEM_PTR_T vpu_cmds_ptr;
+-+    int (*vpu_cmds_arm)[6]; // r0-r5 for each command
+-+    int vpu_cmds_vc;
+- #endif
+- 
+- #endif
+-diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
+-index 06371da..6367068 100644
+---- a/libavcodec/hevc_filter.c
+-+++ b/libavcodec/hevc_filter.c
+-@@ -656,9 +656,23 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+-                                                                    s->frame->linesize[chroma],
+-                                                                    c_tc, no_p, no_q);
+-                         } else
+-+#ifdef RPI_DEBLOCK_VPU
+-+                        if (s->enable_rpi_deblock) {
+-+                            uint8_t (*setup)[2][2][4];
+-+                            int xc = x>>s->ps.sps->hshift[chroma];
+-+                            int yc = y>>s->ps.sps->vshift[chroma];
+-+                            int num16 = (yc>>4)*s->uv_setup_width + (xc>>4);
+-+                            int a = ((yc>>3) & 1) << 1;
+-+                            int b = (xc>>3) & 1;
+-+                            setup = s->uv_setup_arm[num16];
+-+                            setup[0][b][0][a] = c_tc[0];
+-+                            setup[0][b][0][a + 1] = c_tc[1];
+-+                        } else
+-+#endif
+-                             s->hevcdsp.hevc_v_loop_filter_chroma(src,
+-                                                                  s->frame->linesize[chroma],
+-                                                                  c_tc, no_p, no_q);
+-+
+-                     }
+-                 }
+- 
+-@@ -689,6 +703,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+-                                                                    s->frame->linesize[chroma],
+-                                                                    c_tc, no_p, no_q);
+-                         } else
+-+#ifdef RPI_DEBLOCK_VPU
+-+                        if (s->enable_rpi_deblock) {
+-+                            uint8_t (*setup)[2][2][4];
+-+                            int xc = x>>s->ps.sps->hshift[chroma];
+-+                            int yc = y>>s->ps.sps->vshift[chroma];
+-+                            int num16 = (yc>>4)*s->uv_setup_width + (xc>>4);
+-+                            int a = ((xc>>3) & 1) << 1;
+-+                            int b = (yc>>3) & 1;
+-+                            setup = s->uv_setup_arm[num16];
+-+                            setup[1][b][0][a] = c_tc[0];
+-+                            setup[1][b][0][a + 1] = c_tc[1];
+-+                        } else
+-+#endif
+-                             s->hevcdsp.hevc_h_loop_filter_chroma(src,
+-                                                                  s->frame->linesize[chroma],
+-                                                                  c_tc, no_p, no_q);
+-@@ -1013,33 +1040,56 @@ void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n)
+- static void rpi_deblock(HEVCContext *s, int y, int ctb_size)
+- {
+-   // Flush image, 4 lines above to bottom of ctb stripe
+--  ff_hevc_flush_buffer_lines(s, FFMAX(y-4,0), y+ctb_size, 1, 0);
+-+  ff_hevc_flush_buffer_lines(s, FFMAX(y-4,0), y+ctb_size, 1, 1);
+-   // TODO flush buffer of beta/tc setup when it becomes cached
+-+
+-+  // Prepare three commands at once to avoid calling overhead
+-+  s->vpu_cmds_arm[0][0] = get_vc_address(s->frame->buf[0]) + s->frame->linesize[0] * y;
+-+  s->vpu_cmds_arm[0][1] = s->frame->linesize[0];
+-+  s->vpu_cmds_arm[0][2] = s->setup_width;
+-+  s->vpu_cmds_arm[0][3] = (int) ( s->y_setup_vc + s->setup_width * (y>>4) );
+-+  s->vpu_cmds_arm[0][4] = ctb_size>>4;
+-+  s->vpu_cmds_arm[0][5] = 2;
+-+
+-+  s->vpu_cmds_arm[1][0] = get_vc_address(s->frame->buf[1]) + s->frame->linesize[1] * (y>> s->ps.sps->vshift[1]);
+-+  s->vpu_cmds_arm[1][1] = s->frame->linesize[1];
+-+  s->vpu_cmds_arm[1][2] = s->uv_setup_width;
+-+  s->vpu_cmds_arm[1][3] = (int) ( s->uv_setup_vc + s->uv_setup_width * ((y>>4)>> s->ps.sps->vshift[1]) );
+-+  s->vpu_cmds_arm[1][4] = (ctb_size>>4)>> s->ps.sps->vshift[1];
+-+  s->vpu_cmds_arm[1][5] = 3;
+-+
+-+  s->vpu_cmds_arm[2][0] = get_vc_address(s->frame->buf[2]) + s->frame->linesize[2] * (y>> s->ps.sps->vshift[2]);
+-+  s->vpu_cmds_arm[2][1] = s->frame->linesize[2];
+-+  s->vpu_cmds_arm[2][2] = s->uv_setup_width;
+-+  s->vpu_cmds_arm[2][3] = (int) ( s->uv_setup_vc + s->uv_setup_width * ((y>>4)>> s->ps.sps->vshift[1]) );
+-+  s->vpu_cmds_arm[2][4] = (ctb_size>>4)>> s->ps.sps->vshift[1];
+-+  s->vpu_cmds_arm[2][5] = 4;
+-+
+-   // Call VPU
+--  // TODO add this to a separate pipeline of VPU jobs that can be run in parallel and wait for completion
+--  vpu_wait(vpu_post_code( vpu_get_fn(), get_vc_address(s->frame->buf[0]) + s->frame->linesize[0] * y, s->frame->linesize[0],
+--                               s->setup_width, (int) ( s->y_setup_vc + s->setup_width * (y>>4) ),
+--                               ctb_size>>4, 2, 0)); // 2 means to do the deblocking code
+-+  vpu_wait(vpu_post_code( vpu_get_fn(), s->vpu_cmds_vc, 3, 0, 0, 0, 5, 0)); // 5 means to do all the commands
+- }
+- 
+--static void rpi_deblock2(HEVCContext *s, int y, int ctb_size)
+--{
+--   int y2;
+--   for(y2=y;y2<y+ctb_size;y2+=16) {
+--      rpi_deblock(s,y2,16);
+--   }
+--}
+- #endif
+- 
+- void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
+- {
+-     int x_end = x >= s->ps.sps->width  - ctb_size;
+-+#ifdef RPI_DEBLOCK_VPU
+-+    int done_deblock = 0;
+-+#endif
+-     if (s->avctx->skip_loop_filter < AVDISCARD_ALL)
+-         deblocking_filter_CTB(s, x, y);
+- #ifdef RPI_DEBLOCK_VPU
+-     if (s->enable_rpi_deblock && x_end)
+-     {
+--      rpi_deblock(s, y, ctb_size);
+-+      int y_at_end = y >= s->ps.sps->height - ctb_size;
+-+      int height = 64;  // Deblock in units 64 high to avoid too many VPU calls
+-+      int y_start = y&~63;
+-+      if (y_at_end) height = s->ps.sps->height - y_start;
+-+      if ((((y+ctb_size)&63)==0) || y_at_end) {
+-+        done_deblock = 1;
+-+        rpi_deblock(s, y_start, height);
+-+      }
+-     }
+- #endif
+-     if (s->ps.sps->sao_enabled) {
+-@@ -1070,11 +1120,25 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
+-         //int newh = y + ctb_size - 4;
+-         //int currh = s->ref->tf.progress->data[0];
+-         //if (((y + ctb_size)&63)==0)
+-+#ifdef RPI_DEBLOCK_VPU
+-+        if (s->enable_rpi_deblock) {
+-+          // we no longer need to flush the luma buffer as it is in GPU memory when using deblocking on the rpi
+-+          if (done_deblock) {
+-+            ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
+-+          }
+-+        } else {
+-+#ifdef RPI_INTER_QPU
+-+          ff_hevc_flush_buffer(s, &s->ref->tf, y + ctb_size - 4);
+-+#endif
+-+          ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
+-+        }
+-+#else
+- #ifdef RPI_INTER_QPU
+-         ff_hevc_flush_buffer(s, &s->ref->tf, y + ctb_size - 4);
+--        // TODO we no longer need to flush the luma buffer as it is in GPU memory when using deblocking on the rpi
+-+        // we no longer need to flush the luma buffer as it is in GPU memory when using deblocking on the rpi
+- #endif
+-         ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
+-+#endif
+-     }
+- }
+- 
+-diff --git a/libavcodec/rpi_hevc_transform.h b/libavcodec/rpi_hevc_transform.h
+-index b3f155f..4309f1c 100644
+---- a/libavcodec/rpi_hevc_transform.h
+-+++ b/libavcodec/rpi_hevc_transform.h
+-@@ -3,14 +3,32 @@ unsigned char rpi_hevc_transform [] = {
+- 106,
+- 0,
+- 144,
+--38,
+++192,
+++243,
+++211,
+++31,
+++128,
+++248,
+++0,
+++0,
+++112,
+++0,
+++192,
+++243,
+++211,
+++31,
+++128,
+++144,
+++112,
+++0,
+++188,
+++64,
+++67,
+++232,
+++0,
+++2,
+++0,
+++0,
+++0,
+++255,
+++64,
+++0,
+++0,
+++20,
+++200,
+++243,
+++0,
+++0,
+++128,
+++144,
+++101,
+++0,
+++195,
+++232,
+++0,
+++2,
+++0,
+++0,
+++12,
+++128,
+++7,
+++192,
+++130,
+++248,
+++0,
+++0,
+++112,
+++192,
+++224,
+++16,
+++195,
+++31,
+++132,
+++248,
+++1,
+++0,
+++112,
+++0,
+++224,
+++16,
+++203,
+++31,
+++25,
+++102,
+++9,
+++106,
+++2,
+++30,
+++41,
+++3,
+++26,
+++87,
+++162,
+++64,
+++64,
+++198,
+++1,
+++23,
+++127,
+++158,
+++103,
+++255,
+++239,
+++3,
+++0,
+++254,
+++0,
+++143,
+++92,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++64,
+++143,
+++93,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++128,
+++143,
+++94,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++192,
+++143,
+++95,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++192,
+++142,
+++208,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++128,
+++142,
+++209,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++64,
+++142,
+++210,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++0,
+++142,
+++211,
+++0,
+++0,
+++240,
+++12,
+++0,
+++128,
+++144,
+++107,
+++0,
+++8,
+++255,
+++99,
+++23,
+++0,
+++212,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++163,
+++23,
+++0,
+++228,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++227,
+++23,
+++0,
+++244,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++35,
+++52,
+++0,
+++180,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++99,
+++52,
+++0,
+++164,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++163,
+++52,
+++0,
+++148,
+++192,
+++51,
+++0,
+++0,
+++111,
+++3,
+++239,
+++3,
+++0,
+++254,
+++0,
+++143,
+++12,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++64,
+++143,
+++13,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++128,
+++143,
+++14,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++192,
+++143,
+++15,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++192,
+++142,
+++16,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++128,
+++142,
+++17,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++64,
+++142,
+++18,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++0,
+++142,
+++19,
+++0,
+++0,
+++240,
+++12,
+++0,
+++128,
+++144,
+++33,
+++0,
+++8,
+++255,
+++99,
+++3,
+++0,
+++212,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++163,
+++3,
+++0,
+++228,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++227,
+++3,
+++0,
+++244,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++35,
+++4,
+++0,
+++180,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++99,
+++4,
+++0,
+++164,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++163,
+++4,
+++0,
+++148,
+++192,
+++51,
+++0,
+++0,
+++111,
+++3,
+++32,
+++246,
+++192,
+++11,
+++1,
+++16,
+++32,
+++246,
+++2,
+++137,
+++47,
+++240,
+++40,
+++246,
+++2,
+++140,
+ +47,
+- 1,
+- 37,
+- 106,
+- 0,
+- 144,
+--57,
+++240,
+++128,
+++245,
+++99,
+++140,
+++5,
+++4,
+++0,
+++247,
+++99,
+++140,
+++1,
+++20,
+++88,
+++246,
+++99,
+++140,
+++1,
+++20,
+++0,
+++247,
+++35,
+++136,
+++62,
+++226,
+++32,
+++247,
+++35,
+++136,
+++32,
+++210,
+++0,
+++247,
+++34,
+++136,
+++63,
+++2,
+++208,
+++246,
+++34,
+++136,
+++0,
+++4,
+++0,
+++247,
+++99,
+++136,
+++58,
+++162,
+++32,
+++247,
+++99,
+++136,
+++33,
+++146,
+++0,
+++247,
+++98,
+++136,
+++59,
+++18,
+++208,
+++246,
+++98,
+++136,
+++0,
+++20,
+++0,
+++247,
+++162,
+++136,
+++33,
+++2,
+++88,
+++246,
+++98,
+++137,
+++2,
+++68,
+++88,
+++246,
+++162,
+++137,
+++3,
+++68,
+++208,
+++254,
+++227,
+++136,
+++60,
+++242,
+++192,
+++243,
+++188,
+++11,
+++208,
+++254,
+++227,
+++136,
+++56,
+++178,
+++192,
+++243,
+++188,
+++10,
+++32,
+++255,
+++226,
+++136,
+++38,
+++58,
+++192,
+++243,
+++60,
+++0,
+++208,
+++254,
+++227,
+++136,
+++59,
+++242,
+++192,
+++243,
+++60,
+++128,
+++32,
+++255,
+++226,
+++136,
+++49,
+++58,
+++192,
+++243,
+++60,
+++128,
+++0,
+++255,
+++226,
+++136,
+++34,
+++34,
+++192,
+++243,
+++60,
+++128,
+++32,
+++255,
+++226,
+++136,
+++37,
+++58,
+++192,
+++243,
+++60,
+++128,
+++0,
+++254,
+++192,
+++136,
+++1,
+++4,
+++0,
+++240,
+++0,
+++160,
+++0,
+++255,
+++194,
+++8,
+++0,
+++52,
+++195,
+++243,
+++0,
+++128,
+++0,
+++255,
+++202,
+++40,
+++0,
+++52,
+++195,
+++243,
+++0,
+++128,
+++0,
+++254,
+++0,
+++240,
+++35,
+++10,
+++0,
+++240,
+++60,
+++0,
+++0,
+++254,
+++192,
+++136,
+++1,
+++4,
+++0,
+++240,
+++0,
+++160,
+++0,
+++255,
+++226,
+++140,
+++34,
+++34,
+++195,
+++243,
+++60,
+++0,
+++32,
+++255,
+++227,
+++140,
+++36,
+++58,
+++192,
+++243,
+++60,
+++0,
+++0,
+++254,
+++192,
+++136,
+++0,
+++4,
+++0,
+++240,
+++0,
+++160,
+++16,
+++246,
+++226,
+++136,
+++35,
+++50,
+++16,
+++246,
+++226,
+++136,
+++35,
+++50,
+++32,
+++246,
+++226,
+++136,
+++35,
+++50,
+++32,
+++254,
+++226,
+++136,
+++35,
+++58,
+++192,
+++243,
+++60,
+++0,
+++11,
+++96,
+++0,
+++254,
+++0,
+++240,
+++1,
+++4,
+++0,
+++240,
+++64,
+++115,
+++5,
+++106,
+++0,
+++144,
+++173,
+++1,
+++27,
+++96,
+++0,
+++254,
+++0,
+++240,
+++1,
+++4,
+++0,
+++240,
+++64,
+++147,
+++5,
+++106,
+++0,
+++144,
+++227,
+++0,
+++64,
+++246,
+++163,
+++140,
+++1,
+++4,
+++0,
+++246,
+++192,
+++175,
+++63,
+++2,
+++0,
+++246,
+++192,
+++174,
+++59,
+++2,
+++0,
+++246,
+++128,
+++175,
+++62,
+++2,
+++0,
+++246,
+++128,
+++174,
+++58,
+++2,
+++0,
+++246,
+++64,
+++175,
+++61,
+++2,
+++0,
+++246,
+++64,
+++174,
+++57,
+++2,
+++0,
+++255,
+++43,
+++240,
+++4,
+++212,
+++192,
+++243,
+++128,
+++11,
+++64,
+++254,
+++43,
+++240,
+++1,
+++228,
+++192,
+++243,
+++128,
+++10,
+++64,
+++254,
+++43,
+++240,
+++1,
+++244,
+++192,
+++243,
+++128,
+++10,
+++64,
+++254,
+++43,
+++240,
+++1,
+++180,
+++192,
+++243,
+++128,
+++10,
+++64,
+++254,
+++43,
+++141,
+++0,
+++164,
+++192,
+++243,
+++128,
+++10,
+++88,
+++246,
+++35,
+++141,
+++3,
+++68,
+++32,
+++247,
+++35,
+++141,
+++191,
+++66,
+++240,
+++246,
+++35,
+++141,
+++50,
+++66,
+++0,
+++255,
+++235,
+++143,
+++52,
+++242,
+++192,
+++243,
+++60,
+++128,
+++0,
+++255,
+++43,
+++240,
+++2,
+++212,
+++192,
+++243,
+++128,
+++11,
+++0,
+++255,
+++43,
+++240,
+++191,
+++226,
+++192,
+++243,
+++188,
+++10,
+++64,
+++254,
+++43,
+++141,
+++0,
+++180,
+++192,
+++243,
+++128,
+++10,
+++88,
+++246,
+++35,
+++141,
+++2,
+++68,
+++32,
+++247,
+++35,
+++141,
+++190,
+++66,
+++240,
+++246,
+++35,
+++141,
+++50,
+++66,
+++0,
+++255,
+++171,
+++143,
+++52,
+++226,
+++192,
+++243,
+++60,
+++128,
+++0,
+++255,
+++43,
+++240,
+++4,
+++180,
+++192,
+++243,
+++128,
+++11,
+++0,
+++255,
+++43,
+++240,
+++191,
+++226,
+++192,
+++243,
+++188,
+++10,
+++128,
+++253,
+++43,
+++240,
+++3,
+++212,
+++192,
+++243,
+++128,
+++10,
+++64,
+++254,
+++35,
+++141,
+++1,
+++196,
+++192,
+++243,
+++128,
+++10,
+++88,
+++246,
+++35,
+++141,
+++3,
+++68,
+++32,
+++247,
+++35,
+++141,
+++189,
+++66,
+++240,
+++246,
+++35,
+++141,
+++50,
+++66,
+++0,
+++255,
+++107,
+++143,
+++52,
+++210,
+++192,
+++243,
+++60,
+++128,
+++0,
+++255,
+++43,
+++240,
+++4,
+++148,
+++192,
+++243,
+++128,
+++11,
+++64,
+++254,
+++43,
+++240,
+++1,
+++164,
+++192,
+++243,
+++128,
+++10,
+++64,
+++254,
+++43,
+++240,
+++1,
+++180,
+++192,
+++243,
+++128,
+++10,
+++64,
+++254,
+++43,
+++240,
+++1,
+++244,
+++192,
+++243,
+++128,
+++10,
+++64,
+++254,
+++43,
+++141,
+++0,
+++228,
+++192,
+++243,
+++128,
+++10,
+++88,
+++246,
+++35,
+++141,
+++3,
+++68,
+++32,
+++247,
+++35,
+++141,
+++187,
+++66,
+++240,
+++246,
+++35,
+++141,
+++50,
+++66,
+++0,
+++255,
+++235,
+++142,
+++52,
+++178,
+++192,
+++243,
+++60,
+++128,
+++0,
+++255,
+++43,
+++240,
+++2,
+++148,
+++192,
+++243,
+++128,
+++11,
+++0,
+++255,
+++43,
+++240,
+++187,
+++162,
+++192,
+++243,
+++188,
+++10,
+++64,
+++254,
+++43,
+++141,
+++0,
+++244,
+++192,
+++243,
+++128,
+++10,
+++88,
+++246,
+++35,
+++141,
+++2,
+++68,
+++32,
+++247,
+++35,
+++141,
+++186,
+++66,
+++240,
+++246,
+++35,
+++141,
+++50,
+++66,
+++0,
+++255,
+++171,
+++142,
+++52,
+++162,
+++192,
+++243,
+++60,
+++128,
+++0,
+++255,
+++43,
+++240,
+++4,
+++244,
+++192,
+++243,
+++128,
+++11,
+++0,
+++255,
+++43,
+++240,
+++187,
+++162,
+++192,
+++243,
+++188,
+++10,
+++128,
+++253,
+++43,
+++240,
+++3,
+++148,
+++192,
+++243,
+++128,
+++10,
+++64,
+++254,
+++35,
+++141,
+++1,
+++132,
+++192,
+++243,
+++128,
+++10,
+++88,
+++246,
+++35,
+++141,
+++3,
+++68,
+++32,
+++247,
+++35,
+++141,
+++185,
+++66,
+++240,
+++246,
+++35,
+++141,
+++50,
+ +66,
+- 1,
+++0,
+++255,
+++107,
+++142,
+++52,
+++146,
+++192,
+++243,
+++60,
+++128,
+++64,
+++255,
+++98,
+++141,
+++0,
+++52,
+++192,
+++243,
+++0,
+++0,
+++0,
+++254,
+++0,
+++240,
+ +53,
+++10,
+++0,
+++240,
+++60,
+++0,
+++0,
+++254,
+++0,
+++240,
+++1,
+++4,
+++0,
+++240,
+++64,
+++147,
+++5,
+ +106,
+ +0,
+ +144,
+++177,
+++0,
+++88,
+++246,
+++163,
+++140,
+++1,
+++4,
+++128,
+++245,
+++99,
+++141,
+++10,
+++4,
+++88,
+++246,
+++162,
+++138,
+++1,
+++68,
+++0,
+++247,
+++162,
+++138,
+++36,
+++162,
+++88,
+++254,
+++162,
+++138,
+++3,
+++164,
+++192,
+++243,
+++128,
+++11,
+++0,
+++255,
+++226,
+++137,
+++32,
+++2,
+++195,
+++243,
+++60,
+++0,
+++32,
+++247,
+++226,
+++137,
+++42,
+++114,
+++0,
+++255,
+++34,
+++138,
+++33,
+++18,
+++195,
+++243,
+++60,
+++0,
+++32,
+++247,
+++34,
+++138,
+++42,
+++130,
+++16,
+++246,
+++98,
+++138,
+++40,
+++114,
+++16,
+++246,
+++98,
+++138,
+++41,
+++146,
+++32,
+++246,
+++98,
+++138,
+++41,
+++146,
+++32,
+++246,
+++226,
+++137,
+++41,
+++146,
+++40,
+++246,
+++34,
+++138,
+++41,
+++146,
+++32,
+++247,
+++163,
+++141,
+++63,
+++178,
+++32,
+++247,
+++227,
+++141,
+++62,
+++162,
+++0,
+++254,
+++0,
+++240,
+++8,
+++4,
+++0,
+++240,
+++128,
+++11,
+++128,
+++253,
+++35,
+++240,
+++9,
+++100,
+++192,
+++243,
+++128,
+++10,
+++128,
+++253,
+++163,
+++141,
+++128,
+++115,
+ +192,
+++243,
+++152,
+++10,
+++88,
+++246,
+++163,
+++141,
+ +4,
+-+69,
+-+106,
+++100,
+++208,
+++246,
+++35,
+++139,
+ +0,
+-+144,
+++100,
+++32,
+++255,
+++34,
+++139,
+++53,
+++202,
+ +192,
+++243,
+++60,
+++128,
+++0,
+++254,
+++0,
+++139,
+++0,
+ +4,
+-+85,
+-+106,
+ +0,
+-+144,
+-+220,
+-+5,
+- 169,
+- 3,
+- 62,
+-@@ -2427,4 +2445,626 @@ unsigned char rpi_hevc_transform [] = {
+- 128,
+- 90,
+- 0,
+++240,
+++0,
+++160,
+++240,
+++246,
+++163,
+++141,
+++48,
+++98,
+++0,
+++247,
+++99,
+++139,
+++63,
+++210,
+++0,
+++247,
+++98,
+++139,
+++1,
+++212,
+++88,
+++254,
+++98,
+++139,
+++1,
+++212,
+++192,
+++243,
+++128,
+++11,
+++32,
+++255,
+++99,
+++139,
+++62,
+++98,
+++192,
+++243,
+++188,
+++10,
+++88,
+++246,
+++98,
+++139,
+++1,
+++212,
+++240,
+++246,
+++98,
+++139,
+++50,
+++210,
+++0,
+++247,
+++163,
+++128,
+++59,
+++146,
+++0,
+++247,
+++160,
+++128,
+++1,
+++36,
+++88,
+++254,
+++160,
+++128,
+++1,
+++36,
+++192,
+++243,
+++128,
+++11,
+++0,
+++247,
+++163,
+++128,
+++58,
+++98,
+++64,
+++255,
+++35,
+++240,
+++0,
+++100,
+++192,
+++243,
+++128,
+++10,
+++64,
+++255,
+++163,
+++128,
+++0,
+++164,
+++192,
+++243,
+++128,
+++10,
+++88,
+++246,
+++160,
+++128,
+++1,
+++36,
+++240,
+++246,
+++160,
+++128,
+++50,
+++34,
+++8,
+++255,
+++227,
+++143,
+++54,
+++242,
+++192,
+++243,
+++60,
+++128,
+++40,
+++255,
+++227,
+++142,
+++54,
+++178,
+++192,
+++243,
+++60,
+++128,
+++0,
+++254,
+++0,
+++240,
+++39,
+++10,
+++0,
+++240,
+++60,
+++128,
+++8,
+++255,
+++163,
+++143,
+++45,
+++226,
+++192,
+++243,
+++60,
+++128,
+++0,
+++254,
+++0,
+++240,
+++44,
+++10,
+++0,
+++240,
+++60,
+++0,
+++0,
+++254,
+++0,
+++240,
+++40,
+++10,
+++0,
+++240,
+++60,
+++128,
+++8,
+++255,
+++163,
+++142,
+++2,
+++162,
+++192,
+++243,
+++60,
+++128,
+++90,
+++0,
+ +169,
+ +3,
+ +14,
+@@ -35609,15 +10256,100 @@ index b3f155f..4309f1c 100644
+ +30,
+ +33,
+ +3,
+- };
+++};
+ diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s
+-index b055208..5543093 100644
+---- a/libavcodec/rpi_hevc_transform.s
++new file mode 100644
++index 0000000..5543093
++--- /dev/null
+ +++ b/libavcodec/rpi_hevc_transform.s
+-@@ -85,6 +85,13 @@ hevc_trans_16x16:
+-   beq memclear16
+-   cmp r5,2
+-   beq hevc_deblock_16x16
++@@ -0,0 +1,917 @@
+++# ******************************************************************************
+++# Argon Design Ltd.
+++# (c) Copyright 2015 Argon Design Ltd. All rights reserved.
+++#
+++# Module : HEVC
+++# Author : Peter de Rivaz
+++# ******************************************************************************
+++
+++# HEVC VPU Transform
+++#
+++# Transform matrix can be thought of as
+++#   output row vector = input row vector * transMatrix2
+++#
+++# The even rows of the matrix are symmetric
+++# The odd rows of the matrix are antisymmetric
+++#
+++# So only need to compute the first half of the results, then can compute the remainder with a butterfly
+++#
+++# EXAMPLE
+++#   (a b c d) (1 2  2  1)
+++#             (3 4 -4 -3)
+++#             (5 6  6  5)
+++#             (7 8 -8 -7)
+++#
+++#  x=(a c)(1 2) = 1a+5c 2a+6c
+++#         (5 6)
+++#
+++#  y=(b d)(3 4) = 3b+7d 4b+8d
+++#         (7 8)
+++#
+++#  u=x+y = 1a+5c+3b+7d 2a+4b+6c+8d
+++#  v=x-y = 1a+5c-3b-7d 2a+6c-4b-8d
+++#
+++#  Final results are (u , v[::-1])
+++#
+++#
+++#  For 32x1 input, load even rows into HX(0++,0), odd rows into HX(16++,0)
+++#  Apply the even matrix first and stop before rounding
+++#  Then apply the odd matrix in a full manner:
+++#
+++#   First step is to compute partial products with the first input (16 cycles)
+++#   1a 3b 5c 7d   16x1 input coefficients produce 16x16 output
+++#   2a 4b 6c 8d
+++#   2a -4b 6c -8d
+++#   1a -3b 5c -7d
+++#
+++#   Second step is to sum partial products into final position (8 cycles)
+++#   1a+3b+5c+7d
+++#   2a+4b+6c+8d
+++#   2a-4b+6c-8d
+++#   1a-3b+5c-7d
+++#
+++#   Then can apply butterfly to combine even results and odd results + rounding to produce 16 rows of output at a time (need to save in transposed format)
+++#
+++#   For 16x16 no butterfly is required and can store final results in original location  (Could do 2 16x16s in parallel to make use of the trick - saves on the adds)
+++#
+++#   For 8x8 we could compute two in parallel.
+++#
+++#
+++
+++# Columns are transformed first
+++#
+++# Store top left half of transMatrix2 in
+++# Store bottom left half of transMatrix2 in HX(32,32)
+++#
+++# For 16x16
+++# HX(0:15,0) contains input data before transform
+++# HY(0:15,0) contains 32bit output data after transform
+++# HX(32,0) contains even rows of left half of transMatrix2
+++# HX(32,32) contains odd rows of left half of transMatrix2
+++# HY(48,0) contains partial products ready for summing
+++#
+++
+++
+++# hevc_trans_16x16(short *transMatrix2, short *coeffs, int num) # TODO add size so we can branch to correct implementation (or perhaps have coeffs32 and num32 as secondary inputs!)
+++# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory)
+++# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
+++# num: number of 16x16 transforms to be done
+++# coeffs32
+++# num32: number of 32x32 transforms
+++# command 0 for transform, 1 for memclear16(int16_t *dst,num16)
+++#
+++hevc_trans_16x16:
+++  cmp r5,1
+++  beq memclear16
+++  cmp r5,2
+++  beq hevc_deblock_16x16
+ +  cmp r5,3
+ +  beq hevc_uv_deblock_16x16
+ +  cmp r5,4
+@@ -35625,1937 +10357,4614 @@ index b055208..5543093 100644
+ +  cmp r5,5
+ +  beq hevc_run_command_list
+ +
+-   push r6-r15, lr # TODO cut down number of used registers
+-   mov r14,r3 # coeffs32
+-   mov r15,r4 # num32
+-@@ -708,3 +715,203 @@ normal_filtering:
+- 
+- filtering_done:
+-   b lr
+++  push r6-r15, lr # TODO cut down number of used registers
+++  mov r14,r3 # coeffs32
+++  mov r15,r4 # num32
+++  mov r3, 16*2 # Stride of transMatrix2 in bytes
+++  vldh HX(32++,0),(r0 += r3) REP 16 # This is the 16x16 matrix, a transform is equivalent to multiplying input row vector * matrix
+++
+++  add r0, 16*16*2 # For 32x32 transforms we also need this matrix
+++  vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
+++
+++  # Now use r0 to describe which matrix we are working on.
+++  # Allows us to prefetch the next block of coefficients for efficiency.
+++  mov r0,0 # This describes the location where we read our coefficients from
+++  mov r3,16*2 # Stride of coefficients in bytes (TODO remove)
+++  mov r7,16*16*2 # Total block size
+++  mov r8,64*16 # Value used to swap from current to next VRF location
+++  vldh HX(0++,0)+r0,(r1 += r3) REP 16
+++  mov r4,64 # Constant used for rounding first pass
+++  mov r5,1<<11 # Constant used for rounding second pass
+++
+++  # At start of block r0,r1 point to the current block (that has already been loaded)
+++block_loop:
+++  eor r0,r8
+++  add r1,r7
+++  # Prefetch the next block
+++  vldh HX(0++,0)+r0,(r1 += r3) REP 16
+++  eor r0,r8
+++  sub r1,r7
+++
+++  # Transform the current block
+++  bl col_trans_16
+++  vadd HY(0++,0)+r0,HY(0++,0)+r0,r4 REP 16   # Now add on rounding, shift down by 7, and saturate
+++  #vsasls HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # 9+7=16 so this ends up with the output saturated and in the top half of the word.
+++  vasl HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16    # This should be saturating, but the instruction above does not assemble?
+++  vmov VX(0,0++)+r0, HX(0++,32)+r0 REP 16    # For simplicity transpose this back to the original position
+++
+++  bl col_trans_16
+++  vadd HY(0++,0)+r0,HY(0++,0)+r0,r5 REP 16   # Now add on rounding, shift down by 7, and saturate
+++  #vsasls HY(0++,0)+r0,HY(0++,0)+r0,4 REP 16 # 4+12=16 so this ends up with the output saturated and in the top half of the word.
+++  vasl HY(0++,0)+r0,HY(0++,0)+r0,4 REP 16    # This should be saturating, but the instruction above does not assemble?  (Probably because it ends with ls which is interpreted as a condition flag)
+++
+++  # Save results - note there has been a transposition during the processing so we save columns
+++  vsth VX(0,32++)+r0, (r1 += r3) REP 16
+++
+++  # Move onto next block
+++  eor r0,r8
+++  add r1,r7
+++
+++  addcmpbgt r2,-1,0,block_loop
+++
+++  # Now go and do any 32x32 transforms
+++  b hevc_trans_32x32
+++
+++  pop r6-r15, pc
+++
+++# r1,r2,r3 r7,r8 should be preserved
+++# HX(0++,0)+r0 is the block to be transformed
+++# HX(32++,0)+r6 is the 16x16 matrix of transform coefficients
+++# Use HY(48,0) for intermediate results
+++# r0 can be used, but should be returned to its original value at the end
+++col_trans_16:
+++  add r6,r0,16 # Final value for this loop
+++col_trans_16_loop:
+++  # First compute partial products for a single column
+++  vmul32s HY(48++,0), VX(0,0)+r0, VX(32,0++) REP 16
+++  # Then sum up the results and place back
+++  vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
+++  addcmpblt r0,1,r6,col_trans_16_loop
+++  sub r0,16  # put r0 back to its original value
+++  b lr
+++
+++col_trans_odd_16:
+++  add r6,r0,16 # Final value for this loop
+++col_trans_odd_16_loop:
+++  # First compute partial products for a single column
+++  vmul32s HY(48++,0), VX(0,0)+r0, VX(32,32++) REP 16
+++  # Then sum up the results and place back
+++  vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
+++  addcmpblt r0,1,r6,col_trans_odd_16_loop
+++  sub r0,16  # put r0 back to its original value
+++  b lr
+++
+++# hevc_trans_32x32(short *transMatrix2, short *coeffs, int num)
+++# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory) Even followed by odd
+++# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
+++# num: number of 16x16 transforms to be done
+++#
+++hevc_trans_32x32:
+++  mov r1,r14 # coeffs
+++  mov r2,r15 # num
+++
+++  # Fetch odd transform matrix
+++  #mov r3, 16*2 # Stride of transMatrix2 in bytes (and of coefficients)
+++  #vldh HX(32++,0),(r0 += r3) REP 16 # This is the even 16x16 matrix
+++  #add r0, 16*16*2
+++  #vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
+++
+++  mov r3, 32*2*2 # Stride used to fetch alternate rows of our input coefficient buffer
+++  mov r7, 16*16*2 # Total block size
+++  sub sp,sp,32*32*2+32 # Allocate some space on the stack for us to store 32*32 shorts as temporary results (needs to be aligned)
+++  # set r8 to 32byte aligned stack pointer
+++  add r8,sp,31
+++  lsr r8,5
+++  lsl r8,5
+++  mov r9,r8  # Backup of the temporary storage
+++  mov r10,r1 # Backup of the coefficient buffer
+++block_loop32:
+++
+++  # COLUMN TRANSFORM
+++  mov r4, 64 # Constant used for rounding first pass
+++  mov r5, 9 # left shift used for rounding first pass
+++
+++  # Transform the first 16 columns
+++  mov r1,r10  # Input Coefficient buffer
+++  mov r8,r9   # Output temporary storage
+++  bl trans32
+++  # Transform the second 16 columns
+++  add r8,32*16*2
+++  add r1,32
+++  bl trans32
+++
+++  # ROW TRANSFORM
+++  mov r4, 1<<11 # Constant used for rounding second pass
+++  mov r5, 4 # left shift used for rounding second pass
+++
+++  mov r1,r9  # Input temporary storage
+++  mov r8,r10   # Output Coefficient buffer
+++  bl trans32
+++  # Transform the second 16 columns
+++  add r8,32*16*2
+++  add r1,32
+++  bl trans32
+++
+++  add r10, 32*32*2 # move onto next block of coefficients
+++  addcmpbgt r2,-1,0,block_loop32
+++
+++  add sp,sp,32*32*2+32 # Restore stack
+++
+++  pop r6-r15, pc
+++
+++trans32:
+++  push lr
+++  # We can no longer afford the VRF space to do prefetching when doing 32x32
+++  # Fetch the even rows
+++  vldh HX(0++,0),(r1 += r3) REP 16
+++  # Fetch the odd rows
+++  vldh HX(16++,0),64(r1 += r3) REP 16 # First odd row is 32 shorts ahead of r1
+++
+++  # Transform the even rows using even matrix
+++  mov r0, 0 # Even rows
+++  bl col_trans_16
+++
+++  # Now transform the odd rows using odd matrix
+++  mov r0, 64*16 # Odd rows
+++  bl col_trans_odd_16
+++
+++  # Now apply butterfly to compute the first 16 results
+++  vadd HY(48++,0),HY(0++,0),HY(16++,0) REP 16
+++  vadd HY(48++,0),HY(48++,0),r4 REP 16   # add on rounding,
+++  vasl HY(48++,0),HY(48++,0),r5 REP 16    # shift down by 7, and saturate
+++  # 16bit results now in HX(48,32)
+++  mov r0,r8
+++  mov r6,32*2
+++  vsth VX(48,32++),(r0+=r6) REP 16
+++
+++  # Now apply butterfly to compute the second 16 results (in reverse order)
+++  vsub HY(63,0),HY(0 ,0),HY(16,0)
+++  vsub HY(62,0),HY(1 ,0),HY(17,0)
+++  vsub HY(61,0),HY(2 ,0),HY(18,0)
+++  vsub HY(60,0),HY(3 ,0),HY(19,0)
+++  vsub HY(59,0),HY(4 ,0),HY(20,0)
+++  vsub HY(58,0),HY(5 ,0),HY(21,0)
+++  vsub HY(57,0),HY(6 ,0),HY(22,0)
+++  vsub HY(56,0),HY(7 ,0),HY(23,0)
+++  vsub HY(55,0),HY(8 ,0),HY(24,0)
+++  vsub HY(54,0),HY(9 ,0),HY(25,0)
+++  vsub HY(53,0),HY(10,0),HY(26,0)
+++  vsub HY(52,0),HY(11,0),HY(27,0)
+++  vsub HY(51,0),HY(12,0),HY(28,0)
+++  vsub HY(50,0),HY(13,0),HY(29,0)
+++  vsub HY(49,0),HY(14,0),HY(30,0)
+++  vsub HY(48,0),HY(15,0),HY(31,0)
+++  vadd HY(48++,0),HY(48++,0),r4 REP 16   # add on rounding,
+++  vasl HY(48++,0),HY(48++,0),r5 REP 16    # shift down by 7, and saturate
+++  add r0,r8,32
+++  vsth VX(48,32++),(r0+=r6) REP 16
+++  pop pc
+++
+++memclear16:
+++  # r0 is address
+++  # r1 is number of 16bits values to set to 0 (may overrun past end and clear more than specified)
+++  vmov HX(0++,0),0 REP 16
+++  mov r2,32
+++loop:
+++  vsth HX(0++,0),(r0+=r2) REP 16
+++  add r0,16*16*2
+++  sub r1,16*16
+++  cmp r1,0
+++  bgt loop
+++  b lr
+++
+++
+++################################################################################
+++# HEVC VPU Deblock
+++#
+++# Vertical edges before horizontal
+++# Decision can change every 4 pixels, but only 8 pixel boundaries are deblocked
+++#
+++# ARM is responsible for storing beta and tc for each 4 pixels horiz and vert edge.
+++# The VPU code works in units of 16x16 blocks.
+++# We do vertical filtering for the current block followed by horizontal filtering for the previous (except for the first time).
+++# One final horizontal filter is required at the end.
+++# PCM is not allowed in this code.
+++#
+++#
+++# H(16-4:16+15,0) contains previous block (note that we need 4 lines above of context that may get altered during filtering)
+++# H(16:31,16) contains current block (note that we do not need the upper lines until the horizontal filtering.
+++
+++.set P0,63
+++.set P1,62
+++.set P2,61
+++.set P3,60
+++.set Q0,59
+++.set Q1,58
+++.set Q2,57
+++.set Q3,56
+++
+++.set dp,32
+++.set dq,33
+++.set d,34
+++.set decision,35
+++.set beta,36
+++.set beta2,37
+++.set beta3,38
+++.set ptest,39
+++.set qtest,40
+++.set pqtest,41
+++.set thresh,42
+++.set deltatest, 44
+++.set deltap1, 45
+++.set tc25, 46
+++.set setup,47
+++.set tc,48
+++.set tc25,49
+++.set tc2, 50
+++.set do_filter, 51
+++.set delta, 52
+++.set tc10, 53
+++.set delta0, 54
+++.set delta1, 55
+++.set zeros, 0
+++.set setup_input, 1
+++.set deltaq1, 2
+++
+++
+++
+++# hevc_deblock_16x16 deblocks an entire row that is 16 pixels high by the full width of the image.
+++# Row has num16 16x16 blocks across
+++# Beta goes from 0 to 64
+++# tc goes from 0 to 24
+++# setup[block_idx][0=vert,1=horz][0=first edge, 1=second edge][0=beta,1=tc][0..3=edge number]
+++#   has 8 bytes per edge
+++#   has 16 bytes per direction
+++#   has 32 bytes per 16x16 block
+++# hevc_deblock_16x16(uint8_t *img (r0), int stride (r1), int num16w (r2), uint8_t setup[num16][2][2][2][4](r3),int num16h(r4))
+++hevc_deblock_16x16:
+++  push r6-r15, lr
+++  mov r9,r4
+++  mov r4,r3
+++  mov r13,r2
+++  mov r2,r0
+++  mov r10,r0
+++  subscale4 r0,r1
+++  mov r8,63
+++  mov r6,-3
+++  vmov H(zeros,0),0
+++# r7 is number of blocks still to load
+++# r0 is location of current block - 4 * stride
+++# r1 is stride
+++# r2 is location of current block
+++# r3 is offset of start of block (actual edges start at H(16,16)+r3 for horizontal and H(16,0)+r3 for vertical
+++# r4 is setup
+++# r5 is for temporary calculations
+++# r8 holds 63
+++# r6 holds -3
+++# r9 holds the number of 16 high rows to process
+++# r10 holds the original img base
+++# r11 returns 0 if no filtering was done on the edge
+++# r12 saves a copy of this
+++# r13 is copy of width
+++
+++process_row:
+++  # First iteration does not do horizontal filtering on previous
+++  mov r7, r13
+++  mov r3,0
+++  vldb H(12++,16)+r3,(r0 += r1) REP 4    # Load the current block
+++  vldb H(16++,16)+r3,(r2 += r1) REP 16
+++  vldb H(setup_input,0), (r4)  # We may wish to prefetch these
+++  vstb H(zeros,0),(r4)
+++  bl vert_filter
+++  add r3,8
+++  vadd H(setup_input,0),H(setup_input,8),0 # Rotate to second set of 8
+++  bl vert_filter
+++  sub r3,8
+++  b start_deblock_loop
+++deblock_loop:
+++  # Middle iterations do vertical on current block and horizontal on preceding
+++  vldb H(12++,16)+r3,(r0 += r1) REP 4  # load the current block
+++  vldb H(16++,16)+r3,(r2 += r1) REP 16
+++  vldb H(setup_input,0), (r4)
+++  vstb H(zeros,0),(r4)
+++  bl vert_filter
+++  add r3,8
+++  vadd H(setup_input,0),H(setup_input,8),0
+++  bl vert_filter
+++  sub r3,8
+++  vldb H(setup_input,0), -16(r4)
+++  vstb H(zeros,0),-16(r4)
+++  bl horz_filter
+++  mov r12,r11
+++  add r3,8*64
+++  vadd H(setup_input,0),H(setup_input,8),0
+++  bl horz_filter
+++  sub r3,8*64
+++  addcmpbeq r12,0,0,skip_save_top
+++  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
+++skip_save_top:
+++  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
+++start_deblock_loop:
+++  # move onto next 16x16 (could do this with circular buffer support instead)
+++  add r3,16
+++  and r3,r8
+++  add r4,32
+++  # Perform loop counter operations (may work with an addcmpbgt as well?)
+++  add r0,16
+++  add r2,16
+++  sub r7,1
+++  cmp r7,0 # Are there still more blocks to load
+++  bgt deblock_loop
+++
+++  # Final iteration needs to just do horizontal filtering
+++  vldb H(setup_input,0), -16(r4)
+++  vstb H(zeros,0),-16(r4)
+++  bl horz_filter
+++  mov r12,r11
+++  add r3,8*64
+++  vadd H(setup_input,0),H(setup_input,8),0
+++  bl horz_filter
+++  sub r3,64*8
+++  addcmpbeq r12,0,0,skip_save_top2
+++  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
+++skip_save_top2:
+++  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
+++
+++# Now look to see if we should do another row
+++  sub r9,1
+++  cmp r9,0
+++  bgt start_again
+++  pop r6-r15, pc
+++start_again:
+++  # Need to sort out r0,r2 to point to next row down
+++  addscale16 r10,r1
+++  mov r2,r10
+++  subscale4 r0,r2,r1
+++  b process_row
+++
+++
+++# At this stage H(16,16)+r3 points to the first pixel of the 16 high edge to be filtered
+++# So we can reuse the code we move the parts to be filtered into HX(P0/P1/P2/P3/Q0/Q1/Q2/Q3,0) - we will perform a final saturation step on placing them back into the correct locations
+++
+++vert_filter:
+++  push lr
+++
+++  vmov HX(P3,0), V(16,12)+r3
+++  vmov HX(P2,0), V(16,13)+r3
+++  vmov HX(P1,0), V(16,14)+r3
+++  vmov HX(P0,0), V(16,15)+r3
+++  vmov HX(Q0,0), V(16,16)+r3
+++  vmov HX(Q1,0), V(16,17)+r3
+++  vmov HX(Q2,0), V(16,18)+r3
+++  vmov HX(Q3,0), V(16,19)+r3
+++
+++  bl do_luma_filter
+++
+++  vadds V(16,13)+r3, HX(P2,0), 0
+++  vadds V(16,14)+r3, HX(P1,0), 0
+++  vadds V(16,15)+r3, HX(P0,0), 0
+++  # P3 and Q3 never change so don't bother saving back
+++  vadds V(16,16)+r3, HX(Q0,0), 0
+++  vadds V(16,17)+r3, HX(Q1,0), 0
+++  vadds V(16,18)+r3, HX(Q2,0), 0
+++
+++  pop pc
+++
+++# Filter edge at H(16,0)+r3
+++horz_filter:
+++  push lr
+++
+++  vmov HX(P3,0), H(12,0)+r3
+++  vmov HX(P2,0), H(13,0)+r3
+++  vmov HX(P1,0), H(14,0)+r3
+++  vmov HX(P0,0), H(15,0)+r3
+++  vmov HX(Q0,0), H(16,0)+r3
+++  vmov HX(Q1,0), H(17,0)+r3
+++  vmov HX(Q2,0), H(18,0)+r3
+++  vmov HX(Q3,0), H(19,0)+r3
+++
+++  bl do_luma_filter
+++
+++  vadds H(13,0)+r3, HX(P2,0), 0
+++  vadds H(14,0)+r3, HX(P1,0), 0
+++  vadds H(15,0)+r3, HX(P0,0), 0
+++  # P3 and Q3 never change so don't bother saving back
+++  vadds H(16,0)+r3, HX(Q0,0), 0
+++  vadds H(17,0)+r3, HX(Q1,0), 0
+++  vadds H(18,0)+r3, HX(Q2,0), 0
+++
+++  pop pc
+++
+++# r4 points to array of beta/tc for each 4 length edge
+++do_luma_filter:
+++  valtl H(setup,0),H(setup_input,0),H(setup_input,0) # b*8tc*8
+++  valtl HX(beta,0),H(setup,0),H(setup,0)
+++  valtu HX(tc,0),H(setup,0),H(setup,0)
+++  vmul HX(tc25,0), HX(tc,0), 5
+++  vadd HX(tc25,0),HX(tc25,0), 1
+++  vasr HX(tc25,0), HX(tc25,0), 1
+++
+++  # Compute decision
+++  vadd HX(dp,0),HX(P1,0),HX(P1,0) # 2*P1
+++  vsub HX(dp,0),HX(P2,0),HX(dp,0) # P2-2*P1
+++  vadd HX(dp,0),HX(dp,0),HX(P0,0) # P2-2*P1+P0
+++  vdist HX(dp,0),HX(dp,0),0 # abs(P2-2*P1+P0) # dp0
+++
+++  vadd HX(dq,0),HX(Q1,0),HX(Q1,0) # 2*Q1
+++  vsub HX(dq,0),HX(Q2,0),HX(dq,0) # Q2-2*Q1
+++  vadd HX(dq,0),HX(dq,0),HX(Q0,0) # Q2-2*Q1+Q0
+++  vdist HX(dq,0),HX(dq,0),0 # abs(Q2-2*Q1+Q0) # dq0
+++
+++  vadd HX(d,0), HX(dp,0), HX(dq,0)
+++  vasr HX(beta2,0),HX(beta,0),2
+++  vasr HX(beta3,0),HX(beta,0),3
+++
+++  # Compute flags that are negative if all conditions pass
+++  vdist HX(decision,0), HX(P0,0), HX(P3,0) CLRA SACC
+++  vdist HX(decision,0), HX(Q0,0), HX(Q3,0) SACC
+++  vsub HX(decision,0), HX(decision,0), HX(beta3,0) SETF
+++
+++  vdist HX(decision,0), HX(P0,0), HX(Q0,0) IFN
+++  vsub HX(decision,0), HX(decision,0), HX(tc25,0) IFN SETF
+++  vadd HX(decision,0), HX(d,0), HX(d,0) IFN
+++  vsub HX(decision,0), HX(decision,0), HX(beta2,0) IFN SETF
+++  vmov HX(decision,0), 1 IFNN
+++  vadd H(decision,0),H(decision,3),0 IFN
+++  vadd H(decision,16),H(decision,19),0 IFN
+++  vmov -,HX(decision,0) SETF   # N marks strong filter
+++  vmov HX(decision,0), 1 IFNN  # NN marks normal filter
+++
+++  vadd HX(do_filter,0), HX(d,3), HX(d,0)
+++  vsub HX(do_filter,0), HX(do_filter,0), HX(beta,0) SETF # IFNN means no filter
+++  vmov HX(decision,0),0 IFNN # Z marks no filter
+++
+++  # Expand out decision (currently valid one every 4 pixels)  0...1...2...3
+++  # First extract out even terms
+++  vodd HX(decision,0),HX(decision,0),HX(decision,0)  # 0.1.2.3
+++  vodd HX(decision,0),HX(decision,0),HX(decision,0)  # 0123
+++  # Now expand back
+++  valtl HX(decision,0),HX(decision,0),HX(decision,0) # 00112233
+++  valtl HX(decision,0),HX(decision,0),HX(decision,0) SETF # 0000111122223333
+++
+++  # HX(decision,0) is negative if want strong filtering, 1 if want normal filtering, 0 if want no filtering
+++
+++  # Do a quick check to see if there is anything to do
+++  mov r11, 0 # Signal no filtering
+++  vmov -,1 IFNZ SUMS r5
+++  cmp r5,0
+++  beq filtering_done
+++  mov r11, 1 # Signal some filtering
+++  # And whether there is any strong filtering
+++  vmov -,1 IFN SUMS r5
+++  cmp r5,0
+++  beq normal_filtering
+++
+++  ##############################################################################
+++  # Strong filtering - could maybe fast case if all have same sign? (especially if all disabled!)
+++  vshl HX(tc2,0), HX(tc,0), 1  # Note that in normal filtering tx2 is tc/2, while here it is tc*2
+++
+++  # Take a copy of the original pixels for use in decision calculation
+++  vmov HX(P0,32),HX(P0,0)
+++  vmov HX(Q0,32),HX(Q0,0)
+++  vmov HX(P1,32),HX(P1,0)
+++  vmov HX(Q1,32),HX(Q1,0)
+++  vmov HX(P2,32),HX(P2,0)
+++  vmov HX(Q2,32),HX(Q2,0)
+++
+++  vadd -,HX(P2,32),4 CLRA SACC
+++  vshl -,HX(P1,32),1 SACC
+++  vshl -,HX(P0,32),1 SACC
+++  vshl -,HX(Q0,32),1 SACC
+++  vshl HX(delta,0),HX(Q1,32),0 SACC
+++  vasr HX(delta,0),HX(delta,0), 3
+++  vsub HX(delta,0),HX(delta,0),HX(P0,32)
+++  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
+++  vadd HX(P0,0),HX(P0,32),HX(delta,0) IFN
+++
+++  vadd -,HX(P2,32),2 CLRA SACC
+++  vadd -,HX(P1,32),HX(P0,32) SACC
+++  vshl HX(delta,0),HX(Q0,32),0 SACC
+++  vasr HX(delta,0),HX(delta,0), 2
+++  vsub HX(delta,0),HX(delta,0),HX(P1,32)
+++  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
+++  vadd HX(P1,0),HX(P1,32),HX(delta,0) IFN
+++
+++  vadd -,HX(Q0,32),4 CLRA SACC
+++  vadd -,HX(P1,32),HX(P0,32) SACC
+++  vmul -,HX(P2,32),3 SACC
+++  vshl HX(delta,0),HX(P3,0),1 SACC # Note that we have not made a copy of P3, so using P3,0 is correct
+++  vasr HX(delta,0),HX(delta,0), 3
+++  vsub HX(delta,0),HX(delta,0),HX(P2,32)
+++  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
+++  vadd HX(P2,0),HX(P2,32),HX(delta,0) IFN
+++  #vmov HX(P2,0),3 IFN
+++
+++  # Now reverse all P/Qs
+++
+++  vadd -,HX(Q2,32),4 CLRA SACC
+++  vshl -,HX(Q1,32),1 SACC
+++  vshl -,HX(Q0,32),1 SACC
+++  vshl -,HX(P0,32),1 SACC
+++  vshl HX(delta,0),HX(P1,32),0 SACC
+++  vasr HX(delta,0),HX(delta,0), 3
+++  vsub HX(delta,0),HX(delta,0),HX(Q0,32)
+++  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
+++  vadd HX(Q0,0),HX(Q0,32),HX(delta,0) IFN
+++
+++  vadd -,HX(Q2,32),2 CLRA SACC
+++  vadd -,HX(Q1,32),HX(Q0,32) SACC
+++  vshl HX(delta,0),HX(P0,32),0 SACC
+++  vasr HX(delta,0),HX(delta,0), 2
+++  vsub HX(delta,0),HX(delta,0),HX(Q1,32)
+++  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
+++  vadd HX(Q1,0),HX(Q1,32),HX(delta,0) IFN
+++
+++  vadd -,HX(P0,32),4 CLRA SACC
+++  vadd -,HX(Q1,32),HX(Q0,32) SACC
+++  vmul -,HX(Q2,32),3 SACC
+++  vshl HX(delta,0),HX(Q3,0),1 SACC # Note that we have not made a copy of Q3, so using Q3,0 is correct
+++  vasr HX(delta,0),HX(delta,0), 3
+++  vsub HX(delta,0),HX(delta,0),HX(Q2,32)
+++  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
+++  vadd HX(Q2,0),HX(Q2,32),HX(delta,0) IFN
+++
+++  ##############################################################################
+++  # Normal filtering
+++normal_filtering:
+++  # Invert the decision flags
+++  # make instruction more complicated as assembler has error and loses SETF
+++  vrsub HX(tc10,0), HX(decision,0), 0 SETF # IFN means normal filtering
+++  vmov  -, HX(tc10,0) SETF # IFN means normal filtering
+++
+++  vmov -,1 IFN SUMS r5
+++  cmp r5,0
+++  beq filtering_done
+++
+++  vasr HX(tc2,0), HX(tc,0), 1
+++  vmul HX(tc10,0), HX(tc,0), 10
+++
+++  vasr HX(thresh,0), HX(beta,0), 1
+++  vadd HX(thresh,0), HX(thresh,0), HX(beta,0)
+++  vasr HX(thresh,0), HX(thresh,0), 3 CLRA SACC
+++
+++  vadd HX(ptest,0),HX(dp,3),HX(dp,0)
+++  vsub HX(ptest,0),HX(ptest,0),HX(thresh,0) # ptest is negative if we need to do the P2 pixel
+++  vadd HX(qtest,0),HX(dq,3),HX(dq,0)
+++  vsub HX(qtest,0),HX(qtest,0),HX(thresh,0) # qtest is negative if we need to do the Q2 pixel
+++  # Expand ptest and qtest together
+++  vodd HX(pqtest,0),HX(ptest,0),HX(qtest,0)  # p.p.p.p.q.q.q.q
+++  vodd HX(pqtest,0),HX(pqtest,0),HX(pqtest,0) # ppppqqqq........
+++  valtl HX(pqtest,0),HX(pqtest,0),HX(pqtest,0) # ppppppppqqqqqqqq
+++  valtl HX(ptest,0),HX(pqtest,0),HX(pqtest,0)
+++  valtu HX(qtest,0),HX(pqtest,0),HX(pqtest,0)
+++
+++  vsub HX(delta0,0), HX(Q0,0), HX(P0,0)
+++  vsub HX(delta1,0), HX(Q1,0), HX(P1,0)
+++  vmov -,8 CLRA SACC
+++  vmul -,HX(delta0,0), 9 SACC
+++  vmul HX(delta0,0),HX(delta1,0), r6 SACC
+++  vasr HX(delta0,0), HX(delta0,0), 4
+++  vdist HX(deltatest,0), HX(delta0,0), 0
+++  vsub HX(deltatest,0), HX(deltatest,0), HX(tc10,0) IFN SETF # negative if still need to do something
+++  vmov HX(deltatest,0), 0 IFNN # clear if no need to do anything so we can reload flags later
+++
+++  vclamps HX(delta0,0), HX(delta0,0), HX(tc,0)
+++
+++  vadd HX(deltap1,0), HX(P2,0), HX(P0,0)
+++  vadd HX(deltap1,0), HX(deltap1,0), 1
+++  vasr HX(deltap1,0), HX(deltap1,0), 1 CLRA SACC
+++  vsub HX(deltap1,0), HX(delta0,0), HX(P1,0) SACC
+++  vasr HX(deltap1,0), HX(deltap1,0), 1
+++  vclamps HX(deltap1,0), HX(deltap1,0), HX(tc2,0)
+++
+++  vadd HX(deltaq1,0), HX(Q2,0), HX(Q0,0)
+++  vadd HX(deltaq1,0), HX(deltaq1,0), 1
+++  vasr HX(deltaq1,0), HX(deltaq1,0), 1 CLRA SACC
+++  vadd HX(deltaq1,0), HX(delta0,0), HX(Q1,0)
+++  vrsub -, HX(delta0,0), 0 SACC
+++  vrsub HX(deltaq1,0), HX(Q1,0), 0 SACC
+++  vasr HX(deltaq1,0), HX(deltaq1,0), 1
+++  vclamps HX(deltaq1,0), HX(deltaq1,0), HX(tc2,0)
+++
+++  vadds HX(P0,0), HX(P0,0), HX(delta0,0) IFN
+++  vsubs HX(Q0,0), HX(Q0,0), HX(delta0,0) IFN
+++
+++  vmov -,HX(ptest,0) IFN SETF # Negative if need to do p1
+++  vadds HX(P1,0), HX(P1,0), HX(deltap1,0) IFN
+++
+++  vmov -,HX(deltatest,0) SETF
+++  vmov -,HX(qtest,0) IFN SETF # Negative if need to do q1
+++  vadds HX(Q1,0), HX(Q1,0), HX(deltaq1,0) IFN
+++
+++  #vmov HX(P2,0),1 IFN
+++
+++filtering_done:
+++  b lr
+++
+++
+++hevc_uv_deblock_16x16:
+++  push r6-r15, lr
+++  mov r14,0
+++  b hevc_uv_start
+++hevc_uv_deblock_16x16_with_clear:
+++  push r6-r15, lr
+++  mov r14,1
+++  b hevc_uv_start
+++
+++hevc_uv_start:
+++  mov r9,r4
+++  mov r4,r3
+++  mov r13,r2
+++  mov r2,r0
+++  mov r10,r0
+++  subscale4 r0,r1
+++  mov r8,63
+++  mov r6,-3
+++  vmov H(zeros,0),0
+++# r7 is number of blocks still to load
+++# r0 is location of current block - 4 * stride
+++# r1 is stride
+++# r2 is location of current block
+++# r3 is offset of start of block (actual edges start at H(16,16)+r3 for horizontal and H(16,0)+r3 for vertical
+++# r4 is setup
+++# r5 is for temporary calculations
+++# r8 holds 63
+++# r6 holds -3
+++# r9 holds the number of 16 high rows to process
+++# r10 holds the original img base
+++# r11 returns 0 if no filtering was done on the edge
+++# r12 saves a copy of this
+++# r13 is copy of width
+++# r14 is 1 if we should clear the old contents, or 0 if not
+++
+++uv_process_row:
+++  # First iteration does not do horizontal filtering on previous
+++  mov r7, r13
+++  mov r3,0
+++  vldb H(12++,16)+r3,(r0 += r1) REP 4    # Load the current block
+++  vldb H(16++,16)+r3,(r2 += r1) REP 16
+++  vldb H(setup_input,0), (r4)  # We may wish to prefetch these
+++  cmp r14,1
+++  bne uv_skip0
+++  vstb H(zeros,0),(r4)
+++uv_skip0:
+++  bl uv_vert_filter
+++  add r3,8
+++  vadd H(setup_input,0),H(setup_input,8),0 # Rotate to second set of 8
+++  bl uv_vert_filter
+++  sub r3,8
+++  b uv_start_deblock_loop
+++uv_deblock_loop:
+++  # Middle iterations do vertical on current block and horizontal on preceding
+++  vldb H(12++,16)+r3,(r0 += r1) REP 4  # load the current block
+++  vldb H(16++,16)+r3,(r2 += r1) REP 16
+++  vldb H(setup_input,0), (r4)
+++  cmp r14,1
+++  bne uv_skip1
+++  vstb H(zeros,0),(r4)
+++uv_skip1:
+++  bl uv_vert_filter
+++  add r3,8
+++  vadd H(setup_input,0),H(setup_input,8),0
+++  bl uv_vert_filter
+++  sub r3,8
+++  vldb H(setup_input,0), -16(r4)
+++  cmp r14,1
+++  bne uv_skip3
+++  vstb H(zeros,0),-16(r4)
+++uv_skip3:
+++  bl uv_horz_filter
+++  mov r12,r11
+++  add r3,8*64
+++  vadd H(setup_input,0),H(setup_input,8),0
+++  bl uv_horz_filter
+++  sub r3,8*64
+++  addcmpbeq r12,0,0,uv_skip_save_top
+++  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
+++uv_skip_save_top:
+++  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
+++uv_start_deblock_loop:
+++  # move onto next 16x16 (could do this with circular buffer support instead)
+++  add r3,16
+++  and r3,r8
+++  add r4,32
+++  # Perform loop counter operations (may work with an addcmpbgt as well?)
+++  add r0,16
+++  add r2,16
+++  sub r7,1
+++  cmp r7,0 # Are there still more blocks to load
+++  bgt uv_deblock_loop
+++
+++  # Final iteration needs to just do horizontal filtering
+++  vldb H(setup_input,0), -16(r4)
+++  cmp r14,1
+++  bne uv_skip2
+++  vstb H(zeros,0),-16(r4)
+++uv_skip2:
+++  bl uv_horz_filter
+++  mov r12,r11
+++  add r3,8*64
+++  vadd H(setup_input,0),H(setup_input,8),0
+++  bl uv_horz_filter
+++  sub r3,64*8
+++  addcmpbeq r12,0,0,uv_skip_save_top2
+++  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
+++uv_skip_save_top2:
+++  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
+++
+++# Now look to see if we should do another row
+++  sub r9,1
+++  cmp r9,0
+++  bgt uv_start_again
+++  pop r6-r15, pc
+++uv_start_again:
+++  # Need to sort out r0,r2 to point to next row down
+++  addscale16 r10,r1
+++  mov r2,r10
+++  subscale4 r0,r2,r1
+++  b uv_process_row
+++
+++
+++# At this stage H(16,16)+r3 points to the first pixel of the 16 high edge to be filtered
+++# So we can reuse the code we move the parts to be filtered into HX(P0/P1/P2/P3/Q0/Q1/Q2/Q3,0) - we will perform a final saturation step on placing them back into the correct locations
+++
+++uv_vert_filter:
+++  push lr
+++
+++  vmov HX(P1,0), V(16,14)+r3
+++  vmov HX(P0,0), V(16,15)+r3
+++  vmov HX(Q0,0), V(16,16)+r3
+++  vmov HX(Q1,0), V(16,17)+r3
+++
+++  bl do_chroma_filter
+++
+++  vadds V(16,15)+r3, HX(P0,0), 0
+++  vadds V(16,16)+r3, HX(Q0,0), 0
+++
+++  pop pc
+++
+++# Filter edge at H(16,0)+r3
+++uv_horz_filter:
+++  push lr
+++
+++  vmov HX(P1,0), H(14,0)+r3
+++  vmov HX(P0,0), H(15,0)+r3
+++  vmov HX(Q0,0), H(16,0)+r3
+++  vmov HX(Q1,0), H(17,0)+r3
+++
+++  bl do_chroma_filter
+++
+++  vadds H(15,0)+r3, HX(P0,0), 0
+++  # P3 and Q3 never change so don't bother saving back
+++  vadds H(16,0)+r3, HX(Q0,0), 0
+++
+++  pop pc
+++
+++# r4 points to array of beta/tc for each 4 length edge
+++do_chroma_filter:
+++  valtl H(setup,0),H(setup_input,0),H(setup_input,0) # tc*8
+++  valtl HX(tc,0),H(setup,0),H(setup,0)
+++
+++  vsub HX(delta,0),HX(Q0,0),HX(P0,0)
+++  vshl HX(delta,0),HX(delta,0),2 CLRA SACC
+++  vsub -,HX(P1,0),HX(Q1,0) SACC
+++  vmov HX(delta,0),4 SACC
+++  vasr HX(delta,0),HX(delta,0),3
+++  vclamps HX(delta,0), HX(delta,0), HX(tc,0)
+++  vadd HX(P0,0),HX(P0,0),HX(delta,0)
+++  vsub HX(Q0,0),HX(Q0,0),HX(delta,0)
+++  b lr
+++
+++# r0 = list
+++# r1 = number
+++hevc_run_command_list:
+++  push r6-r7, lr
+++  mov r6, r0
+++  mov r7, r1
+++loop_cmds:
+++  ld r0,(r6) # How to encode r6++?
+++  add r6,4
+++  ld r1,(r6)
+++  add r6,4
+++  ld r2,(r6)
+++  add r6,4
+++  ld r3,(r6)
+++  add r6,4
+++  ld r4,(r6)
+++  add r6,4
+++  ld r5,(r6)
+++  add r6,4
+++  bl hevc_trans_16x16
+++  sub r7,1
+++  cmp r7,0
+++  bgt loop_cmds
+++
+++  pop r6-r7, pc
++diff --git a/libavcodec/rpi_mailbox.c b/libavcodec/rpi_mailbox.c
++new file mode 100644
++index 0000000..3904efc
++--- /dev/null
+++++ b/libavcodec/rpi_mailbox.c
++@@ -0,0 +1,340 @@
+++/*
+++Copyright (c) 2012, Broadcom Europe Ltd.
+++All rights reserved.
+++
+++Redistribution and use in source and binary forms, with or without
+++modification, are permitted provided that the following conditions are met:
+++    * Redistributions of source code must retain the above copyright
+++      notice, this list of conditions and the following disclaimer.
+++    * Redistributions in binary form must reproduce the above copyright
+++      notice, this list of conditions and the following disclaimer in the
+++      documentation and/or other materials provided with the distribution.
+++    * Neither the name of the copyright holder nor the
+++      names of its contributors may be used to endorse or promote products
+++      derived from this software without specific prior written permission.
+++
+++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+++*/
+++
+++#include <stdio.h>
+++#include <string.h>
+++#include <stdlib.h>
+++#include <fcntl.h>
+++#include <unistd.h>
+++#include <assert.h>
+++#include <stdint.h>
+++#include <sys/mman.h>
+++#include <sys/ioctl.h>
+++
+++#include <linux/ioctl.h>
+++
+++#define MAJOR_NUM 100
+++#define IOCTL_MBOX_PROPERTY _IOWR(MAJOR_NUM, 0, char *)
+++#define DEVICE_FILE_NAME "/dev/vcio"
+++
+++#include "rpi_mailbox.h"
+++
+++#define PAGE_SIZE (4*1024)
+++
+++// Shared memory will not be cached in ARM cache
+++void *mapmem_shared(unsigned base, unsigned size)
+++{
+++   int mem_fd;
+++   unsigned offset = base % PAGE_SIZE;
+++   base = base - offset;
+++   /* open /dev/mem */
+++   if ((mem_fd = open("/dev/mem", O_RDWR|O_SYNC) ) < 0) {
+++      printf("can't open /dev/mem\nThis program should be run as root. Try prefixing command with: sudo\n");
+++      return NULL;
+++   }
+++   void *mem = mmap(
+++      0,
+++      size,
+++      PROT_READ|PROT_WRITE,
+++      MAP_SHARED/*|MAP_FIXED*/,
+++      mem_fd,
+++      base);
+++#ifdef DEBUG
+++   printf("base=0x%x, mem=%p\n", base, mem);
+++#endif
+++   if (mem == MAP_FAILED) {
+++      printf("mmap error %d\n", (int)mem);
+++      return NULL;
+++   }
+++   close(mem_fd);
+++   return (char *)mem + offset;
+++}
+++
+++// Unshared memory will be faster as lives in ARM cache, but requires cache flushing
+++void *mapmem_private(unsigned base, unsigned size)
+++{
+++   int mem_fd;
+++   unsigned offset = base % PAGE_SIZE;
+++   base = base - offset;
+++   /* open /dev/mem */
+++   if ((mem_fd = open("/dev/mem", O_RDWR|O_SYNC) ) < 0) {
+++      printf("can't open /dev/mem\nThis program should be run as root. Try prefixing command with: sudo\n");
+++      return NULL;
+++   }
+++   void *mem = mmap(
+++      0,
+++      size,
+++      PROT_READ|PROT_WRITE,
+++      MAP_PRIVATE/*|MAP_FIXED*/,
+++      mem_fd,
+++      base);
+++#ifdef DEBUG
+++   printf("base=0x%x, mem=%p\n", base, mem);
+++#endif
+++   if (mem == MAP_FAILED) {
+++      printf("mmap error %d\n", (int)mem);
+++      return NULL;
+++   }
+++   close(mem_fd);
+++   return (char *)mem + offset;
+++}
+++
+++void unmapmem(void *addr, unsigned size)
+++{
+++   int s = munmap(addr, size);
+++   if (s != 0) {
+++      printf("munmap error %d\n", s);
+++      exit (-1);
+++   }
+++}
+++
+++/*
+++ * use ioctl to send mbox property message
+++ */
+++
+++static int mbox_property(int file_desc, void *buf)
+++{
+++   int ret_val = ioctl(file_desc, IOCTL_MBOX_PROPERTY, buf);
+++
+++   if (ret_val < 0) {
+++      printf("ioctl_set_msg failed:%d\n", ret_val);
+++   }
+++
+++#ifdef DEBUG
+++   unsigned *p = buf; int i; unsigned size = *(unsigned *)buf;
+++   for (i=0; i<size/4; i++)
+++      printf("%04x: 0x%08x\n", i*sizeof *p, p[i]);
+++#endif
+++   return ret_val;
+++}
+++
+++unsigned mem_alloc(int file_desc, unsigned size, unsigned align, unsigned flags)
+++{
+++   int i=0;
+++   unsigned p[32];
+++   p[i++] = 0; // size
+++   p[i++] = 0x00000000; // process request
+++
+++   p[i++] = 0x3000c; // (the tag id)
+++   p[i++] = 12; // (size of the buffer)
+++   p[i++] = 12; // (size of the data)
+++   p[i++] = size; // (num bytes? or pages?)
+++   p[i++] = align; // (alignment)
+++   p[i++] = flags; // (MEM_FLAG_L1_NONALLOCATING)
+++
+++   p[i++] = 0x00000000; // end tag
+++   p[0] = i*sizeof *p; // actual size
+++
+++   mbox_property(file_desc, p);
+++   return p[5];
+++}
+++
+++unsigned mem_free(int file_desc, unsigned handle)
+++{
+++   int i=0;
+++   unsigned p[32];
+++   p[i++] = 0; // size
+++   p[i++] = 0x00000000; // process request
+++
+++   p[i++] = 0x3000f; // (the tag id)
+++   p[i++] = 4; // (size of the buffer)
+++   p[i++] = 4; // (size of the data)
+++   p[i++] = handle;
+++
+++   p[i++] = 0x00000000; // end tag
+++   p[0] = i*sizeof *p; // actual size
+++
+++   mbox_property(file_desc, p);
+++   return p[5];
+++}
+++
+++unsigned mem_lock(int file_desc, unsigned handle)
+++{
+++   int i=0;
+++   unsigned p[32];
+++   p[i++] = 0; // size
+++   p[i++] = 0x00000000; // process request
+++
+++   p[i++] = 0x3000d; // (the tag id)
+++   p[i++] = 4; // (size of the buffer)
+++   p[i++] = 4; // (size of the data)
+++   p[i++] = handle;
+++
+++   p[i++] = 0x00000000; // end tag
+++   p[0] = i*sizeof *p; // actual size
+++
+++   mbox_property(file_desc, p);
+++   return p[5];
+++}
+++
+++unsigned mem_unlock(int file_desc, unsigned handle)
+++{
+++   int i=0;
+++   unsigned p[32];
+++   p[i++] = 0; // size
+++   p[i++] = 0x00000000; // process request
+++
+++   p[i++] = 0x3000e; // (the tag id)
+++   p[i++] = 4; // (size of the buffer)
+++   p[i++] = 4; // (size of the data)
+++   p[i++] = handle;
+++
+++   p[i++] = 0x00000000; // end tag
+++   p[0] = i*sizeof *p; // actual size
+++
+++   mbox_property(file_desc, p);
+++   return p[5];
+++}
+++
+++unsigned execute_code(int file_desc, unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5)
+++{
+++   int i=0;
+++   unsigned p[32];
+++   p[i++] = 0; // size
+++   p[i++] = 0x00000000; // process request
+++
+++   p[i++] = 0x30010; // (the tag id)
+++   p[i++] = 28; // (size of the buffer)
+++   p[i++] = 28; // (size of the data)
+++   p[i++] = code;
+++   p[i++] = r0;
+++   p[i++] = r1;
+++   p[i++] = r2;
+++   p[i++] = r3;
+++   p[i++] = r4;
+++   p[i++] = r5;
+++
+++   p[i++] = 0x00000000; // end tag
+++   p[0] = i*sizeof *p; // actual size
+++
+++   mbox_property(file_desc, p);
+++   return p[5];
+++}
+++
+++unsigned qpu_enable(int file_desc, unsigned enable)
+++{
+++   int i=0;
+++   unsigned p[32];
+++
+++   p[i++] = 0; // size
+++   p[i++] = 0x00000000; // process request
+++
+++   p[i++] = 0x30012; // (the tag id)
+++   p[i++] = 4; // (size of the buffer)
+++   p[i++] = 4; // (size of the data)
+++   p[i++] = enable;
+++
+++   p[i++] = 0x00000000; // end tag
+++   p[0] = i*sizeof *p; // actual size
+++
+++   mbox_property(file_desc, p);
+++   return p[5];
+++}
+++
+++unsigned execute_qpu(int file_desc, unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout) {
+++   int i=0;
+++   unsigned p[32];
+++
+++   p[i++] = 0; // size
+++   p[i++] = 0x00000000; // process request
+++   p[i++] = 0x30011; // (the tag id)
+++   p[i++] = 16; // (size of the buffer)
+++   p[i++] = 16; // (size of the data)
+++   p[i++] = num_qpus;
+++   p[i++] = control;
+++   p[i++] = noflush;
+++   p[i++] = timeout; // ms
+++
+++   p[i++] = 0x00000000; // end tag
+++   p[0] = i*sizeof *p; // actual size
+++
+++   mbox_property(file_desc, p);
+++   return p[5];
+++}
+++
+++void execute_multi(int file_desc,
+++   unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout,
+++   unsigned num_qpus_2, unsigned control_2, unsigned noflush_2, unsigned timeout_2,
+++   unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
+++   unsigned code_2, unsigned r0_2, unsigned r1_2, unsigned r2_2, unsigned r3_2, unsigned r4_2, unsigned r5_2) {
+++   int i=0;
+++   unsigned p[32];
+++
+++   p[i++] = 0; // size
+++   p[i++] = 0x00000000; // process request
+++   p[i++] = 0x30018; // (the tag id)
+++   p[i++] = 88; // (size of the buffer)
+++   p[i++] = 88; // (size of the data)
+++
+++   p[i++] = num_qpus;
+++   p[i++] = control;
+++   p[i++] = noflush;
+++   p[i++] = timeout; // ms
+++
+++   p[i++] = num_qpus_2;
+++   p[i++] = control_2;
+++   p[i++] = noflush_2;
+++   p[i++] = timeout_2; // ms
+++
+++   p[i++] = code;
+++   p[i++] = r0;
+++   p[i++] = r1;
+++   p[i++] = r2;
+++   p[i++] = r3;
+++   p[i++] = r4;
+++   p[i++] = r5;
+++
+++   p[i++] = code_2;
+++   p[i++] = r0_2;
+++   p[i++] = r1_2;
+++   p[i++] = r2_2;
+++   p[i++] = r3_2;
+++   p[i++] = r4_2;
+++   p[i++] = r5_2;
+++
+++   p[i++] = 0x00000000; // end tag
+++   p[0] = i*sizeof *p; // actual size
+++
+++   mbox_property(file_desc, p);
+++   return;
+++}
+++
+++int mbox_open() {
+++   int file_desc;
+++
+++   // open a char device file used for communicating with kernel mbox driver
+++   file_desc = open(DEVICE_FILE_NAME, 0);
+++   if (file_desc < 0) {
+++      printf("Can't open device file: %s\n", DEVICE_FILE_NAME);
+++      printf("Try creating a device file with: sudo mknod %s c %d 0\n", DEVICE_FILE_NAME, MAJOR_NUM);
+++   }
+++   return file_desc;
+++}
+++
+++void mbox_close(int file_desc) {
+++  close(file_desc);
+++}
++diff --git a/libavcodec/rpi_mailbox.h b/libavcodec/rpi_mailbox.h
++new file mode 100644
++index 0000000..5898102
++--- /dev/null
+++++ b/libavcodec/rpi_mailbox.h
++@@ -0,0 +1,25 @@
+++#ifndef RPI_MAILBOX_H
+++#define RPI_MAILBOX_H
+++
+++extern int mbox_open(void);
+++extern void mbox_close(int file_desc);
+++
+++extern unsigned get_version(int file_desc);
+++extern unsigned mem_alloc(int file_desc, unsigned size, unsigned align, unsigned flags);
+++extern unsigned mem_free(int file_desc, unsigned handle);
+++extern unsigned mem_lock(int file_desc, unsigned handle);
+++extern unsigned mem_unlock(int file_desc, unsigned handle);
+++extern void *mapmem_shared(unsigned base, unsigned size);
+++extern void *mapmem_private(unsigned base, unsigned size);
+++extern void unmapmem(void *addr, unsigned size);
+++
+++extern unsigned execute_code(int file_desc, unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5);
+++extern unsigned execute_qpu(int file_desc, unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout);
+++extern void execute_multi(int file_desc,
+++   unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout,
+++   unsigned num_qpus_2, unsigned control_2, unsigned noflush_2, unsigned timeout_2,
+++   unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
+++   unsigned code_2, unsigned r0_2, unsigned r1_2, unsigned r2_2, unsigned r3_2, unsigned r4_2, unsigned r5_2);
+++extern unsigned qpu_enable(int file_desc, unsigned enable);
+++
+++#endif
++diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
++new file mode 100644
++index 0000000..a01c051
++--- /dev/null
+++++ b/libavcodec/rpi_qpu.c
++@@ -0,0 +1,991 @@
+++#ifdef RPI
+++// Use vchiq service for submitting jobs
+++#define GPUSERVICE
+++
+++// This works better than the mmap in that the memory can be cached, but requires a kernel modification to enable the device.
+++// define RPI_TIME_TOTAL_QPU to print out how much time is spent in the QPU code
+++//#define RPI_TIME_TOTAL_QPU
+++// define RPI_TIME_TOTAL_VPU to print out how much time is spent in the VPI code
+++//#define RPI_TIME_TOTAL_VPU
+++// define RPI_TIME_TOTAL_POSTED to print out how much time is spent in the multi execute QPU/VPU combined
+++#define RPI_TIME_TOTAL_POSTED
+++
+++#include <stdio.h>
+++#include <stdlib.h>
+++#include <string.h>
+++#include <stddef.h>
+++#include <stdint.h>
+++#include "libavutil/avassert.h"
+++
+++#include "config.h"
+++
+++#include <pthread.h>
+++#include <time.h>
+++
+++#include "rpi_mailbox.h"
+++#include "rpi_qpu.h"
+++#include "rpi_shader.h"
+++#include "rpi_hevc_transform.h"
+++
+++#include "rpi_user_vcsm.h"
+++#ifdef GPUSERVICE
+++#pragma GCC diagnostic push
+++// Many many redundant decls in the header files
+++#pragma GCC diagnostic ignored "-Wredundant-decls"
+++#include "interface/vmcs_host/vc_vchi_gpuserv.h"
+++#pragma GCC diagnostic pop
+++#endif
+++
+++// QPU profile flags
+++#define NO_FLUSH 1
+++#define CLEAR_PROFILE 2
+++#define OUTPUT_COUNTS 4
+++
+++#define FLAGS_FOR_PROFILING (NO_FLUSH)
+++
+++
+++// On Pi2 there is no way to access the VPU L2 cache
+++// GPU_MEM_FLG should be 4 for uncached memory.  (Or C for alias to allocate in the VPU L2 cache)
+++// However, if using VCSM allocated buffers, need to use C at the moment because VCSM does not allocate uncached memory correctly
+++// The QPU crashes if we mix L2 cached and L2 uncached accesses due to a HW bug.
+++#define GPU_MEM_FLG 0x4
+++// GPU_MEM_MAP is meaningless on the Pi2 and should be left at 0  (On Pi1 it allows ARM to access VPU L2 cache)
+++#define GPU_MEM_MAP 0x0
+++
+++#define vcos_verify_ge0(x) ((x)>=0)
+++
+++/*static const unsigned code[] =
+++{
+++  #include "rpi_shader.hex"
+++};*/
+++
+++// Size in 32bit words
+++#define QPU_CODE_SIZE 2048
+++#define VPU_CODE_SIZE 2048
+++
+++const short rpi_transMatrix2even[32][16] = { // Even rows first
+++{64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64},
+++{90,  87,  80,  70,  57,  43,  25,   9,  -9, -25, -43, -57, -70, -80, -87, -90},
+++{89,  75,  50,  18, -18, -50, -75, -89, -89, -75, -50, -18,  18,  50,  75,  89},
+++{87,  57,   9, -43, -80, -90, -70, -25,  25,  70,  90,  80,  43,  -9, -57, -87},
+++{83,  36, -36, -83, -83, -36,  36,  83,  83,  36, -36, -83, -83, -36,  36,  83},
+++{80,   9, -70, -87, -25,  57,  90,  43, -43, -90, -57,  25,  87,  70,  -9, -80},
+++{75, -18, -89, -50,  50,  89,  18, -75, -75,  18,  89,  50, -50, -89, -18,  75},
+++{70, -43, -87,   9,  90,  25, -80, -57,  57,  80, -25, -90,  -9,  87,  43, -70},
+++{64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64},
+++{57, -80, -25,  90,  -9, -87,  43,  70, -70, -43,  87,   9, -90,  25,  80, -57},
+++{50, -89,  18,  75, -75, -18,  89, -50, -50,  89, -18, -75,  75,  18, -89,  50},
+++{43, -90,  57,  25, -87,  70,   9, -80,  80,  -9, -70,  87, -25, -57,  90, -43},
+++{36, -83,  83, -36, -36,  83, -83,  36,  36, -83,  83, -36, -36,  83, -83,  36},
+++{25, -70,  90, -80,  43,   9, -57,  87, -87,  57,  -9, -43,  80, -90,  70, -25},
+++{18, -50,  75, -89,  89, -75,  50, -18, -18,  50, -75,  89, -89,  75, -50,  18},
+++{ 9, -25,  43, -57,  70, -80,  87, -90,  90, -87,  80, -70,  57, -43,  25,  -9},
+++// Odd rows
+++{90,  90,  88,  85,  82,  78,  73,  67,  61,  54,  46,  38,  31,  22,  13,   4},
+++{90,  82,  67,  46,  22,  -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13},
+++{88,  67,  31, -13, -54, -82, -90, -78, -46,  -4,  38,  73,  90,  85,  61,  22},
+++{85,  46, -13, -67, -90, -73, -22,  38,  82,  88,  54,  -4, -61, -90, -78, -31},
+++{82,  22, -54, -90, -61,  13,  78,  85,  31, -46, -90, -67,   4,  73,  88,  38},
+++{78,  -4, -82, -73,  13,  85,  67, -22, -88, -61,  31,  90,  54, -38, -90, -46},
+++{73, -31, -90, -22,  78,  67, -38, -90, -13,  82,  61, -46, -88,  -4,  85,  54},
+++{67, -54, -78,  38,  85, -22, -90,   4,  90,  13, -88, -31,  82,  46, -73, -61},
+++{61, -73, -46,  82,  31, -88, -13,  90,  -4, -90,  22,  85, -38, -78,  54,  67},
+++{54, -85,  -4,  88, -46, -61,  82,  13, -90,  38,  67, -78, -22,  90, -31, -73},
+++{46, -90,  38,  54, -90,  31,  61, -88,  22,  67, -85,  13,  73, -82,   4,  78},
+++{38, -88,  73,  -4, -67,  90, -46, -31,  85, -78,  13,  61, -90,  54,  22, -82},
+++{31, -78,  90, -61,   4,  54, -88,  82, -38, -22,  73, -90,  67, -13, -46,  85},
+++{22, -61,  85, -90,  73, -38,  -4,  46, -78,  90, -82,  54, -13, -31,  67, -88},
+++{13, -38,  61, -78,  88, -90,  85, -73,  54, -31,   4,  22, -46,  67, -82,  90},
+++{ 4, -13,  22, -31,  38, -46,  54, -61,  67, -73,  78, -82,  85, -88,  90, -90}
+++};
+++
+++struct GPU
+++{
+++  unsigned int qpu_code[QPU_CODE_SIZE];
+++  unsigned int vpu_code[VPU_CODE_SIZE];
+++  short transMatrix2even[16*16*2];
+++  int open_count; // Number of allocated video buffers
+++  int      mb; // Mailbox handle
+++  int      vc; // Address in GPU memory
+++  int mail[12*2]; // These are used to pass pairs of code/unifs to the QPUs for the first QPU task
+++  int mail2[12*2]; // These are used to pass pairs of code/unifs to the QPUs for the second QPU task
+++};
+++
+++// Stop more than one thread trying to allocate memory or use the processing resources at once
+++static pthread_mutex_t gpu_mutex = PTHREAD_MUTEX_INITIALIZER;
+++static volatile struct GPU* gpu = NULL;
+++static GPU_MEM_PTR_T gpu_mem_ptr;
+++
+++#if defined(RPI_TIME_TOTAL_QPU) || defined(RPI_TIME_TOTAL_VPU) || defined(RPI_TIME_TOTAL_POSTED)
+++static unsigned int Microseconds(void) {
+++    struct timespec ts;
+++    unsigned int x;
+++    static unsigned int base = 0;
+++    clock_gettime(CLOCK_REALTIME, &ts);
+++    x = ts.tv_sec*1000000 + ts.tv_nsec/1000;
+++    if (base==0) base=x;
+++    return x-base;
+++}
+++#endif
+++
+++static int gpu_malloc_uncached_internal(int numbytes, GPU_MEM_PTR_T *p, int mb);
+++static void gpu_free_internal(GPU_MEM_PTR_T *p);
+++
+++// Connect to QPU, returns 0 on success.
+++static int gpu_init(volatile struct GPU **gpu) {
+++  int mb = mbox_open();
+++  int vc;
+++  volatile struct GPU* ptr;
+++	if (mb < 0)
+++		return -1;
+++#ifndef RPI_ASYNC
+++	if (qpu_enable(mb, 1)) return -2;
+++#endif
+++  vcsm_init();
+++  gpu_malloc_uncached_internal(sizeof(struct GPU), &gpu_mem_ptr, mb);
+++  ptr = (volatile struct GPU*)gpu_mem_ptr.arm;
+++  memset((void*)ptr, 0, sizeof *ptr);
+++  vc = gpu_mem_ptr.vc;
+++
+++  ptr->mb = mb;
+++  ptr->vc = vc;
+++
+++  printf("GPU allocated at 0x%x\n",vc);
+++
+++  *gpu = ptr;
+++
+++  // Now copy over the QPU code into GPU memory
+++  {
+++    int num_bytes = qpu_get_fn(QPU_MC_END) - qpu_get_fn(QPU_MC_SETUP_UV);
+++    av_assert0(num_bytes<=QPU_CODE_SIZE*sizeof(unsigned int));
+++    memcpy((void*)ptr->qpu_code, rpi_shader, num_bytes);
+++  }
+++  // And the VPU code
+++  {
+++    int num_bytes = sizeof(rpi_hevc_transform);
+++    av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
+++    memcpy((void*)ptr->vpu_code, rpi_hevc_transform, num_bytes);
+++  }
+++  // And the transform coefficients
+++  memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, sizeof(rpi_transMatrix2even));
+++
+++#ifdef RPI_ASYNC
+++  {
+++    int err;
+++    vpu_async_tail = 0;
+++    vpu_async_head = 0;
+++    err = pthread_create(&vpu_thread, NULL, vpu_start, NULL);
+++    //printf("Created thread\n");
+++    if (err) {
+++        av_log(NULL, AV_LOG_FATAL, "Failed to create vpu thread\n");
+++        return -4;
+++    }
+++
+++    {
+++      struct sched_param param = {0};
+++      int policy = 0;
+++
+++      if (pthread_getschedparam(vpu_thread, &policy, &param) != 0)
+++      {
+++        av_log(NULL, AV_LOG_ERROR, "Unable to get VPU thread scheduling parameters\n");
+++      }
+++      else
+++      {
+++        av_log(NULL, AV_LOG_INFO, "VPU thread: policy=%d (%s), pri=%d\n",
+++            policy,
+++            policy == SCHED_RR ? "RR" : policy == SCHED_FIFO ? "FIFO" : "???" ,
+++            param.sched_priority);
+++
+++        policy = SCHED_FIFO;
+++        param.sched_priority = sched_get_priority_max(SCHED_FIFO);
+++
+++        av_log(NULL, AV_LOG_INFO, "Attempt to set: policy=%d (%s), pri=%d\n",
+++            policy,
+++            policy == SCHED_RR ? "RR" : policy == SCHED_FIFO ? "FIFO" : "???" ,
+++            param.sched_priority);
+++
+++        if (pthread_setschedparam(vpu_thread, policy, &param) != 0)
+++        {
+++          av_log(NULL, AV_LOG_ERROR, "Unable to set VPU thread scheduling parameters\n");
+++        }
+++        else
+++        {
+++          if (pthread_getschedparam(vpu_thread, &policy, &param) != 0)
+++          {
+++            av_log(NULL, AV_LOG_ERROR, "Unable to get VPU thread scheduling parameters\n");
+++          }
+++          else
+++          {
+++            av_log(NULL, AV_LOG_INFO, "VPU thread (after): policy=%d (%s), pri=%d\n",
+++                policy,
+++                policy == SCHED_RR ? "RR" : policy == SCHED_FIFO ? "FIFO" : "???" ,
+++                param.sched_priority);
+++          }
+++        }
+++      }
+++
+++    }
+++
+++  }
+++#endif
+++
+++  return 0;
+++}
+++
+++// Returns 1 if the gpu is currently idle
+++static int gpu_idle(void)
+++{
+++  int ret = pthread_mutex_trylock(&gpu_mutex);
+++  if (ret==0) {
+++    pthread_mutex_unlock(&gpu_mutex);
+++    return 1;
+++  }
+++  return 0;
+++}
+++
+++// Make sure we have exclusive access to the mailbox, and enable qpu if necessary.
+++static void gpu_lock(void) {
+++  pthread_mutex_lock(&gpu_mutex);
+++
+++  if (gpu==NULL) {
+++    gpu_init(&gpu);
+++  }
+++}
+++
+++static void gpu_unlock(void) {
+++  pthread_mutex_unlock(&gpu_mutex);
+++}
+++
+++static int gpu_malloc_uncached_internal(int numbytes, GPU_MEM_PTR_T *p, int mb) {
+++  p->numbytes = numbytes;
+++  p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
+++  av_assert0(p->vcsm_handle);
+++  p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
+++  av_assert0(p->vc_handle);
+++  p->arm = vcsm_lock(p->vcsm_handle);
+++  av_assert0(p->arm);
+++  p->vc = mem_lock(mb, p->vc_handle);
+++  av_assert0(p->vc);
+++  return 0;
+++}
+++
+++// Allocate memory on GPU
+++// Fills in structure <p> containing ARM pointer, videocore handle, videocore memory address, numbytes
+++// Returns 0 on success.
+++// This allocates memory that will not be cached in ARM's data cache.
+++// Therefore safe to use without data cache flushing.
+++int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p)
+++{
+++  int r;
+++  gpu_lock();
+++  r = gpu_malloc_uncached_internal(numbytes, p, gpu->mb);
+++  gpu->open_count++;
+++  gpu_unlock();
+++  return r;
+++}
+++
+++int gpu_get_mailbox(void)
+++{
+++  av_assert0(gpu);
+++  return gpu->mb;
+++}
+++
+++// Call this to clean and invalidate a region of memory
+++void gpu_cache_flush(const GPU_MEM_PTR_T * const p)
+++{
+++#ifdef RPI_FAST_CACHEFLUSH
+++    struct vcsm_user_clean_invalid_s iocache = {};
+++    iocache.s[0].handle = p->vcsm_handle;
+++    iocache.s[0].cmd = 3; // clean+invalidate
+++    iocache.s[0].addr = (int) p->arm;
+++    iocache.s[0].size  = p->numbytes;
+++    vcsm_clean_invalid( &iocache );
+++#else
+++    void *tmp = vcsm_lock(p->vcsm_handle);
+++    vcsm_unlock_ptr(tmp);
+++#endif
+++}
+++
+++void gpu_cache_flush3(GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2)
+++{
+++#ifdef RPI_FAST_CACHEFLUSH
+++    struct vcsm_user_clean_invalid_s iocache = {};
+++    iocache.s[0].handle = p0->vcsm_handle;
+++    iocache.s[0].cmd = 3; // clean+invalidate
+++    iocache.s[0].addr = (int) p0->arm;
+++    iocache.s[0].size  = p0->numbytes;
+++    iocache.s[1].handle = p1->vcsm_handle;
+++    iocache.s[1].cmd = 3; // clean+invalidate
+++    iocache.s[1].addr = (int) p1->arm;
+++    iocache.s[1].size  = p1->numbytes;
+++    iocache.s[2].handle = p2->vcsm_handle;
+++    iocache.s[2].cmd = 3; // clean+invalidate
+++    iocache.s[2].addr = (int) p2->arm;
+++    iocache.s[2].size  = p2->numbytes;
+++    vcsm_clean_invalid( &iocache );
+++#else
+++    void *tmp;
+++    tmp = vcsm_lock(p0->vcsm_handle);
+++    vcsm_unlock_ptr(tmp);
+++    tmp = vcsm_lock(p1->vcsm_handle);
+++    vcsm_unlock_ptr(tmp);
+++    tmp = vcsm_lock(p2->vcsm_handle);
+++    vcsm_unlock_ptr(tmp);
+++#endif
+++}
+++
+++static int gpu_malloc_cached_internal(int numbytes, GPU_MEM_PTR_T *p) {
+++  p->numbytes = numbytes;
+++  p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST, (char *)"Video Frame" );
+++  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_VC, (char *)"Video Frame" );
+++  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
+++  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST_AND_VC, (char *)"Video Frame" );
+++  av_assert0(p->vcsm_handle);
+++  p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
+++  av_assert0(p->vc_handle);
+++  p->arm = vcsm_lock(p->vcsm_handle);
+++  av_assert0(p->arm);
+++  p->vc = mem_lock(gpu->mb, p->vc_handle);
+++  av_assert0(p->vc);
+++  return 0;
+++}
+++
+++// This allocates data that will be
+++//    Cached in ARM L2
+++//    Uncached in VPU L2
+++int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p)
+++{
+++  int r;
+++  gpu_lock();
+++  r = gpu_malloc_cached_internal(numbytes, p);
+++  gpu->open_count++;
+++  gpu_unlock();
+++  return r;
+++}
+++
+++static void gpu_term(void)
+++{
+++  int mb;
+++
+++  if (gpu==NULL)
+++    return;
+++  mb = gpu->mb;
+++
+++  // ??? Tear down anything needed for gpuexecute
+++
+++  qpu_enable(mb, 0);
+++  gpu_free_internal(&gpu_mem_ptr);
+++
+++  vcsm_exit();
+++
+++  mbox_close(mb);
+++  gpu = NULL;
+++}
+++
+++void gpu_free_internal(GPU_MEM_PTR_T *p) {
+++  int mb = gpu->mb;
+++  mem_unlock(mb,p->vc_handle);
+++  vcsm_unlock_ptr(p->arm);
+++  vcsm_free(p->vcsm_handle);
+++}
+++
+++void gpu_free(GPU_MEM_PTR_T *p) {
+++  gpu_lock();
+++
+++  gpu_free_internal(p);
+++
+++  gpu->open_count--;
+++  if (gpu->open_count==0) {
+++      printf("Closing GPU\n");
+++      gpu_term();
+++      gpu = NULL;
+++  }
+++  gpu_unlock();
+++}
+++
+++unsigned int vpu_get_fn(void) {
+++  // Make sure that the gpu is initialized
+++  if (gpu==NULL) {
+++    printf("Preparing gpu\n");
+++    gpu_lock();
+++    gpu_unlock();
+++  }
+++  return gpu->vc + offsetof(struct GPU,vpu_code);
+++}
+++
+++unsigned int vpu_get_constants(void) {
+++  if (gpu==NULL) {
+++    gpu_lock();
+++    gpu_unlock();
+++  }
+++  return gpu->vc + offsetof(struct GPU,transMatrix2even);
+++}
+++
+++#ifdef GPUSERVICE
+++static void callback(void *cookie)
+++{
+++  sem_post((sem_t *)cookie);
+++}
+++#endif
+++
+++
+++static volatile uint32_t post_done = 0;
+++static volatile uint32_t post_qed = 0;
+++
+++static void post_code2_cb(void * v)
+++{
+++  uint32_t n = (uint32_t)v;
+++  if ((int32_t)(n - post_done) > 0) {
+++    post_done = n;
+++  }
+++}
+++
+++
+++// Post a command to the queue
+++// Returns an id which we can use to wait for completion
+++int vpu_post_code2(unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5, GPU_MEM_PTR_T *buf)
+++{
+++  struct gpu_job_s j[1] = {
+++    {
+++      .command = EXECUTE_VPU,
+++      .u.v.q = {code, r0, r1, r2, r3, r4, r5},
+++      .callback.func = post_code2_cb
+++    }
+++  };
+++  uint32_t id;
+++
+++  j[0].callback.cookie = (void *)(id = ++post_qed);
+++
+++  av_assert0(vc_gpuserv_execute_code(1, j) == 0);
+++
+++  return id;
+++}
+++
+++int vpu_qpu_post_code2(unsigned vpu_code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
+++    int qpu0_n, const uint32_t * qpu0_mail,
+++    int qpu1_n, const uint32_t * qpu1_mail)
+++{
+++#if 1
+++  sem_t sync0;
+++  struct gpu_job_s j[4];
+++
+++  sem_init(&sync0, 0, 0);
+++
+++  j[0].command = EXECUTE_VPU;
+++  j[0].u.v.q[0] = vpu_code;
+++  j[0].u.v.q[1] = r0;
+++  j[0].u.v.q[2] = r1;
+++  j[0].u.v.q[3] = r2;
+++  j[0].u.v.q[4] = r3;
+++  j[0].u.v.q[5] = r4;
+++  j[0].u.v.q[6] = r5;
+++  j[0].callback.func = 0;
+++  j[0].callback.cookie = NULL;
+++
+++  j[1].command = EXECUTE_QPU;
+++  j[1].u.q.jobs = qpu1_n;
+++  memcpy(j[1].u.q.control, qpu1_mail, qpu1_n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
+++  j[1].u.q.noflush = FLAGS_FOR_PROFILING;
+++  j[1].u.q.timeout = 5000;
+++  j[1].callback.func = 0;
+++  j[1].callback.cookie = NULL;
+++
+++  j[2].command = EXECUTE_QPU;
+++  j[2].u.q.jobs = qpu0_n;
+++  memcpy(j[2].u.q.control, qpu0_mail, qpu0_n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
+++  j[2].u.q.noflush = 1;
+++  j[2].u.q.timeout = 5000;
+++  j[2].callback.func = 0;
+++  j[2].callback.cookie = NULL;
+++
+++  j[3].command = EXECUTE_SYNC;
+++  j[3].u.s.mask = 3;
+++  j[3].callback.func = callback;
+++  j[3].callback.cookie = (void *)&sync0;
+++
+++  av_assert0(vc_gpuserv_execute_code(4, j) == 0);
+++
+++  sem_wait(&sync0);
+++#else
+++
+++  sem_t sync0, sync2;
+++  struct gpu_job_s j[3];
+++
+++  sem_init(&sync0, 0, 0);
+++  sem_init(&sync2, 0, 0);
+++
+++  j[0].command = EXECUTE_VPU;
+++  j[0].u.v.q[0] = vpu_code;
+++  j[0].u.v.q[1] = r0;
+++  j[0].u.v.q[2] = r1;
+++  j[0].u.v.q[3] = r2;
+++  j[0].u.v.q[4] = r3;
+++  j[0].u.v.q[5] = r4;
+++  j[0].u.v.q[6] = r5;
+++  j[0].callback.func = callback;
+++  j[0].callback.cookie = (void *)&sync0;
+++
+++  j[1].command = EXECUTE_QPU;
+++  j[1].u.q.jobs = qpu1_n;
+++  memcpy(j[1].u.q.control, qpu1_mail, qpu1_n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
+++  j[1].u.q.noflush = FLAGS_FOR_PROFILING;
+++  j[1].u.q.timeout = 5000;
+++  j[1].callback.func = 0;
+++  j[1].callback.cookie = NULL;
+++
+++  j[2].command = EXECUTE_QPU;
+++  j[2].u.q.jobs = qpu0_n;
+++  memcpy(j[2].u.q.control, qpu0_mail, qpu0_n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
+++  j[2].u.q.noflush = 1;
+++  j[2].u.q.timeout = 5000;
+++  j[2].callback.func = callback;
+++  j[2].callback.cookie = (void *)&sync2;
+++
+++  av_assert0(vc_gpuserv_execute_code(3, j) == 0);
+++
+++  sem_wait(&sync0);
+++  sem_wait(&sync2);
+++#endif
+++
+++  return 0;
+++}
+++
+++
+++// Wait for completion of the given command
+++void vpu_wait(int id)
+++{
+++  if (id == 0) {
+++#if 0
+++    sem_t sync0;
+++    struct gpu_job_s j[1] =
+++    {
+++      {
+++        .command = EXECUTE_SYNC,
+++        .u.s.mask = 3,
+++        .callback.func = callback,
+++        .callback.cookie = (void *)&sync0
+++      }
+++    };
+++
+++    sem_init(&sync0, 0, 0);
+++
+++    av_assert0(vc_gpuserv_execute_code(1, j) == 0);
+++
+++    sem_wait(&sync0);
+++#endif
+++  }
+++  else {
+++    while ((int32_t)(post_done - (uint32_t)id) < 0) {
+++      usleep(1000);
+++    }
+++  }
+++}
+++
+++
+++unsigned int qpu_get_fn(int num) {
+++    // Make sure that the gpu is initialized
+++    unsigned int *fn;
+++    if (gpu==NULL) {
+++      printf("Preparing gpu\n");
+++      gpu_lock();
+++      gpu_unlock();
+++    }
+++    switch(num) {
+++    case QPU_MC_SETUP:
+++      fn = mc_setup;
+++      break;
+++    case QPU_MC_FILTER:
+++      fn = mc_filter;
+++      break;
+++    case QPU_MC_EXIT:
+++      fn = mc_exit;
+++      break;
+++    case QPU_MC_INTERRUPT_EXIT12:
+++      fn = mc_interrupt_exit12;
+++      break;
+++    case QPU_MC_FILTER_B:
+++      fn = mc_filter_b;
+++      break;
+++    //case QPU_MC_FILTER_HONLY:
+++    //  fn = mc_filter_honly;
+++    //  break;
+++    case QPU_MC_SETUP_UV:
+++      fn = mc_setup_uv;
+++      break;
+++    case QPU_MC_FILTER_UV:
+++      fn = mc_filter_uv;
+++      break;
+++    case QPU_MC_FILTER_UV_B0:
+++      fn = mc_filter_uv_b0;
+++      break;
+++    case QPU_MC_FILTER_UV_B:
+++      fn = mc_filter_uv_b;
+++      break;
+++    case QPU_MC_INTERRUPT_EXIT8:
+++      fn = mc_interrupt_exit8;
+++      break;
+++    case QPU_MC_END:
+++      fn = mc_end;
+++      break;
+++    default:
+++      printf("Unknown function\n");
+++      exit(-1);
+++    }
+++    return gpu->vc + 4*(int)(fn-rpi_shader);
+++    //return code[num] + gpu->vc;
+++}
+++
+++#if 0
+++typedef unsigned int uint32_t;
+++
+++typedef struct mvs_s {
+++    GPU_MEM_PTR_T unif_mvs_ptr;
+++    uint32_t *unif_mvs; // Base of memory for motion vector commands
+++
+++    // _base pointers are to the start of the row
+++    uint32_t *mvs_base[8];
+++    // these pointers are to the next free space
+++    uint32_t *u_mvs[8];
+++
+++} HEVCContext;
+++
+++#define RPI_CHROMA_COMMAND_WORDS 12
+++
+++static void rpi_inter_clear(HEVCContext *s)
+++{
+++    int i;
+++    for(i=0;i<8;i++) {
+++        s->u_mvs[i] = s->mvs_base[i];
+++        *s->u_mvs[i]++ = 0;
+++        *s->u_mvs[i]++ = 0;
+++        *s->u_mvs[i]++ = 0;
+++        *s->u_mvs[i]++ = 0;
+++        *s->u_mvs[i]++ = 0;
+++        *s->u_mvs[i]++ = 128;  // w
+++        *s->u_mvs[i]++ = 128;  // h
+++        *s->u_mvs[i]++ = 128;  // stride u
+++        *s->u_mvs[i]++ = 128;  // stride v
+++        s->u_mvs[i] += 3;  // Padding words
+++    }
+++}
+++
+++static void rpi_execute_inter_qpu(HEVCContext *s)
+++{
+++    int k;
+++    uint32_t *unif_vc = (uint32_t *)s->unif_mvs_ptr.vc;
+++
+++    for(k=0;k<8;k++) {
+++        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
+++        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
+++        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP); //  dummy location for V
+++    }
+++
+++    s->u_mvs[8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
+++
+++    qpu_run_shader8(qpu_get_fn(QPU_MC_SETUP_UV),
+++      (uint32_t)(unif_vc+(s->mvs_base[0 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+++      (uint32_t)(unif_vc+(s->mvs_base[1 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+++      (uint32_t)(unif_vc+(s->mvs_base[2 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+++      (uint32_t)(unif_vc+(s->mvs_base[3 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+++      (uint32_t)(unif_vc+(s->mvs_base[4 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+++      (uint32_t)(unif_vc+(s->mvs_base[5 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+++      (uint32_t)(unif_vc+(s->mvs_base[6 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+++      (uint32_t)(unif_vc+(s->mvs_base[7 ] - (uint32_t*)s->unif_mvs_ptr.arm))
+++      );
+++}
+++
+++void rpi_test_qpu(void)
+++{
+++    HEVCContext mvs;
+++    HEVCContext *s = &mvs;
+++    int i;
+++    int uv_commands_per_qpu = (1 + (256*64*2)/(4*4)) * RPI_CHROMA_COMMAND_WORDS;
+++    uint32_t *p;
+++    printf("Allocate memory\n");
+++    gpu_malloc_uncached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr );
+++    s->unif_mvs = (uint32_t *) s->unif_mvs_ptr.arm;
+++
+++    // Set up initial locations for uniform streams
+++    p = s->unif_mvs;
+++    for(i = 0; i < 8; i++) {
+++        s->mvs_base[i] = p;
+++        p += uv_commands_per_qpu;
+++    }
+++    // Now run a simple program that should just quit immediately after a single texture fetch
+++    rpi_inter_clear(s);
+++    for(i=0;i<4;i++) {
+++      printf("Launch QPUs\n");
+++      rpi_execute_inter_qpu(s);
+++      printf("Done\n");
+++    }
+++    printf("Free memory\n");
+++    gpu_free(&s->unif_mvs_ptr);
+++    return;
+++}
+++#endif
+++
+++#if 0
+++
+++int32_t hcoeffs[] = {-4, 10, -21, 70, 90, -24, 11, -4};
+++//int32_t hcoeffs[] = {1, 1, 1, 1, 1, 1, 1, 1};
+++int32_t vcoeffs[] = {-2, 6, -13, 37, 115, -20, 9, -4};
+++//int32_t vcoeffs[] = {1, 1, 1, 1, 1, 1, 1, 1};
+++
+++#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0-1) & 0xff) | ((c1-1) & 0xff) << 8 | ((c2-1) & 0xff) << 16 | ((c3-1) & 0xff) << 24);
+++
+++static uint8_t av_clip_uint8(int32_t a)
+++{
+++    if (a&(~255)) return (-a)>>31;
+++    else          return a;
+++}
+++
+++static int32_t filter8(const uint8_t *data, int pitch)
+++{
+++   int32_t vsum = 0;
+++   int x, y;
+++
+++   for (y = 0; y < 8; y++) {
+++      int32_t hsum = 0;
+++
+++      for (x = 0; x < 8; x++)
+++         hsum += hcoeffs[x]*data[x + y * pitch];
+++
+++      vsum += vcoeffs[y]*av_clip_uint8( (hsum + 64) >> 7); // Added brackets to stop compiler warning
+++   }
+++
+++   return av_clip_uint8( (vsum + 64) >> 7);
+++}
+++
+++// Note regression changes coefficients so is not thread safe
+++//#define REGRESSION
+++#ifdef REGRESSION
+++#define CMAX 100
+++#else
+++#define CMAX 2
+++#endif
+++#define YMAX 16
+++
+++int rpi_test_shader(void)
+++{
+++   int i, c;
+++
+++   uint32_t *unifs;
+++
+++   uint8_t *in_buffer;
+++   uint8_t *out_buffer[2];
+++
+++   GPU_MEM_PTR_T unifs_ptr;
+++   GPU_MEM_PTR_T in_buffer_ptr;
+++   GPU_MEM_PTR_T out_buffer_ptr[2];
+++
+++   // Addresses in GPU memory of filter programs
+++   uint32_t mc_setup = 0;
+++   uint32_t mc_filter = 0;
+++   uint32_t mc_exit = 0;
+++
+++   int pitch = 0x500;
+++
+++   if (gpu==NULL) {
+++      gpu_lock();
+++      gpu_unlock();
+++   }
+++
+++   printf("This needs to change to reflect new assembler\n");
+++   // Use table to compute locations of program start points
+++   mc_setup = code[0] + gpu->vc;
+++   mc_filter = code[1] + gpu->vc;
+++   mc_exit = code[2] + gpu->vc;
+++
+++   if (!vcos_verify_ge0(gpu_malloc_uncached(4*64,&unifs_ptr))) {
+++      return -2;
+++   }
+++   unifs = (uint32_t*)unifs_ptr.arm;
+++
+++   if (!vcos_verify_ge0(gpu_malloc_uncached(64*23,&in_buffer_ptr))) {
+++      return -3;
+++   }
+++   in_buffer = (uint8_t*)in_buffer_ptr.arm;
+++
+++   if (!vcos_verify_ge0(gpu_malloc_uncached(16*pitch,&out_buffer_ptr[0])) || !vcos_verify_ge0(gpu_malloc_uncached(16*pitch,&out_buffer_ptr[1]))) {
+++      return -4;
+++   }
+++   out_buffer[0] = (uint8_t*)out_buffer_ptr[0].arm;
+++   out_buffer[1] = (uint8_t*)out_buffer_ptr[1].arm;
+++
+++   for (c = 0; c < CMAX; c++) {
+++      int xo[] = {rand()&31, rand()&31};
+++
+++#ifdef REGRESSION
+++      for (i = 0; i < 8; i++) {
+++         hcoeffs[i] = (int8_t)rand();
+++         vcoeffs[i] = (int8_t)rand();
+++         if (hcoeffs[i]==-128)
+++           hcoeffs[i]++;
+++         if (vcoeffs[i]==-128)
+++           vcoeffs[i]++;
+++      }
+++#endif
+++
+++      for (i = 0; i < 64*23; i++) {
+++         //printf("%d %d %p\n",i,gpu->mb,&in_buffer[i]);
+++         in_buffer[i] = rand();
+++      }
+++
+++      // Clear output array
+++      {
+++        int b;
+++        for(b=0;b<2;b++) {
+++          for(i=0;i<16*16;i++) {
+++            out_buffer[b][i] = 3;
+++          }
+++        }
+++      }
+++
+++      unifs[0] = mc_filter;
+++      unifs[1] = in_buffer_ptr.vc+xo[0]+16;
+++      unifs[2] = 64; // src pitch
+++      unifs[3] = pitch; // dst pitch
+++      unifs[4] = 0; // Padding
+++      unifs[5] = 0;
+++      unifs[6] = 0;
+++      unifs[7 ] = mc_filter;
+++      unifs[8 ] = in_buffer_ptr.vc+xo[1]+16;
+++      unifs[9 ] = ENCODE_COEFFS(hcoeffs[0], hcoeffs[1], hcoeffs[2], hcoeffs[3]);
+++      unifs[10] = ENCODE_COEFFS(hcoeffs[4], hcoeffs[5], hcoeffs[6], hcoeffs[7]);
+++      unifs[11] = ENCODE_COEFFS(vcoeffs[0], vcoeffs[1], vcoeffs[2], vcoeffs[3]);
+++      unifs[12] = ENCODE_COEFFS(vcoeffs[4], vcoeffs[5], vcoeffs[6], vcoeffs[7]);
+++      unifs[13] = out_buffer_ptr[0].vc;
+++      unifs[14] = mc_exit;
+++      unifs[15] = in_buffer_ptr.vc+xo[1]+16;        // dummy
+++      unifs[16] = ENCODE_COEFFS(hcoeffs[0], hcoeffs[1], hcoeffs[2], hcoeffs[3]);
+++      unifs[17] = ENCODE_COEFFS(hcoeffs[4], hcoeffs[5], hcoeffs[6], hcoeffs[7]);
+++      unifs[18] = ENCODE_COEFFS(vcoeffs[0], vcoeffs[1], vcoeffs[2], vcoeffs[3]);
+++      unifs[19] = ENCODE_COEFFS(vcoeffs[4], vcoeffs[5], vcoeffs[6], vcoeffs[7]);
+++      unifs[20] = out_buffer_ptr[1].vc;
+++
+++      printf("Gpu->vc=%x Code=%x dst=%x\n",gpu->vc, mc_filter,out_buffer_ptr[1].vc);
+++
+++      // flush_dcache(); TODO is this needed on ARM side? - tried to use the direct alias to avoid this problem
+++
+++      //qpu_run_shader(mc_setup, unifs_ptr.vc);
+++      //qpu_run_shader(gpu, gpu->vc, unifs_ptr.vc);
+++      rpi_do_block(in_buffer_ptr.vc+xo[0]+16, 64, out_buffer_ptr[0].vc, pitch,out_buffer[0]);
+++      rpi_do_block(in_buffer_ptr.vc+xo[1]+16, 64, out_buffer_ptr[1].vc, pitch,out_buffer[1]);
+++
+++      if (1)
+++      {
+++         int x, y, b;
+++         int bad = 0;
+++
+++         for (b=0; b<2; ++b)
+++            for (y=0; y<YMAX; ++y)
+++               for (x=0; x<16; ++x) {
+++                  int32_t ref = filter8(in_buffer+x+y*64+xo[b], 64);
+++
+++                  if (out_buffer[b][x+y*pitch] != ref) {
+++                      bad = 1;
+++//                     printf("%d, %d, %d, %d\n", c, b, x, y);
+++                  }
+++#ifndef REGRESSION
+++                  //printf("%08x %08x\n", out_buffer[b][x+y*pitch], ref);
+++#endif
+++               }
+++          if (bad)
+++            printf("Failed dst=%x test=%d\n",out_buffer_ptr[1].vc,c);
+++          else
+++            printf("Passed dst=%x test=%d\n",out_buffer_ptr[1].vc,c);
+++      }
+++      //printf("%d\n", simpenrose_get_qpu_tick_count());
+++   }
+++
+++   gpu_free(&out_buffer_ptr[0]);
+++   gpu_free(&out_buffer_ptr[1]);
+++   gpu_free(&in_buffer_ptr);
+++   gpu_free(&unifs_ptr);
+++
+++   return 0;
+++}
+++
+++void rpi_do_block_arm(const uint8_t *in_buffer, int src_pitch, uint8_t *dst, int dst_pitch)
+++{
+++  int x,y;
+++  for (y=0; y<16; ++y) {
+++    for (x=0; x<16; ++x) {
+++       dst[x+y*dst_pitch] = filter8(in_buffer+x+y*src_pitch, src_pitch);
+++    }
+++  }
+++}
+++
+++void rpi_do_block(const uint8_t *in_buffer_vc, int src_pitch, uint8_t *dst_vc, int dst_pitch, uint8_t *dst)
+++{
+++   uint32_t *unifs;
+++
+++   GPU_MEM_PTR_T unifs_ptr;
+++   //uint8_t *out_buffer;
+++   //GPU_MEM_PTR_T out_buffer_ptr;
+++
+++   // Addresses in GPU memory of filter programs
+++   uint32_t mc_setup = 0;
+++   uint32_t mc_filter = 0;
+++   uint32_t mc_exit = 0;
+++   //int x,y;
+++
+++   if (gpu==NULL) {
+++      gpu_lock();
+++      gpu_unlock();
+++   }
+++
+++   // Use table to compute locations of program start points
+++   mc_setup = code[0] + gpu->vc;
+++   mc_filter = code[1] + gpu->vc;
+++   mc_exit = code[2] + gpu->vc;
+++
+++   if (!vcos_verify_ge0(gpu_malloc_uncached(4*64,&unifs_ptr))) {
+++      return;
+++   }
+++   //gpu_malloc_uncached(16*dst_pitch,&out_buffer_ptr);
+++   //out_buffer = (uint8_t*)out_buffer_ptr.arm;
+++
+++   /*for (y=0; y<16; ++y) {
+++      for (x=0; x<16; ++x) {
+++         out_buffer[x+y*dst_pitch] = 7;
+++      }
+++    }*/
+++
+++   unifs = (uint32_t*)unifs_ptr.arm;
+++
+++    unifs[0] = mc_filter;
+++    unifs[1] = (int)in_buffer_vc;
+++    unifs[2] = src_pitch; // src pitch
+++    unifs[3] = dst_pitch; // dst pitch
+++    unifs[4] = 0; // Padding
+++    unifs[5] = 0;
+++    unifs[6] = 0;
+++    unifs[7 ] = mc_exit;
+++    unifs[8 ] = (int)in_buffer_vc;
+++    unifs[9 ] = ENCODE_COEFFS(hcoeffs[0], hcoeffs[1], hcoeffs[2], hcoeffs[3]);
+++    unifs[10] = ENCODE_COEFFS(hcoeffs[4], hcoeffs[5], hcoeffs[6], hcoeffs[7]);
+++    unifs[11] = ENCODE_COEFFS(vcoeffs[0], vcoeffs[1], vcoeffs[2], vcoeffs[3]);
+++    unifs[12] = ENCODE_COEFFS(vcoeffs[4], vcoeffs[5], vcoeffs[6], vcoeffs[7]);
+++    unifs[13] = (int)dst_vc;
+++    //unifs[13] = (int)out_buffer_ptr.vc;
+++
+++    //printf("Gpu->vc=%x Code=%x dst=%x\n",gpu->vc, mc_filter,out_buffer_ptr[1].vc);
+++
+++    qpu_run_shader(mc_setup, unifs_ptr.vc);
+++
+++    /*for (y=0; y<16; ++y) {
+++      for (x=0; x<16; ++x) {
+++         dst[x+y*dst_pitch] = out_buffer[x+y*dst_pitch];
+++      }
+++    }*/
+++
+++    gpu_free(&unifs_ptr);
+++    //gpu_free(&out_buffer_ptr);
+++}
+++
+++
+++
+++#endif
+++
+++#endif // RPI
++diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
++new file mode 100644
++index 0000000..c6cdb2b
++--- /dev/null
+++++ b/libavcodec/rpi_qpu.h
++@@ -0,0 +1,176 @@
+++#ifndef RPI_QPU_H
+++#define RPI_QPU_H
+++
+++// Define RPI_FAST_CACHEFLUSH to use the VCSM cache flush code
+++// *** N.B. Code has rotted & crashes if this is unset (before this set of changes)
+++#define RPI_FAST_CACHEFLUSH
+++
+++#define RPI_ONE_BUF 1
+++
+++typedef struct gpu_mem_ptr_s {
+++  unsigned char *arm; // Pointer to memory mapped on ARM side
+++  int vc_handle;   // Videocore handle of relocatable memory
+++  int vcsm_handle; // Handle for use by VCSM
+++  int vc;       // Address for use in GPU code
+++  int numbytes; // Size of memory block
+++} GPU_MEM_PTR_T;
+++
+++// General GPU functions
+++extern int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p);
+++extern int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p);
+++extern void gpu_free(GPU_MEM_PTR_T *p);
+++extern void gpu_cache_flush(const GPU_MEM_PTR_T * const p);
+++extern void gpu_cache_flush3(GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2);
+++
+++#include "libavutil/frame.h"
+++#if !RPI_ONE_BUF
+++static inline uint32_t get_vc_address_y(const AVFrame * const frame) {
+++    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[0]);
+++    return p->vc;
+++}
+++
+++static inline uint32_t get_vc_address_u(const AVFrame * const frame) {
+++    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[1]);
+++    return p->vc;
+++}
+++
+++static inline uint32_t get_vc_address_v(const AVFrame * const frame) {
+++    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[2]);
+++    return p->vc;
+++}
+++
+++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) {
+++    return *(GPU_MEM_PTR_T *)av_buffer_pool_opaque(frame->buf[0]);
+++}
+++
+++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_u(const AVFrame * const frame) {
+++    return *(GPU_MEM_PTR_T *)av_buffer_pool_opaque(frame->buf[1]);
+++}
+++
+++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_v(const AVFrame * const frame) {
+++    return *(GPU_MEM_PTR_T *)av_buffer_pool_opaque(frame->buf[2]);
+++}
+++
+++#else
+++
+++static inline int gpu_is_buf1(const AVFrame * const frame)
+++{
+++    return frame->buf[1] == NULL;
+++}
+++
+++static inline GPU_MEM_PTR_T * gpu_buf1_gmem(const AVFrame * const frame)
+++{
+++    return av_buffer_get_opaque(frame->buf[0]);
+++}
+++
+++static inline GPU_MEM_PTR_T * gpu_buf3_gmem(const AVFrame * const frame, const int n)
+++{
+++    return av_buffer_pool_opaque(frame->buf[n]);
+++}
+++
+++
+++static inline uint32_t get_vc_address_y(const AVFrame * const frame) {
+++    return gpu_is_buf1(frame) ? gpu_buf1_gmem(frame)->vc : gpu_buf3_gmem(frame, 0)->vc;
+++}
+++
+++static inline uint32_t get_vc_address_u(const AVFrame * const frame) {
+++    return gpu_is_buf1(frame) ?
+++        gpu_buf1_gmem(frame)->vc + frame->data[1] - frame->data[0] :
+++        gpu_buf3_gmem(frame, 1)->vc;
+++}
+++
+++static inline uint32_t get_vc_address_v(const AVFrame * const frame) {
+++    return gpu_is_buf1(frame) ?
+++        gpu_buf1_gmem(frame)->vc + frame->data[2] - frame->data[0] :
+++        gpu_buf3_gmem(frame, 2)->vc;
+++}
+++
+++
+++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) {
+++    if (gpu_is_buf1(frame))
+++    {
+++        GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
+++        g.numbytes = frame->data[1] - frame->data[0];
+++        return g;
+++    }
+++    else
+++        return *gpu_buf3_gmem(frame, 0);
+++}
+++
+++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_u(const AVFrame * const frame) {
+++    if (gpu_is_buf1(frame))
+++    {
+++        GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
+++        g.arm += frame->data[1] - frame->data[0];
+++        g.vc += frame->data[1] - frame->data[0];
+++        g.numbytes = frame->data[2] - frame->data[1];  // chroma size
+++        return g;
+++    }
+++    else
+++        return *gpu_buf3_gmem(frame, 1);
+++}
+++
+++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_v(const AVFrame * const frame) {
+++    if (gpu_is_buf1(frame))
+++    {
+++        GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
+++        g.arm += frame->data[2] - frame->data[0];
+++        g.vc += frame->data[2] - frame->data[0];
+++        g.numbytes = frame->data[2] - frame->data[1];  // chroma size
+++        return g;
+++    }
+++    else
+++        return *gpu_buf3_gmem(frame, 2);
+++}
+++
+++#endif
+++
+++
+++// QPU specific functions
+++extern void rpi_test_qpu(void);
+++
+++enum {
+++  QPU_MC_SETUP,
+++  QPU_MC_FILTER,
+++  QPU_MC_EXIT,
+++  QPU_MC_INTERRUPT_EXIT12,
+++  QPU_MC_FILTER_B,
+++  QPU_MC_FILTER_HONLY,
+++  QPU_MC_SETUP_UV,
+++  QPU_MC_FILTER_UV,
+++  QPU_MC_FILTER_UV_B0,
+++  QPU_MC_FILTER_UV_B,
+++  QPU_MC_INTERRUPT_EXIT8,
+++  QPU_MC_END
+++  };
+++extern unsigned int qpu_get_fn(int num);
+++
+++#define QPU_N_UV   8
+++#define QPU_N_Y    12
+++#define QPU_N_MAX  16
+++
+++#define QPU_MAIL_EL_VALS  2
+++#define QPU_MAIL_EL_SIZE  (QPU_MAIL_EL_VALS * sizeof(uint32_t))
+++#define QPU_MAIL_VALS_MAX (QPU_N_MAX * QPU_MAIL_EL_VALS)
+++#define QPU_MAIL_SIZE (QPU_MAIL_VALS_MAX * sizeof(uint32_t))
+++
+++// VPU specific functions
+++extern unsigned int vpu_get_fn(void);
+++extern unsigned int vpu_get_constants(void);
+++//extern unsigned vpu_execute_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5);
+++extern int vpu_post_code2( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5, GPU_MEM_PTR_T *buf);
+++int vpu_qpu_post_code2(unsigned vpu_code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
+++    int qpu0_n, const uint32_t * qpu0_mail,
+++    int qpu1_n, const uint32_t * qpu1_mail);
+++
+++extern void vpu_wait( int id);
+++
+++// Simple test of shader code
+++extern int rpi_test_shader(void);
+++
+++extern void rpi_do_block(const unsigned char *in_buffer_vc, int src_pitch, unsigned char *dst_vc, int dst_pitch, unsigned char *dst);
+++extern void rpi_do_block_arm(const unsigned char *in_buffer, int src_pitch, unsigned char *dst, int dst_pitch);
+++
+++extern int gpu_get_mailbox(void);
+++
+++#endif
++diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
++new file mode 100644
++index 0000000..06fb166
++--- /dev/null
+++++ b/libavcodec/rpi_shader.c
++@@ -0,0 +1,629 @@
+++#include "rpi_shader.h"
+++
+++#ifdef _MSC_VER
+++   #include <stdint.h>
+++   /* cast through uintptr_t to avoid warnings */
+++   #define POINTER_TO_UINT(X) ((unsigned int)(uintptr_t)(X))
+++#else
+++   #define POINTER_TO_UINT(X) ((unsigned int)(X))
+++#endif
+++
+++#ifdef __cplusplus
+++extern "C" { /* the types are probably wrong... */
+++#endif
+++#ifdef __cplusplus
+++}
+++#endif
+++
+++#ifdef _MSC_VER
+++__declspec(align(8))
+++#elif defined(__GNUC__)
+++__attribute__((aligned(8)))
+++#endif
+++unsigned int rpi_shader[] = {
+++// ::mc_setup_uv
+++/* [0x00000000] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+++/* [0x00000008] */ 0x0c9a0f80, 0x10020427, // add ra_x, unif, elem_num
+++/* [0x00000010] */ 0x15827d80, 0x10020767, // mov ra_y, unif
+++/* [0x00000018] */ 0x15827d80, 0x10020627, // mov ra_frame_base, unif
+++/* [0x00000020] */ 0x009e7000, 0x100009e7, // nop
+++/* [0x00000028] */ 0x0d620f80, 0x10020667, // sub ra_u2v_ref_offset, unif, ra_frame_base
+++/* [0x00000030] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
+++/* [0x00000038] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
+++/* [0x00000040] */ 0x15827d80, 0x10021427, // mov rb16, unif
+++/* [0x00000048] */ 0x15827d80, 0x10020827, // mov r0, unif
+++/* [0x00000050] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+++/* [0x00000058] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
+++/* [0x00000060] */ 0x00010000, 0xe0020127, // mov ra4, 0x10000
+++/* [0x00000068] */ 0x00000001, 0xe0020527, // mov ra_k1, 1
+++/* [0x00000070] */ 0x00000100, 0xe00205a7, // mov ra_k256, 256
+++/* [0x00000078] */ 0x00000040, 0xe00207a7, // mov ra30, 64
+++/* [0x00000080] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
+++/* [0x00000088] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255
+++/* [0x00000090] */ 0x00000018, 0xe00215e7, // mov rb23, 24
+++/* [0x00000098] */ 0x00000000, 0xe0020227, // mov ra8, 0
+++/* [0x000000a0] */ 0x00000000, 0xe0020267, // mov ra9, 0
+++/* [0x000000a8] */ 0x00000000, 0xe00202a7, // mov ra10, 0
+++/* [0x000000b0] */ 0x00000000, 0xe00202e7, // mov ra11, 0
+++/* [0x000000b8] */ 0x00000000, 0xe0020327, // mov ra12, 0
+++/* [0x000000c0] */ 0x00000000, 0xe0020367, // mov ra13, 0
+++/* [0x000000c8] */ 0x00000000, 0xe00203a7, // mov ra14, 0
+++/* [0x000000d0] */ 0x00000000, 0xe00203e7, // mov ra15, 0
+++/* [0x000000d8] */ 0x15427d80, 0x10020827, // mov r0, ra_x
+++/* [0x000000e0] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
+++/* [0x000000e8] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base
+++/* [0x000000f0] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
+++/* [0x000000f8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
+++/* [0x00000100] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+++/* [0x00000108] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+++/* [0x00000110] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x, r0
+++/* [0x00000118] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+++/* [0x00000120] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
+++/* [0x00000128] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_frame_base, r2
+++/* [0x00000130] */ 0x0c9e7440, 0x10020f27, // add t1s, r2, r1
+++/* [0x00000138] */ 0x00000009, 0xe00208a7, // mov r2, 9
+++/* [0x00000140] */ 0x0c827580, 0x10021367, // add rb13, r2, unif
+++/* [0x00000148] */ 0x15827d80, 0x100009e7, // mov -, unif
+++/* [0x00000150] */ 0x15827d80, 0x100208a7, // mov r2, unif
+++/* [0x00000158] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
+++/* [0x00000160] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+++/* [0x00000168] */ 0x159e7480, 0x10020867, // mov r1, r2
+++/* [0x00000170] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+++/* [0x00000178] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+++/* [0x00000180] */ 0x159e7480, 0x10020827, // mov r0, r2
+++/* [0x00000188] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+++/* [0x00000190] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+++/* [0x00000198] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+++/* [0x000001a0] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
+++/* [0x000001a8] */ 0x0f9c11c0, 0xd00208a7, // asr r2, r0, 1
+++/* [0x000001b0] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
+++/* [0x000001b8] */ 0x0c9e7440, 0x10021567, // add rb21, r2, r1
+++/* [0x000001c0] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
+++/* [0x000001c8] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+++/* [0x000001d0] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
+++/* [0x000001d8] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
+++/* [0x000001e0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+++/* [0x000001e8] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
+++/* [0x000001f0] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+++/* [0x000001f8] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+++/* [0x00000200] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x
+++/* [0x00000208] */ 0x0c627380, 0x10020f27, // add t1s, r1, ra_frame_base
+++// ::mc_filter_uv
+++/* [0x00000210] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+++/* [0x00000218] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+++/* [0x00000220] */ 0x938001f6, 0xd0024821, // max r0, r0, 0         ; mov r1, unif
+++/* [0x00000228] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+++/* [0x00000230] */ 0x8d4e0ef6, 0x10025891, // sub r2, unif, r3      ; mov ra_xshift, ra_xshift_next
+++/* [0x00000238] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+++/* [0x00000240] */ 0x8c8270f6, 0x10025801, // add r0, r0, r3        ; mov ra1, unif
+++/* [0x00000248] */ 0x9481c1f6, 0xd00254c0, // and rb_x_next, r0, ~3 ; mov ra0, unif
+++/* [0x00000250] */ 0x959dc27f, 0x10024731, // mov ra_y_next, r1     ; mov vw_setup, rb28
+++/* [0x00000258] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
+++/* [0x00000260] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b
+++/* [0x00000268] */ 0x0c041dc0, 0xd2021467, // add rb17, ra1.16a, 1
+++/* [0x00000270] */ 0x0c043dc0, 0xd20214a7, // add rb18, ra1.16a, 3
+++/* [0x00000278] */ 0x11047dc0, 0xd2020827, // shl r0,   ra1.16a, 7
+++/* [0x00000280] */ 0x0c067180, 0x14020827, // add r0,   r0, ra1.16b
+++/* [0x00000288] */ 0x119d01c0, 0xd0020827, // shl r0,   r0, i_shift16
+++/* [0x00000290] */ 0x8c81b1f6, 0x10025683, // add rb26, r0, rb27    ; mov ra3, unif
+++/* [0x00000298] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++/* [0x000002a0] */ 0x950e0ff6, 0x18024048, // mov ra1, unif         ; mov rb8,  ra3.8a
+++/* [0x000002a8] */ 0x950e0ff6, 0x1a064049, // mov.ifnz ra1, unif    ; mov rb9,  ra3.8b
+++/* [0x000002b0] */ 0x800e7036, 0x1c0049ca, // nop                   ; mov rb10, ra3.8c
+++/* [0x000002b8] */ 0x950c0ff6, 0xde0248cb, // mov r3, 0             ; mov rb11, ra3.8d
+++/* [0x000002c0] */ 0x1104ddc0, 0x14020867, // shl r1, ra1.16b, rb13
+++/* [0x000002c8] */ 0x0f9c13c0, 0xd0021327, // asr rb12, r1, 1
+++/* [0x000002d0] */ 0x11041dc0, 0xd20213a7, // shl rb14, ra1.16a, 1
+++// :uvloop
+++/* [0x000002d8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1          ; ldtmu0
+++/* [0x000002e0] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
+++/* [0x000002e8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+++/* [0x000002f0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+++/* [0x000002f8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+++/* [0x00000300] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+++/* [0x00000308] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+++/* [0x00000310] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+++/* [0x00000318] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
+++/* [0x00000320] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2
+++/* [0x00000328] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++/* [0x00000330] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,       r0
+++/* [0x00000338] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8
+++/* [0x00000340] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1
+++/* [0x00000348] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9
+++/* [0x00000350] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2
+++/* [0x00000358] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
+++/* [0x00000360] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3
+++/* [0x00000368] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
+++/* [0x00000370] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
+++/* [0x00000378] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+++/* [0x00000380] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
+++/* [0x00000388] */ 0x55389db7, 0x10024361, // mov ra13, ra14          ; mul24 r1, ra14, rb9
+++/* [0x00000390] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+++/* [0x00000398] */ 0x55308037, 0x100243e0, // mov ra15, r0            ; mul24 r0, ra12, rb8
+++/* [0x000003a0] */ 0x4d38a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra14, rb10
+++/* [0x000003a8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+++/* [0x000003b0] */ 0x8d9f223f, 0x10020867, // sub r1, r1, r0          ; mov -, vw_wait
+++/* [0x000003b8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
+++/* [0x000003c0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+++/* [0x000003c8] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
+++/* [0x000003d0] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
+++/* [0x000003d8] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
+++/* [0x000003e0] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:uvloop
+++/* [0x000003e8] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
+++/* [0x000003f0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
+++/* [0x000003f8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+++/* [0x00000400] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+++/* [0x00000408] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+++/* [0x00000410] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+++/* [0x00000418] */ 0x00000010, 0xe0020827, // mov r0, 16
+++/* [0x00000420] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+++/* [0x00000428] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+++/* [0x00000430] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+++/* [0x00000438] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+++/* [0x00000440] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+++// ::mc_filter_uv_b0
+++/* [0x00000448] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+++/* [0x00000450] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+++/* [0x00000458] */ 0x938001f6, 0xd0024821, // max r0, r0, 0                ; mov r1, unif
+++/* [0x00000460] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+++/* [0x00000468] */ 0x8d4e0ef6, 0x10025891, // sub r2, unif, r3             ; mov ra_xshift, ra_xshift_next
+++/* [0x00000470] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+++/* [0x00000478] */ 0x8c8270f6, 0x10025801, // add r0, r0, r3  	     ; mov ra1, unif
+++/* [0x00000480] */ 0x9481c1f6, 0xd00254c0, // and rb_x_next, r0, ~3        ; mov ra0, unif
+++/* [0x00000488] */ 0x959d527f, 0x10024731, // mov ra_y_next, r1            ; mov vw_setup, rb21
+++/* [0x00000490] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
+++/* [0x00000498] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b
+++/* [0x000004a0] */ 0x0c041dc0, 0xd2021467, // add rb17, ra1.16a, 1
+++/* [0x000004a8] */ 0x0c043dc0, 0xd20214a7, // add rb18, ra1.16a, 3
+++/* [0x000004b0] */ 0x11047dc0, 0xd2020827, // shl r0,   ra1.16a, 7
+++/* [0x000004b8] */ 0x0c067180, 0x14020827, // add r0,   r0, ra1.16b
+++/* [0x000004c0] */ 0x918101f6, 0xd0025803, // shl r0,   r0, i_shift16      ; mov ra3, unif
+++/* [0x000004c8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+++/* [0x000004d0] */ 0x150e7d80, 0x18021227, // mov rb8, ra3.8a
+++/* [0x000004d8] */ 0x150e7d80, 0x1a021267, // mov rb9, ra3.8b
+++/* [0x000004e0] */ 0x150e7d80, 0x1c0212a7, // mov rb10, ra3.8c
+++/* [0x000004e8] */ 0x150e7d80, 0x1e0212e7, // mov rb11, ra3.8d
+++/* [0x000004f0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++/* [0x000004f8] */ 0x15827d80, 0x100213a7, // mov      rb14, unif
+++/* [0x00000500] */ 0x95800dbf, 0xd00653a3, // mov.ifnz rb14, unif    ; mov r3, 0
+++// :uvloop_b0
+++/* [0x00000508] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1          ; ldtmu0
+++/* [0x00000510] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
+++/* [0x00000518] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+++/* [0x00000520] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+++/* [0x00000528] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+++/* [0x00000530] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+++/* [0x00000538] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+++/* [0x00000540] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+++/* [0x00000548] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
+++/* [0x00000550] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2
+++/* [0x00000558] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++/* [0x00000560] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,       r0
+++/* [0x00000568] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8
+++/* [0x00000570] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1
+++/* [0x00000578] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9
+++/* [0x00000580] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2
+++/* [0x00000588] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
+++/* [0x00000590] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3
+++/* [0x00000598] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
+++/* [0x000005a0] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
+++/* [0x000005a8] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+++/* [0x000005b0] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+++/* [0x000005b8] */ 0x55389db7, 0x10024361, // mov ra13, ra14          ; mul24 r1, ra14, rb9
+++/* [0x000005c0] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+++/* [0x000005c8] */ 0x55308037, 0x100243e0, // mov ra15, r0            ; mul24 r0, ra12, rb8
+++/* [0x000005d0] */ 0x4d38a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra14, rb10
+++/* [0x000005d8] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
+++/* [0x000005e0] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+++/* [0x000005e8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+++/* [0x000005f0] */ 0x8d9f223f, 0x10020867, // sub r1, r1, r0          ; mov -, vw_wait
+++/* [0x000005f8] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
+++/* [0x00000600] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+++/* [0x00000608] */ 0x15827d80, 0x100009e7, // mov -, unif
+++/* [0x00000610] */ 0x15827d80, 0x100009e7, // mov -, unif
+++/* [0x00000618] */ 0x009e7000, 0x100009e7, // nop
+++// ::mc_filter_uv_b
+++/* [0x00000620] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+++/* [0x00000628] */ 0x954dcdbf, 0x10024471, // mov ra_xshift, ra_xshift_next      ; mov vw_setup, rb28
+++/* [0x00000630] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+++/* [0x00000638] */ 0x938001f6, 0xd002581c, // max r0, r0, 0                      ; mov ra_y_next, unif
+++/* [0x00000640] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+++/* [0x00000648] */ 0x4d808cc7, 0xd0025893, // sub r2, unif, r3                   ; mul24 ra_xshift_next, r0, 8
+++/* [0x00000650] */ 0x8c8270f6, 0x10025801, // add r0, r0, r3                     ; mov ra1, unif
+++/* [0x00000658] */ 0x9481c1f6, 0xd00254c0, // and rb_x_next, r0, ~3              ; mov ra0, unif
+++/* [0x00000660] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b
+++/* [0x00000668] */ 0x0c041dc0, 0xd2021467, // add rb17, ra1.16a, 1
+++/* [0x00000670] */ 0x0c043dc0, 0xd20214a7, // add rb18, ra1.16a, 3
+++/* [0x00000678] */ 0x11047dc0, 0xd2020827, // shl r0,   ra1.16a, 7
+++/* [0x00000680] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
+++/* [0x00000688] */ 0x918151f6, 0xd00258c3, // shl r3, r0, i_shift21     ; mov ra3, unif
+++/* [0x00000690] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+++/* [0x00000698] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
+++/* [0x000006a0] */ 0x0c067180, 0x14020827, // add r0, r0, ra1.16b
+++/* [0x000006a8] */ 0x119d01c0, 0xd0020827, // shl r0, r0, i_shift16
+++/* [0x000006b0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+++/* [0x000006b8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++/* [0x000006c0] */ 0x950e0ff6, 0x18024048, // mov      ra1, unif  ; mov rb8,  ra3.8a
+++/* [0x000006c8] */ 0x950e0ff6, 0x1a064049, // mov.ifnz ra1, unif  ; mov rb9,  ra3.8b
+++/* [0x000006d0] */ 0x800e7036, 0x1c0049ca, // nop                 ; mov rb10, ra3.8c
+++/* [0x000006d8] */ 0x950c0ff6, 0xde0248cb, // mov r3, 0           ; mov rb11, ra3.8d
+++/* [0x000006e0] */ 0x1104ddc0, 0x14020867, // shl r1, ra1.16b, rb13
+++/* [0x000006e8] */ 0x0f9c13c0, 0xd0021327, // asr rb12, r1, 1
+++// :uvloop_b
+++/* [0x000006f0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1          ; ldtmu0
+++/* [0x000006f8] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
+++/* [0x00000700] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+++/* [0x00000708] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+++/* [0x00000710] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift     ; v8subs r0, r0, rb20
+++/* [0x00000718] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+++/* [0x00000720] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+++/* [0x00000728] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+++/* [0x00000730] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2         ; v8subs r1, r1, rb20
+++/* [0x00000738] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2
+++/* [0x00000740] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++/* [0x00000748] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,       r0
+++/* [0x00000750] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8
+++/* [0x00000758] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1
+++/* [0x00000760] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9
+++/* [0x00000768] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2
+++/* [0x00000770] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
+++/* [0x00000778] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3
+++/* [0x00000780] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
+++/* [0x00000788] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
+++/* [0x00000790] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+++/* [0x00000798] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
+++/* [0x000007a0] */ 0x55389db7, 0x10024361, // mov ra13, ra14          ; mul24 r1, ra14, rb9
+++/* [0x000007a8] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+++/* [0x000007b0] */ 0x55308037, 0x100243e0, // mov ra15, r0            ; mul24 r0, ra12, rb8
+++/* [0x000007b8] */ 0x4d38a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra14, rb10
+++/* [0x000007c0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+++/* [0x000007c8] */ 0x4d13023e, 0x10024860, // sub r1, r1, r0          ; mul24 r0, vpm, ra4
+++/* [0x000007d0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
+++/* [0x000007d8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+++/* [0x000007e0] */ 0x4f0501ce, 0xd2024821, // asr r0, r0, i_shift16   ; mul24 r1, r1, ra1.16a
+++/* [0x000007e8] */ 0x409ce007, 0x100049e0, // nop                     ; mul24 r0, r0, rb14
+++/* [0x000007f0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+++/* [0x000007f8] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
+++/* [0x00000800] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
+++/* [0x00000808] */ 0xfffffec8, 0xf06809e7, // brr.anyn -, r:uvloop_b
+++/* [0x00000810] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
+++/* [0x00000818] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
+++/* [0x00000820] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+++/* [0x00000828] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+++/* [0x00000830] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+++/* [0x00000838] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+++/* [0x00000840] */ 0x00000010, 0xe0020827, // mov r0, 16
+++/* [0x00000848] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+++/* [0x00000850] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+++/* [0x00000858] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+++/* [0x00000860] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+++/* [0x00000868] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+++// ::mc_exit
+++/* [0x00000870] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+++/* [0x00000878] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+++/* [0x00000880] */ 0x009e7000, 0xa00009e7, // ldtmu0
+++/* [0x00000888] */ 0x009e7000, 0xb00009e7, // ldtmu1
+++/* [0x00000890] */ 0x009e7000, 0xa00009e7, // ldtmu0
+++/* [0x00000898] */ 0x009e7000, 0xb00009e7, // ldtmu1
+++/* [0x000008a0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+++/* [0x000008a8] */ 0x009e7000, 0x100009e7, // nop        ; nop
+++/* [0x000008b0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+++// ::mc_interrupt_exit8
+++/* [0x000008b8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+++/* [0x000008c0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+++/* [0x000008c8] */ 0x009e7000, 0xb00009e7, // ldtmu1
+++/* [0x000008d0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+++/* [0x000008d8] */ 0x009e7000, 0xb00009e7, // ldtmu1
+++/* [0x000008e0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+++/* [0x000008e8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+++/* [0x000008f0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+++/* [0x000008f8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+++/* [0x00000900] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+++/* [0x00000908] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+++/* [0x00000910] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+++/* [0x00000918] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+++/* [0x00000920] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+++/* [0x00000928] */ 0x009e7000, 0x100009e7, // nop        ; nop
+++// ::mc_setup
+++/* [0x00000930] */ 0x00000010, 0xe00208e7, // mov r3, 16
+++/* [0x00000938] */ 0x15827d80, 0x10020227, // mov ra8, unif
+++/* [0x00000940] */ 0x15827d80, 0x10020267, // mov ra9, unif
+++/* [0x00000948] */ 0x15827d80, 0x100202a7, // mov ra10, unif
+++/* [0x00000950] */ 0x15827d80, 0x100202e7, // mov ra11, unif
+++/* [0x00000958] */ 0x15827d80, 0x10020867, // mov r1, unif
+++/* [0x00000960] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3
+++/* [0x00000968] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3
+++/* [0x00000970] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3
+++/* [0x00000978] */ 0x0d9c13c0, 0xd0021667, // sub rb_frame_width_minus_1,r1,1
+++/* [0x00000980] */ 0x0d9c11c0, 0xd00217a7, // sub rb_frame_height_minus_1,r0,1
+++/* [0x00000988] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
+++/* [0x00000990] */ 0x15827d80, 0x10020827, // mov r0, unif
+++/* [0x00000998] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+++/* [0x000009a0] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
+++/* [0x000009a8] */ 0x15227d80, 0x10020867, // mov r1, ra8
+++/* [0x000009b0] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3
+++/* [0x000009b8] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3
+++/* [0x000009c0] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3
+++/* [0x000009c8] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
+++/* [0x000009d0] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+++/* [0x000009d8] */ 0x922591f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, ra9
+++/* [0x000009e0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+++/* [0x000009e8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
+++/* [0x000009f0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+++/* [0x000009f8] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
+++/* [0x00000a00] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+++/* [0x00000a08] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+++/* [0x00000a10] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
+++/* [0x00000a18] */ 0x8c9e7452, 0x10025e18, // add t0s, r2, r1 ; mov ra_frame_base, r2
+++/* [0x00000a20] */ 0x152a7d80, 0x10020867, // mov r1, ra10
+++/* [0x00000a28] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3
+++/* [0x00000a30] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3
+++/* [0x00000a38] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3
+++/* [0x00000a40] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
+++/* [0x00000a48] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+++/* [0x00000a50] */ 0x922d91f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, ra11
+++/* [0x00000a58] */ 0x119c31c0, 0xd0021067, // shl rx_xshift2_next, r0, 3
+++/* [0x00000a60] */ 0x0c9c13c0, 0xd0120567, // add ra_y2, r1, 1
+++/* [0x00000a68] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+++/* [0x00000a70] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
+++/* [0x00000a78] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+++/* [0x00000a80] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+++/* [0x00000a88] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
+++/* [0x00000a90] */ 0x8c9e7452, 0x10025f19, // add t1s, r2, r1 ; mov ra_frame_base2, r2
+++/* [0x00000a98] */ 0x00000001, 0xe0020527, // mov ra_k1, 1
+++/* [0x00000aa0] */ 0x00000100, 0xe00205a7, // mov ra_k256, 256
+++/* [0x00000aa8] */ 0x00000040, 0xe00207a7, // mov ra30, 64
+++/* [0x00000ab0] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
+++/* [0x00000ab8] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255
+++/* [0x00000ac0] */ 0x00000018, 0xe00215e7, // mov rb23, 24
+++/* [0x00000ac8] */ 0x00000000, 0xe0020227, // mov ra8, 0
+++/* [0x00000ad0] */ 0x00000000, 0xe0020267, // mov ra9, 0
+++/* [0x00000ad8] */ 0x00000000, 0xe00202a7, // mov ra10, 0
+++/* [0x00000ae0] */ 0x00000000, 0xe00202e7, // mov ra11, 0
+++/* [0x00000ae8] */ 0x00000000, 0xe0020327, // mov ra12, 0
+++/* [0x00000af0] */ 0x00000000, 0xe0020367, // mov ra13, 0
+++/* [0x00000af8] */ 0x00000000, 0xe00203a7, // mov ra14, 0
+++/* [0x00000b00] */ 0x00000000, 0xe00203e7, // mov ra15, 0
+++/* [0x00000b08] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+++/* [0x00000b10] */ 0x159e7480, 0x10020867, // mov r1, r2
+++/* [0x00000b18] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+++/* [0x00000b20] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+++/* [0x00000b28] */ 0x159e7480, 0x10020827, // mov r0, r2
+++/* [0x00000b30] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+++/* [0x00000b38] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+++/* [0x00000b40] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+++/* [0x00000b48] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
+++/* [0x00000b50] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
+++/* [0x00000b58] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+++/* [0x00000b60] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
+++/* [0x00000b68] */ 0x0c809dc0, 0xd0021367, // add rb13, unif, 9
+++/* [0x00000b70] */ 0x15827d80, 0x100009e7, // mov -, unif
+++/* [0x00000b78] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
+++/* [0x00000b80] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+++/* [0x00000b88] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
+++/* [0x00000b90] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+++/* [0x00000b98] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_frame_base
+++/* [0x00000ba0] */ 0x13540dc0, 0xd2020867, // max r1, ra_y2, 0
+++/* [0x00000ba8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+++/* [0x00000bb0] */ 0x0c541dc0, 0xd2120567, // add ra_y2, ra_y2, 1
+++/* [0x00000bb8] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+++/* [0x00000bc0] */ 0x0c667380, 0x10020f27, // add t1s, r1, ra_frame_base2
+++// :per_block_setup
+++/* [0x00000bc8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++/* [0x00000bd0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+++/* [0x00000bd8] */ 0x959a0ff6, 0x10024061, // mov ra1, unif  ; mov r1, elem_num
+++/* [0x00000be0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+++/* [0x00000be8] */ 0x159c1fc0, 0x10021027, // mov rx_xshift2, rx_xshift2_next
+++/* [0x00000bf0] */ 0x0c067c40, 0x12020827, // add r0, ra1.16a, r1
+++/* [0x00000bf8] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+++/* [0x00000c00] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
+++/* [0x00000c08] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+++/* [0x00000c10] */ 0x95048ff6, 0xd40258dc, // mov r3, 8                          ; mov ra_y_next, ra1.16b
+++/* [0x00000c18] */ 0x9481c1f6, 0xd0025801, // and r0, r0, ~3                     ; mov ra1, unif
+++/* [0x00000c20] */ 0x0c9e7400, 0x100206a7, // add ra_frame_base_next, r2, r0
+++/* [0x00000c28] */ 0x0c067c40, 0x12020827, // add r0, ra1.16a, r1
+++/* [0x00000c30] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+++/* [0x00000c38] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
+++/* [0x00000c40] */ 0x119c31c0, 0xd0021067, // shl rx_xshift2_next, r0, 3
+++/* [0x00000c48] */ 0x8c0676f6, 0x142258d5, // add r3, r3, r3                     ; mov ra_y2_next, ra1.16b
+++/* [0x00000c50] */ 0x9481c1f6, 0xd0025801, // and r0, r0, ~3                     ; mov ra1, unif
+++/* [0x00000c58] */ 0x0c9e7400, 0x100214e7, // add rx_frame_base2_next, r2, r0
+++/* [0x00000c60] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+++/* [0x00000c68] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b
+++/* [0x00000c70] */ 0x0c045dc0, 0xd2021467, // add rb17, ra1.16a, 5
+++/* [0x00000c78] */ 0x0c047dc0, 0xd20214a7, // add rb18, ra1.16a, 7
+++/* [0x00000c80] */ 0x11047dc0, 0xd2020827, // shl r0,   ra1.16a, 7
+++/* [0x00000c88] */ 0x0c067180, 0x14020827, // add r0,   r0, ra1.16b
+++/* [0x00000c90] */ 0x119d01c0, 0xd0020827, // shl r0,   r0, i_shift16
+++/* [0x00000c98] */ 0x8c81b1f6, 0x100256a0, // add rb26, r0, rb27                 ; mov r0, unif
+++/* [0x00000ca0] */ 0x119d01c0, 0xd0040827, // shl.ifz r0, r0, i_shift16
+++/* [0x00000ca8] */ 0x119c31c0, 0xd0020227, // shl ra8, r0, 3
+++/* [0x00000cb0] */ 0x00010100, 0xe0020867, // mov r1,0x00010100
+++/* [0x00000cb8] */ 0x10227380, 0x1e4200a7, // ror ra2.8a, r1, ra8.8d
+++/* [0x00000cc0] */ 0x10227380, 0x1c420027, // ror ra0.8a, r1, ra8.8c
+++/* [0x00000cc8] */ 0x01040400, 0xe0020867, // mov r1,0x01040400
+++/* [0x00000cd0] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d
+++/* [0x00000cd8] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c
+++/* [0x00000ce0] */ 0x050b0a00, 0xe0020867, // mov r1,0x050b0a00
+++/* [0x00000ce8] */ 0x10227380, 0x1e6200a7, // ror ra2.8c, r1, ra8.8d
+++/* [0x00000cf0] */ 0x10227380, 0x1c620027, // ror ra0.8c, r1, ra8.8c
+++/* [0x00000cf8] */ 0x11283a40, 0xe0020867, // mov r1,0x11283a40
+++/* [0x00000d00] */ 0x10227380, 0x1e7200a7, // ror ra2.8d, r1, ra8.8d
+++/* [0x00000d08] */ 0x10227380, 0x1c720027, // ror ra0.8d, r1, ra8.8c
+++/* [0x00000d10] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100
+++/* [0x00000d18] */ 0x10227380, 0x1e4200e7, // ror ra3.8a, r1, ra8.8d
+++/* [0x00000d20] */ 0x10227380, 0x1c420067, // ror ra1.8a, r1, ra8.8c
+++/* [0x00000d28] */ 0x0a0b0500, 0xe0020867, // mov r1,0x0a0b0500
+++/* [0x00000d30] */ 0x10227380, 0x1e5200e7, // ror ra3.8b, r1, ra8.8d
+++/* [0x00000d38] */ 0x10227380, 0x1c520067, // ror ra1.8b, r1, ra8.8c
+++/* [0x00000d40] */ 0x04040100, 0xe0020867, // mov r1,0x04040100
+++/* [0x00000d48] */ 0x10227380, 0x1e6200e7, // ror ra3.8c, r1, ra8.8d
+++/* [0x00000d50] */ 0x10227380, 0x1c620067, // ror ra1.8c, r1, ra8.8c
+++/* [0x00000d58] */ 0x01010000, 0xe0020867, // mov r1,0x01010000
+++/* [0x00000d60] */ 0x902203bf, 0x1e7240e0, // ror ra3.8d, r1, ra8.8d    ; mov r0, unif
+++/* [0x00000d68] */ 0x9020d3bf, 0x1c724061, // ror ra1.8d, r1, ra8.8c    ; mov r1, rb13
+++/* [0x00000d70] */ 0x910e0e76, 0x18024844, // shl r1, unif, r1          ; mov rb4, ra3.8a
+++/* [0x00000d78] */ 0x8f0e70f6, 0x1a024485, // asr ra18, r0, r3          ; mov rb5, ra3.8b
+++/* [0x00000d80] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+++/* [0x00000d88] */ 0x910e70f6, 0x1c024806, // shl r0, r0, r3            ; mov rb6, ra3.8c
+++/* [0x00000d90] */ 0x950c0ff6, 0xde0248c7, // mov r3, 0                 ; mov rb7, ra3.8d
+++/* [0x00000d98] */ 0x0f9c93c0, 0xd0021327, // asr rb12, r1, 9
+++// ::mc_filter
+++/* [0x00000da0] */ 0x0f9cf1c0, 0xd00213a7, // asr rb14, r0, 15
+++// :yloop
+++/* [0x00000da8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1                           ; ldtmu0
+++/* [0x00000db0] */ 0x8e4539bf, 0xb0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu1
+++/* [0x00000db8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+++/* [0x00000dc0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+++/* [0x00000dc8] */ 0x8e5409f6, 0x14129855, // shr r1, r4, rx_xshift2    ; mov.ifz ra_y2, ra_y2_next
+++/* [0x00000dd0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+++/* [0x00000dd8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+++/* [0x00000de0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
+++/* [0x00000de8] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
+++/* [0x00000df0] */ 0x13540dc0, 0xd20208a7, // max r2, ra_y2, 0
+++/* [0x00000df8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+++/* [0x00000e00] */ 0x4c541dd3, 0xd2124562, // add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
+++/* [0x00000e08] */ 0xec654c8f, 0x10024f21, // add t1s, ra_frame_base2, r2  ; v8subs r1, r1, rb20
+++/* [0x00000e10] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++/* [0x00000e18] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,      r0
+++/* [0x00000e20] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
+++/* [0x00000e28] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1, r0 << 1
+++/* [0x00000e30] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
+++/* [0x00000e38] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2
+++/* [0x00000e40] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
+++/* [0x00000e48] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3
+++/* [0x00000e50] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
+++/* [0x00000e58] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4
+++/* [0x00000e60] */ 0x40074031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12
+++/* [0x00000e68] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5
+++/* [0x00000e70] */ 0x40073031, 0xda00c9e3, // nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13
+++/* [0x00000e78] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6
+++/* [0x00000e80] */ 0x40072031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14
+++/* [0x00000e88] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7
+++/* [0x00000e90] */ 0x40071031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15
+++/* [0x00000e98] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
+++/* [0x00000ea0] */ 0x8d2087f6, 0xd00269e1, // sub.setf -, r3, 8       ; mov r1,   ra8
+++/* [0x00000ea8] */ 0x95249dbf, 0x10024208, // mov ra8,  ra9           ; mov rb8,  rb9
+++/* [0x00000eb0] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:yloop
+++/* [0x00000eb8] */ 0x9528adbf, 0x10024249, // mov ra9,  ra10          ; mov rb9,  rb10
+++/* [0x00000ec0] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11          ; mov rb10, rb11
+++/* [0x00000ec8] */ 0x959e7009, 0x100242cb, // mov ra11, r0            ; mov rb11, r1
+++/* [0x00000ed0] */ 0x4008803e, 0x180049e0, // nop                     ; mul24 r0, rb8,  ra2.8a
+++/* [0x00000ed8] */ 0x4008903e, 0x1a0049e1, // nop                     ; mul24 r1, rb9,  ra2.8b
+++/* [0x00000ee0] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0          ; mul24 r0, rb10, ra2.8c
+++/* [0x00000ee8] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0          ; mul24 r0, rb11, ra2.8d
+++/* [0x00000ef0] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8,  rb4
+++/* [0x00000ef8] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9,  rb5
+++/* [0x00000f00] */ 0x4d286237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb6
+++/* [0x00000f08] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
+++/* [0x00000f10] */ 0x8d9f223f, 0x10020867, // sub r1, r1, r0          ; mov -, vw_wait
+++/* [0x00000f18] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
+++/* [0x00000f20] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+++/* [0x00000f28] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
+++/* [0x00000f30] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
+++/* [0x00000f38] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
+++/* [0x00000f40] */ 0xfffffe48, 0xf06809e7, // brr.anyn -, r:yloop
+++/* [0x00000f48] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
+++/* [0x00000f50] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
+++/* [0x00000f58] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+++/* [0x00000f60] */ 0xfffffc48, 0xf0f809e7, // brr -, r:per_block_setup
+++/* [0x00000f68] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+++/* [0x00000f70] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+++/* [0x00000f78] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+++// ::mc_filter_b
+++/* [0x00000f80] */ 0x0f9d01c0, 0xd00213a7, // asr rb14, r0, i_shift16
+++// :yloopb
+++/* [0x00000f88] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1                           ; ldtmu0
+++/* [0x00000f90] */ 0x8e4539bf, 0xb0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu1
+++/* [0x00000f98] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+++/* [0x00000fa0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+++/* [0x00000fa8] */ 0x8e5409f6, 0x14129855, // shr r1, r4, rx_xshift2    ; mov.ifz ra_y2, ra_y2_next
+++/* [0x00000fb0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+++/* [0x00000fb8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+++/* [0x00000fc0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
+++/* [0x00000fc8] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
+++/* [0x00000fd0] */ 0x13540dc0, 0xd20208a7, // max r2, ra_y2, 0
+++/* [0x00000fd8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+++/* [0x00000fe0] */ 0x4c541dd3, 0xd2124562, // add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
+++/* [0x00000fe8] */ 0xec654c8f, 0x10024f21, // add t1s, ra_frame_base2, r2  ; v8subs r1, r1, rb20
+++/* [0x00000ff0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++/* [0x00000ff8] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,      r0
+++/* [0x00001000] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
+++/* [0x00001008] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1, r0 << 1
+++/* [0x00001010] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
+++/* [0x00001018] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2
+++/* [0x00001020] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
+++/* [0x00001028] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3
+++/* [0x00001030] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
+++/* [0x00001038] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4
+++/* [0x00001040] */ 0x40074031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12
+++/* [0x00001048] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5
+++/* [0x00001050] */ 0x40073031, 0xda00c9e3, // nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13
+++/* [0x00001058] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6
+++/* [0x00001060] */ 0x40072031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14
+++/* [0x00001068] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7
+++/* [0x00001070] */ 0x40071031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15
+++/* [0x00001078] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
+++/* [0x00001080] */ 0x8d2087f6, 0xd00269e1, // sub.setf -, r3, 8       ; mov r1,   ra8
+++/* [0x00001088] */ 0x95249dbf, 0x10024208, // mov ra8,  ra9           ; mov rb8,  rb9
+++/* [0x00001090] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:yloopb
+++/* [0x00001098] */ 0x9528adbf, 0x10024249, // mov ra9,  ra10          ; mov rb9,  rb10
+++/* [0x000010a0] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11          ; mov rb10, rb11
+++/* [0x000010a8] */ 0x959e7009, 0x100242cb, // mov ra11, r0            ; mov rb11, r1
+++/* [0x000010b0] */ 0x4008803e, 0x180049e0, // nop                     ; mul24 r0, rb8,  ra2.8a
+++/* [0x000010b8] */ 0x4008903e, 0x1a0049e1, // nop                     ; mul24 r1, rb9,  ra2.8b
+++/* [0x000010c0] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0          ; mul24 r0, rb10, ra2.8c
+++/* [0x000010c8] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0          ; mul24 r0, rb11, ra2.8d
+++/* [0x000010d0] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8,  rb4
+++/* [0x000010d8] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9,  rb5
+++/* [0x000010e0] */ 0x4d286237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb6
+++/* [0x000010e8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
+++/* [0x000010f0] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0          ; mov r2, rb12
+++/* [0x000010f8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
+++/* [0x00001100] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+++/* [0x00001108] */ 0x409ce00f, 0x100049e0, // nop                     ; mul24 r0, r1, rb14
+++/* [0x00001110] */ 0x4c4b808e, 0xd0024821, // add r0, r0, r2          ; mul24 r1, r1 << 8, ra18 << 8
+++/* [0x00001118] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+++/* [0x00001120] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
+++/* [0x00001128] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:yloopb
+++/* [0x00001130] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
+++/* [0x00001138] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
+++/* [0x00001140] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+++/* [0x00001148] */ 0xfffffa60, 0xf0f809e7, // brr -, r:per_block_setup
+++/* [0x00001150] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+++/* [0x00001158] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+++/* [0x00001160] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+++// ::mc_interrupt_exit12
+++/* [0x00001168] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+++/* [0x00001170] */ 0x009e7000, 0xa00009e7, // ldtmu0
+++/* [0x00001178] */ 0x009e7000, 0xa00009e7, // ldtmu0
+++/* [0x00001180] */ 0x009e7000, 0xb00009e7, // ldtmu1
+++/* [0x00001188] */ 0x009e7000, 0xb00009e7, // ldtmu1
+++/* [0x00001190] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+++/* [0x00001198] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+++/* [0x000011a0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+++/* [0x000011a8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+++/* [0x000011b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+++/* [0x000011b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+++/* [0x000011c0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+++/* [0x000011c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+++/* [0x000011d0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+++/* [0x000011d8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+++/* [0x000011e0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+++/* [0x000011e8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+++/* [0x000011f0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+++/* [0x000011f8] */ 0x009e7000, 0x100009e7, // nop        ; nop
+++// ::mc_exit1
+++/* [0x00001200] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+++/* [0x00001208] */ 0x009e7000, 0xa00009e7, // ldtmu0
+++/* [0x00001210] */ 0x009e7000, 0xb00009e7, // ldtmu1
+++/* [0x00001218] */ 0x009e7000, 0xa00009e7, // ldtmu0
+++/* [0x00001220] */ 0x009e7000, 0xb00009e7, // ldtmu1
+++/* [0x00001228] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+++/* [0x00001230] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+++/* [0x00001238] */ 0x009e7000, 0x100009e7, // nop        ; nop
+++// ::mc_end
+++};
+++#ifdef __HIGHC__
+++#pragma Align_to(8, rpi_shader)
+++#endif
++diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
++new file mode 100644
++index 0000000..9772796
++--- /dev/null
+++++ b/libavcodec/rpi_shader.h
++@@ -0,0 +1,19 @@
+++#ifndef rpi_shader_H
+++#define rpi_shader_H
+++
+++extern unsigned int rpi_shader[];
+++
+++#define mc_setup_uv (rpi_shader + 0)
+++#define mc_filter_uv (rpi_shader + 132)
+++#define mc_filter_uv_b0 (rpi_shader + 274)
+++#define mc_filter_uv_b (rpi_shader + 392)
+++#define mc_exit (rpi_shader + 540)
+++#define mc_interrupt_exit8 (rpi_shader + 558)
+++#define mc_setup (rpi_shader + 588)
+++#define mc_filter (rpi_shader + 872)
+++#define mc_filter_b (rpi_shader + 992)
+++#define mc_interrupt_exit12 (rpi_shader + 1114)
+++#define mc_exit1 (rpi_shader + 1152)
+++#define mc_end (rpi_shader + 1168)
+++
+++#endif
++diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
++new file mode 100644
++index 0000000..aa9e1e7
++--- /dev/null
+++++ b/libavcodec/rpi_shader.qasm
++@@ -0,0 +1,1098 @@
+++# register allocation
+++#
+++# ra0...ra7                                     eight horizontal filter coefficients
+++#
+++# rb0 rx_shift2
+++# rb1 rb_y2_next
+++#
+++# rb4...rb7
+++#
+++# rb8..rb11, ra8...ra11                         Y: eight filtered rows of context (ra11 == most recent)
+++#
+++#                                               (ra15 isn't clamped to zero - this happens during the
+++#                                                copy to ra14, and during its use in the vertical filter)
+++#
+++# rb8...rb11                                    eight vertical filter coefficients
+++
+++# ra4                                           y: Fiter, UV: 0x10000
+++
+++# rb12                                          offset to add before shift (round + weighting offsets)
+++# rb13                                          shift: denom + 6 + 9
+++# rb14                                          L0 weight (U on left, V on right)
+++# rb15                                          -- free --
+++#
+++# ra16                                          clipped(row start address+elem_num)&~3
+++# ra17                                          per-channel shifts
+++# ra18                                          L1 weight (Y)
+++# ra19                                          next ra17
+++#
+++# rb16                                          pitch
+++# rb17                                          height + 1
+++# rb18                                          height + 3
+++# rb19                                          next ra16
+++#
+++# ra20                                          1
+++# ra21                                          ra_21
+++# ra22 ra_k256                                  256
+++# ra23 ra_y2_next                               ra_y2_next
+++#
+++# rb20                                          0xffffff00
+++# rb21                                          vpm_setup for reading/writing 16bit results into VPM
+++# rb22 rb_k255                                  255
+++# rb23                                          24
+++#
+++# rb24                                          vdw_setup_1(dst_pitch)
+++# rb25                                          frame width-1
+++# rb26                                          height<<23 + width<<16 + vdw_setup_0
+++# rb27                                          vdw_setup_0 (depends on QPU number)
+++# rb28                                          vpm_setup (depends on QPU number) for writing 8bit results into VPM
+++# rb29                                          vdw_setup_1(dst_pitch-width)
+++# rb30                                          frame height-1
+++# rb31                                          used as temp to count loop iterations
+++#
+++# ra24                                          clipped(row start address+8+elem_num)&~3
+++# ra25                                          per-channel shifts 2
+++# ra26                                          next ra24
+++# ra27                                          next ra25
+++# ra28                                          next y
+++# ra29                                          y for next texture access
+++# ra30                                          64
+++#
+++# ra31                                          next kernel address
+++
+++.set rb_frame_width_minus_1,       rb25
+++.set rb_frame_height_minus_1,      rb30
+++.set rb_pitch,                     rb16
+++.set ra_x,                         ra16
+++.set ra_y2,                        ra21.16a
+++.set ra_y2_next,                   ra21.16b
+++
+++.set rb_x_next,                    rb19
+++.set rx_frame_base2_next,          rb19
+++
+++.set ra_frame_base,                ra24
+++.set ra_frame_base_next,           ra26
+++.set ra_xshift,                    ra17
+++
+++.set ra_u2v_ref_offset,            ra25
+++.set ra_frame_base2,               ra25
+++
+++.set ra_xshift_next,               ra19
+++.set rx_xshift2,                   rb0
+++.set rx_xshift2_next,              rb1
+++
+++.set ra_u2v_dst_offset,            ra27
+++
+++.set ra_y_next,                    ra28
+++.set ra_y,                         ra29
+++
+++.set ra_k1,                        ra20
+++.set rb_k255,                      rb22
+++.set ra_k256,                      ra22
+++
+++# With shifts only the bottom 5 bits are considered so -16=16, -15=17 etc.
+++.set i_shift16,                    -16
+++.set i_shift21,                    -11
+++
+++################################################################################
+++# mc_setup_uv(next_kernel, x, y, ref_u_base, ref_v_base, frame_width, frame_height, pitch, dst_pitch, offset, denom, vpm_id)
+++::mc_setup_uv
+++
+++# Read starting kernel
+++mov ra31, unif
+++
+++# Load first request location
+++add ra_x, unif, elem_num # Store x
+++mov ra_y, unif # Store y
+++mov ra_frame_base, unif # Store frame u base
+++nop
+++sub ra_u2v_ref_offset, unif, ra_frame_base # Store offset to add to move from u to v in reference frame
+++
+++# Read image dimensions
+++sub rb25,unif,1
+++sub rb30,unif,1
+++
+++# get source pitch
+++mov rb16, unif
+++
+++# get destination pitch
+++mov r0, unif
+++mov r1, vdw_setup_1(0)
+++add rb24, r1, r0
+++
+++# load constants
+++
+++mov ra4, 0x10000
+++mov ra_k1, 1
+++mov ra_k256, 256
+++mov ra30, 64
+++
+++mov rb20, 0xffffff00
+++mov rb_k255, 255
+++mov rb23, 24
+++
+++# touch vertical context to keep simulator happy
+++
+++mov ra8, 0
+++mov ra9, 0
+++mov ra10, 0
+++mov ra11, 0
+++mov ra12, 0
+++mov ra13, 0
+++mov ra14, 0
+++mov ra15, 0
+++
+++# Compute base address for first and second access
+++mov r0, ra_x           # Load x
+++max r0, r0, 0; mov r1, ra_y # Load y
+++min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base  # Load the frame base
+++shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
+++add ra_y, r1, 1
+++add r0, r0, r3
+++and r0, r0, ~3
+++max r1, r1, 0 ; mov ra_x, r0 # y
+++min r1, r1, rb_frame_height_minus_1
+++# submit texture requests for first line
+++add r2, r2, r0 ; mul24 r1, r1, rb_pitch
+++add t0s, r0, r1 ; mov ra_frame_base, r2
+++add t1s, r2, r1
+++
+++mov r2, 9
+++add rb13, r2, unif  # denominator
+++mov -, unif         # Unused
+++
+++# Compute part of VPM to use for DMA output
+++mov r2, unif
+++shl r2, r2, 1   # Convert QPU numbers to be even (this means we can only use 8 QPUs, but is necessary as we need to save 16bit intermediate results)
+++and r2, r2, 15
+++mov r1, r2
+++asr r1, r1, 2
+++shl r1, r1, 6
+++mov r0, r2
+++and r0, r0, 3
+++add r0, r0, r1
+++
+++mov r1, vpm_setup(0, 4, h8p(0, 0))   # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
+++add rb28, r0, r1  # VPM 8bit storage
+++asr r2, r0, 1     # r0 = bc0000d
+++mov r1, vpm_setup(0, 2, h16p(0, 0))  # 2 is stride - stride acts on ADDR which is Y[5:0],H[0] for 16 bit
+++add rb21, r2, r1  # VPM for 16bit intermediates
+++mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
+++shl r0, r0, 5
+++add rb27, r0, r1  # DMA out
+++
+++# submit texture requests for second line
+++max r1, ra_y, 0
+++min r1, r1, rb_frame_height_minus_1
+++add ra_y, ra_y, 1
+++bra -, ra31
+++nop ; mul24 r1, r1, rb_pitch
+++add t0s, r1, ra_x
+++add t1s, r1, ra_frame_base
+++
+++
+++
+++################################################################################
+++
+++# mc_filter_uv(next_kernel, x, y, frame_u_base, frame_v_base, width_height, hcoeffs, vcoeffs, offset_weight_u, offset_weight_v, this_u_dst, this_v_dst)
+++
+++# At this point we have already issued two pairs of texture requests for the current block
+++# ra_x, ra_x16_base point to the current coordinates for this block
+++::mc_filter_uv
+++mov ra31, unif
+ +
+++# per-channel shifts were calculated on the *previous* invocation
+ +
+-+hevc_uv_deblock_16x16:
+-+  push r6-r15, lr
+-+  mov r14,0
+-+  b hevc_uv_start
+-+hevc_uv_deblock_16x16_with_clear:
+-+  push r6-r15, lr
+-+  mov r14,1
+-+  b hevc_uv_start
+++# get base addresses and per-channel shifts for *next* invocation
+++add r0, unif, elem_num    # x
+++max r0, r0, 0         ; mov r1, unif # y
+++min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base
+++# compute offset from frame base u to frame base v
+++sub r2, unif, r3      ; mov ra_xshift, ra_xshift_next
+++shl ra_xshift_next, r0, 3
+++add r0, r0, r3        ; mov ra1, unif  # ; width_height
+++and rb_x_next, r0, ~3 ; mov ra0, unif  # H filter coeffs
+++mov ra_y_next, r1     ; mov vw_setup, rb28
+++add ra_frame_base_next, rb_x_next, r2
+ +
+-+hevc_uv_start:
+-+  mov r9,r4
+-+  mov r4,r3
+-+  mov r13,r2
+-+  mov r2,r0
+-+  mov r10,r0
+-+  subscale4 r0,r1
+-+  mov r8,63
+-+  mov r6,-3
+-+  vmov H(zeros,0),0
+-+# r7 is number of blocks still to load
+-+# r0 is location of current block - 4 * stride
+-+# r1 is stride
+-+# r2 is location of current block
+-+# r3 is offset of start of block (actual edges start at H(16,16)+r3 for horizontal and H(16,0)+r3 for vertical
+-+# r4 is setup
+-+# r5 is for temporary calculations
+-+# r8 holds 63
+-+# r6 holds -3
+-+# r9 holds the number of 16 high rows to process
+-+# r10 holds the original img base
+-+# r11 returns 0 if no filtering was done on the edge
+-+# r12 saves a copy of this
+-+# r13 is copy of width
+-+# r14 is 1 if we should clear the old contents, or 0 if not
+++# set up VPM write
+++# get width,height of block
+ +
+-+uv_process_row:
+-+  # First iteration does not do horizontal filtering on previous
+-+  mov r7, r13
+-+  mov r3,0
+-+  vldb H(12++,16)+r3,(r0 += r1) REP 4    # Load the current block
+-+  vldb H(16++,16)+r3,(r2 += r1) REP 16
+-+  vldb H(setup_input,0), (r4)  # We may wish to prefetch these
+-+  cmp r14,1
+-+  bne uv_skip0
+-+  vstb H(zeros,0),(r4)
+-+uv_skip0:
+-+  bl uv_vert_filter
+-+  add r3,8
+-+  vadd H(setup_input,0),H(setup_input,8),0 # Rotate to second set of 8
+-+  bl uv_vert_filter
+-+  sub r3,8
+-+  b uv_start_deblock_loop
+-+uv_deblock_loop:
+-+  # Middle iterations do vertical on current block and horizontal on preceding
+-+  vldb H(12++,16)+r3,(r0 += r1) REP 4  # load the current block
+-+  vldb H(16++,16)+r3,(r2 += r1) REP 16
+-+  vldb H(setup_input,0), (r4)
+-+  cmp r14,1
+-+  bne uv_skip1
+-+  vstb H(zeros,0),(r4)
+-+uv_skip1:
+-+  bl uv_vert_filter
+-+  add r3,8
+-+  vadd H(setup_input,0),H(setup_input,8),0
+-+  bl uv_vert_filter
+-+  sub r3,8
+-+  vldb H(setup_input,0), -16(r4)
+-+  cmp r14,1
+-+  bne uv_skip3
+-+  vstb H(zeros,0),-16(r4)
+-+uv_skip3:
+-+  bl uv_horz_filter
+-+  mov r12,r11
+-+  add r3,8*64
+-+  vadd H(setup_input,0),H(setup_input,8),0
+-+  bl uv_horz_filter
+-+  sub r3,8*64
+-+  addcmpbeq r12,0,0,uv_skip_save_top
+-+  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
+-+uv_skip_save_top:
+-+  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
+-+uv_start_deblock_loop:
+-+  # move onto next 16x16 (could do this with circular buffer support instead)
+-+  add r3,16
+-+  and r3,r8
+-+  add r4,32
+-+  # Perform loop counter operations (may work with an addcmpbgt as well?)
+-+  add r0,16
+-+  add r2,16
+-+  sub r7,1
+-+  cmp r7,0 # Are there still more blocks to load
+-+  bgt uv_deblock_loop
+++sub rb29, rb24, ra1.16b  # Compute vdw_setup1(dst_pitch-width)
+++add rb17, ra1.16a, 1
+++add rb18, ra1.16a, 3
+++shl r0,   ra1.16a, 7
+++add r0,   r0, ra1.16b    # Combine width and height of destination area
+++shl r0,   r0, i_shift16  # Shift into bits 16 upwards of the vdw_setup0 register
+++add rb26, r0, rb27    ; mov ra3, unif  # ; V filter coeffs
+++
+++mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++
+++# unpack filter coefficients
+++
+++mov ra1, unif         ; mov rb8,  ra3.8a   # U offset/weight
+++mov.ifnz ra1, unif    ; mov rb9,  ra3.8b   # V offset/weight
+++nop                   ; mov rb10, ra3.8c
+++mov r3, 0             ; mov rb11, ra3.8d   # Loop count
+++
+++shl r1, ra1.16b, rb13
+++asr rb12, r1, 1
+++shl rb14, ra1.16a, 1  # b14 = weight*2
+++
+++# rb14 - weight L0 * 2
+++# rb13 = weight denom + 6 + 9
+++# rb12 = (((is P) ? offset L0 * 2 : offset L1 + offset L0) + 1) << (rb13 - 1)
+++
+++# r2 is elem_num
+++# retrieve texture results and pick out bytes
+++# then submit two more texture requests
+++
+++# r3 = 0
+++:uvloop
+++# retrieve texture results and pick out bytes
+++# then submit two more texture requests
+++
+++sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1          ; ldtmu0     # loop counter increment
+++shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
+++mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+++mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+++shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
+++
+++max r2, ra_y, 0  # y
+++min r2, r2, rb_frame_height_minus_1
+++add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+++add t0s, ra_x, r2    ; v8subs r1, r1, rb20
+++add t1s, ra_frame_base, r2
+++
+++# generate seven shifted versions
+++# interleave with scroll of vertical context
+++
+++mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++
+++# apply horizontal filter
+++nop                  ; mul24      r3, ra0.8a,       r0
+++nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8
+++nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1
+++nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9
+++sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2
+++nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
+++add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3
+++nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
+++sub r0, r2, r3       ; mov r3, rb31
+++sub.setf -, r3, 4    ; mov ra12, ra13
+++brr.anyn -, r:uvloop
+++mov ra13, ra14          ; mul24 r1, ra14, rb9
+++mov ra14, ra15
+++mov ra15, r0            ; mul24 r0, ra12, rb8
+++# >>> .anyn uvloop
+++
+++# apply vertical filter and write to VPM
+++
+++sub r1, r1, r0          ; mul24 r0, ra14, rb10
+++add r1, r1, r0          ; mul24 r0, ra15, rb11
+++sub r1, r1, r0          ; mov -, vw_wait
+++sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
+++asr r1, r1, 14
+++nop                     ; mul24 r1, r1, rb14
+++shl r1, r1, 8
+++
+++add r1, r1, rb12
+++brr.anyn -, r:uvloop
+++asr r1, r1, rb13
+++min r1, r1, rb_k255       # Delay 2
+++max vpm, r1, 0         # Delay 3
+++
+++# DMA out for U
+++
+++mov vw_setup, rb26 # VDW setup 0
+++mov vw_setup, rb29 # Stride
+++mov vw_addr, unif # start the VDW
+++
+++# DMA out for V
+++# We need to wait for the U to complete first, but have nothing useful to compute while we wait.
+++# Could potentially push this write into the start of the next pipeline stage.
+++mov r0, 16
+++mov -, vw_wait
+++
+++bra -, ra31
+++add vw_setup, rb26, r0 # VDW setup 0
+++mov vw_setup, rb29 # Stride
+++mov vw_addr, unif # start the VDW
+++
+++
+++################################################################################
+++
+++# mc_filter_uv_b0(next_kernel, x, y, frame_u_base, frame_v_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_u_dst, this_v_dst)
+++
+++# At this point we have already issued two pairs of texture requests for the current block
+++# ra_x, ra_x16_base point to the current coordinates for this block
+++::mc_filter_uv_b0
+++mov ra31, unif
+++
+++# per-channel shifts were calculated on the *previous* invocation
+++
+++# get base addresses and per-channel shifts for *next* invocation
+++add r0, unif, elem_num       # x
+++max r0, r0, 0                ; mov r1, unif # y
+++min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base
+++sub r2, unif, r3             ; mov ra_xshift, ra_xshift_next # compute offset from frame base u to frame base v ;
+++shl ra_xshift_next, r0, 3
+++add r0, r0, r3  	     ; mov ra1, unif   # ; width_height
+++and rb_x_next, r0, ~3        ; mov ra0, unif   # ; H filter coeffs
+++mov ra_y_next, r1            ; mov vw_setup, rb21
+++
+++add ra_frame_base_next, rb_x_next, r2
+++
+++# Need to have unsigned coeffs to so we can just unpack in the filter
+++# chroma filter always goes -ve, +ve, +ve, -ve. This is fixed in the
+++# filter code. Unpack into b regs for V
+++
+++# set up VPM write, we need to save 16bit precision
+++
+++sub rb29, rb24, ra1.16b         # Compute vdw_setup1(dst_pitch-width)
+++add rb17, ra1.16a, 1
+++add rb18, ra1.16a, 3
+++shl r0,   ra1.16a, 7
+++add r0,   r0, ra1.16b           # Combine width and height of destination area
+++shl r0,   r0, i_shift16      ; mov ra3, unif  # ; V filter coeffs
+++add rb26, r0, rb27
+++
+++mov rb8, ra3.8a
+++mov rb9, ra3.8b
+++mov rb10, ra3.8c
+++mov rb11, ra3.8d
+++
+++# r2 is elem_num
+++# r3 is loop counter
+++
+++mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++
+++mov      rb14, unif                 # U weight L0
+++mov.ifnz rb14, unif    ; mov r3, 0  # V weight L0 ; Loop counter
+++# rb14 unused in b0 but will hang around till the second pass
+++
+++# retrieve texture results and pick out bytes
+++# then submit two more texture requests
+++
+++# r3 = 0
+++:uvloop_b0
+++# retrieve texture results and pick out bytes
+++# then submit two more texture requests
+++
+++sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1          ; ldtmu0     # loop counter increment
+++shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
+++mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+++mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+++shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
+++
+++max r2, ra_y, 0  # y
+++min r2, r2, rb_frame_height_minus_1
+++add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+++add t0s, ra_x, r2    ; v8subs r1, r1, rb20
+++add t1s, ra_frame_base, r2
+++
+++# generate seven shifted versions
+++# interleave with scroll of vertical context
+++
+++mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++
+++nop                  ; mul24      r3, ra0.8a,       r0
+++nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8
+++nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1
+++nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9
+++sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2
+++nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
+++add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3
+++nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
+++sub r0, r2, r3       ; mov r3, rb31
+++sub.setf -, r3, 4    ; mov ra12, ra13
+++brr.anyn -, r:uvloop_b0
+++mov ra13, ra14          ; mul24 r1, ra14, rb9  # ra14 is about to be ra13
+++mov ra14, ra15
+++mov ra15, r0            ; mul24 r0, ra12, rb8
+++# >>> .anyn uvloop_b0
+++
+++# apply vertical filter and write to VPM
+++
+++sub r1, r1, r0          ; mul24 r0, ra14, rb10
+++sub.setf -, r3, rb18
+++brr.anyn -, r:uvloop_b0
+++add r1, r1, r0          ; mul24 r0, ra15, rb11
+++sub r1, r1, r0          ; mov -, vw_wait
+++asr vpm, r1, 6
+++# >>> .anyn uvloop_b0
+++
+++# in pass0 we don't really need to save any results, but need to discard the uniforms
+++# DMA out for U
+++
+++bra -, ra31
+++mov -, unif           # Delay 1
+++mov -, unif           # Delay 2
+++nop                   # Delay 3
+++
+++
+++################################################################################
+++
+++::mc_filter_uv_b
+++mov ra31, unif
+++
+++# per-channel shifts were calculated on the *previous* invocation
+++
+++# set up VPM write
+++mov ra_xshift, ra_xshift_next      ; mov vw_setup, rb28
+++
+++# get base addresses and per-channel shifts for *next* invocation
+++add r0, unif, elem_num    # x
+++max r0, r0, 0                      ; mov ra_y_next, unif # y
+++min r0, r0, rb_frame_width_minus_1 ; mov r3, unif        # V frame_base
+++# compute offset from frame base u to frame base v
+++sub r2, unif, r3                   ; mul24 ra_xshift_next, r0, 8 # U frame_base
+++add r0, r0, r3                     ; mov ra1, unif       # width_height
+++and rb_x_next, r0, ~3              ; mov ra0, unif       # H filter coeffs
+++
+++sub rb29, rb24, ra1.16b  # Compute vdw_setup1(dst_pitch-width)
+++add rb17, ra1.16a, 1
+++add rb18, ra1.16a, 3
+++shl r0,   ra1.16a, 7
+++
+++add ra_frame_base_next, rb_x_next, r2
+++
+++# r0 is currently height<<7
+++# For vr_setup we want height<<20 (so 20-7=13 additional bits)
+++shl r3, r0, i_shift21     ; mov ra3, unif # Shl 13 + Mask off top 8 bits ; V filter coeffs
+++shr r3, r3, 8
+++add vr_setup, r3, rb21
+++
+++add r0, r0, ra1.16b    # Combine width and height of destination area
+++shl r0, r0, i_shift16  # Shift into bits 16 upwards of the vdw_setup0 register
+++add rb26, r0, rb27
+ +
+-+  # Final iteration needs to just do horizontal filtering
+-+  vldb H(setup_input,0), -16(r4)
+-+  cmp r14,1
+-+  bne uv_skip2
+-+  vstb H(zeros,0),-16(r4)
+-+uv_skip2:
+-+  bl uv_horz_filter
+-+  mov r12,r11
+-+  add r3,8*64
+-+  vadd H(setup_input,0),H(setup_input,8),0
+-+  bl uv_horz_filter
+-+  sub r3,64*8
+-+  addcmpbeq r12,0,0,uv_skip_save_top2
+-+  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
+-+uv_skip_save_top2:
+-+  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
+++# get filter coefficients
+ +
+-+# Now look to see if we should do another row
+-+  sub r9,1
+-+  cmp r9,0
+-+  bgt uv_start_again
+-+  pop r6-r15, pc
+-+uv_start_again:
+-+  # Need to sort out r0,r2 to point to next row down
+-+  addscale16 r10,r1
+-+  mov r2,r10
+-+  subscale4 r0,r2,r1
+-+  b uv_process_row
+++mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+ +
+++# Get offset & weight stuff
+ +
+-+# At this stage H(16,16)+r3 points to the first pixel of the 16 high edge to be filtered
+-+# So we can reuse the code we move the parts to be filtered into HX(P0/P1/P2/P3/Q0/Q1/Q2/Q3,0) - we will perform a final saturation step on placing them back into the correct locations
+++# The unif read occurs unconditionally, only the write is conditional
+++mov      ra1, unif  ; mov rb8,  ra3.8a    # U offset/weight ;
+++mov.ifnz ra1, unif  ; mov rb9,  ra3.8b    # V offset/weight ;
+++nop                 ; mov rb10, ra3.8c
+++mov r3, 0           ; mov rb11, ra3.8d    # Loop counter ;
+ +
+-+uv_vert_filter:
+-+  push lr
+++shl r1, ra1.16b, rb13
+++asr rb12, r1, 1
+ +
+-+  vmov HX(P1,0), V(16,14)+r3
+-+  vmov HX(P0,0), V(16,15)+r3
+-+  vmov HX(Q0,0), V(16,16)+r3
+-+  vmov HX(Q1,0), V(16,17)+r3
+++# ra1.16a used directly in the loop
+ +
+-+  bl do_chroma_filter
+++# retrieve texture results and pick out bytes
+++# then submit two more texture requests
+ +
+-+  vadds V(16,15)+r3, HX(P0,0), 0
+-+  vadds V(16,16)+r3, HX(Q0,0), 0
+++# r3 = 0
+++:uvloop_b
+++# retrieve texture results and pick out bytes
+++# then submit two more texture requests
+ +
+-+  pop pc
+++sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1          ; ldtmu0     # loop counter increment
+++shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
+++mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+++mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+++shr r1, r4, ra_xshift     ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
+ +
+-+# Filter edge at H(16,0)+r3
+-+uv_horz_filter:
+-+  push lr
+++max r2, ra_y, 0  # y
+++min r2, r2, rb_frame_height_minus_1
+++add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+++add t0s, ra_x, r2         ; v8subs r1, r1, rb20
+++add t1s, ra_frame_base, r2
+ +
+-+  vmov HX(P1,0), H(14,0)+r3
+-+  vmov HX(P0,0), H(15,0)+r3
+-+  vmov HX(Q0,0), H(16,0)+r3
+-+  vmov HX(Q1,0), H(17,0)+r3
+++# generate seven shifted versions
+++# interleave with scroll of vertical context
+ +
+-+  bl do_chroma_filter
+++mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+ +
+-+  vadds H(15,0)+r3, HX(P0,0), 0
+-+  # P3 and Q3 never change so don't bother saving back
+-+  vadds H(16,0)+r3, HX(Q0,0), 0
+++nop                  ; mul24      r3, ra0.8a,       r0
+++nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8
+++nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1
+++nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9
+++sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2
+++nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
+++add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3
+++nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
+++sub r0, r2, r3       ; mov r3, rb31
+++sub.setf -, r3, 4    ; mov ra12, ra13
+++brr.anyn -, r:uvloop_b
+++mov ra13, ra14          ; mul24 r1, ra14, rb9
+++mov ra14, ra15
+++mov ra15, r0            ; mul24 r0, ra12, rb8
+++# >>> .anyn uvloop_b
+ +
+-+  pop pc
+++# apply vertical filter and write to VPM
+ +
+-+# r4 points to array of beta/tc for each 4 length edge
+-+do_chroma_filter:
+-+  valtl H(setup,0),H(setup_input,0),H(setup_input,0) # tc*8
+-+  valtl HX(tc,0),H(setup,0),H(setup,0)
+++sub r1, r1, r0          ; mul24 r0, ra14, rb10
+++add r1, r1, r0          ; mul24 r0, ra15, rb11
+++# Beware: vpm read gets unsigned 16-bit value, so we must sign extend it
+++sub r1, r1, r0          ; mul24 r0, vpm, ra4  # ra4 = 0x10000
+++sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
+++asr r1, r1, 14          # shift2=6
+ +
+-+  vsub HX(delta,0),HX(Q0,0),HX(P0,0)
+-+  vshl HX(delta,0),HX(delta,0),2 CLRA SACC
+-+  vsub -,HX(P1,0),HX(Q1,0) SACC
+-+  vmov HX(delta,0),4 SACC
+-+  vasr HX(delta,0),HX(delta,0),3
+-+  vclamps HX(delta,0), HX(delta,0), HX(tc,0)
+-+  vadd HX(P0,0),HX(P0,0),HX(delta,0)
+-+  vsub HX(Q0,0),HX(Q0,0),HX(delta,0)
+-+  b lr
+++asr r0, r0, i_shift16   ; mul24 r1, r1, ra1.16a
+++nop                     ; mul24 r0, r0, rb14
+ +
+-+# r0 = list
+-+# r1 = number
+-+hevc_run_command_list:
+-+  push r6-r7, lr
+-+  mov r6, r0
+-+  mov r7, r1
+-+loop_cmds:
+-+  ld r0,(r6) # How to encode r6++?
+-+  add r6,4
+-+  ld r1,(r6)
+-+  add r6,4
+-+  ld r2,(r6)
+-+  add r6,4
+-+  ld r3,(r6)
+-+  add r6,4
+-+  ld r4,(r6)
+-+  add r6,4
+-+  ld r5,(r6)
+-+  add r6,4
+-+  bl hevc_trans_16x16
+-+  sub r7,1
+-+  cmp r7,0
+-+  bgt loop_cmds
+++add r1, r1, r0          ; mov -, vw_wait
+++shl r1, r1, 8           # Lose bad top 8 bits & sign extend
+ +
+-+  pop r6-r7, pc
+-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
+-index 91777be..5aa0432 100644
+---- a/libavcodec/rpi_qpu.c
+-+++ b/libavcodec/rpi_qpu.c
+-@@ -397,6 +397,8 @@ static void *vpu_start(void *arg) {
+-   int start_time;
+-   int end_time;
+-   int count=0;
+-+  int count_deblock=0;
+-+  int count_qpu=0;
+- #endif
+-   while(1) {
+-     int i;
+-@@ -442,7 +444,7 @@ static void *vpu_start(void *arg) {
+-         break;
+-       }
+-     }
+--    printf("Have_qpu = %d, have_vpu=%d\n",have_qpu,have_vpu);
+-+    //printf("Have_qpu = %d, have_vpu=%d\n",have_qpu,have_vpu);
+- #endif
+-     qpu_code = p[7];
+-     qpu_codeb = p[16];
+-@@ -460,6 +462,12 @@ static void *vpu_start(void *arg) {
+-     off_time += start_time-last_time;
+- #endif
+- 
+-+#define NO_FLUSH 1
+-+#define CLEAR_PROFILE 2
+-+#define OUTPUT_COUNTS 4
+++add r1, r1, rb12        # rb12 = (offsetL0 + offsetL1 + 1) << (rb13 - 1)
+ +
+-+#define FLAGS_FOR_PROFILING (NO_FLUSH)
+++brr.anyn -, r:uvloop_b
+++asr r1, r1, rb13         # Delay 1
+++min r1, r1, rb_k255       # Delay 2
+++max vpm, r1, 0         # Delay 3
+ +
+- #ifdef RPI_COMBINE_JOBS
+-     if (have_qpu) {
+-       for(i=0;i<8;i++) {
+-@@ -472,14 +480,14 @@ static void *vpu_start(void *arg) {
+-       }
+-       if (have_vpu) {
+-         execute_multi(gpu->mb,
+--                              12,gpu->vc + offsetof(struct GPU, mail2), 1, 5000,
+-+                              12,gpu->vc + offsetof(struct GPU, mail2), FLAGS_FOR_PROFILING, 5000,
+-                               8,gpu->vc + offsetof(struct GPU, mail), 1 /* no flush */, 5000 /* timeout ms */,
+-                               p[0], p[1], p[2], p[3], p[4], p[5], p[6], // VPU0
+-                               q[0], q[1], q[2], q[3], q[4], q[5], q[6]); // VPU1
+-         q[0] = 0;
+-       } else {
+-         execute_multi(gpu->mb,
+--                              12,gpu->vc + offsetof(struct GPU, mail2), 1, 5000,
+-+                              12,gpu->vc + offsetof(struct GPU, mail2), FLAGS_FOR_PROFILING, 5000,
+-                               8,gpu->vc + offsetof(struct GPU, mail), 1 /* no flush */, 5000 /* timeout ms */,
+-                               p[0], p[1], p[2], p[3], p[4], p[5], p[6], // VPU0
+-                               0,    0   , 0   , 0   , 0   , 0   , 0); // VPU1
+-@@ -510,7 +518,7 @@ static void *vpu_start(void *arg) {
+-       execute_qpu(gpu->mb,8,gpu->vc + offsetof(struct GPU, mail), 1 /* no flush */, 5000 /* timeout ms */);
+- #else
+-       execute_multi(gpu->mb,
+--                              12,gpu->vc + offsetof(struct GPU, mail2), 1, 5000,
+-+                              12,gpu->vc + offsetof(struct GPU, mail2), FLAGS_FOR_PROFILING , 5000,
+-                               8,gpu->vc + offsetof(struct GPU, mail), 1 /* no flush */, 5000 /* timeout ms */,
+-                               p[0], p[1], p[2], p[3], p[4], p[5], p[6], // VPU0
+-                               0,    0   , 0   , 0   , 0   , 0   , 0); // VPU1
+-@@ -525,17 +533,20 @@ static void *vpu_start(void *arg) {
+-     // There are three cases we may wish to distinguish of VPU/QPU activity
+-     on_time += end_time - start_time;
+- #else
+--    if (p[6]==2)
+-+    if (p[6]>1) {
+-+      count_deblock++;
+-       on_time_deblock += end_time - start_time;
+--    else
+-+    } else {
+-       on_time += end_time - start_time;
+-+      count_qpu++;
+-+    }
+- #endif
+-     count++;
+-     if ((count&0x7f)==0)
+- #ifdef RPI_COMBINE_JOBS
+--      printf("Posted %d On=%dms, On_deblock=%dms, Off=%dms\n",count,(int)(on_time/1000),(int)(on_time_deblock/1000),(int)(off_time/1000));
+--#else
+-       printf("Posted %d On=%dms, Off=%dms\n",count,(int)(on_time/1000),(int)(off_time/1000));
+-+#else
+-+      printf("Posted %d On=%dms (%d calls), On_deblock=%dms (%d calls), Off=%dms\n",count,(int)(on_time/1000),count_qpu,(int)(on_time_deblock/1000),count_deblock,(int)(off_time/1000));
+- #endif
+- #endif
+- job_done_early:
+-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
+-index 0686249..64bf5b0 100644
+---- a/libavcodec/rpi_shader.qasm
+-+++ b/libavcodec/rpi_shader.qasm
+-@@ -1077,6 +1077,17 @@ nop        ; nop # delay slot 2
+- ::mc_interrupt_exit12
+-   mov  -, vw_wait # wait on the VDW
+- 
+-+  # Dummy wait to test instructions
+-+#  mov r3,1000000
+-+#:dummy_loop
+-+#  sub.setf r3, r3, 1
+-+#  nop
+-+#  nop
+-+#  brr.anynn -, r:dummy_loop
+-+#  nop
+-+#  nop
+-+#  nop
+ +
+-   ldtmu0
+-   ldtmu0
+-   ldtmu1
+--- 
+-2.7.4
+-
+-
+-From 12a194bddd049ab97154e9fbdd46b63b558a3bee Mon Sep 17 00:00:00 2001
+-From: Ben Avison <bavison@riscosopen.org>
+-Date: Tue, 23 Jun 2015 23:42:03 +0100
+-Subject: [PATCH 67/68] armv7/hevc: Optimise deblocking boundary strength
+- calculation
+-
+----
+- libavcodec/arm/hevcdsp_deblock_neon.S | 115 +++++++++++++++++
+- libavcodec/arm/hevcdsp_init_neon.c    |   9 ++
+- libavcodec/hevc.h                     |  11 --
+- libavcodec/hevc_filter.c              | 224 ++++++++++++++--------------------
+- libavcodec/hevcdsp.c                  | 116 ++++++++++++++++++
+- libavcodec/hevcdsp.h                  |  14 +++
+- 6 files changed, 344 insertions(+), 145 deletions(-)
+-
+-diff --git a/libavcodec/arm/hevcdsp_deblock_neon.S b/libavcodec/arm/hevcdsp_deblock_neon.S
+-index 166bddb..bad4589 100644
+---- a/libavcodec/arm/hevcdsp_deblock_neon.S
+-+++ b/libavcodec/arm/hevcdsp_deblock_neon.S
+-@@ -383,3 +383,118 @@ function ff_hevc_h_loop_filter_chroma_neon, export=1
+-         vst1.8   {d4}, [r0]
+-         bx       lr
+- endfunc
+++# DMA out for U
+ +
+-+/* ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_inc, int out_inc,
+-+ *                                            int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
+-+ *                                            MvField *curr, MvField *neigh, uint8_t *bs)
+-+ */
+-+function ff_hevc_deblocking_boundary_strengths_neon, export=1
+-+        add         ip, sp, #4*4
+-+        push        {a2-a4,v1-v8,lr}
+-+        ldmia       ip, {v5-v7}
+-+1:      ldmdb       ip, {v1-v4}
+-+        ldrsb       a3, [v5, #8]    @ curr->ref_idx
+-+        ldrsb       v8, [v5, #9]
+-+        ldrsb       ip, [v6, #8]    @ neigh->ref_idx
+-+        ldrsb       lr, [v6, #9]
+-+        ldr         v1, [v1, a3, lsl #2]
+-+        ldrb        a3, [v5, #10]   @ curr->pred_flag
+-+        ldr         v2, [v2, v8, lsl #2]
+-+        ldrb        v8, [v6, #10]   @ neigh->pred_flag
+-+        ldr         v3, [v3, ip, lsl #2]
+-+        ldr         v4, [v4, lr, lsl #2]
+-+        teq         a3, #3
+-+        beq         20f
+-+        teq         v8, #3
+-+        beq         90f
+++mov vw_setup, rb26 # VDW setup 0
+++mov vw_setup, rb29 # Stride
+++mov vw_addr, unif # start the VDW
+ +
+-+        tst         a3, #1
+-+        ldrne       a3, [v5, #0]    @ curr->mv[0]
+-+        ldreq       a3, [v5, #4]    @ curr->mv[1]
+-+        moveq       v1, v2
+-+        tst         v8, #1
+-+        ldrne       v8, [v6, #0]    @ neigh->mv[0]
+-+        ldreq       v8, [v6, #4]    @ neigh->mv[1]
+-+        moveq       v3, v4
+-+        teq         v1, v3
+-+        bne         10f
+-+        ldr         lr, =0xFFFCFFFC
+-+        ssub16      ip, v8, a3
+-+        ssub16      a3, a3, v8
+-+        sel         a3, a3, ip
+-+        ands        a3, a3, lr
+-+        @ drop through
+-+10:     movne       a3, #1
+-+11:     subs        a2, a2, #1
+-+12:     strbhs      a3, [v7], a4
+-+        subs        a2, a2, #1
+-+        bhs         12b
+++# DMA out for V
+++# We need to wait for the U to complete first, but have nothing useful to compute while we wait.
+++# Could potentially push this write into the start of the next pipeline stage.
+++mov r0, 16
+++mov -, vw_wait
+++
+++bra -, ra31
+++add vw_setup, rb26, r0 # VDW setup 0
+++mov vw_setup, rb29 # Stride
+++mov vw_addr, unif # start the VDW
+++
+++################################################################################
+++
+++# mc_exit()
+++
+++::mc_exit
+++mov  -, vw_wait # wait on the VDW
+++
+++mov -,srel(0)
+++
+++ldtmu0
+++ldtmu1
+++ldtmu0
+++ldtmu1
+++
+++nop        ; nop ; thrend
+++nop        ; nop # delay slot 1
+++nop        ; nop # delay slot 2
+++
+++# mc_interrupt_exit8()
+++::mc_interrupt_exit8
+++mov  -, vw_wait # wait on the VDW
+++
+++ldtmu0
+++ldtmu1
+++ldtmu0
+++ldtmu1
+++
+++mov -,sacq(0) # 1
+++mov -,sacq(0) # 2
+++mov -,sacq(0) # 3
+++mov -,sacq(0) # 4
+++mov -,sacq(0) # 5
+++mov -,sacq(0) # 6
+++mov -,sacq(0) # 7
+++
+++nop        ; nop ; thrend
+++mov interrupt, 1; nop # delay slot 1
+++nop        ; nop # delay slot 2
+++
+++
+++
+++
+++
+++# LUMA CODE
+++
+++# The idea is to form B predictions by doing 8 pixels from ref0 in parallel with 8 pixels from ref1.
+++# For P frames we make the second x,y coordinates offset by +8
+ +
+-+        ldm         sp, {a2, a3}
+-+        add         ip, sp, #16*4
+-+        subs        a1, a1, #1
+-+        add         v5, v5, a3
+-+        add         v6, v6, a3
+-+        bhi         1b
+-+        pop         {a2-a4,v1-v8,pc}
+++################################################################################
+++# mc_setup(y_x, ref_y_base, y2_x2, ref_y2_base, frame_width_height, pitch, dst_pitch, offset_shift, tbd, next_kernel)
+++::mc_setup
+++  mov r3, 16
+ +
+-+20:     teq         v8, #3
+-+        bne         10b
+++  # Need to save these because we need to know the frame dimensions before computing texture coordinates
+++  mov ra8, unif  # y_x
+++  mov ra9, unif  # ref_y_base
+++  mov ra10, unif # y2_x2
+++  mov ra11, unif # ref_y2_base
+ +
+-+        teq         v1, v3
+-+        teqeq       v2, v4
+-+        bne         40f
+-+        teq         v1, v2
+-+        bne         30f
+++# Read image dimensions
+++  mov r1, unif # width_height
+++  shl r0,r1,r3
+++  asr r1,r1,r3 # width
+++  asr r0,r0,r3 # height
+++  sub rb_frame_width_minus_1,r1,1
+++  sub rb_frame_height_minus_1,r0,1
+ +
+-+        ldrd        v1, v2, [v5]    @ curr->mv
+-+        ldrd        v3, v4, [v6]    @ neigh->mv
+-+        ldr         lr, =0xFFFCFFFC
+-+        ssub16      ip, v3, v1
+-+        ssub16      a3, v1, v3
+-+        sel         a3, a3, ip
+-+        ands        a3, a3, lr
+-+        bne         25f
+-+        ssub16      ip, v4, v2
+-+        ssub16      a3, v2, v4
+-+        sel         a3, a3, ip
+-+        ands        a3, a3, lr
+-+        beq         11b
+-+        @ drop through
+-+25:     ssub16      ip, v4, v1
+-+        ssub16      a3, v1, v4
+-+        sel         a3, a3, ip
+-+        ands        a3, a3, lr
+-+        bne         10b
+-+        ssub16      ip, v3, v2
+-+        ssub16      a3, v2, v3
+-+        sel         a3, a3, ip
+-+        ands        a3, a3, lr
+-+        b           10b
+++# get source pitch
+++  mov rb_pitch, unif # src_pitch
+ +
+-+30:     ldrd        v1, v2, [v5]    @ curr->mv
+-+        ldrd        v3, v4, [v6]    @ neigh->mv
+-+        ldr         lr, =0xFFFCFFFC
+-+        ssub16      ip, v3, v1
+-+        ssub16      a3, v1, v3
+-+        sel         a3, a3, ip
+-+        ands        a3, a3, lr
+-+        bne         10b
+-+        ssub16      ip, v4, v2
+-+        ssub16      a3, v2, v4
+-+        sel         a3, a3, ip
+-+        ands        a3, a3, lr
+-+        b           10b
+++# get destination pitch
+++  mov r0, unif       # dst_pitch
+++  mov r1, vdw_setup_1(0)
+++  add rb24, r1, r0
+ +
+-+40:     teq         v1, v4
+-+        teqeq       v2, v3
+-+        bne         10b
+++# Compute base address for first and second access
+++  mov r1, ra8 # y_x
+++  shl r0,r1,r3 # r0 is x<<16
+++  asr r1,r1,r3 # r1 is y
+++  asr r0,r0,r3 # r0 is x
+++  add r0, r0, elem_num # Load x
+++  max r0, r0, 0
+++  min r0, r0, rb_frame_width_minus_1 ; mov r2, ra9  # Load the frame base
+++  shl ra_xshift_next, r0, 3 # Compute shifts
+++  add ra_y, r1, 1
+++  and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
+++  add r2, r2, r0  # r2 is address for frame0 (not including y offset)
+++  max r1, r1, 0
+++  min r1, r1, rb_frame_height_minus_1
+++  nop             ; mul24 r1, r1, rb_pitch   # r2 contains the addresses (not including y offset) for frame0
+++  add t0s, r2, r1 ; mov ra_frame_base, r2
+ +
+-+        ldrd        v1, v2, [v5]    @ curr->mv
+-+        ldrd        v3, v4, [v6]    @ neigh->mv
+-+        ldr         lr, =0xFFFCFFFC
+-+        b           25b
+++  mov r1, ra10 # y_x
+++  shl r0,r1,r3 # r0 is x<<16
+++  asr r1,r1,r3 # r1 is y
+++  asr r0,r0,r3 # r0 is x
+++  add r0, r0, elem_num # Load x
+++  max r0, r0, 0
+++  min r0, r0, rb_frame_width_minus_1 ; mov r2, ra11  # Load the frame base
+++  shl rx_xshift2_next, r0, 3 # Compute shifts
+++  add ra_y2, r1, 1
+++  and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
+++  add r2, r2, r0  # r2 is address for frame1 (not including y offset)
+++  max r1, r1, 0
+++  min r1, r1, rb_frame_height_minus_1
+++  nop             ; mul24 r1, r1, rb_pitch   # r2 contains the addresses (not including y offset) for frame0
+++  add t1s, r2, r1 ; mov ra_frame_base2, r2
+ +
+-+90:     mov         a3, #1
+-+        b           11b
+-+endfunc
+-diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c
+-index e5da7e9..49c70dd 100644
+---- a/libavcodec/arm/hevcdsp_init_neon.c
+-+++ b/libavcodec/arm/hevcdsp_init_neon.c
+-@@ -290,6 +290,10 @@ static void ff_hevc_sao_edge_neon_wrapper(uint8_t *_dst /* align 16 */, uint8_t
+- }
+- #undef CMP
+- 
+-+void ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_inc, int out_inc,
+-+                                                int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
+-+                                                MvField *curr, MvField *neigh, uint8_t *bs);
+ +
+- av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
+- {
+-     if (bit_depth == 8) {
+-@@ -387,4 +391,9 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
+-         c->put_hevc_qpel_uni[8][0][0]  = ff_hevc_put_qpel_uw_pixels_w48_neon_8;
+-         c->put_hevc_qpel_uni[9][0][0]  = ff_hevc_put_qpel_uw_pixels_w64_neon_8;
+-     }
+++# load constants
+ +
+-+    assert(offsetof(MvField, mv) == 0);
+-+    assert(offsetof(MvField, ref_idx) == 8);
+-+    assert(offsetof(MvField, pred_flag) == 10);
+-+    c->hevc_deblocking_boundary_strengths = ff_hevc_deblocking_boundary_strengths_neon;
+- }
+-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
+-index 7eb37e6..496c0e1 100644
+---- a/libavcodec/hevc.h
+-+++ b/libavcodec/hevc.h
+-@@ -684,17 +684,6 @@ typedef struct CodingUnit {
+-     uint8_t cu_transquant_bypass_flag;
+- } CodingUnit;
+- 
+--typedef struct Mv {
+--    int16_t x;  ///< horizontal component of motion vector
+--    int16_t y;  ///< vertical component of motion vector
+--} Mv;
+--
+--typedef struct MvField {
+--    DECLARE_ALIGNED(4, Mv, mv)[2];
+--    int8_t ref_idx[2];
+--    int8_t pred_flag;
+--} MvField;
+--
+- typedef struct NeighbourAvailable {
+-     int cand_bottom_left;
+-     int cand_left;
+-diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
+-index 6367068..826a82f 100644
+---- a/libavcodec/hevc_filter.c
+-+++ b/libavcodec/hevc_filter.c
+-@@ -726,69 +726,6 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+-     }
+- }
+- 
+--static int boundary_strength(HEVCContext *s, MvField *curr, MvField *neigh,
+--                             RefPicList *neigh_refPicList)
+--{
+--    if (curr->pred_flag == PF_BI &&  neigh->pred_flag == PF_BI) {
+--        // same L0 and L1
+--        if (s->ref->refPicList[0].list[curr->ref_idx[0]] == neigh_refPicList[0].list[neigh->ref_idx[0]]  &&
+--            s->ref->refPicList[0].list[curr->ref_idx[0]] == s->ref->refPicList[1].list[curr->ref_idx[1]] &&
+--            neigh_refPicList[0].list[neigh->ref_idx[0]] == neigh_refPicList[1].list[neigh->ref_idx[1]]) {
+--            if ((FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
+--                 FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4) &&
+--                (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
+--                 FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4))
+--                return 1;
+--            else
+--                return 0;
+--        } else if (neigh_refPicList[0].list[neigh->ref_idx[0]] == s->ref->refPicList[0].list[curr->ref_idx[0]] &&
+--                   neigh_refPicList[1].list[neigh->ref_idx[1]] == s->ref->refPicList[1].list[curr->ref_idx[1]]) {
+--            if (FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
+--                FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4)
+--                return 1;
+--            else
+--                return 0;
+--        } else if (neigh_refPicList[1].list[neigh->ref_idx[1]] == s->ref->refPicList[0].list[curr->ref_idx[0]] &&
+--                   neigh_refPicList[0].list[neigh->ref_idx[0]] == s->ref->refPicList[1].list[curr->ref_idx[1]]) {
+--            if (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
+--                FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4)
+--                return 1;
+--            else
+--                return 0;
+--        } else {
+--            return 1;
+--        }
+--    } else if ((curr->pred_flag != PF_BI) && (neigh->pred_flag != PF_BI)){ // 1 MV
+--        Mv A, B;
+--        int ref_A, ref_B;
+--
+--        if (curr->pred_flag & 1) {
+--            A     = curr->mv[0];
+--            ref_A = s->ref->refPicList[0].list[curr->ref_idx[0]];
+--        } else {
+--            A     = curr->mv[1];
+--            ref_A = s->ref->refPicList[1].list[curr->ref_idx[1]];
+--        }
+--
+--        if (neigh->pred_flag & 1) {
+--            B     = neigh->mv[0];
+--            ref_B = neigh_refPicList[0].list[neigh->ref_idx[0]];
+--        } else {
+--            B     = neigh->mv[1];
+--            ref_B = neigh_refPicList[1].list[neigh->ref_idx[1]];
+--        }
+--
+--        if (ref_A == ref_B) {
+--            if (FFABS(A.x - B.x) >= 4 || FFABS(A.y - B.y) >= 4)
+--                return 1;
+--            else
+--                return 0;
+--        } else
+--            return 1;
+--    }
+--
+--    return 1;
+--}
+- 
+- void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
+-                                            int log2_trafo_size)
+-@@ -799,10 +736,17 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
+-     int log2_min_tu_size = s->ps.sps->log2_min_tb_size;
+-     int min_pu_width     = s->ps.sps->min_pu_width;
+-     int min_tu_width     = s->ps.sps->min_tb_width;
+--    int is_intra = tab_mvf[(y0 >> log2_min_pu_size) * min_pu_width +
+--                           (x0 >> log2_min_pu_size)].pred_flag == PF_INTRA;
+-     int boundary_upper, boundary_left;
+--    int i, j, bs;
+-+    int i, j;
+-+    RefPicList *rpl      = s->ref->refPicList;
+-+    int min_pu_in_4pix   = (1 << log2_min_pu_size) >> 2;
+-+    int trafo_in_min_pus = (1 << log2_trafo_size) >> log2_min_pu_size;
+-+    int y_pu             = y0 >> log2_min_pu_size;
+-+    int x_pu             = x0 >> log2_min_pu_size;
+-+    MvField *curr        = &tab_mvf[y_pu * min_pu_width + x_pu];
+-+    int is_intra         = curr->pred_flag == PF_INTRA;
+-+    int inc              = log2_min_pu_size == 2 ? 2 : 1;
+-+    uint8_t *bs;
+- 
+- #ifdef DISABLE_STRENGTHS
+-     return;
+-@@ -818,34 +762,56 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
+-           (y0 % (1 << s->ps.sps->log2_ctb_size)) == 0)))
+-         boundary_upper = 0;
+- 
+-+    bs = &s->horizontal_bs[(x0 + y0 * s->bs_width) >> 2];
+++  mov ra_k1, 1
+++  mov ra_k256, 256
+++  mov ra30, 64
+++
+++  mov rb20, 0xffffff00
+++  mov rb_k255, 255
+++  mov rb23, 24
+++
+++# touch vertical context to keep simulator happy
+++
+++  mov ra8, 0
+++  mov ra9, 0
+++  mov ra10, 0
+++  mov ra11, 0
+++  mov ra12, 0
+++  mov ra13, 0
+++  mov ra14, 0
+++  mov ra15, 0
+ +
+-     if (boundary_upper) {
+-         RefPicList *rpl_top = (lc->boundary_flags & BOUNDARY_UPPER_SLICE) ?
+-                               ff_hevc_get_ref_list(s, s->ref, x0, y0 - 1) :
+--                              s->ref->refPicList;
+--        int yp_pu = (y0 - 1) >> log2_min_pu_size;
+--        int yq_pu =  y0      >> log2_min_pu_size;
+--        int yp_tu = (y0 - 1) >> log2_min_tu_size;
+--        int yq_tu =  y0      >> log2_min_tu_size;
+-+                              rpl;
+-+        MvField *top = curr - min_pu_width;
+++# Compute part of VPM to use
+++  mov r2, qpu_num
+++  mov r1, r2
+++  asr r1, r1, 2
+++  shl r1, r1, 6
+++  mov r0, r2
+++  and r0, r0, 3
+++  add r0, r0, r1
+++  mov r1, vpm_setup(0, 4, h8p(0, 0))   # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
+++  add rb28, r0, r1  # VPM for saving data
+++  mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
+++  shl r0, r0, 5
+++  add rb27, r0, r1  # Command for dma output
+ +
+-+        if (is_intra) {
+-+            for (i = 0; i < (1 << log2_trafo_size); i += 4)
+-+                bs[i >> 2] = 2;
+++# Weighted prediction denom
+++  add rb13, unif, 9  # unif = weight denom + 6
+ +
+-+        } else {
+-+            int y_tu = y0 >> log2_min_tu_size;
+-+            int x_tu = x0 >> log2_min_tu_size;
+-+            uint8_t *curr_cbf_luma = &s->cbf_luma[y_tu * min_tu_width + x_tu];
+-+            uint8_t *top_cbf_luma = curr_cbf_luma - min_tu_width;
+++  mov -, unif # Unused
+ +
+-+            s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
+-+                    min_pu_in_4pix, sizeof (MvField), 4 >> 2,
+-+                    rpl[0].list, rpl[1].list, rpl_top[0].list, rpl_top[1].list,
+-+                    curr, top, bs);
+- 
+-             for (i = 0; i < (1 << log2_trafo_size); i += 4) {
+--                int x_pu = (x0 + i) >> log2_min_pu_size;
+--                int x_tu = (x0 + i) >> log2_min_tu_size;
+--                MvField *top  = &tab_mvf[yp_pu * min_pu_width + x_pu];
+--                MvField *curr = &tab_mvf[yq_pu * min_pu_width + x_pu];
+--                uint8_t top_cbf_luma  = s->cbf_luma[yp_tu * min_tu_width + x_tu];
+--                uint8_t curr_cbf_luma = s->cbf_luma[yq_tu * min_tu_width + x_tu];
+--
+--                if (curr->pred_flag == PF_INTRA || top->pred_flag == PF_INTRA)
+--                    bs = 2;
+--                else if (curr_cbf_luma || top_cbf_luma)
+--                    bs = 1;
+--                else
+--                    bs = boundary_strength(s, curr, top, rpl_top);
+--                s->horizontal_bs[((x0 + i) + y0 * s->bs_width) >> 2] = bs;
+-+                int i_pu = i >> log2_min_pu_size;
+-+                int i_tu = i >> log2_min_tu_size;
+++# submit texture requests for second line
+++  max r1, ra_y, 0
+++  min r1, r1, rb_frame_height_minus_1
+++  add ra_y, ra_y, 1
+++  nop ; mul24 r1, r1, rb_pitch
+++  add t0s, r1, ra_frame_base
+ +
+-+                if (top[i_pu].pred_flag == PF_INTRA)
+-+                    bs[i >> 2] = 2;
+-+                else if (curr_cbf_luma[i_tu] || top_cbf_luma[i_tu])
+-+                    bs[i >> 2] = 1;
+-             }
+-+        }
+-+    }
+++  max r1, ra_y2, 0
+++  min r1, r1, rb_frame_height_minus_1
+++  add ra_y2, ra_y2, 1
+++  nop ; mul24 r1, r1, rb_pitch
+++  add t1s, r1, ra_frame_base2
+ +
+-+    if (!is_intra) {
+-+        for (j = inc; j < trafo_in_min_pus; j += inc) {
+-+            MvField *top;
+++# FALL THROUGHT TO PER-BLOCK SETUP
+ +
+-+            curr += min_pu_width * inc;
+-+            top = curr - min_pu_width;
+-+            bs += s->bs_width * inc << log2_min_pu_size >> 2;
+++# Start of per-block setup code
+++# P and B blocks share the same setup code to save on Icache space
+++:per_block_setup
+++  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++  mov ra31, unif
+ +
+-+            s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
+-+                    min_pu_in_4pix, sizeof (MvField), 4 >> 2,
+-+                    rpl[0].list, rpl[1].list, rpl[0].list, rpl[1].list,
+-+                    curr, top, bs);
+-+        }
+-     }
+- 
+--    // bs for vertical TU boundaries
+-     boundary_left = x0 > 0 && !(x0 & 7);
+-     if (boundary_left &&
+-         ((!s->sh.slice_loop_filter_across_slices_enabled_flag &&
+-@@ -856,64 +822,54 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
+-           (x0 % (1 << s->ps.sps->log2_ctb_size)) == 0)))
+-         boundary_left = 0;
+- 
+-+    curr = &tab_mvf[y_pu * min_pu_width + x_pu];
+-+    bs = &s->vertical_bs[(x0 + y0 * s->bs_width) >> 2];
+++  mov ra1, unif  ; mov r1, elem_num  # y_x ; elem_num has implicit unpack??
+ +
+-     if (boundary_left) {
+-         RefPicList *rpl_left = (lc->boundary_flags & BOUNDARY_LEFT_SLICE) ?
+-                                ff_hevc_get_ref_list(s, s->ref, x0 - 1, y0) :
+--                               s->ref->refPicList;
+--        int xp_pu = (x0 - 1) >> log2_min_pu_size;
+--        int xq_pu =  x0      >> log2_min_pu_size;
+--        int xp_tu = (x0 - 1) >> log2_min_tu_size;
+--        int xq_tu =  x0      >> log2_min_tu_size;
+--
+--            for (i = 0; i < (1 << log2_trafo_size); i += 4) {
+--                int y_pu      = (y0 + i) >> log2_min_pu_size;
+--                int y_tu      = (y0 + i) >> log2_min_tu_size;
+--                MvField *left = &tab_mvf[y_pu * min_pu_width + xp_pu];
+--                MvField *curr = &tab_mvf[y_pu * min_pu_width + xq_pu];
+--                uint8_t left_cbf_luma = s->cbf_luma[y_tu * min_tu_width + xp_tu];
+--                uint8_t curr_cbf_luma = s->cbf_luma[y_tu * min_tu_width + xq_tu];
+--
+--                if (curr->pred_flag == PF_INTRA || left->pred_flag == PF_INTRA)
+--                    bs = 2;
+--                else if (curr_cbf_luma || left_cbf_luma)
+--                    bs = 1;
+--                else
+--                    bs = boundary_strength(s, curr, left, rpl_left);
+--                s->vertical_bs[(x0 + (y0 + i) * s->bs_width) >> 2] = bs;
+--            }
+--    }
+-+                               rpl;
+-+        MvField *left = curr - 1;
+- 
+--    if (log2_trafo_size > log2_min_pu_size && !is_intra) {
+--        RefPicList *rpl = s->ref->refPicList;
+-+        if (is_intra) {
+-+            for (j = 0; j < (1 << log2_trafo_size); j += 4)
+-+                bs[j * s->bs_width >> 2] = 2;
+- 
+--        // bs for TU internal horizontal PU boundaries
+--        for (j = 8; j < (1 << log2_trafo_size); j += 8) {
+--            int yp_pu = (y0 + j - 1) >> log2_min_pu_size;
+--            int yq_pu = (y0 + j)     >> log2_min_pu_size;
+--
+--            for (i = 0; i < (1 << log2_trafo_size); i += 4) {
+--                int x_pu = (x0 + i) >> log2_min_pu_size;
+--                MvField *top  = &tab_mvf[yp_pu * min_pu_width + x_pu];
+--                MvField *curr = &tab_mvf[yq_pu * min_pu_width + x_pu];
+--
+--                bs = boundary_strength(s, curr, top, rpl);
+--                s->horizontal_bs[((x0 + i) + (y0 + j) * s->bs_width) >> 2] = bs;
+-+        } else {
+-+            int y_tu = y0 >> log2_min_tu_size;
+-+            int x_tu = x0 >> log2_min_tu_size;
+-+            uint8_t *curr_cbf_luma = &s->cbf_luma[y_tu * min_tu_width + x_tu];
+-+            uint8_t *left_cbf_luma = curr_cbf_luma - 1;
+++# per-channel shifts were calculated on the *previous* invocation
+++  mov ra_xshift, ra_xshift_next
+++  mov rx_xshift2, rx_xshift2_next
+++
+++# get base addresses and per-channel shifts for *next* invocation
+++
+++  add r0, ra1.16a, r1 # Load x
+++  max r0, r0, 0
+++  min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
+++  shl ra_xshift_next, r0, 3 # Compute shifts
+++  mov r3, 8                          ; mov ra_y_next, ra1.16b
+++  and r0, r0, ~3                     ; mov ra1, unif # y2_x2
+++  add ra_frame_base_next, r2, r0
+++
+++  add r0, ra1.16a, r1 # Load x
+++  max r0, r0, 0
+++  min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
+++  shl rx_xshift2_next, r0, 3         # Compute shifts
+++  add r3, r3, r3                     ; mov ra_y2_next, ra1.16b  # r3 = 16 ;
+++  and r0, r0, ~3                     ; mov ra1, unif  # width_height ; r0 gives the clipped and aligned x coordinate
+++  add rx_frame_base2_next, r2, r0    # r2 is address for frame1 (not including y offset)
+++
+++# set up VPM write
+++  mov vw_setup, rb28
+++
+++# get width,height of block (unif load above)
+++  sub rb29, rb24, ra1.16b # Compute vdw_setup1(dst_pitch-width)
+++  add rb17, ra1.16a, 5
+++  add rb18, ra1.16a, 7
+++  shl r0,   ra1.16a, 7
+++  add r0,   r0, ra1.16b # Combine width and height of destination area
+++  shl r0,   r0, i_shift16 # Shift into bits 16 upwards of the vdw_setup0 register
+++  add rb26, r0, rb27                 ; mov r0, unif   # Packed filter offsets
+++
+++# get filter coefficients and discard unused B frame values
+++  shl.ifz r0, r0, i_shift16      # Pick half to use
+++  shl ra8, r0, 3
+++
+++# Pack the 1st 4 filter coefs for H & V tightly
+++
+++  mov r1,0x00010100  # -ve
+++  ror ra2.8a, r1, ra8.8d
+++  ror ra0.8a, r1, ra8.8c
+++
+++  mov r1,0x01040400
+++  ror ra2.8b, r1, ra8.8d
+++  ror ra0.8b, r1, ra8.8c
+++
+++  mov r1,0x050b0a00  # -ve
+++  ror ra2.8c, r1, ra8.8d
+++  ror ra0.8c, r1, ra8.8c
+++
+++  mov r1,0x11283a40
+++  ror ra2.8d, r1, ra8.8d
+++  ror ra0.8d, r1, ra8.8c
+++
+++# In the 2nd vertical half we use b registers due to
+++# using a-side fifo regs. The easiest way to achieve this to pack it
+++# and then unpack!
+++
+++  mov r1,0x3a281100
+++  ror ra3.8a, r1, ra8.8d
+++  ror ra1.8a, r1, ra8.8c
+++
+++  mov r1,0x0a0b0500  # -ve
+++  ror ra3.8b, r1, ra8.8d
+++  ror ra1.8b, r1, ra8.8c
+++
+++  mov r1,0x04040100
+++  ror ra3.8c, r1, ra8.8d
+++  ror ra1.8c, r1, ra8.8c
+++
+++# Extract weighted prediction information in parallel
+++
+++  mov r1,0x01010000  # -ve
+++  ror ra3.8d, r1, ra8.8d    ; mov r0, unif      # ; weight L1 weight L1 (hi16)/weight L0 (lo16)
+++  ror ra1.8d, r1, ra8.8c    ; mov r1, rb13      # ; rb13 = weight denom + 6 + 9
+++
+++# r3 = 16 from (long way) above
+++  shl r1, unif, r1          ; mov rb4, ra3.8a   # combined offet = ((is P) ? offset L0 * 2 : offset L1 + offset L0) + 1) ;
+++  asr ra18, r0, r3          ; mov rb5, ra3.8b
+++  bra -, ra31
+++  shl r0, r0, r3            ; mov rb6, ra3.8c
+++  mov r3, 0                 ; mov rb7, ra3.8d   # loop count ;
+++  asr rb12, r1, 9
+++
+++# >>> branch ra31
+++#
+++# r3 = 0
+++# ra18 = weight L1
+++# r0   = weight L0 << 16 (will be put into rb14 in filter preamble)
+++# rb13 = weight denom + 6 + 9
+++# rb12 = (((is P) ? offset L0 * 2 : offset L1 + offset L0) + 1) << (rb13 - 1)
+++
+++
+++################################################################################
+++# mc_filter(y_x, frame_base, y2_x2, frame_base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
+++# In a P block, y2_x2 should be y_x+8
+++# At this point we have already issued two pairs of texture requests for the current block
+++
+++::mc_filter
+++# r0 = weight << 16; We want weight * 2 in rb14
+++  asr rb14, r0, 15
+++
+++# r3 = 0
+++
+++:yloop
+++# retrieve texture results and pick out bytes
+++# then submit two more texture requests
+++
+++# If we knew there was no clipping then this code would get simpler.
+++# Perhaps we could add on the pitch and clip using larger values?
+++
+++# N.B. Whilst y == y2 as far as this loop is concerned we will start
+++# the grab for the next block before we finish with this block and that
+++# might be B where y != y2 so we must do full processing on both y and y2
+++
+++  sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1                           ; ldtmu0
+++  shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu1
+++  mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+++  mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+++  shr r1, r4, rx_xshift2    ; mov.ifz ra_y2, ra_y2_next
+++
+++  max r2, ra_y, 0  # y
+++  min r2, r2, rb_frame_height_minus_1
+++  add ra_y, ra_y, 1            ; mul24 r2, r2, r3
+++  add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte
+++
+++  max r2, ra_y2, 0  # y
+++  min r2, r2, rb_frame_height_minus_1
+++  add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
+++  add t1s, ra_frame_base2, r2  ; v8subs r1, r1, rb20
+++
+++# generate seven shifted versions
+++# interleave with scroll of vertical context
+++
+++  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++
+++# apply horizontal filter
+++  nop                  ; mul24      r3, ra0.8a,      r0
+++  nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
+++  nop                  ; mul24      r2, ra0.8b << 1, r0 << 1
+++  nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
+++  sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2
+++  nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
+++  sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3
+++  nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
+++  add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4
+++  nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12
+++  add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5
+++  nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13
+++  sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6
+++  nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14
+++  add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7
+++  nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15
+++  sub r0, r2, r3       ; mov r3, rb31
+++
+++  sub.setf -, r3, 8       ; mov r1,   ra8
+++  mov ra8,  ra9           ; mov rb8,  rb9
+++  brr.anyn -, r:yloop
+++  mov ra9,  ra10          ; mov rb9,  rb10
+++  mov ra10, ra11          ; mov rb10, rb11
+++  mov ra11, r0            ; mov rb11, r1
+++  # >>> .anyn yloop
+++
+++  # apply vertical filter and write to VPM
+++
+++  nop                     ; mul24 r0, rb8,  ra2.8a
+++  nop                     ; mul24 r1, rb9,  ra2.8b
+++  sub r1, r1, r0          ; mul24 r0, rb10, ra2.8c
+++  sub r1, r1, r0          ; mul24 r0, rb11, ra2.8d
+++  add r1, r1, r0          ; mul24 r0, ra8,  rb4
+++  add r1, r1, r0          ; mul24 r0, ra9,  rb5
+++  sub r1, r1, r0          ; mul24 r0, ra10, rb6
+++  add r1, r1, r0          ; mul24 r0, ra11, rb7
+++  sub r1, r1, r0          ; mov -, vw_wait
+++# At this point r1 is a 22-bit signed quantity: 8 (original sample),
+++#  +6, +6 (each pass), +1 (the passes can overflow slightly), +1 (sign)
+++# The top 8 bits have rubbish in them as mul24 is unsigned
+++# The low 6 bits need discard before weighting
+++  sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256  # x256 - sign extend & discard rubbish
+++  asr r1, r1, 14
+++  nop                     ; mul24 r1, r1, rb14
+++  add r1, r1, rb12
+++
+++  shl r1, r1, 8
+++  brr.anyn -, r:yloop
+++  asr r1, r1, rb13
+++# We have a saturating pack unit - I can't help feeling it should be useful here
+++  min r1, r1, rb_k255       # Delay 2  rb_k255 = 255
+++  max vpm, r1, 0         # Delay 3
+++# >>> branch.anyn yloop
+++
+++# DMA out
+++
+++  brr -, r:per_block_setup
+++  mov vw_setup, rb26 # VDW setup 0    Delay 1
+++  mov vw_setup, rb29 # Stride         Delay 2
+++  mov vw_addr, unif # start the VDW   Delay 3
+++
+++
+++
+++################################################################################
+++
+++# mc_filter_b(y_x, frame_base, y2_x2, frame_base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
+++# In a P block, only the first half of coefficients contain used information.
+++# At this point we have already issued two pairs of texture requests for the current block
+++# May be better to just send 16.16 motion vector and figure out the coefficients inside this block (only 4 cases so can compute hcoeffs in around 24 cycles?)
+++# Can fill in the coefficients so only
+++# Can also assume default weighted prediction for B frames.
+++# Perhaps can unpack coefficients in a more efficient manner by doing H/V for a and b at the same time?
+++# Or possibly by taking advantage of symmetry?
+++# From 19->7 32bits per command.
+ +
+-+            s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
+-+                    min_pu_in_4pix, min_pu_width * sizeof (MvField), 4 * s->bs_width >> 2,
+-+                    rpl[0].list, rpl[1].list, rpl_left[0].list, rpl_left[1].list,
+-+                    curr, left, bs);
+++::mc_filter_b
+++  # r0 = weightL0 << 16, we want it in rb14
+++  asr rb14, r0, i_shift16
+ +
+-+            for (j = 0; j < (1 << log2_trafo_size); j += 4) {
+-+                int j_pu = j >> log2_min_pu_size;
+-+                int j_tu = j >> log2_min_tu_size;
+++:yloopb
+++# retrieve texture results and pick out bytes
+++# then submit two more texture requests
+ +
+-+                if (left[j_pu * min_pu_width].pred_flag == PF_INTRA)
+-+                    bs[j * s->bs_width >> 2] = 2;
+-+                else if (curr_cbf_luma[j_tu * min_tu_width] || left_cbf_luma[j_tu * min_tu_width])
+-+                    bs[j * s->bs_width >> 2] = 1;
+-             }
+-         }
+-+    }
+- 
+--        // bs for TU internal vertical PU boundaries
+--        for (j = 0; j < (1 << log2_trafo_size); j += 4) {
+--            int y_pu = (y0 + j) >> log2_min_pu_size;
+-+    if (!is_intra) {
+-+        for (i = inc; i < trafo_in_min_pus; i += inc) {
+-+            MvField *left;
+- 
+--            for (i = 8; i < (1 << log2_trafo_size); i += 8) {
+--                int xp_pu = (x0 + i - 1) >> log2_min_pu_size;
+--                int xq_pu = (x0 + i)     >> log2_min_pu_size;
+--                MvField *left = &tab_mvf[y_pu * min_pu_width + xp_pu];
+--                MvField *curr = &tab_mvf[y_pu * min_pu_width + xq_pu];
+-+            curr += inc;
+-+            left = curr - 1;
+-+            bs += inc << log2_min_pu_size >> 2;
+- 
+--                bs = boundary_strength(s, curr, left, rpl);
+--                s->vertical_bs[((x0 + i) + (y0 + j) * s->bs_width) >> 2] = bs;
+--            }
+-+            s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
+-+                    min_pu_in_4pix, min_pu_width * sizeof (MvField), 4 * s->bs_width >> 2,
+-+                    rpl[0].list, rpl[1].list, rpl[0].list, rpl[1].list,
+-+                    curr, left, bs);
+-         }
+-     }
+- }
+-diff --git a/libavcodec/hevcdsp.c b/libavcodec/hevcdsp.c
+-index 9d773d9..a6534a9 100644
+---- a/libavcodec/hevcdsp.c
+-+++ b/libavcodec/hevcdsp.c
+-@@ -123,6 +123,120 @@ DECLARE_ALIGNED(16, const int8_t, ff_hevc_qpel_filters[3][16]) = {
+- #include "hevcdsp_template.c"
+- #undef BIT_DEPTH
+- 
+-+static void hevc_deblocking_boundary_strengths(int pus, int dup, int in_inc, int out_inc,
+-+                                               int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
+-+                                               MvField *curr, MvField *neigh, uint8_t *bs)
+-+{
+-+    for (; pus > 0; pus--) {
+-+        int strength, out;
+-+        int curr_refL0 = curr_rpl0[curr->ref_idx[0]];
+-+        int curr_refL1 = curr_rpl1[curr->ref_idx[1]];
+-+        int neigh_refL0 = neigh_rpl0[neigh->ref_idx[0]];
+-+        int neigh_refL1 = neigh_rpl1[neigh->ref_idx[1]];
+++# If we knew there was no clipping then this code would get simpler.
+++# Perhaps we could add on the pitch and clip using larger values?
+ +
+-+#if 1 // This more directly matches the original implementation
+-+        if (curr->pred_flag == PF_BI &&  neigh->pred_flag == PF_BI) {
+-+            // same L0 and L1
+-+            if (curr_refL0 == neigh_refL0 &&
+-+                curr_refL0 == curr_refL1 &&
+-+                neigh_refL0 == neigh_refL1) {
+-+                if ((FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
+-+                     FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4) &&
+-+                    (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
+-+                     FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4))
+-+                    strength = 1;
+-+                else
+-+                    strength = 0;
+-+            } else if (neigh_refL0 == curr_refL0 &&
+-+                       neigh_refL1 == curr_refL1) {
+-+                if (FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
+-+                    FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4)
+-+                    strength = 1;
+-+                else
+-+                    strength = 0;
+-+            } else if (neigh_refL1 == curr_refL0 &&
+-+                       neigh_refL0 == curr_refL1) {
+-+                if (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
+-+                    FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4)
+-+                    strength = 1;
+-+                else
+-+                    strength = 0;
+-+            } else {
+-+                strength = 1;
+-+            }
+-+        } else if ((curr->pred_flag != PF_BI) && (neigh->pred_flag != PF_BI)){ // 1 MV
+-+            Mv curr_mv0, neigh_mv0;
+++  sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1                           ; ldtmu0
+++  shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu1
+++  mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+++  mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+++  shr r1, r4, rx_xshift2    ; mov.ifz ra_y2, ra_y2_next
+ +
+-+            if (curr->pred_flag & 1) {
+-+                curr_mv0   = curr->mv[0];
+-+            } else {
+-+                curr_mv0   = curr->mv[1];
+-+                curr_refL0 = curr_refL1;
+-+            }
+++  max r2, ra_y, 0  # y
+++  min r2, r2, rb_frame_height_minus_1
+++  add ra_y, ra_y, 1            ; mul24 r2, r2, r3
+++  add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte
+ +
+-+            if (neigh->pred_flag & 1) {
+-+                neigh_mv0   = neigh->mv[0];
+-+            } else {
+-+                neigh_mv0   = neigh->mv[1];
+-+                neigh_refL0 = neigh_refL1;
+-+            }
+++  max r2, ra_y2, 0  # y
+++  min r2, r2, rb_frame_height_minus_1
+++  add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
+++  add t1s, ra_frame_base2, r2  ; v8subs r1, r1, rb20
+ +
+-+            if (curr_refL0 == neigh_refL0) {
+-+                if (FFABS(curr_mv0.x - neigh_mv0.x) >= 4 || FFABS(curr_mv0.y - neigh_mv0.y) >= 4)
+-+                    strength = 1;
+-+                else
+-+                    strength = 0;
+-+            } else
+-+                strength = 1;
+-+        } else
+-+            strength = 1;
+-+#else // This has exactly the same effect, but is more suitable for vectorisation
+-+        Mv curr_mv[2];
+-+        Mv neigh_mv[2];
+-+        memcpy(curr_mv, curr->mv, sizeof curr_mv);
+-+        memcpy(neigh_mv, neigh->mv, sizeof neigh_mv);
+++# generate seven shifted versions
+++# interleave with scroll of vertical context
+ +
+-+        if (!(curr->pred_flag & 2)) {
+-+            curr_mv[1] = curr_mv[0];
+-+            curr_refL1 = curr_refL0;
+-+        }
+-+        if (!(neigh->pred_flag & 2)) {
+-+            neigh_mv[1] = neigh_mv[0];
+-+            neigh_refL1 = neigh_refL0;
+-+        }
+-+        if (!(curr->pred_flag & 1)) {
+-+            curr_mv[0] = curr_mv[1];
+-+            curr_refL0 = curr_refL1;
+-+        }
+-+        if (!(neigh->pred_flag & 1)) {
+-+            neigh_mv[0] = neigh_mv[1];
+-+            neigh_refL0 = neigh_refL1;
+-+        }
+++  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+ +
+-+        strength = 1;
+++# apply horizontal filter
+++  nop                  ; mul24      r3, ra0.8a,      r0
+++  nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
+++  nop                  ; mul24      r2, ra0.8b << 1, r0 << 1
+++  nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
+++  sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2
+++  nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
+++  sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3
+++  nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
+++  add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4
+++  nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12
+++  add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5
+++  nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13
+++  sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6
+++  nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14
+++  add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7
+++  nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15
+++  sub r0, r2, r3       ; mov r3, rb31
+++
+++  sub.setf -, r3, 8       ; mov r1,   ra8
+++  mov ra8,  ra9           ; mov rb8,  rb9
+++  brr.anyn -, r:yloopb
+++  mov ra9,  ra10          ; mov rb9,  rb10
+++  mov ra10, ra11          ; mov rb10, rb11
+++  mov ra11, r0            ; mov rb11, r1
+++  # >>> .anyn yloopb
+ +
+-+        strength &= (neigh_refL0 != curr_refL0) | (neigh_refL1 != curr_refL1) |
+-+                (FFABS(neigh_mv[0].x - curr_mv[0].x) >= 4) | (FFABS(neigh_mv[0].y - curr_mv[0].y) >= 4) |
+-+                (FFABS(neigh_mv[1].x - curr_mv[1].x) >= 4) | (FFABS(neigh_mv[1].y - curr_mv[1].y) >= 4);
+++  # apply vertical filter and write to VPM
+ +
+-+        strength &= (neigh_refL1 != curr_refL0) | (neigh_refL0 != curr_refL1) |
+-+                (FFABS(neigh_mv[1].x - curr_mv[0].x) >= 4) | (FFABS(neigh_mv[1].y - curr_mv[0].y) >= 4) |
+-+                (FFABS(neigh_mv[0].x - curr_mv[1].x) >= 4) | (FFABS(neigh_mv[0].y - curr_mv[1].y) >= 4);
+++  nop                     ; mul24 r0, rb8,  ra2.8a
+++  nop                     ; mul24 r1, rb9,  ra2.8b
+++  sub r1, r1, r0          ; mul24 r0, rb10, ra2.8c
+++  sub r1, r1, r0          ; mul24 r0, rb11, ra2.8d
+++  add r1, r1, r0          ; mul24 r0, ra8,  rb4
+++  add r1, r1, r0          ; mul24 r0, ra9,  rb5
+++  sub r1, r1, r0          ; mul24 r0, ra10, rb6
+++  add r1, r1, r0          ; mul24 r0, ra11, rb7
+++  sub r1, r1, r0          ; mov r2, rb12
+++# As with P-pred r1 is a 22-bit signed quantity in 32-bits
+++# Top 8 bits are bad - low 6 bits should be discarded
+++  sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
+ +
+-+        strength |= (((curr->pred_flag + 1) ^ (neigh->pred_flag + 1)) >> 2);
+-+#endif
+++  asr r1, r1, 14
+++  nop                     ; mul24 r0, r1, rb14
+++  add r0, r0, r2          ; mul24 r1, r1 << 8, ra18 << 8
+ +
+-+        curr += in_inc / sizeof (MvField);
+-+        neigh += in_inc / sizeof (MvField);
+++  add r1, r1, r0          ; mov -, vw_wait
+++  shl r1, r1, 8
+ +
+-+        for (out = dup; out > 0; out--)
+-+        {
+-+            *bs = strength;
+-+            bs += out_inc;
+-+        }
+-+    }
+-+}
+++  brr.anyn -, r:yloopb
+++  asr r1, r1, rb13         # Delay 1
+++  min r1, r1, rb_k255       # Delay 2
+++  max vpm, r1, 0         # Delay 3
+ +
+- void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
+- {
+- #undef FUNC
+-@@ -257,6 +371,8 @@ int i = 0;
+-         break;
+-     }
+- 
+-+    hevcdsp->hevc_deblocking_boundary_strengths = hevc_deblocking_boundary_strengths;
+++# DMA out
+++  brr -, r:per_block_setup
+++  mov vw_setup, rb26 # VDW setup 0    Delay 1
+++  mov vw_setup, rb29 # Stride         Delay 2
+++  mov vw_addr, unif # start the VDW   Delay 3
+++
+++################################################################################
+++
+++# mc_interrupt_exit12()
+++::mc_interrupt_exit12
+++  mov  -, vw_wait # wait on the VDW
+++
+++  # Dummy wait to test instructions
+++#  mov r3,1000000
+++#:dummy_loop
+++#  sub.setf r3, r3, 1
+++#  nop
+++#  nop
+++#  brr.anynn -, r:dummy_loop
+++#  nop
+++#  nop
+++#  nop
+++
+++  ldtmu0
+++  ldtmu0
+++  ldtmu1
+++  ldtmu1
+++
+++  mov -,sacq(0) # 1
+++  mov -,sacq(0) # 2
+++  mov -,sacq(0) # 3
+++  mov -,sacq(0) # 4
+++  mov -,sacq(0) # 5
+++  mov -,sacq(0) # 6
+++  mov -,sacq(0) # 7
+++  mov -,sacq(0) # 8
+++  mov -,sacq(0) # 9
+++  mov -,sacq(0) # 10
+++  mov -,sacq(0) # 11
+++
+++  nop        ; nop ; thrend
+++  mov interrupt, 1; nop # delay slot 1
+++  nop        ; nop # delay slot 2
+++
+++
+++::mc_exit1
+++  mov  -, vw_wait # wait on the VDW
+++
+++  ldtmu0
+++  ldtmu1
+++  ldtmu0
+++  ldtmu1
+++  nop        ; nop ; thrend
+++  mov interrupt, 1; nop # delay slot 1
+++  nop        ; nop # delay slot 2
+++
+++
+++::mc_end
+++# Do not add code here because mc_end must appear after all other code.
++diff --git a/libavcodec/rpi_user_vcsm.h b/libavcodec/rpi_user_vcsm.h
++new file mode 100644
++index 0000000..db41a4d
++--- /dev/null
+++++ b/libavcodec/rpi_user_vcsm.h
++@@ -0,0 +1,459 @@
+++/*****************************************************************************
+++* Copyright 2001 - 2011 Broadcom Corporation.  All rights reserved.
+++*
+++* This program is the proprietary software of Broadcom Corporation and/or
+++* its licensors, and may only be used, duplicated, modified or distributed
+++* pursuant to the terms and conditions of a separate, written license
+++* agreement executed between you and Broadcom (an "Authorized License").
+++* Except as set forth in an Authorized License, Broadcom grants no license
+++* (express or implied), right to use, or waiver of any kind with respect to
+++* the Software, and Broadcom expressly reserves all rights in and to the
+++* Software and all intellectual property rights therein.  IF YOU HAVE NO
+++* AUTHORIZED LICENSE, THEN YOU HAVE NO RIGHT TO USE THIS SOFTWARE IN ANY
+++* WAY, AND SHOULD IMMEDIATELY NOTIFY BROADCOM AND DISCONTINUE ALL USE OF
+++* THE SOFTWARE.
+++*
+++* Except as expressly set forth in the Authorized License,
+++* 1. This program, including its structure, sequence and organization,
+++*    constitutes the valuable trade secrets of Broadcom, and you shall use
+++*    all reasonable efforts to protect the confidentiality thereof, and to
+++*    use this information only in connection with your use of Broadcom
+++*    integrated circuit products.
+++* 2. TO THE MAXIMUM EXTENT PERMITTED BY LAW, THE SOFTWARE IS PROVIDED "AS IS"
+++*    AND WITH ALL FAULTS AND BROADCOM MAKES NO PROMISES, REPRESENTATIONS OR
+++*    WARRANTIES, EITHER EXPRESS, IMPLIED, STATUTORY, OR OTHERWISE, WITH
+++*    RESPECT TO THE SOFTWARE.  BROADCOM SPECIFICALLY DISCLAIMS ANY AND ALL
+++*    IMPLIED WARRANTIES OF TITLE, MERCHANTABILITY, NONINFRINGEMENT, FITNESS
+++*    FOR A PARTICULAR PURPOSE, LACK OF VIRUSES, ACCURACY OR COMPLETENESS,
+++*    QUIET ENJOYMENT, QUIET POSSESSION OR CORRESPONDENCE TO DESCRIPTION. YOU
+++*    ASSUME THE ENTIRE RISK ARISING OUT OF USE OR PERFORMANCE OF THE SOFTWARE.
+++* 3. TO THE MAXIMUM EXTENT PERMITTED BY LAW, IN NO EVENT SHALL BROADCOM OR ITS
+++*    LICENSORS BE LIABLE FOR (i) CONSEQUENTIAL, INCIDENTAL, SPECIAL, INDIRECT,
+++*    OR EXEMPLARY DAMAGES WHATSOEVER ARISING OUT OF OR IN ANY WAY RELATING TO
+++*    YOUR USE OF OR INABILITY TO USE THE SOFTWARE EVEN IF BROADCOM HAS BEEN
+++*    ADVISED OF THE POSSIBILITY OF SUCH DAMAGES; OR (ii) ANY AMOUNT IN EXCESS
+++*    OF THE AMOUNT ACTUALLY PAID FOR THE SOFTWARE ITSELF OR U.S. $1, WHICHEVER
+++*    IS GREATER. THESE LIMITATIONS SHALL APPLY NOTWITHSTANDING ANY FAILURE OF
+++*    ESSENTIAL PURPOSE OF ANY LIMITED REMEDY.
+++*****************************************************************************/
+ +
+-     if (ARCH_X86)
+-         ff_hevc_dsp_init_x86(hevcdsp, bit_depth);
+-     if (ARCH_ARM)
+-diff --git a/libavcodec/hevcdsp.h b/libavcodec/hevcdsp.h
+-index 9f1f6dd..e221e54 100644
+---- a/libavcodec/hevcdsp.h
+-+++ b/libavcodec/hevcdsp.h
+-@@ -42,6 +42,17 @@ typedef struct SAOParams {
+-     uint8_t type_idx[3];    ///< sao_type_idx
+- } SAOParams;
+- 
+-+typedef struct Mv {
+-+    int16_t x;  ///< horizontal component of motion vector
+-+    int16_t y;  ///< vertical component of motion vector
+-+} Mv;
+++#ifndef __USER_VCSM__H__INCLUDED__
+++#define __USER_VCSM__H__INCLUDED__
+ +
+-+typedef struct MvField {
+-+    DECLARE_ALIGNED(4, Mv, mv)[2];
+-+    int8_t ref_idx[2];
+-+    int8_t pred_flag;
+-+} MvField;
+++/* VideoCore Shared Memory - user interface library.
+++**
+++** This library provides all the necessary abstraction for any application to
+++** make use of the shared memory service which is distributed accross a kernel
+++** driver and a videocore service.
+++**
+++** It is an application design decision to choose or not to use this service.
+++**
+++** The logical flow of operations that a user application needs to follow when
+++** using this service is:
+++**
+++**       1) Initialize the service.
+++**       2) Allocate shared memory blocks.
+++**       3) Start using the allocated blocks.
+++**          - In order to gain ownership on a block, lock the allocated block,
+++**            locking a block returns a valid address that the user application
+++**            can access.
+++**          - When finished with using the block for the current execution cycle
+++**            or function, and so when giving up the ownership, unlock the block.
+++**       4) A block can be locked/unlocked as many times required - within or outside
+++**          of - a specific execution context.
+++**       5) To completely release an allocated block, free it.
+++**       6) If the service is no longer required, terminate it.
+++**
+++**
+++** Some generic considerations:
+ +
+- typedef struct HEVCDSPContext {
+-     void (*put_pcm)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
+-                     struct GetBitContext *gb, int pcm_bit_depth);
+-@@ -120,6 +131,9 @@ typedef struct HEVCDSPContext {
+-     void (*hevc_v_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride,
+-                                         int32_t *tc, uint8_t *no_p,
+-                                         uint8_t *no_q);
+-+    void (*hevc_deblocking_boundary_strengths)(int pus, int dup, int in_inc, int out_inc,
+-+                                               int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
+-+                                               MvField *curr, MvField *neigh, uint8_t *bs);
+- } HEVCDSPContext;
+- 
+- void ff_hevc_dsp_init(HEVCDSPContext *hpc, int bit_depth);
+--- 
+-2.7.4
+-
+-
+-From 619366d6acfd5f040a3116fda97b1146c8e40250 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Wed, 15 Jul 2015 09:09:11 +0100
+-Subject: [PATCH 68/68] Only enable qpu when needed
+-
+----
+- libavcodec/hevc.h    |  2 +-
+- libavcodec/rpi_qpu.c | 21 ++++++++++++++++-----
+- 2 files changed, 17 insertions(+), 6 deletions(-)
+-
+-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
+-index 496c0e1..ce14975 100644
+---- a/libavcodec/hevc.h
+-+++ b/libavcodec/hevc.h
+-@@ -57,7 +57,7 @@
+-   // Define RPI_WORKER to launch a worker thread for pixel processing tasks
+-   #define RPI_WORKER
+-   // Define RPI_DEBLOCK_VPU to perform deblocking on the VPUs
+--  #define RPI_DEBLOCK_VPU
+-+  //#define RPI_DEBLOCK_VPU
+- 
+- #endif
+- 
+-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
+-index 5aa0432..ffd13ca 100644
+---- a/libavcodec/rpi_qpu.c
+-+++ b/libavcodec/rpi_qpu.c
+-@@ -9,7 +9,7 @@
+- // define RPI_ASYNC to run the VPU in a separate thread, need to make a separate call to check for completion
+- #define RPI_ASYNC
+- // Define RPI_COMBINE_JOBS to find jobs that can be executed in parallel
+--#define RPI_COMBINE_JOBS
+-+//#define RPI_COMBINE_JOBS
+- 
+- #include <stdio.h>
+- #include <stdlib.h>
+-@@ -143,9 +143,9 @@ static int gpu_init(volatile struct GPU **gpu) {
+-   volatile struct GPU* ptr;
+- 	if (mb < 0)
+- 		return -1;
+--
+-+#ifndef RPI_ASYNC
+- 	if (qpu_enable(mb, 1)) return -2;
+--
+-+#endif
+-   vcsm_init();
+-   gpu_malloc_uncached_internal(sizeof(struct GPU), &gpu_mem_ptr, mb);
+-   ptr = (volatile struct GPU*)gpu_mem_ptr.arm;
+-@@ -336,9 +336,9 @@ static void gpu_term(void)
+-     vpu_post_code(0, 0, 0, 0, 0, 0, -1, NULL);
+-     pthread_join(vpu_thread, &res);
+-   }
+--#endif
+--
+-+#else
+-   qpu_enable(mb, 0);
+-+#endif
+-   gpu_free_internal(&gpu_mem_ptr);
+- 
+-   vcsm_exit();
+-@@ -400,6 +400,7 @@ static void *vpu_start(void *arg) {
+-   int count_deblock=0;
+-   int count_qpu=0;
+- #endif
+-+  int qpu_started = 0;
+-   while(1) {
+-     int i;
+-     int *p; // Pointer for a QPU/VPU job
+-@@ -427,6 +428,12 @@ static void *vpu_start(void *arg) {
+-     if (p[7] == 0 && p[0] == 0 && p[16]==0)
+-       goto job_done_early;
+- 
+-+    if (!qpu_started) {
+-+      int result = qpu_enable(gpu->mb, 1);
+-+      av_assert0(result==0);
+-+      qpu_started = 1;
+-+    }
+++** Allocating memory blocks.
+++**
+++**   Memory blocks can be allocated in different manners depending on the cache
+++**   behavior desired.  A given block can either be:
+ +
+- #ifdef RPI_COMBINE_JOBS
+-     // First scan for a qpu job
+-     for (int x=0;x<num_jobs;x++) {
+-@@ -556,6 +563,10 @@ job_done_early:
+-     pthread_mutex_unlock(&post_mutex);
+-   }
+- 
+-+  if (qpu_started) {
+-+    qpu_enable(gpu->mb, 0);
+-+  }
+++**       - Allocated in a non cached fashion all the way through host and videocore.
+++**       - Allocated in a cached fashion on host OR videocore.
+++**       - Allocated in a cached fashion on host AND videocore.
+++**
+++**   It is an application decision to determine how to allocate a block.  Evidently
+++**   if the application will be doing substantial read/write accesses to a given block,
+++**   it is recommended to allocate the block at least in a 'host cached' fashion for
+++**   better results.
+++**
+++**
+++** Locking memory blocks.
+++**
+++**   When the memory block has been allocated in a host cached fashion, locking the
+++**   memory block (and so taking ownership of it) will trigger a cache invalidation.
+++**
+++**   For the above reason and when using host cached allocation, it is important that
+++**   an application properly implements the lock/unlock mechanism to ensure cache will
+++**   stay coherent, otherwise there is no guarantee it will at all be.
+++**
+++**   It is possible to dynamically change the host cache behavior (ie cached or non
+++**   cached) of a given allocation without needing to free and re-allocate the block.
+++**   This feature can be useful for such application which requires access to the block
+++**   only at certain times and not otherwise.  By changing the cache behavior dynamically
+++**   the application can optimize performances for a given duration of use.
+++**   Such dynamic cache behavior remapping only applies to host cache and not videocore
+++**   cache.  If one requires to change the videocore cache behavior, then a new block
+++**   must be created to replace the old one.
+++**
+++**   On successful locking, a valid pointer is returned that the application can use
+++**   to access to data inside the block.  There is no guarantee that the pointer will
+++**   stay valid following the unlock action corresponding to this lock.
+++**
+++**
+++** Unocking memory blocks.
+++**
+++**   When the memory block has been allocated in a host cached fashion, unlocking the
+++**   memory block (and so forgiving its ownership) will trigger a cache flush unless
+++**   explicitely asked not to flush the cache for performances reasons.
+++**
+++**   For the above reason and when using host cached allocation, it is important that
+++**   an application properly implements the lock/unlock mechanism to ensure cache will
+++**   stay coherent, otherwise there is no guarantee it will at all be.
+++**
+++**
+++** A complete API is defined below.
+++*/
+ +
+-   return NULL;
+- }
+- 
+--- 
+-2.7.4
+-
+-From a0d0946951b53e64ce103dd61b455f8d1f72caf9 Mon Sep 17 00:00:00 2001
+-From: John Cox <jc@kynesim.co.uk>
+-Date: Tue, 9 Feb 2016 11:57:40 +0000
+-Subject: [PATCH 1/2] Zero copy code v6
+-
+-This version has GPU buffer pooling code
+----
+- ffmpeg.c                 | 123 +++++++++-----
+- libavcodec/Makefile      |   2 +
+- libavcodec/avcodec.h     |   6 +
+- libavcodec/hevc.c        |  92 ++++++-----
+- libavcodec/hevc_filter.c |  83 +++++-----
+- libavcodec/rpi_qpu.c     |   2 +-
+- libavcodec/rpi_qpu.h     | 109 ++++++++++++-
+- libavcodec/rpi_zc.c      | 406 +++++++++++++++++++++++++++++++++++++++++++++++
+- libavcodec/rpi_zc.h      |  83 ++++++++++
+- 9 files changed, 779 insertions(+), 127 deletions(-)
+- create mode 100644 libavcodec/rpi_zc.c
+- create mode 100644 libavcodec/rpi_zc.h
+-
+-diff --git a/ffmpeg.c b/ffmpeg.c
+-index 50c6e86..953e5b8 100644
+---- a/ffmpeg.c
+-+++ b/ffmpeg.c
+-@@ -25,7 +25,7 @@
+- 
+- #ifdef RPI
+- #define RPI_DISPLAY
+--//#define RPI_ZERO_COPY
+-+#define RPI_ZERO_COPY
+- #endif
+- 
+- #include "config.h"
+-@@ -80,9 +80,7 @@
+- #include <interface/mmal/util/mmal_default_components.h>
+- #include <interface/mmal/util/mmal_connection.h>
+- #include <interface/mmal/util/mmal_util_params.h>
+--#ifdef RPI_ZERO_COPY
+--#include "libavcodec/rpi_qpu.h"
+--#endif
+-+#include "libavcodec/rpi_zc.h"
+- #endif
+- 
+- #if HAVE_SYS_RESOURCE_H
+-@@ -183,13 +181,7 @@ static void free_input_threads(void);
+- 
+- static MMAL_COMPONENT_T* rpi_display = NULL;
+- static MMAL_POOL_T *rpi_pool = NULL;
+--
+--#ifdef RPI_ZERO_COPY
+--static uint8_t *get_vc_handle(AVBufferRef *bref) {
+--  GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
+--  return (uint8_t *)p->vc_handle;
+--}
+--#endif
+-+static volatile int rpi_display_count = 0;
+- 
+- static MMAL_POOL_T* display_alloc_pool(MMAL_PORT_T* port, size_t w, size_t h)
+- {
+-@@ -206,7 +198,7 @@ static MMAL_POOL_T* display_alloc_pool(MMAL_PORT_T* port, size_t w, size_t h)
+-     for (i = 0; i < NUM_BUFFERS; ++i)
+-     {
+-        MMAL_BUFFER_HEADER_T* buffer = pool->header[i];
+--       void* bufPtr = buffer->data;
+-+       char * bufPtr = buffer->data;
+-        memset(bufPtr, i*30, w*h);
+-        memset(bufPtr+w*h, 128, (w*h)/2);
+-     }
+-@@ -215,23 +207,31 @@ static MMAL_POOL_T* display_alloc_pool(MMAL_PORT_T* port, size_t w, size_t h)
+-     return pool;
+- }
+- 
+--static void display_cb_input(MMAL_PORT_T *port,MMAL_BUFFER_HEADER_T *buffer) {
+-+static void display_cb_input(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer) {
+-+#ifdef RPI_ZERO_COPY
+-+    av_rpi_zc_unref(buffer->user_data);
+-+    --rpi_display_count;
+++#ifdef __cplusplus
+++extern "C"
+++{
+ +#endif
+-+    mmal_buffer_header_release(buffer);
+-+}
+-+
+-+static void display_cb_control(MMAL_PORT_T *port,MMAL_BUFFER_HEADER_T *buffer) {
+-   mmal_buffer_header_release(buffer);
+- }
+- 
+- static MMAL_COMPONENT_T* display_init(size_t x, size_t y, size_t w, size_t h)
+- {
+-     MMAL_COMPONENT_T* display;
+--    int w2 = (w+31)&~31;
+--    int h2 = (h+15)&~15;
+-     MMAL_DISPLAYREGION_T region =
+-     {
+--        {MMAL_PARAMETER_DISPLAYREGION, sizeof(region)},
+-+        .hdr = {MMAL_PARAMETER_DISPLAYREGION, sizeof(region)},
+-         .set = MMAL_DISPLAY_SET_LAYER | MMAL_DISPLAY_SET_FULLSCREEN | MMAL_DISPLAY_SET_DEST_RECT,
+-         .layer = 2,
+-         .fullscreen = 0,
+-         .dest_rect = {x, y, w, h}
+-     };
+-+    const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(w, h);
+ +
+-     bcm_host_init();  // TODO is this needed?
+-     mmal_component_create(MMAL_COMPONENT_DEFAULT_VIDEO_RENDERER, &display);
+-     assert(display);
+-@@ -240,8 +240,8 @@ static MMAL_COMPONENT_T* display_init(size_t x, size_t y, size_t w, size_t h)
+- 
+-     MMAL_ES_FORMAT_T* format = display->input[0]->format;
+-     format->encoding = MMAL_ENCODING_I420;
+--    format->es->video.width = w2;
+--    format->es->video.height = h2;
+-+    format->es->video.width = geo.stride_y;
+-+    format->es->video.height = geo.height_y;
+-     format->es->video.crop.x = 0;
+-     format->es->video.crop.y = 0;
+-     format->es->video.crop.width = w;
+-@@ -250,46 +250,75 @@ static MMAL_COMPONENT_T* display_init(size_t x, size_t y, size_t w, size_t h)
+- 
+-     mmal_component_enable(display);
+- 
+--    rpi_pool = display_alloc_pool(display->input[0], w2, h2);
+-+    rpi_pool = display_alloc_pool(display->input[0], geo.stride_y, geo.height_y);
+- 
+-     mmal_port_enable(display->input[0],display_cb_input);
+--    mmal_port_enable(display->control,display_cb_input);
+-+    mmal_port_enable(display->control,display_cb_control);
+- 
+--    printf("Allocated display %d %d\n",w,h);
+-+    printf("Allocated display %dx%d in %dx%d\n", w, h, geo.stride_y, geo.height_y);
+- 
+-     return display;
+- }
+- 
+--static void display_frame(MMAL_COMPONENT_T* display,AVFrame* fr)
+-+static void display_frame(struct AVCodecContext * const s, MMAL_COMPONENT_T* const display, const AVFrame* const fr)
+- {
+--    int w = fr->width;
+--    int h = fr->height;
+--    int w2 = (w+31)&~31;
+--    int h2 = (h+15)&~15;
+-     if (!display || !rpi_pool)
+-         return;
+++/* Different status that can be dumped.
+++*/
+++typedef enum
+++{
+++   VCSM_STATUS_VC_WALK_ALLOC = 0,   // Walks *all* the allocation on videocore.
+++                                    // Result of the walk is seen in the videocore
+++                                    // log.
+++   VCSM_STATUS_HOST_WALK_MAP,       // Walks the *full* mapping allocation on host
+++                                    // driver (ie for all processes).  Result of
+++                                    // the walk is seen in the kernel log.
+++   VCSM_STATUS_HOST_WALK_PID_MAP,   // Walks the per process mapping allocation on host
+++                                    // driver (for current process).  Result of
+++                                    // the walk is seen in the kernel log.
+++   VCSM_STATUS_HOST_WALK_PID_ALLOC, // Walks the per process host allocation on host
+++                                    // driver (for current process).  Result of
+++                                    // the walk is seen in the kernel log.
+++   VCSM_STATUS_VC_MAP_ALL,          // Equivalent to both VCSM_STATUS_VC_WALK_ALLOC and
+++                                    // VCSM_STATUS_HOST_WALK_MAP.
+++                                    //
+++   VCSM_STATUS_NONE,                // Must be last - invalid.
+ +
+-+    if (rpi_display_count >= 3) {
+-+        av_log(s, AV_LOG_VERBOSE, "Frame dropped\n");
+-+        return;
+-+    }
+++} VCSM_STATUS_T;
+ +
+-     MMAL_BUFFER_HEADER_T* buf = mmal_queue_get(rpi_pool->queue);
+-     if (!buf) {
+--      // Running too fast so drop the frame
+--      return;
+-+        // Running too fast so drop the frame
+-+        printf("Q alloc failure\n");
+-+        return;
+-     }
+-     assert(buf);
+-     buf->cmd = 0;
+--    buf->length = (w2 * h2 * 3)/2;
+-     buf->offset = 0; // Offset to valid data
+-     buf->flags = 0;
+- #ifdef RPI_ZERO_COPY
+--    buf->data = get_vc_handle(fr->buf[0]);
+--    buf->alloc_size = (w2*h2*3)/2;
+++/* Different kind of cache behavior.
+++*/
+++typedef enum
+ +{
+-+    const AVRpiZcRefPtr fr_buf = av_rpi_zc_ref(s, fr, 1);
+++   VCSM_CACHE_TYPE_NONE = 0,        // No caching applies.
+++   VCSM_CACHE_TYPE_HOST,            // Allocation is cached on host (user space).
+++   VCSM_CACHE_TYPE_VC,              // Allocation is cached on videocore.
+++   VCSM_CACHE_TYPE_HOST_AND_VC,     // Allocation is cached on both host and videocore.
+ +
+-+    buf->user_data = fr_buf;
+-+    buf->data = av_rpi_zc_vc_handle(fr_buf);
+-+    buf->alloc_size =
+-+        buf->length = av_rpi_zc_numbytes(fr_buf);
+++} VCSM_CACHE_TYPE_T;
+ +
+-+    ++rpi_display_count;
+-+}
+- #else
+-+{
+-+#error YYY
+-+    int w = fr->width;
+-+    int h = fr->height;
+-+    int w2 = (w+31)&~31;
+-+    int h2 = (h+15)&~15;
+++/* Initialize the vcsm processing.
+++**
+++** Must be called once before attempting to do anything else.
+++**
+++** Returns 0 on success, -1 on error.
+++*/
+++int vcsm_init( void );
+ +
+-+    buf->length = (w2 * h2 * 3)/2;
+-+    buf->user_data = NULL;
+ +
+-     //mmal_buffer_header_mem_lock(buf);
+-     memcpy(buf->data, fr->data[0], w2 * h);
+-     memcpy(buf->data+w2*h2, fr->data[1], w2 * h / 4);
+-     memcpy(buf->data+w2*h2*5/4, fr->data[2], w2 * h / 4);
+-     //mmal_buffer_header_mem_unlock(buf);
+-+}
+- #endif
+- 
+--    mmal_port_send_buffer(display->input[0], buf);  // I assume this will automatically get released
+-+    while (rpi_display_count >= 3) {
+-+        usleep(5000);
+-+    }
+++/* Terminates the vcsm processing.
+++**
+++** Must be called vcsm services are no longer needed, it will
+++** take care of removing any allocation under the current process
+++** control if deemed necessary.
+++*/
+++void vcsm_exit( void );
+ +
+-+    if (mmal_port_send_buffer(display->input[0], buf) != MMAL_SUCCESS)
+-+    {
+-+        printf("** send failed: depth=%d\n", rpi_display_count);
+-+        display_cb_input(NULL, buf);
+-+    }
+- }
+- 
+- static void display_exit(MMAL_COMPONENT_T* display)
+-@@ -687,6 +716,11 @@ static void ffmpeg_cleanup(int ret)
+-         avformat_close_input(&input_files[i]->ctx);
+-         av_freep(&input_files[i]);
+-     }
+ +
+-+#ifdef RPI_DISPLAY
+-+    display_exit(rpi_display);
+-+#endif
+++/* Queries the status of the the vcsm.
+++**
+++** Triggers dump of various kind of information, see the
+++** different variants specified in VCSM_STATUS_T.
+++**
+++** Pid is optional.
+++*/
+++void vcsm_status( VCSM_STATUS_T status, int pid );
+ +
+-     for (i = 0; i < nb_input_streams; i++) {
+-         InputStream *ist = input_streams[i];
+- 
+-@@ -698,6 +732,9 @@ static void ffmpeg_cleanup(int ret)
+-         av_freep(&ist->filters);
+-         av_freep(&ist->hwaccel_device);
+- 
+-+#ifdef RPI_ZERO_COPY
+-+        av_rpi_zc_uninit(ist->dec_ctx);
+-+#endif
+-         avcodec_free_context(&ist->dec_ctx);
+- 
+-         av_freep(&input_streams[i]);
+-@@ -729,9 +766,6 @@ static void ffmpeg_cleanup(int ret)
+-     term_exit();
+-     ffmpeg_exited = 1;
+- 
+--#ifdef RPI_DISPLAY
+--    display_exit(rpi_display);
+--#endif
+- }
+- 
+- void remove_avoptions(AVDictionary **a, AVDictionary *b)
+-@@ -1091,18 +1125,19 @@ static void do_video_out(AVFormatContext *s,
+-     int frame_size = 0;
+-     InputStream *ist = NULL;
+-     AVFilterContext *filter = ost->filter->filter;
+ +
+-+    if (ost->source_index >= 0)
+-+        ist = input_streams[ost->source_index];
+++/* Allocates a non-cached block of memory of size 'size' via the vcsm memory
+++** allocator.
+++**
+++** Returns:        0 on error
+++**                 a non-zero opaque handle on success.
+++**
+++** On success, the user must invoke vcsm_lock with the returned opaque
+++** handle to gain access to the memory associated with the opaque handle.
+++** When finished using the memory, the user calls vcsm_unlock_xx (see those
+++** function definition for more details on the one that can be used).
+++**
+++** A well behaved application should make every attempt to lock/unlock
+++** only for the duration it needs to access the memory data associated with
+++** the opaque handle.
+++*/
+++unsigned int vcsm_malloc( unsigned int size, char *name );
+ +
+- #ifdef RPI_DISPLAY
+--    if (next_picture)
+-+    if (next_picture && ist != NULL)
+-     {
+--	if (!rpi_display)
+-+        if (!rpi_display)
+-            rpi_display = display_init(0,0,next_picture->width,next_picture->height);
+--        display_frame(rpi_display,next_picture);
+-+        display_frame(ist->dec_ctx, rpi_display, next_picture);
+-     }
+- #endif
+- 
+--    if (ost->source_index >= 0)
+--        ist = input_streams[ost->source_index];
+--
+-     if (filter->inputs[0]->frame_rate.num > 0 &&
+-         filter->inputs[0]->frame_rate.den > 0)
+-         duration = 1/(av_q2d(filter->inputs[0]->frame_rate) * av_q2d(enc->time_base));
+-@@ -2708,6 +2743,12 @@ static int init_input_stream(int ist_index, char *error, int error_len)
+-         ist->dec_ctx->opaque                = ist;
+-         ist->dec_ctx->get_format            = get_format;
+-         ist->dec_ctx->get_buffer2           = get_buffer;
+ +
+-+#ifdef RPI_ZERO_COPY
+-+        // Overrides the above get_buffer2
+-+        av_rpi_zc_init(ist->dec_ctx);
+-+#endif
+++/* Allocates a cached block of memory of size 'size' via the vcsm memory
+++** allocator, the type of caching requested is passed as argument of the
+++** function call.
+++**
+++** Returns:        0 on error
+++**                 a non-zero opaque handle on success.
+++**
+++** On success, the user must invoke vcsm_lock with the returned opaque
+++** handle to gain access to the memory associated with the opaque handle.
+++** When finished using the memory, the user calls vcsm_unlock_xx (see those
+++** function definition for more details on the one that can be used).
+++**
+++** A well behaved application should make every attempt to lock/unlock
+++** only for the duration it needs to access the memory data associated with
+++** the opaque handle.
+++*/
+++unsigned int vcsm_malloc_cache( unsigned int size, VCSM_CACHE_TYPE_T cache, char *name );
+ +
+-         ist->dec_ctx->thread_safe_callbacks = 1;
+- 
+-         av_opt_set_int(ist->dec_ctx, "refcounted_frames", 1, 0);
+-diff --git a/libavcodec/Makefile b/libavcodec/Makefile
+-index 03065cd..21e4514 100644
+---- a/libavcodec/Makefile
+-+++ b/libavcodec/Makefile
+-@@ -9,6 +9,7 @@ HEADERS = avcodec.h                                                     \
+-           rpi_shader.h                                                  \
+-           rpi_mailbox.h                                                 \
+-           rpi_hevc_transform.h                                          \
+-+          rpi_zc.h                                                      \
+-           d3d11va.h                                                     \
+-           dirac.h                                                       \
+-           dv_profile.h                                                  \
+-@@ -50,6 +51,7 @@ OBJS = allcodecs.o                                                      \
+-        rpi_qpu.o                                                        \
+-        rpi_shader.o                                                     \
+-        rpi_mailbox.o                                                    \
+-+       rpi_zc.o                                                         \
+-        vorbis_parser.o                                                  \
+-        xiph.o                                                           \
+- 
+-diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
+-index 39713ed..a1ba217 100644
+---- a/libavcodec/avcodec.h
+-+++ b/libavcodec/avcodec.h
+-@@ -3505,6 +3505,12 @@ typedef struct AVCodecContext {
+- #define FF_SUB_TEXT_FMT_ASS_WITH_TIMINGS 1
+- #endif
+- 
+-+    /**
+-+     * Opaque pointer for use by replacement get_buffer2 code
+-+     *
+-+     * @author jc (08/02/2016)
+-+     */
+-+    void * get_buffer_context;
+- } AVCodecContext;
+- 
+- AVRational av_codec_get_pkt_timebase         (const AVCodecContext *avctx);
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 8437e10..51736c7 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -114,10 +114,6 @@ static uint32_t rpi_filter_coefs[8][1] = {
+-         { ENCODE_COEFFS(  -2,  10,  58,  -2) }
+- };
+- 
+--static uint32_t get_vc_address(AVBufferRef *bref) {
+--  GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
+--  return p->vc;
+--}
+- #endif
+- 
+- 
+-@@ -2197,9 +2193,9 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                   int bw = nPbW-start_x;
+-                   int bh = nPbH-start_y;
+-                   y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff);
+--                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[0]);
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref0->frame);
+-                   y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + 8 + start_x) & 0xffff);
+--                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[0]);
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref0->frame);
+-                   *y++ = ( (bw<16 ? bw : 16) << 16 ) + (bh<16 ? bh : 16);
+-                   *y++ = my2_mx2_my_mx;
+-                   if (weight_flag) {
+-@@ -2207,7 +2203,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                   } else {
+-                       *y++ = 1; // Weight of 1 and offset of 0
+-                   }
+--                  *y++ = (get_vc_address(s->frame->buf[0]) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
+-+                  *y++ = (get_vc_address_y(s->frame) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
+-                   y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter;
+-                 }
+-             }
+-@@ -2246,8 +2242,8 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
+--                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[1]);
+--                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[2]);
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_u(ref0->frame);
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_v(ref0->frame);
+-                       *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
+-                       *u++ = rpi_filter_coefs[_mx][0];
+-                       *u++ = rpi_filter_coefs[_my][0];
+-@@ -2258,8 +2254,8 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                           *u++ = 1; // Weight of 1 and offset of 0
+-                           *u++ = 1;
+-                       }
+--                      *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
+--                      *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+-+                      *u++ = (get_vc_address_u(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
+-+                      *u++ = (get_vc_address_v(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+-                     }
+-                 }
+-                 s->curr_u_mvs = u;
+-@@ -2297,9 +2293,9 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                   int bw = nPbW-start_x;
+-                   int bh = nPbH-start_y;
+-                   y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff);
+--                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[0]);
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref1->frame);
+-                   y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + 8 + start_x) & 0xffff);
+--                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[0]);
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref1->frame);
+-                   *y++ = ( (bw<16 ? bw : 16) << 16 ) + (bh<16 ? bh : 16);
+-                   *y++ = my2_mx2_my_mx;
+-                   if (weight_flag) {
+-@@ -2307,7 +2303,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                   } else {
+-                       *y++ = 1; // Weight of 1 and offset of 0
+-                   }
+--                  *y++ = (get_vc_address(s->frame->buf[0]) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
+-+                  *y++ = (get_vc_address_y(s->frame) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
+-                   y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter;
+-                 }
+-             }
+-@@ -2347,8 +2343,8 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
+--                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[1]);
+--                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[2]);
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_u(ref1->frame);
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_v(ref1->frame);
+-                       *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
+-                       // TODO chroma weight and offset... s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]
+-                       *u++ = rpi_filter_coefs[_mx][0];
+-@@ -2360,8 +2356,8 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                           *u++ = 1; // Weight of 1 and offset of 0
+-                           *u++ = 1;
+-                       }
+--                      *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
+--                      *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+-+                      *u++ = (get_vc_address_u(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
+-+                      *u++ = (get_vc_address_v(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+-                     }
+-                 }
+-                 s->curr_u_mvs = u;
+-@@ -2403,13 +2399,13 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                   int bw = nPbW-start_x;
+-                   int bh = nPbH-start_y;
+-                   y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff);
+--                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[0]);
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref0->frame);
+-                   y++[-RPI_LUMA_COMMAND_WORDS] = ((y2 - 3 + start_y) << 16) + ( (x2 - 3 + start_x) & 0xffff); // Second fetch is for ref1
+--                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[0]);
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref1->frame);
+-                   *y++ = ( (bw<8 ? bw : 8) << 16 ) + (bh<16 ? bh : 16);
+-                   *y++ = my2_mx2_my_mx;
+-                   *y++ = 1; // B frame weighted prediction not supported
+--                  *y++ = (get_vc_address(s->frame->buf[0]) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
+-+                  *y++ = (get_vc_address_y(s->frame) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
+-                   y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter_b;
+-                 }
+-             }
+-@@ -2453,8 +2449,8 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b0;
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
+--                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[1]);
+--                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[2]);
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_u(ref0->frame);
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_v(ref0->frame);
+-                       *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
+-                       *u++ = rpi_filter_coefs[_mx][0];
+-                       *u++ = rpi_filter_coefs[_my][0];
+-@@ -2464,14 +2460,14 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b;
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = x2_c - 1 + start_x;
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = y2_c - 1 + start_y;
+--                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[1]);
+--                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[2]);
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_u(ref1->frame);
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_v(ref1->frame);
+-                       *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
+-                       *u++ = rpi_filter_coefs[_mx2][0];
+-                       *u++ = rpi_filter_coefs[_my2][0];
+-                       u+=2; // Weights not supported in B slices
+--                      *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
+--                      *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+-+                      *u++ = (get_vc_address_u(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
+-+                      *u++ = (get_vc_address_v(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+-                     }
+-                 }
+-                 s->curr_u_mvs = u;
+-@@ -3270,12 +3266,13 @@ static int32_t filter8_luma(uint8_t *data, int x0, int y0, int pitch, int my_mx,
+-    return vsum;
+- }
+- 
+--static uint8_t *test_frame(HEVCContext *s,uint32_t p, AVFrame *frame, int cIdx)
+-+static uint8_t *test_frame(HEVCContext *s,uint32_t p, AVFrame *frame, const int cIdx)
+- {
+-   //int pic_width        = s->ps.sps->width >> s->ps.sps->hshift[cIdx];
+-   int pic_height       = s->ps.sps->height >> s->ps.sps->vshift[cIdx];
+-   int pitch = frame->linesize[cIdx];
+--  uint32_t base = get_vc_address(frame->buf[cIdx]);
+-+  uint32_t base = c_idx == 0 ? get_vc_address_y(frame);
+-+    c_idx == 1 ? get_vc_address_u(frame) : get_vc_address_v(frame);
+-   if (p>=base && p<base+pitch*pic_height) {
+-     return frame->data[cIdx] + (p-base);
+-   }
+-@@ -3562,6 +3559,7 @@ static void rpi_launch_vpu_qpu(HEVCContext *s)
+- #ifdef RPI
+- 
+- #ifndef RPI_FAST_CACHEFLUSH
+-+#error RPI_FAST_CACHEFLUSH is broken
+- static void flush_buffer(AVBufferRef *bref) {
+-     GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
+-     gpu_cache_flush(p);
+-@@ -3572,7 +3570,7 @@ static void flush_frame(HEVCContext *s,AVFrame *frame)
+- {
+- #ifdef RPI_FAST_CACHEFLUSH
+-     struct vcsm_user_clean_invalid_s iocache = {};
+--    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[1]);
+-+    GPU_MEM_PTR_T p = get_gpu_mem_ptr_u(s->frame);
+-     int n = s->ps.sps->height;
+-     int curr_y = 0;
+-     int curr_uv = 0;
+-@@ -3580,21 +3578,21 @@ static void flush_frame(HEVCContext *s,AVFrame *frame)
+-     int sz,base;
+-     sz = s->frame->linesize[1] * (n_uv-curr_uv);
+-     base = s->frame->linesize[1] * curr_uv;
+--    iocache.s[0].handle = p->vcsm_handle;
+-+    iocache.s[0].handle = p.vcsm_handle;
+-     iocache.s[0].cmd = 3; // clean+invalidate
+--    iocache.s[0].addr = (int)(p->arm) + base;
+-+    iocache.s[0].addr = (int)(p.arm) + base;
+-     iocache.s[0].size  = sz;
+--    p = av_buffer_pool_opaque(frame->buf[2]);
+--    iocache.s[1].handle = p->vcsm_handle;
+-+    p = get_gpu_mem_ptr_v(s->frame);
+-+    iocache.s[1].handle = p.vcsm_handle;
+-     iocache.s[1].cmd = 3; // clean+invalidate
+--    iocache.s[1].addr = (int)(p->arm) + base;
+-+    iocache.s[1].addr = (int)(p.arm) + base;
+-     iocache.s[1].size  = sz;
+--    p = av_buffer_pool_opaque(frame->buf[0]);
+-+    p = get_gpu_mem_ptr_y(s->frame);
+-     sz = s->frame->linesize[0] * (n-curr_y);
+-     base = s->frame->linesize[0] * curr_y;
+--    iocache.s[2].handle = p->vcsm_handle;
+-+    iocache.s[2].handle = p.vcsm_handle;
+-     iocache.s[2].cmd = 3; // clean+invalidate
+--    iocache.s[2].addr = (int)(p->arm) + base;
+-+    iocache.s[2].addr = (int)(p.arm) + base;
+-     iocache.s[2].size  = sz;
+-     vcsm_clean_invalid( &iocache );
+- #else
+-@@ -3612,7 +3610,7 @@ static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM
+-     int curr_y;
+-     int curr_uv;
+-     int n_uv;
+--    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[1]);
+-+    GPU_MEM_PTR_T p = get_gpu_mem_ptr_u(s->frame);
+-     int sz,base;
+-     int (*d)[2] = s->dblk_cmds[job];
+-     int low=(*d)[1];
+-@@ -3629,21 +3627,21 @@ static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM
+- 
+-     sz = s->frame->linesize[1] * (n_uv-curr_uv);
+-     base = s->frame->linesize[1] * curr_uv;
+--    iocache.s[0].handle = p->vcsm_handle;
+-+    iocache.s[0].handle = p.vcsm_handle;
+-     iocache.s[0].cmd = 3; // clean+invalidate
+--    iocache.s[0].addr = (int)(p->arm) + base;
+-+    iocache.s[0].addr = (int)(p.arm) + base;
+-     iocache.s[0].size  = sz;
+--    p = av_buffer_pool_opaque(frame->buf[2]);
+--    iocache.s[1].handle = p->vcsm_handle;
+-+    p = get_gpu_mem_ptr_v(s->frame);
+-+    iocache.s[1].handle = p.vcsm_handle;
+-     iocache.s[1].cmd = 3; // clean+invalidate
+--    iocache.s[1].addr = (int)(p->arm) + base;
+-+    iocache.s[1].addr = (int)(p.arm) + base;
+-     iocache.s[1].size  = sz;
+--    p = av_buffer_pool_opaque(frame->buf[0]);
+-+    p = get_gpu_mem_ptr_y(s->frame);
+-     sz = s->frame->linesize[0] * (n-curr_y);
+-     base = s->frame->linesize[0] * curr_y;
+--    iocache.s[2].handle = p->vcsm_handle;
+-+    iocache.s[2].handle = p.vcsm_handle;
+-     iocache.s[2].cmd = 3; // clean+invalidate
+--    iocache.s[2].addr = (int)(p->arm) + base;
+-+    iocache.s[2].addr = (int)(p.arm) + base;
+-     iocache.s[2].size  = sz;
+- 
+-     iocache.s[3].handle = p0->vcsm_handle;
+-diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
+-index 826a82f..c4fa305 100644
+---- a/libavcodec/hevc_filter.c
+-+++ b/libavcodec/hevc_filter.c
+-@@ -879,17 +879,25 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
+- #undef CR
+- 
+- #ifdef RPI_INTER_QPU
+--static void flush_buffer(AVBufferRef *bref) {
+--    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
+--    gpu_cache_flush(p);
+-+static void flush_buffer_y(const AVFrame * const frame) {
+-+    GPU_MEM_PTR_T p = get_gpu_mem_ptr_y(frame);
+-+    gpu_cache_flush(&p);
+- }
+- 
+--// Return Physical address for this image
+--static uint32_t get_vc_address(AVBufferRef *bref) {
+--  GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
+--  return p->vc;
+-+static void flush_buffer_u(const AVFrame * const frame) {
+-+    GPU_MEM_PTR_T p = get_gpu_mem_ptr_u(frame);
+-+    gpu_cache_flush(&p);
+- }
+- 
+-+static void flush_buffer_v(const AVFrame * const frame) {
+-+    GPU_MEM_PTR_T p = get_gpu_mem_ptr_v(frame);
+-+    gpu_cache_flush(&p);
+-+}
+++
+++/* Shares an allocated block of memory via the vcsm memory allocator.
+++**
+++** Returns:        0 on error
+++**                 a non-zero opaque handle on success.
+++**
+++** On success, the user must invoke vcsm_lock with the returned opaque
+++** handle to gain access to the memory associated with the opaque handle.
+++** When finished using the memory, the user calls vcsm_unlock_xx (see those
+++** function definition for more details on the one that can be used).
+++**
+++** A well behaved application should make every attempt to lock/unlock
+++** only for the duration it needs to access the memory data associated with
+++** the opaque handle.
+++*/
+++unsigned int vcsm_malloc_share( unsigned int handle );
+ +
+ +
+-+#ifdef RPI_DEBLOCK_VPU
+-+#error Not fixed yet
+++/* Resizes a block of memory allocated previously by vcsm_alloc.
+++**
+++** Returns:        0 on success
+++**                 -errno on error.
+++**
+++** The handle must be unlocked by user prior to attempting any
+++** resize action.
+++**
+++** On error, the original size allocated against the handle
+++** remains available the same way it would be following a
+++** successful vcsm_malloc.
+++*/
+++int vcsm_resize( unsigned int handle, unsigned int new_size );
+ +
+- // ff_hevc_flush_buffer_lines
+- // flushes and invalidates all pixel rows in [start,end-1]
+- static void ff_hevc_flush_buffer_lines(HEVCContext *s, int start, int end, int flush_luma, int flush_chroma)
+-@@ -901,44 +909,44 @@ static void ff_hevc_flush_buffer_lines(HEVCContext *s, int start, int end, int f
+-         int curr_uv = curr_y >> s->ps.sps->vshift[1];
+-         int n_uv = n >> s->ps.sps->vshift[1];
+-         int sz,base;
+--        GPU_MEM_PTR_T *p;
+-+        GPU_MEM_PTR_T p;
+-         if (curr_uv < 0) curr_uv = 0;
+-         if (n_uv<=curr_uv) { return; }
+-         sz = s->frame->linesize[1] * (n_uv-curr_uv);
+-         base = s->frame->linesize[1] * curr_uv;
+-         if (flush_chroma) {
+--          p = av_buffer_pool_opaque(s->frame->buf[1]);
+--          iocache.s[0].handle = p->vcsm_handle;
+-+          p = get_gpu_mem_ptr_u(s->frame);
+-+          iocache.s[0].handle = p.vcsm_handle;
+-           iocache.s[0].cmd = 3; // clean+invalidate
+--          iocache.s[0].addr = (int)p->arm + base;
+-+          iocache.s[0].addr = (int)p.arm + base;
+-           iocache.s[0].size  = sz;
+--          p = av_buffer_pool_opaque(s->frame->buf[2]);
+--          iocache.s[1].handle = p->vcsm_handle;
+-+          p = get_gpu_mem_ptr_v(s->frame);
+-+          iocache.s[1].handle = p.vcsm_handle;
+-           iocache.s[1].cmd = 3; // clean+invalidate
+--          iocache.s[1].addr = (int)p->arm + base;
+-+          iocache.s[1].addr = (int)p.arm + base;
+-           iocache.s[1].size  = sz;
+-         }
+-         if (flush_luma) {
+--          p = av_buffer_pool_opaque(s->frame->buf[0]);
+-+          p = get_gpu_mem_ptr_y(s->frame);
+-           sz = s->frame->linesize[0] * (n-curr_y);
+-           base = s->frame->linesize[0] * curr_y;
+--          iocache.s[2].handle = p->vcsm_handle;
+-+          iocache.s[2].handle = p.vcsm_handle;
+-           iocache.s[2].cmd = 3; // clean+invalidate
+--          iocache.s[2].addr = (int)p->arm + base;
+-+          iocache.s[2].addr = (int)p.arm + base;
+-           iocache.s[2].size  = sz;
+-         }
+-         vcsm_clean_invalid( &iocache );
+- #else
+-         if (flush_chroma) {
+--          flush_buffer(s->frame->buf[1]);
+--          flush_buffer(s->frame->buf[2]);
+-+          flush_buffer_u(s->frame);
+-+          flush_buffer_v(s->frame);
+-         }
+-         if (flush_luma) {
+--          flush_buffer(s->frame->buf[0]);
+-+          flush_buffer_y(s->frame);
+-         }
+- #endif
+- }
+--
+-+#endif
+- 
+- void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n)
+- {
+-@@ -950,37 +958,37 @@ void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n)
+-         int curr_uv = curr_y >> s->ps.sps->vshift[1];
+-         int n_uv = n >> s->ps.sps->vshift[1];
+-         int sz,base;
+--        GPU_MEM_PTR_T *p;
+-+        GPU_MEM_PTR_T p;
+-         if (curr_uv < 0) curr_uv = 0;
+-         if (n_uv<=curr_uv) { return; }
+-         sz = s->frame->linesize[1] * (n_uv-curr_uv);
+-         base = s->frame->linesize[1] * curr_uv;
+--        p = av_buffer_pool_opaque(s->frame->buf[1]);
+--        iocache.s[0].handle = p->vcsm_handle;
+-+        p = get_gpu_mem_ptr_u(s->frame);
+-+        iocache.s[0].handle = p.vcsm_handle;
+-         iocache.s[0].cmd = 3; // clean+invalidate
+--        iocache.s[0].addr = (int)p->arm + base;
+-+        iocache.s[0].addr = (int)p.arm + base;
+-         iocache.s[0].size  = sz;
+--        p = av_buffer_pool_opaque(s->frame->buf[2]);
+--        iocache.s[1].handle = p->vcsm_handle;
+-+        p = get_gpu_mem_ptr_v(s->frame);
+-+        iocache.s[1].handle = p.vcsm_handle;
+-         iocache.s[1].cmd = 3; // clean+invalidate
+--        iocache.s[1].addr = (int)p->arm + base;
+-+        iocache.s[1].addr = (int)p.arm + base;
+-         iocache.s[1].size  = sz;
+- 
+- #ifdef RPI_LUMA_QPU
+--        p = av_buffer_pool_opaque(s->frame->buf[0]);
+-+        p = get_gpu_mem_ptr_y(s->frame);
+-         sz = s->frame->linesize[0] * (n-curr_y);
+-         base = s->frame->linesize[0] * curr_y;
+--        iocache.s[2].handle = p->vcsm_handle;
+-+        iocache.s[2].handle = p.vcsm_handle;
+-         iocache.s[2].cmd = 3; // clean+invalidate
+--        iocache.s[2].addr = (int)p->arm + base;
+-+        iocache.s[2].addr = (int)p.arm + base;
+-         iocache.s[2].size  = sz;
+- #endif
+-         vcsm_clean_invalid( &iocache );
+- #else
+--        flush_buffer(s->frame->buf[1]);
+--        flush_buffer(s->frame->buf[2]);
+-+        flush_buffer_u(s->frame);
+-+        flush_buffer_v(s->frame);
+- #ifdef RPI_LUMA_QPU
+--        flush_buffer(s->frame->buf[0]);
+-+        flush_buffer_y(s->frame);
+- #endif
+- 
+- #endif
+-@@ -992,6 +1000,7 @@ void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n)
+- #endif
+- 
+- #ifdef RPI_DEBLOCK_VPU
+-+#error XXX
+- /* rpi_deblock deblocks an entire row of ctbs using the VPU */
+- static void rpi_deblock(HEVCContext *s, int y, int ctb_size)
+- {
+-@@ -1000,21 +1009,21 @@ static void rpi_deblock(HEVCContext *s, int y, int ctb_size)
+-   // TODO flush buffer of beta/tc setup when it becomes cached
+- 
+-   // Prepare three commands at once to avoid calling overhead
+--  s->vpu_cmds_arm[0][0] = get_vc_address(s->frame->buf[0]) + s->frame->linesize[0] * y;
+-+  s->vpu_cmds_arm[0][0] = get_vc_address_y(s->frame) + s->frame->linesize[0] * y;
+-   s->vpu_cmds_arm[0][1] = s->frame->linesize[0];
+-   s->vpu_cmds_arm[0][2] = s->setup_width;
+-   s->vpu_cmds_arm[0][3] = (int) ( s->y_setup_vc + s->setup_width * (y>>4) );
+-   s->vpu_cmds_arm[0][4] = ctb_size>>4;
+-   s->vpu_cmds_arm[0][5] = 2;
+- 
+--  s->vpu_cmds_arm[1][0] = get_vc_address(s->frame->buf[1]) + s->frame->linesize[1] * (y>> s->ps.sps->vshift[1]);
+-+  s->vpu_cmds_arm[1][0] = get_vc_address_u(s->frame) + s->frame->linesize[1] * (y>> s->ps.sps->vshift[1]);
+-   s->vpu_cmds_arm[1][1] = s->frame->linesize[1];
+-   s->vpu_cmds_arm[1][2] = s->uv_setup_width;
+-   s->vpu_cmds_arm[1][3] = (int) ( s->uv_setup_vc + s->uv_setup_width * ((y>>4)>> s->ps.sps->vshift[1]) );
+-   s->vpu_cmds_arm[1][4] = (ctb_size>>4)>> s->ps.sps->vshift[1];
+-   s->vpu_cmds_arm[1][5] = 3;
+- 
+--  s->vpu_cmds_arm[2][0] = get_vc_address(s->frame->buf[2]) + s->frame->linesize[2] * (y>> s->ps.sps->vshift[2]);
+-+  s->vpu_cmds_arm[2][0] = get_vc_address_v(s->frame) + s->frame->linesize[2] * (y>> s->ps.sps->vshift[2]);
+-   s->vpu_cmds_arm[2][1] = s->frame->linesize[2];
+-   s->vpu_cmds_arm[2][2] = s->uv_setup_width;
+-   s->vpu_cmds_arm[2][3] = (int) ( s->uv_setup_vc + s->uv_setup_width * ((y>>4)>> s->ps.sps->vshift[1]) );
+-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
+-index ffd13ca..b0c9bc5 100644
+---- a/libavcodec/rpi_qpu.c
+-+++ b/libavcodec/rpi_qpu.c
+-@@ -250,7 +250,7 @@ int gpu_get_mailbox(void)
+- }
+- 
+- // Call this to clean and invalidate a region of memory
+--void gpu_cache_flush(GPU_MEM_PTR_T *p)
+-+void gpu_cache_flush(const GPU_MEM_PTR_T * const p)
+- {
+- #ifdef RPI_FAST_CACHEFLUSH
+-     struct vcsm_user_clean_invalid_s iocache = {};
+-diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
+-index 81c2bb1..b913f79 100644
+---- a/libavcodec/rpi_qpu.h
+-+++ b/libavcodec/rpi_qpu.h
+-@@ -2,8 +2,11 @@
+- #define RPI_QPU_H
+- 
+- // Define RPI_FAST_CACHEFLUSH to use the VCSM cache flush code
+-+// *** N.B. Code has rotted & crashes if this is unset (before this set of changes)
+- #define RPI_FAST_CACHEFLUSH
+- 
+-+#define RPI_ONE_BUF 1
+ +
+- typedef struct gpu_mem_ptr_s {
+-   unsigned char *arm; // Pointer to memory mapped on ARM side
+-   int vc_handle;   // Videocore handle of relocatable memory
+-@@ -16,9 +19,113 @@ typedef struct gpu_mem_ptr_s {
+- extern int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p);
+- extern int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p);
+- extern void gpu_free(GPU_MEM_PTR_T *p);
+--extern void gpu_cache_flush(GPU_MEM_PTR_T *p);
+-+extern void gpu_cache_flush(const GPU_MEM_PTR_T * const p);
+- extern void gpu_cache_flush3(GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2);
+- 
+-+#include "libavutil/frame.h"
+-+#if !RPI_ONE_BUF
+-+static inline uint32_t get_vc_address_y(const AVFrame * const frame) {
+-+    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[0]);
+-+    return p->vc;
+-+}
+++/* Frees a block of memory that was successfully allocated by
+++** a prior call the vcms_alloc.
+++**
+++** The handle should be considered invalid upon return from this
+++** call.
+++**
+++** Whether any memory is actually freed up or not as the result of
+++** this call will depends on many factors, if all goes well it will
+++** be freed.  If something goes wrong, the memory will likely end up
+++** being freed up as part of the vcsm_exit process.  In the end the
+++** memory is guaranteed to be freed one way or another.
+++*/
+++void vcsm_free( unsigned int handle );
+ +
+-+static inline uint32_t get_vc_address_u(const AVFrame * const frame) {
+-+    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[1]);
+-+    return p->vc;
+-+}
+ +
+-+static inline uint32_t get_vc_address_v(const AVFrame * const frame) {
+-+    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[2]);
+-+    return p->vc;
+-+}
+++/* Retrieves a videocore opaque handle from a mapped user address
+++** pointer.  The videocore handle will correspond to the actual
+++** memory mapped in videocore.
+++**
+++** Returns:        0 on error
+++**                 a non-zero opaque handle on success.
+++**
+++** Note: the videocore opaque handle is distinct from the user
+++**       opaque handle (allocated via vcsm_malloc) and it is only
+++**       significant for such application which knows what to do
+++**       with it, for the others it is just a number with little
+++**       use since nothing can be done with it (in particular
+++**       for safety reason it cannot be used to map anything).
+++*/
+++unsigned int vcsm_vc_hdl_from_ptr( void *usr_ptr );
+ +
+-+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) {
+-+    return *(GPU_MEM_PTR_T *)av_buffer_pool_opaque(frame->buf[0]);
+-+}
+ +
+-+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_u(const AVFrame * const frame) {
+-+    return *(GPU_MEM_PTR_T *)av_buffer_pool_opaque(frame->buf[1]);
+-+}
+++/* Retrieves a videocore opaque handle from a opaque handle
+++** pointer.  The videocore handle will correspond to the actual
+++** memory mapped in videocore.
+++**
+++** Returns:        0 on error
+++**                 a non-zero opaque handle on success.
+++**
+++** Note: the videocore opaque handle is distinct from the user
+++**       opaque handle (allocated via vcsm_malloc) and it is only
+++**       significant for such application which knows what to do
+++**       with it, for the others it is just a number with little
+++**       use since nothing can be done with it (in particular
+++**       for safety reason it cannot be used to map anything).
+++*/
+++unsigned int vcsm_vc_hdl_from_hdl( unsigned int handle );
+ +
+-+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_v(const AVFrame * const frame) {
+-+    return *(GPU_MEM_PTR_T *)av_buffer_pool_opaque(frame->buf[2]);
+-+}
+ +
+-+#else
+++/* Retrieves a user opaque handle from a mapped user address
+++** pointer.
+++**
+++** Returns:        0 on error
+++**                 a non-zero opaque handle on success.
+++*/
+++unsigned int vcsm_usr_handle( void *usr_ptr );
+ +
+-+static inline int gpu_is_buf1(const AVFrame * const frame)
+-+{
+-+    return frame->buf[1] == NULL;
+-+}
+ +
+-+static inline GPU_MEM_PTR_T * gpu_buf1_gmem(const AVFrame * const frame)
+-+{
+-+    return av_buffer_get_opaque(frame->buf[0]);
+-+}
+++/* Retrieves a mapped user address from an opaque user
+++** handle.
+++**
+++** Returns:        0 on error
+++**                 a non-zero address on success.
+++**
+++** On success, the address corresponds to the pointer
+++** which can access the data allocated via the vcsm_malloc
+++** call.
+++*/
+++void *vcsm_usr_address( unsigned int handle );
+ +
+-+static inline GPU_MEM_PTR_T * gpu_buf3_gmem(const AVFrame * const frame, const int n)
+-+{
+-+    return av_buffer_pool_opaque(frame->buf[n]);
+-+}
+++
+++/* Locks the memory associated with this opaque handle.
+++**
+++** Returns:        NULL on error
+++**                 a valid pointer on success.
+++**
+++** A user MUST lock the handle received from vcsm_malloc
+++** in order to be able to use the memory associated with it.
+++**
+++** On success, the pointer returned is only valid within
+++** the lock content (ie until a corresponding vcsm_unlock_xx
+++** is invoked).
+++*/
+++void *vcsm_lock( unsigned int handle );
+ +
+ +
+-+static inline uint32_t get_vc_address_y(const AVFrame * const frame) {
+-+    return gpu_is_buf1(frame) ? gpu_buf1_gmem(frame)->vc : gpu_buf3_gmem(frame, 0)->vc;
+-+}
+++/* Locks the memory associated with this opaque handle.  The lock
+++** also gives a chance to update the *host* cache behavior of the
+++** allocated buffer if so desired.  The *videocore* cache behavior
+++** of the allocated buffer cannot be changed by this call and such
+++** attempt will be ignored.
+++**
+++** The system will attempt to honour the cache_update mode request,
+++** the cache_result mode will provide the final answer on which cache
+++** mode is really in use.  Failing to change the cache mode will not
+++** result in a failure to lock the buffer as it is an application
+++** decision to choose what to do if (cache_result != cache_update)
+++**
+++** The value returned in cache_result can only be considered valid if
+++** the returned pointer is non NULL.  The cache_result pointer may be
+++** NULL if the application does not care about the actual outcome of
+++** its action with regards to the cache behavior change.
+++**
+++** Returns:        NULL on error
+++**                 a valid pointer on success.
+++**
+++** A user MUST lock the handle received from vcsm_malloc
+++** in order to be able to use the memory associated with it.
+++**
+++** On success, the pointer returned is only valid within
+++** the lock content (ie until a corresponding vcsm_unlock_xx
+++** is invoked).
+++*/
+++void *vcsm_lock_cache( unsigned int handle,
+++                       VCSM_CACHE_TYPE_T cache_update,
+++                       VCSM_CACHE_TYPE_T *cache_result );
+++
+++
+++/* Unlocks the memory associated with this user mapped address.
+++**
+++** Returns:        0 on success
+++**                 -errno on error.
+++**
+++** After unlocking a mapped address, the user should no longer
+++** attempt to reference it.
+++*/
+++int vcsm_unlock_ptr( void *usr_ptr );
+++
+++
+++/* Unlocks the memory associated with this user mapped address.
+++** Apply special processing that would override the otherwise
+++** default behavior.
+++**
+++** If 'cache_no_flush' is specified:
+++**    Do not flush cache as the result of the unlock (if cache
+++**    flush was otherwise applicable in this case).
+++**
+++** Returns:        0 on success
+++**                 -errno on error.
+++**
+++** After unlocking a mapped address, the user should no longer
+++** attempt to reference it.
+++*/
+++int vcsm_unlock_ptr_sp( void *usr_ptr, int cache_no_flush );
+++
+ +
+-+static inline uint32_t get_vc_address_u(const AVFrame * const frame) {
+-+    return gpu_is_buf1(frame) ?
+-+        gpu_buf1_gmem(frame)->vc + frame->data[1] - frame->data[0] :
+-+        gpu_buf3_gmem(frame, 1)->vc;
+-+}
+++/* Unlocks the memory associated with this user opaque handle.
+++**
+++** Returns:        0 on success
+++**                 -errno on error.
+++**
+++** After unlocking an opaque handle, the user should no longer
+++** attempt to reference the mapped addressed once associated
+++** with it.
+++*/
+++int vcsm_unlock_hdl( unsigned int handle );
+ +
+-+static inline uint32_t get_vc_address_v(const AVFrame * const frame) {
+-+    return gpu_is_buf1(frame) ?
+-+        gpu_buf1_gmem(frame)->vc + frame->data[2] - frame->data[0] :
+-+        gpu_buf3_gmem(frame, 2)->vc;
+-+}
+ +
+++/* Unlocks the memory associated with this user opaque handle.
+++** Apply special processing that would override the otherwise
+++** default behavior.
+++**
+++** If 'cache_no_flush' is specified:
+++**    Do not flush cache as the result of the unlock (if cache
+++**    flush was otherwise applicable in this case).
+++**
+++** Returns:        0 on success
+++**                 -errno on error.
+++**
+++** After unlocking an opaque handle, the user should no longer
+++** attempt to reference the mapped addressed once associated
+++** with it.
+++*/
+++int vcsm_unlock_hdl_sp( unsigned int handle, int cache_no_flush );
+ +
+-+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) {
+-+    if (gpu_is_buf1(frame))
+-+    {
+-+        GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
+-+        g.numbytes = frame->data[1] - frame->data[0];
+-+        return g;
+-+    }
+-+    else
+-+        return *gpu_buf3_gmem(frame, 0);
+-+}
+++/* Clean and/or invalidate the memory associated with this user opaque handle
+++**
+++** Returns:        non-zero on error
+++**
+++** structure contains a list of flush/invalidate commands. Commands are:
+++** 0: nop
+++** 1: invalidate       given virtual range in L1/L2
+++** 2: clean            given virtual range in L1/L2
+++** 3: clean+invalidate given virtual range in L1/L2
+++** 4: flush all L1/L2
+++*/
+++struct vcsm_user_clean_invalid_s {
+++   struct {
+++      unsigned int cmd;
+++      unsigned int handle;
+++      unsigned int addr;
+++      unsigned int size;
+++   } s[8];
+++};
+ +
+-+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_u(const AVFrame * const frame) {
+-+    if (gpu_is_buf1(frame))
+-+    {
+-+        GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
+-+        g.arm += frame->data[1] - frame->data[0];
+-+        g.vc += frame->data[1] - frame->data[0];
+-+        g.numbytes = frame->data[2] - frame->data[1];  // chroma size
+-+        return g;
+-+    }
+-+    else
+-+        return *gpu_buf3_gmem(frame, 1);
+-+}
+++int vcsm_clean_invalid( struct vcsm_user_clean_invalid_s *s );
+ +
+-+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_v(const AVFrame * const frame) {
+-+    if (gpu_is_buf1(frame))
+-+    {
+-+        GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
+-+        g.arm += frame->data[2] - frame->data[0];
+-+        g.vc += frame->data[2] - frame->data[0];
+-+        g.numbytes = frame->data[2] - frame->data[1];  // chroma size
+-+        return g;
+-+    }
+-+    else
+-+        return *gpu_buf3_gmem(frame, 2);
+++#ifdef __cplusplus
+ +}
+-+
+ +#endif
+ +
+-+
+- // QPU specific functions
+- extern void qpu_run_shader8(int code, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8);
+- extern void qpu_run_shader12(int code, int num, int code2, int num2, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8, int unifs9, int unifs10, int unifs11, int unifs12);
+++#endif /* __USER_VCSM__H__INCLUDED__ */
+ diff --git a/libavcodec/rpi_zc.c b/libavcodec/rpi_zc.c
+ new file mode 100644
+ index 0000000..9580165
+@@ -38057,80 +15466,3089 @@ index 0000000..f0109f4
+ +
+ +#endif
+ +
+--- 
+-2.7.4
+-
+-
+-From a6da64e1ca42f0394ccfa55dca782a456841da94 Mon Sep 17 00:00:00 2001
+-From: John Cox <jc@kynesim.co.uk>
+-Date: Tue, 1 Mar 2016 14:21:25 +0000
+-Subject: [PATCH 2/2] Set VPU scheduling thread to high priority after creation
+-
+----
+- libavcodec/rpi_qpu.c | 48 +++++++++++++++++++++++++++++++++++++++++++++++-
+- 1 file changed, 47 insertions(+), 1 deletion(-)
+-
+-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
+-index b0c9bc5..ee19231 100644
+---- a/libavcodec/rpi_qpu.c
+-+++ b/libavcodec/rpi_qpu.c
+-@@ -182,9 +182,55 @@ static int gpu_init(volatile struct GPU **gpu) {
+-     err = pthread_create(&vpu_thread, NULL, vpu_start, NULL);
+-     //printf("Created thread\n");
+-     if (err) {
+--        printf("Failed to create vpu thread\n");
+-+        av_log(NULL, AV_LOG_FATAL, "Failed to create vpu thread\n");
+-         return -4;
+-     }
++diff --git a/libavcodec/utils.c b/libavcodec/utils.c
++index f7adb52..708526e 100644
++--- a/libavcodec/utils.c
+++++ b/libavcodec/utils.c
++@@ -26,6 +26,12 @@
++  */
++ 
++ #include "config.h"
+ +
+-+    {
+-+      struct sched_param param = {0};
+-+      int policy = 0;
+++#ifdef RPI
+++// Move video buffers to GPU memory
+++#define RPI_GPU_BUFFERS
+++#endif
+ +
+-+      if (pthread_getschedparam(vpu_thread, &policy, &param) != 0)
+-+      {
+-+        av_log(NULL, AV_LOG_ERROR, "Unable to get VPU thread scheduling parameters\n");
+-+      }
+-+      else
+-+      {
+-+        av_log(NULL, AV_LOG_INFO, "VPU thread: policy=%d (%s), pri=%d\n",
+-+            policy,
+-+            policy == SCHED_RR ? "RR" : policy == SCHED_FIFO ? "FIFO" : "???" ,
+-+            param.sched_priority);
++ #include "libavutil/atomic.h"
++ #include "libavutil/attributes.h"
++ #include "libavutil/avassert.h"
++@@ -64,6 +70,10 @@
++ #include "libavutil/ffversion.h"
++ const char av_codec_ffversion[] = "FFmpeg version " FFMPEG_VERSION;
++ 
+++#ifdef RPI_GPU_BUFFERS
+++#include "rpi_qpu.h"
+++#endif
+ +
+-+        policy = SCHED_FIFO;
+-+        param.sched_priority = sched_get_priority_max(SCHED_FIFO);
++ #if HAVE_PTHREADS || HAVE_W32THREADS || HAVE_OS2THREADS
++ static int default_lockmgr_cb(void **arg, enum AVLockOp op)
++ {
++@@ -503,6 +513,47 @@ int avcodec_fill_audio_frame(AVFrame *frame, int nb_channels,
++     return ret;
++ }
++ 
+++#ifdef RPI_GPU_BUFFERS
+++static void rpi_buffer_default_free(void *opaque, uint8_t *data)
+++{
+++    GPU_MEM_PTR_T *p = opaque;
+++    gpu_free(p);
+++    av_free(p);
+++}
+ +
+-+        av_log(NULL, AV_LOG_INFO, "Attempt to set: policy=%d (%s), pri=%d\n",
+-+            policy,
+-+            policy == SCHED_RR ? "RR" : policy == SCHED_FIFO ? "FIFO" : "???" ,
+-+            param.sched_priority);
+++static AVBufferRef *rpi_buffer_alloc(int size)
+++{
+++    AVBufferRef *ret = NULL;
+++    uint8_t    *data = NULL;
+++    GPU_MEM_PTR_T *p;
+ +
+-+        if (pthread_setschedparam(vpu_thread, policy, &param) != 0)
+-+        {
+-+          av_log(NULL, AV_LOG_ERROR, "Unable to set VPU thread scheduling parameters\n");
+-+        }
+-+        else
+-+        {
+-+          if (pthread_getschedparam(vpu_thread, &policy, &param) != 0)
+-+          {
+-+            av_log(NULL, AV_LOG_ERROR, "Unable to get VPU thread scheduling parameters\n");
+-+          }
+-+          else
+-+          {
+-+            av_log(NULL, AV_LOG_INFO, "VPU thread (after): policy=%d (%s), pri=%d\n",
+-+                policy,
+-+                policy == SCHED_RR ? "RR" : policy == SCHED_FIFO ? "FIFO" : "???" ,
+-+                param.sched_priority);
+-+          }
+-+        }
+-+      }
+++    static int total=0;
+++    total+=size;
+++
+++    p = av_malloc(sizeof *p);
+++    if (!p)
+++        return NULL;
+++
+++    if (gpu_malloc_cached(size,p)<0)  // Change this line to choose cached or uncached memory.  The caching here refers to the ARM data cache.
+++        return NULL;
+++
+++    data = p->arm;
+++    printf("Rpi alloc %d/%d ARM=%p VC=%x->%x\n",size,total,p->arm,p->vc,p->vc+size);
+++    //memset(data, 64, size);
+++
+++    if (!data)
+++        return NULL;
+ +
+++    ret = av_buffer_create(data, size, rpi_buffer_default_free, p, 0);
+++    if (!ret) {
+++        gpu_free(p);
+++        av_freep(&p);
+ +    }
+ +
+-   }
+++    return ret;
+++}
+++#endif
+++
++ static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame)
++ {
++     FramePool *pool = avctx->internal->pool;
++@@ -550,6 +601,14 @@ static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame)
++             av_buffer_pool_uninit(&pool->pools[i]);
++             pool->linesize[i] = linesize[i];
++             if (size[i]) {
+++#ifdef RPI_GPU_BUFFERS
+++                if (avctx->codec_id == AV_CODEC_ID_HEVC)
+++                    pool->pools[i] = av_buffer_pool_init(size[i] + 16 + STRIDE_ALIGN - 1,
+++                                                     CONFIG_MEMORY_POISONING ?
+++                                                        NULL :
+++                                                        rpi_buffer_alloc);
+++                else
+++#endif
++                 pool->pools[i] = av_buffer_pool_init(size[i] + 16 + STRIDE_ALIGN - 1,
++                                                      CONFIG_MEMORY_POISONING ?
++                                                         NULL :
++diff --git a/libavformat/mpegts.c b/libavformat/mpegts.c
++index b31d233..2767306 100644
++--- a/libavformat/mpegts.c
+++++ b/libavformat/mpegts.c
++@@ -701,7 +701,7 @@ static const StreamType ISO_types[] = {
+  #endif
++     { 0x1b, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_H264       },
++     { 0x1c, AVMEDIA_TYPE_AUDIO, AV_CODEC_ID_AAC        },
++-    { 0x20, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_H264       },
+++    { 0x20, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_H264_MVC   },
++     { 0x21, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_JPEG2000   },
++     { 0x24, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_HEVC       },
++     { 0x42, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_CAVS       },
++diff --git a/libavformat/utils.c b/libavformat/utils.c
++index 6f343f2..83f26d5 100644
++--- a/libavformat/utils.c
+++++ b/libavformat/utils.c
++@@ -691,7 +691,7 @@ static int update_wrap_reference(AVFormatContext *s, AVStream *st, int stream_in
++         int default_stream_index = av_find_default_stream_index(s);
++         if (s->streams[default_stream_index]->pts_wrap_reference == AV_NOPTS_VALUE) {
++             for (i = 0; i < s->nb_streams; i++) {
++-                if (av_find_program_from_stream(s, NULL, i))
+++                if (0 && av_find_program_from_stream(s, NULL, i))
++                     continue;
++                 s->streams[i]->pts_wrap_reference = pts_wrap_reference;
++                 s->streams[i]->pts_wrap_behavior = pts_wrap_behavior;
++diff --git a/libavutil/buffer.c b/libavutil/buffer.c
++index 694e116..203ca7b 100644
++--- a/libavutil/buffer.c
+++++ b/libavutil/buffer.c
++@@ -425,3 +425,9 @@ AVBufferRef *av_buffer_pool_get(AVBufferPool *pool)
++ 
++     return ret;
++ }
+++
+++// Return the opaque for the underlying frame (gives us a GPU_MEM_PTR_T)
+++void *av_buffer_pool_opaque(AVBufferRef *ref) {
+++  BufferPoolEntry *buf = av_buffer_get_opaque(ref);
+++  return buf->opaque;
+++}
++diff --git a/libavutil/buffer.h b/libavutil/buffer.h
++index 0c0ce12..82e0bc3 100644
++--- a/libavutil/buffer.h
+++++ b/libavutil/buffer.h
++@@ -283,6 +283,9 @@ void av_buffer_pool_uninit(AVBufferPool **pool);
++  */
++ AVBufferRef *av_buffer_pool_get(AVBufferPool *pool);
+  
+--- 
+-2.7.4
+-
+++// Return the opaque for the underlying frame
+++void *av_buffer_pool_opaque(AVBufferRef *ref);
+++
++ /**
++  * @}
++  */
++diff --git a/pi-util/conf.sh b/pi-util/conf.sh
++new file mode 100755
++index 0000000..8b596a2
++--- /dev/null
+++++ b/pi-util/conf.sh
++@@ -0,0 +1,33 @@
+++echo "Configure for Pi2/3"
+++
+++RPI_BUILDROOT=`pwd`/build
+++RPI_ROOTFS=$RPI_BUILDROOT/linux/raspian_jessie_pi1-sysroot
+++RPI_TOOLROOT=$RPI_BUILDROOT/tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf
+++RPI_OPT_VC=$RPI_ROOTFS/opt/vc
+++#RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_ROOTFS/usr/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
+++RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
+++RPI_DEFS="-D__VCCOREVER__=0x04000000 -DRPI=1"
+++#RPI_DEFS="-D__VCCOREVER__=0x04000000"
+++RPI_LIBDIRS="-L$RPI_ROOTFS/lib -L$RPI_ROOTFS/usr/lib -L$RPI_OPT_VC/lib"
+++#RPI_KEEPS="-save-temps=obj"
+++RPI_KEEPS=""
+++
+++./configure --enable-cross-compile\
+++ --arch=armv6t2\
+++ --cpu=cortex-a7\
+++ --target-os=linux\
+++ --disable-stripping\
+++ --disable-thumb\
+++ --enable-mmal\
+++ --extra-cflags="-g $RPI_KEEPS $RPI_DEFS $RPI_INCLUDES"\
+++ --extra-cxxflags="$RPI_DEFS $RPI_INCLUDES"\
+++ --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_ROOTFS/lib,-rpath-link=$RPI_ROOTFS/usr/lib"\
+++ --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\
+++ --cross-prefix=$RPI_TOOLROOT/bin/arm-linux-gnueabihf-
+++
+++# --enable-extra-warnings\
+++# --arch=armv71\
+++# --enable-shared\
+++
+++# gcc option for getting asm listing
+++# -Wa,-ahls
++diff --git a/pi-util/conf_h265.csv b/pi-util/conf_h265.csv
++new file mode 100644
++index 0000000..61d1399
++--- /dev/null
+++++ b/pi-util/conf_h265.csv
++@@ -0,0 +1,144 @@
+++1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.bit,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.md5
+++2,AMP_A_Samsung_6,AMP_A_Samsung_6.bin,AMP_A_Samsung_6.md5
+++1,AMP_B_Samsung_6,AMP_B_Samsung_6.bin,AMP_B_Samsung_6.md5
+++1,AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5
+++1,AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5
+++1,AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5
+++1,AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5
+++1,AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5
+++2,AMVP_C_Samsung_6,AMVP_C_Samsung_6.bin,AMVP_C_Samsung_6.md5
+++1,BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5
+++1,CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5
+++1,CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5
+++1,CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5
+++1,CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5
+++1,CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5
+++1,CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5
+++1,CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5
+++1,CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5
+++1,CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5
+++1,cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5
+++1,CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5
+++1,CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5
+++1,DBLK_A_MAIN10_VIXS_3,DBLK_A_MAIN10_VIXS_3.bit,DBLK_A_MAIN10_VIXS_3.md5
+++1,DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5
+++1,DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5
+++1,DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5
+++1,DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5
+++1,DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5
+++1,DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5
+++1,DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5
+++2,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5
+++1,DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5
+++1,DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5
+++1,DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5
+++1,DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5
+++1,DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5
+++1,ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5
+++1,ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5
+++1,ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5
+++1,EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5
+++1,FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5
+++1,HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5
+++1,INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5
+++1,INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5
+++1,ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5
+++1,ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5
+++1,ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5
+++1,ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5
+++1,ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5
+++1,IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5
+++1,IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5
+++1,IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5
+++1,LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5
+++1,LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5
+++1,LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5
+++1,MAXBINS_A_TI_4,MAXBINS_A_TI_4.bit,MAXBINS_A_TI_4.md5
+++1,MAXBINS_B_TI_4,MAXBINS_B_TI_4.bit,MAXBINS_B_TI_4.md5
+++1,MAXBINS_C_TI_4,MAXBINS_C_TI_4.bit,MAXBINS_C_TI_4.md5
+++1,MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5
+++1,MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5
+++1,MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5
+++1,MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5
+++1,MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5
+++1,MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5
+++1,MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5
+++1,MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5
+++1,MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5
+++1,MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5
+++1,NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5
+++1,NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5
+++2,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5
+++1,OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5
+++1,OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5
+++1,OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5
+++1,PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5
+++1,PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5
+++1,PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5
+++1,PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5
+++1,PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5
+++1,PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5
+++1,PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5
+++1,PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5
+++1,PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5
+++1,POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5
+++2,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5
+++1,PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5
+++1,RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5
+++2,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5
+++1,RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5
+++1,RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5
+++1,RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5
+++1,RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5
+++1,RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5
+++1,RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5
+++1,RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5
+++1,RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5
+++1,RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5
+++1,RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5
+++1,RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5
+++1,RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5
+++1,RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5
+++1,RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5
+++1,RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5
+++1,SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5
+++1,SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5
+++1,SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5
+++1,SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5
+++1,SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5
+++1,SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5
+++1,SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5
+++1,SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5
+++1,SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5
+++1,SLIST_A_Sony_4,str.bin,SLIST_A_Sony_4_yuv.md5
+++1,SLIST_B_Sony_8,str.bin,SLIST_B_Sony_8_yuv.md5
+++1,SLIST_C_Sony_3,str.bin,SLIST_C_Sony_3_yuv.md5
+++1,SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5
+++2,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5
+++1,STRUCT_A_Samsung_6,STRUCT_A_Samsung_6.bin,STRUCT_A_Samsung_6.md5
+++1,STRUCT_B_Samsung_6,STRUCT_B_Samsung_6.bin,STRUCT_B_Samsung_6.md5
+++1,TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5
+++1,TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5
+++1,TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5
+++1,TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5
+++1,TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5
+++1,TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5
+++2,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5
+++1,TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5
+++2,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5
+++1,WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5
+++1,WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5
+++1,WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5
+++1,WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5
+++1,WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5
+++1,WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5
+++1,WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5
+++1,WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5
+++1,WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5
+++1,WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5
+++1,WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5
+++1,WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5
+++1,WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5
+++1,WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5
+++1,WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5
+++1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
++diff --git a/pi-util/ffconf.py b/pi-util/ffconf.py
++new file mode 100644
++index 0000000..38f942f
++--- /dev/null
+++++ b/pi-util/ffconf.py
++@@ -0,0 +1,146 @@
+++#!/usr/bin/env python
+++
+++import os
+++import subprocess
+++import re
+++import argparse
+++import sys
+++import csv
+++from stat import *
+++
+++conf_root = "/opt/conform/h265"
+++ffmpeg_exec = "./ffmpeg"
+++
+++def testone(fileroot, name, es_file, md5_file):
+++    tmp_root = "/tmp"
+++
+++    dec_file = os.path.join(tmp_root, name + ".dec.md5")
+++    try:
+++        os.remove(dec_file)
+++    except:
+++        pass
+++
+++    flog = open(os.path.join(tmp_root, name + ".log"), "wt")
+++
+++    # Unaligned needed for cropping conformance
+++    rstr = subprocess.call(
+++        [ffmpeg_exec, "-flags", "unaligned", "-vcodec", "hevc", "-i", os.path.join(fileroot, es_file), "-f", "md5", dec_file],
+++        stdout=flog, stderr=subprocess.STDOUT)
+++
+++    try:
+++        m1 = None
+++        m2 = None
+++        with open(os.path.join(fileroot, md5_file)) as f:
+++            for line in f:
+++                m1 = re.search("[0-9a-f]{32}", line.lower())
+++                if m1:
+++                    break
+++
+++        with open(dec_file) as f:
+++            m2 = re.search("[0-9a-f]{32}", f.readline())
+++    except:
+++        pass
+++
+++    rv = False
+++    if  m1 and m2 and m1.group() == m2.group():
+++        print >> flog, "Match: " + m1.group()
+++        rv = True
+++    elif not m1:
+++        print >> flog, "****** Cannot find m1"
+++    elif not m2:
+++        print >> flog, "****** Cannot find m2"
+++    else:
+++        print >> flog, "****** Mismatch: " + m1.group() + " != " + m2.group()
+++    flog.close()
+++    return rv
+++
+++def scandir(root):
+++    aconf = []
+++    ents = os.listdir(conf_root)
+++    ents.sort(key=str.lower)
+++    for name in ents:
+++        test_path = os.path.join(conf_root, name)
+++        if S_ISDIR(os.stat(test_path).st_mode):
+++            files = os.listdir(test_path)
+++            es_file = "?"
+++            md5_file = "?"
+++            for f in files:
+++                (base, ext) = os.path.splitext(f)
+++                if base[0] == '.':
+++                    pass
+++                elif ext == ".bit" or ext == ".bin":
+++                    es_file = f
+++                elif ext == ".md5":
+++                    if md5_file == "?":
+++                        md5_file = f
+++                    elif base[-3:] == "yuv":
+++                        md5_file = f
+++            aconf.append((1, name, es_file, md5_file))
+++    return aconf
+++
+++def runtest(name, tests):
+++    if not tests:
+++        return True
+++    for t in tests:
+++        if name[0:len(t)] == t:
+++            return True
+++        return False
+++
+++def doconf(csva, tests):
+++    failures = []
+++    unx_success = []
+++    for a in csva:
+++        exp_test = int(a[0])
+++        if (exp_test and runtest(a[1], tests)):
+++            name = a[1]
+++            print "==== ", name,
+++            sys.stdout.flush()
+++
+++            if (not testone(os.path.join(conf_root, name), name, a[2], a[3])) :
+++                if exp_test == 1:
+++                    failures.append(name)
+++                    print ": * FAIL *"
+++                else:
+++                    print ": fail"
+++            else:
+++                if exp_test == 2:
+++                    print ": * OK *"
+++                    unx_success.append(name)
+++                else:
+++                    print ": ok"
+++
+++
+++    if failures or unx_success:
+++        print "Unexpected Failures:", failures
+++        print "Unexpected Success: ", unx_success
+++    else:
+++        print "All tests normal"
+++
+++
+++class ConfCSVDialect(csv.Dialect):
+++    delimiter = ','
+++    doublequote = True
+++    lineterminator = '\n'
+++    quotechar='"'
+++    quoting = csv.QUOTE_MINIMAL
+++    skipinitialspace = True
+++    strict = True
+++
+++if __name__ == '__main__':
+++
+++    argp = argparse.ArgumentParser(description="FFmpeg h265 conformance tester")
+++    argp.add_argument("tests", nargs='*')
+++    argp.add_argument("--csvgen", action='store_true', help="Generate CSV file for dir")
+++    argp.add_argument("--csv", default="pi-util/conf_h265.csv", help="CSV filename")
+++    args = argp.parse_args()
+++
+++    if args.csvgen:
+++        csv.writer(sys.stdout).writerows(scandir(conf_root))
+++        exit(0)
+++
+++    with open(args.csv, 'rt') as csvfile:
+++        csva = [a for a in csv.reader(csvfile, ConfCSVDialect())]
+++
+++
+++    doconf(csva, args.tests)
+++
++diff --git a/pi-util/qasm.py b/pi-util/qasm.py
++new file mode 100644
++index 0000000..1eacc04
++--- /dev/null
+++++ b/pi-util/qasm.py
++@@ -0,0 +1,2502 @@
+++#!/usr/bin/env python
+++
+++#    add.ifz.setf  -, r0, ra0 ; fmul  rb1, rany2, 0 ; thrend # comment
+++#    add  r0, r0, 1                    # implicit mul nop
+++#    nop                               # explicit add nop, implicit mul nop
+++#    bkpt                              # implicit add/mul nop
+++#    mov  r0, 0x1234                   # hex immediate
+++#    mov  r0, 20 * 40                  # expressions...
+++#    mov  r0, f(sqrt(2.0) * 3.0)       # f() converts float to bits
+++#    mov  r0, a:label                  # put address of label in r0
+++# :label
+++#    bra.allnn  ra2, a:1f              # branch to label 1 (searching forward), using absolute address
+++# :1
+++#    brr.anyz  -, r:1b                 # branch to label 1 (searching backward), using relative address
+++# :1                                   # multiple definitions of numeric labels (differentiated using f/b)
+++# .set my_val, 3                       # introduce alias for 3
+++# .set my_reg, r0                      # and for r0
+++#    mov  my_reg, my_val               # then use them
+++# .set my_reg2, my_reg + my_val        # r0 plus 3 is r3
+++# .macro my_add, a, b, c               # a, b, c act as if .set on entry
+++# .set my_val, 10
+++#    add  a, b, c
+++#    mov  r0, my_val                   # 10
+++# .endm                                # forget all .sets since .macro (including arg .sets)
+++#    mov  r0, my_val                   # 3
+++#    my_add  my_reg2, my_reg, ra0 << 4 # << rotates left (>> rotates right)
+++
+++import math
+++import optparse
+++import os
+++import random
+++import re
+++import struct
+++import sys
+++import time
+++
+++###############################################################################
+++# constants
+++###############################################################################
+++
+++# ops
+++######
+++
+++# negatives are internal qasm ops
+++
+++AOP_MOV     = -3   # two operands
+++AOP_BRA     = -2   # two operands
+++AOP_BRR     = -1   # two operands
+++AOP_NOP     = 0x00 # no operands
+++AOP_FADD    = 0x01
+++AOP_FSUB    = 0x02
+++AOP_FMIN    = 0x03
+++AOP_FMAX    = 0x04
+++AOP_FMINABS = 0x05
+++AOP_FMAXABS = 0x06
+++AOP_FTOI    = 0x07 # two operands
+++AOP_ITOF    = 0x08 # two operands
+++AOP_ADD     = 0x0c
+++AOP_SUB     = 0x0d
+++AOP_SHR     = 0x0e
+++AOP_ASR     = 0x0f
+++AOP_ROR     = 0x10
+++AOP_SHL     = 0x11
+++AOP_MIN     = 0x12
+++AOP_MAX     = 0x13
+++AOP_AND     = 0x14
+++AOP_OR      = 0x15
+++AOP_XOR     = 0x16
+++AOP_NOT     = 0x17 # two operands
+++AOP_CLZ     = 0x18 # two operands
+++AOP_V8ADDS  = 0x1e
+++AOP_V8SUBS  = 0x1f
+++
+++MOP_MOV    = -1  # two operands
+++MOP_NOP    = 0x0 # no operands
+++MOP_FMUL   = 0x1
+++MOP_MUL24  = 0x2
+++MOP_V8MULD = 0x3
+++MOP_V8MIN  = 0x4
+++MOP_V8MAX  = 0x5
+++MOP_V8ADDS = 0x6
+++MOP_V8SUBS = 0x7
+++
+++# ldi modes
+++############
+++
+++LDI_32          = 0
+++LDI_EL_SIGNED   = 1
+++LDI_EL_UNSIGNED = 3
+++LDI_SEMA        = 4
+++
+++# conds
+++########
+++
+++COND_NEVER  = 0
+++COND_ALWAYS = 1
+++COND_IFZ    = 2
+++COND_IFNZ   = 3
+++COND_IFN    = 4
+++COND_IFNN   = 5
+++COND_IFC    = 6
+++COND_IFNC   = 7
+++
+++BCOND_ALLZ   = 0
+++BCOND_ALLNZ  = 1
+++BCOND_ANYZ   = 2
+++BCOND_ANYNZ  = 3
+++BCOND_ALLN   = 4
+++BCOND_ALLNN  = 5
+++BCOND_ANYN   = 6
+++BCOND_ANYNN  = 7
+++BCOND_ALLC   = 8
+++BCOND_ALLNC  = 9
+++BCOND_ANYC   = 10
+++BCOND_ANYNC  = 11
+++BCOND_ALWAYS = 15
+++
+++# packing/unpacking
+++####################
+++
+++# regfile a pack modes
+++PACK_A_NOP   = 0
+++PACK_A_16A   = 1
+++PACK_A_16B   = 2
+++PACK_A_8888  = 3
+++PACK_A_8A    = 4
+++PACK_A_8B    = 5
+++PACK_A_8C    = 6
+++PACK_A_8D    = 7
+++PACK_A_32S   = 8
+++PACK_A_16AS  = 9
+++PACK_A_16BS  = 10
+++PACK_A_8888S = 11
+++PACK_A_8AS   = 12
+++PACK_A_8BS   = 13
+++PACK_A_8CS   = 14
+++PACK_A_8DS   = 15
+++
+++# mul unit pack modes
+++PACK_MUL_NOP  = 0
+++PACK_MUL_8888 = 3
+++PACK_MUL_8A   = 4
+++PACK_MUL_8B   = 5
+++PACK_MUL_8C   = 6
+++PACK_MUL_8D   = 7
+++
+++# regfile a unpack modes
+++UNPACK_A_NOP = 0
+++UNPACK_A_16A = 1
+++UNPACK_A_16B = 2
+++UNPACK_A_8R  = 3
+++UNPACK_A_8A  = 4
+++UNPACK_A_8B  = 5
+++UNPACK_A_8C  = 6
+++UNPACK_A_8D  = 7
+++
+++# r4 unpack modes
+++UNPACK_R4_NOP = 0
+++UNPACK_R4_16A = 1
+++UNPACK_R4_16B = 2
+++UNPACK_R4_8R  = 3
+++UNPACK_R4_8A  = 4
+++UNPACK_R4_8B  = 5
+++UNPACK_R4_8C  = 6
+++UNPACK_R4_8D  = 7
+++
+++PACK_TYPE_INT    = 0
+++PACK_TYPE_FLOAT  = 1
+++PACK_TYPE_EITHER = -1
+++
+++PACK_MODE_A      = 0 # regfile a
+++PACK_MODE_M      = 1 # mul unit
+++PACK_MODE_EITHER = -1
+++
+++UNPACK_LOC_A     = 0 # regfile a
+++UNPACK_LOC_R4    = 1 # r4
+++UNPACK_LOC_AB    = 2 # either regfile a or regfile b
+++UNPACK_LOC_OTHER = 3 # somewhere else
+++
+++# args
+++#######
+++
+++# loc_t, ie internal
+++MUX_AC  = 0
+++MUX_ANY = 1
+++MUX_A   = 2
+++MUX_B   = 3
+++RW_EITHER = 0
+++RW_READ   = 1
+++RW_WRITE  = 2
+++
+++RADDR_NOP = 39
+++
+++# negatives are for internal use
+++RMUX_SEMA  = -6
+++RMUX_LABEL = -5
+++RMUX_IMMV  = -4
+++RMUX_IMM   = -3
+++RMUX_AC    = -2
+++RMUX_ANY   = -1
+++RMUX_A0    = 0 # followed by A1, A2, A3, A4, A5
+++RMUX_A     = 6
+++RMUX_B     = 7
+++
+++WADDR_R0  = 32 # followed by R1, R2, R3
+++WADDR_NOP = 39
+++
+++WMUX_ANY = 0
+++WMUX_A   = 1
+++WMUX_B   = 2
+++
+++# signals
+++##########
+++
+++SIG_BKPT       = 0
+++SIG_NORMAL     = 1
+++SIG_THRSW      = 2
+++SIG_THREND     = 3
+++SIG_SBWAIT     = 4
+++SIG_SBDONE     = 5
+++SIG_INT        = 6 # on a0
+++SIG_LTHRSW     = 6 # on b0
+++SIG_LOADCV     = 7
+++SIG_LOADC      = 8
+++SIG_LDCEND     = 9
+++SIG_LDTMU0     = 10
+++SIG_LDTMU1     = 11
+++SIG_ROTATE     = 12 # on a0
+++SIG_LOADAM     = 12 # on b0
+++SIG_SMALLIMMED = 13
+++SIG_IMMED      = 14
+++SIG_BRANCH     = 15
+++
+++# multi-line assembler constructs
+++##################################
+++
+++CONSTRUCT_MACRO = 0x1
+++CONSTRUCT_IF    = 0x2
+++CONSTRUCT_ELSE  = 0x4
+++CONSTRUCT_REP   = 0x8
+++
+++###############################################################################
+++# helpers
+++###############################################################################
+++
+++def asm_error(message, location = None):
+++   if location is None:
+++      location = current_location
+++   if location == '':
+++      sys.stderr.write('qasm ERROR: %s\n' % message)
+++   else:
+++      sys.stderr.write('qasm ERROR: %s: %s\n' % (location, message))
+++   sys.exit(-1)
+++
+++def asm_warning(message, location = None):
+++   if disable_warnings or (nwarn_level != 0):
+++      return
+++   if location is None:
+++      location = current_location
+++   if location == '':
+++      sys.stderr.write('qasm WARNING: %s\n' % message)
+++   else:
+++      sys.stderr.write('qasm WARNING: %s: %s\n' % (location, message))
+++   if warnings_are_errors:
+++      asm_error('warnings are errors!', location)
+++
+++# smart_split('') = []
+++# smart_split('a') = ['a']
+++# smart_split('a(1, 2),[3, 4, 5],6') = ['a(1, 2)', '[3, 4, 5]', '6']
+++def smart_split(s, delim = ',', count = 0):
+++   if len(s) == 0:
+++      return []
+++   parts = []
+++   depth = 0
+++   i = 0
+++   for j in xrange(len(s)):
+++      if s[j] in '([{':
+++         depth += 1
+++      elif s[j] in ')]}':
+++         depth -= 1
+++      elif (s[j] == delim) and (depth == 0):
+++         parts.append(s[i:j])
+++         i = j + 1
+++         if len(parts) == count:
+++            break
+++   if depth != 0:
+++      asm_error('bracket nesting fail')
+++   parts.append(s[i:])
+++   return parts
+++
+++def is_int(x):
+++   return isinstance(x, int) or isinstance(x, long)
+++
+++###############################################################################
+++# "parsing" stuff
+++###############################################################################
+++
+++re_macro = re.compile('\\.macro\\s+(?P<name>\\w+)(?P<params>(\\s*,\\s*\\w+)*)$')
+++re_if = re.compile('\\.if((?P<set>n?set)\\s+(?P<name>\\w+)|\\s(?P<condition>.+))$')
+++re_elif = re.compile('\\.elif((?P<set>n?set)\\s+(?P<name>\\w+)|\\s(?P<condition>.+))$')
+++re_rep = re.compile('\\.rep\\s+(?P<name>\\w+)\\s*,(?P<count>.+)$')
+++re_include = re.compile('\\.include\\s(?P<filename>.+)$')
+++re_set = re.compile('\\.set\\s+(?P<name>\\w+)\\s*,(?P<val>.+)$')
+++re_unset = re.compile('\\.unset\\s+(?P<name>\\w+)$')
+++re_eval = re.compile('\\.eval\\s(?P<expr>.+)$')
+++re_print_info_warn_error = re.compile('\\.(?P<print_info_warn_error>print|info|warn|error)\\s(?P<message>.+)$')
+++re_assert = re.compile('\\.assert\\s(?P<condition>.+)$')
+++re_data = re.compile('\\.d(?P<size>[124])\\s(?P<data>.+)$')
+++re_macro_inst = re.compile('(?P<name>\\w+)(?P<args>\\s.+|)$')
+++re_label = re.compile(':(?P<name>:?[a-zA-Z_]\\w*|\\d+)$')
+++re_op = re.compile('(?P<op>\\w+)(\\.(?P<cond>\\w+))??(\\.(?P<sf>setf))?(?P<args>\\s.+|)$')
+++re_label_ref_left = re.compile('\\b([ar]):')
+++re_label_ref_right = re.compile('[a-zA-Z_]\\w*|\\d+[bf]$')
+++re_pack = re.compile('\\.([0-9]\\w*[a-df-zA-DF-Z_])') # a bit weird because we don't want to pick up float literals...
+++
+++# ops
+++######
+++
+++aops = {
+++   'mov': (AOP_MOV, 2),
+++   'bra': (AOP_BRA, 2),
+++   'brr': (AOP_BRR, 2),
+++   'nop': (AOP_NOP, 0),
+++   'fadd': (AOP_FADD, 3),
+++   'fsub': (AOP_FSUB, 3),
+++   'fmin': (AOP_FMIN, 3),
+++   'fmax': (AOP_FMAX, 3),
+++   'fminabs': (AOP_FMINABS, 3),
+++   'fmaxabs': (AOP_FMAXABS, 3),
+++   'ftoi': (AOP_FTOI, 2),
+++   'itof': (AOP_ITOF, 2),
+++   'add': (AOP_ADD, 3),
+++   'sub': (AOP_SUB, 3),
+++   'shr': (AOP_SHR, 3),
+++   'asr': (AOP_ASR, 3),
+++   'ror': (AOP_ROR, 3),
+++   'shl': (AOP_SHL, 3),
+++   'min': (AOP_MIN, 3),
+++   'max': (AOP_MAX, 3),
+++   'and': (AOP_AND, 3),
+++   'or': (AOP_OR, 3),
+++   'xor': (AOP_XOR, 3),
+++   'not': (AOP_NOT, 2),
+++   'clz': (AOP_CLZ, 2),
+++   'v8adds': (AOP_V8ADDS, 3),
+++   'v8subs': (AOP_V8SUBS, 3)}
+++
+++def get_aop(aop):
+++   if aop not in aops:
+++      asm_error('invalid aop')
+++   return aops[aop]
+++
+++mops = {
+++   'mov': (MOP_MOV, 2),
+++   'nop': (MOP_NOP, 0),
+++   'fmul': (MOP_FMUL, 3),
+++   'mul24': (MOP_MUL24, 3),
+++   'v8muld': (MOP_V8MULD, 3),
+++   'v8min': (MOP_V8MIN, 3),
+++   'v8max': (MOP_V8MAX, 3),
+++   'v8adds': (MOP_V8ADDS, 3),
+++   'v8subs': (MOP_V8SUBS, 3)}
+++
+++def get_mop(mop):
+++   if mop not in mops:
+++      asm_error('invalid mop')
+++   return mops[mop]
+++
+++# conds
+++########
+++
+++conds = {
+++   'ifz': COND_IFZ,
+++   'ifnz': COND_IFNZ,
+++   'ifn': COND_IFN,
+++   'ifnn': COND_IFNN,
+++   'ifc': COND_IFC,
+++   'ifnc': COND_IFNC}
+++
+++def get_cond(cond):
+++   if not cond:
+++      return COND_ALWAYS
+++   if cond not in conds:
+++      asm_error('invalid cond')
+++   return conds[cond]
+++
+++bconds = {
+++   'allz': BCOND_ALLZ,
+++   'allnz': BCOND_ALLNZ,
+++   'anyz': BCOND_ANYZ,
+++   'anynz': BCOND_ANYNZ,
+++   'alln': BCOND_ALLN,
+++   'allnn': BCOND_ALLNN,
+++   'anyn': BCOND_ANYN,
+++   'anynn': BCOND_ANYNN,
+++   'allc': BCOND_ALLC,
+++   'allnc': BCOND_ALLNC,
+++   'anyc': BCOND_ANYC,
+++   'anync': BCOND_ANYNC}
+++
+++def get_bcond(bcond):
+++   if not bcond:
+++      return BCOND_ALWAYS
+++   if bcond not in bconds:
+++      asm_error('invalid bcond')
+++   return bconds[bcond]
+++
+++def get_setf(setf):
+++   if not setf:
+++      return False
+++   return True
+++
+++# packing/unpacking
+++####################
+++
+++packs = {
+++   '16a':    (PACK_A_16A,    PACK_TYPE_INT,    PACK_MODE_A),
+++   '16b':    (PACK_A_16B,    PACK_TYPE_INT,    PACK_MODE_A),
+++   '16af':   (PACK_A_16A,    PACK_TYPE_FLOAT,  PACK_MODE_A),
+++   '16bf':   (PACK_A_16B,    PACK_TYPE_FLOAT,  PACK_MODE_A),
+++   '8abcd':  (PACK_A_8888,   PACK_TYPE_EITHER, PACK_MODE_A),
+++   '8a':     (PACK_A_8A,     PACK_TYPE_EITHER, PACK_MODE_A),
+++   '8b':     (PACK_A_8B,     PACK_TYPE_EITHER, PACK_MODE_A),
+++   '8c':     (PACK_A_8C,     PACK_TYPE_EITHER, PACK_MODE_A),
+++   '8d':     (PACK_A_8D,     PACK_TYPE_EITHER, PACK_MODE_A),
+++   's':      (PACK_A_32S,    PACK_TYPE_EITHER, PACK_MODE_A),
+++   '16as':   (PACK_A_16AS,   PACK_TYPE_EITHER, PACK_MODE_A),
+++   '16bs':   (PACK_A_16BS,   PACK_TYPE_EITHER, PACK_MODE_A),
+++   '8abcds': (PACK_A_8888S,  PACK_TYPE_EITHER, PACK_MODE_A),
+++   '8as':    (PACK_A_8AS,    PACK_TYPE_EITHER, PACK_MODE_A),
+++   '8bs':    (PACK_A_8BS,    PACK_TYPE_EITHER, PACK_MODE_A),
+++   '8cs':    (PACK_A_8CS,    PACK_TYPE_EITHER, PACK_MODE_A),
+++   '8ds':    (PACK_A_8DS,    PACK_TYPE_EITHER, PACK_MODE_A),
+++   '8abcdc': (PACK_MUL_8888, PACK_TYPE_EITHER, PACK_MODE_M),
+++   '8ac':    (PACK_MUL_8A,   PACK_TYPE_EITHER, PACK_MODE_M),
+++   '8bc':    (PACK_MUL_8B,   PACK_TYPE_EITHER, PACK_MODE_M),
+++   '8cc':    (PACK_MUL_8C,   PACK_TYPE_EITHER, PACK_MODE_M),
+++   '8dc':    (PACK_MUL_8D,   PACK_TYPE_EITHER, PACK_MODE_M)}
+++
+++def get_pack(pack):
+++   if not pack:
+++      return (0, PACK_TYPE_EITHER, PACK_MODE_EITHER)
+++   if pack not in packs:
+++      asm_error('invalid pack')
+++   return packs[pack]
+++
+++a_unpacks = {
+++   '16a':  (UNPACK_A_16A, PACK_TYPE_INT),
+++   '16b':  (UNPACK_A_16B, PACK_TYPE_INT),
+++   '16af': (UNPACK_A_16A, PACK_TYPE_FLOAT),
+++   '16bf': (UNPACK_A_16B, PACK_TYPE_FLOAT),
+++   '8dr':  (UNPACK_A_8R,  PACK_TYPE_EITHER),
+++   '8a':   (UNPACK_A_8A,  PACK_TYPE_INT),
+++   '8b':   (UNPACK_A_8B,  PACK_TYPE_INT),
+++   '8c':   (UNPACK_A_8C,  PACK_TYPE_INT),
+++   '8d':   (UNPACK_A_8D,  PACK_TYPE_INT),
+++   '8ac':  (UNPACK_A_8A,  PACK_TYPE_FLOAT),
+++   '8bc':  (UNPACK_A_8B,  PACK_TYPE_FLOAT),
+++   '8cc':  (UNPACK_A_8C,  PACK_TYPE_FLOAT),
+++   '8dc':  (UNPACK_A_8D,  PACK_TYPE_FLOAT)}
+++
+++def get_a_unpack(unpack):
+++   if not unpack:
+++      return (UNPACK_A_NOP, PACK_TYPE_EITHER, UNPACK_LOC_A)
+++   if unpack not in a_unpacks:
+++      asm_error('invalid ra unpack')
+++   return a_unpacks[unpack] + (UNPACK_LOC_A,)
+++
+++r4_unpacks = {
+++   '16af': UNPACK_R4_16A,
+++   '16bf': UNPACK_R4_16B,
+++   '8dr':  UNPACK_R4_8R,
+++   '8ac':  UNPACK_R4_8A,
+++   '8bc':  UNPACK_R4_8B,
+++   '8cc':  UNPACK_R4_8C,
+++   '8dc':  UNPACK_R4_8D}
+++
+++def get_r4_unpack(unpack):
+++   if not unpack:
+++      return (UNPACK_R4_NOP, PACK_TYPE_EITHER, UNPACK_LOC_R4)
+++   if unpack not in r4_unpacks:
+++      asm_error('invalid r4 unpack')
+++   return (r4_unpacks[unpack], PACK_TYPE_EITHER, UNPACK_LOC_R4)
+++
+++# args
+++#######
+++
+++class loc_t:
+++   def __init__(self, mux, i, rot, r5_rot, pack, rw):
+++      self.mux = mux
+++      self.i = i
+++      self.rot = rot % 16
+++      self.r5_rot = r5_rot % 16
+++      self.pack = pack
+++      self.rw = rw
+++
+++   def copy(self):
+++      return loc_t(self.mux, self.i, self.rot, self.r5_rot, self.pack, self.rw)
+++
+++   def __add__(self, i):
+++      if not is_int(i):
+++         raise Exception('can only add integer to loc')
+++      return loc_t(self.mux, self.i + i, self.rot, self.r5_rot, self.pack, self.rw)
+++
+++   def __sub__(self, i):
+++      if not is_int(i):
+++         raise Exception('can only subtract integer from loc')
+++      return loc_t(self.mux, self.i - i, self.rot, self.r5_rot, self.pack, self.rw)
+++
+++   def __cmp__(self, other):
+++      if is_int(other):
+++         return cmp(self.i, other)
+++      if not isinstance(other, loc_t):
+++         raise Exception('can only compare loc to integer or other loc')
+++      if self.mux != other.mux:
+++         return cmp(self.mux, other.mux)
+++      if self.i != other.i:
+++         return cmp(self.i, other.i)
+++      if self.rot != other.rot:
+++         return cmp(self.rot, other.rot)
+++      if self.r5_rot != other.r5_rot:
+++         return cmp(self.r5_rot, other.r5_rot)
+++      return cmp(self.pack, other.pack)
+++
+++   def is_r5(self):
+++      return (self.mux == MUX_AC) and (self.i == 5)
+++
+++   def shift(self, rot, left):
+++      if isinstance(rot, loc_t) and rot.is_r5():
+++         if (rot.rot != 0) or (rot.r5_rot != 0) or rot.pack:
+++            raise Exception('can\'t rotate by rotated/unpacked r5')
+++         return loc_t(self.mux, self.i, self.rot, self.r5_rot + (-1 if left else 1), self.pack, self.rw)
+++      if not is_int(rot):
+++         raise Exception('can only rotate by integer or r5')
+++      return loc_t(self.mux, self.i, self.rot + (-rot if left else rot), self.r5_rot, self.pack, self.rw)
+++
+++   def __lshift__(self, rot):
+++      return self.shift(rot, True)
+++
+++   def __rshift__(self, rot):
+++      return self.shift(rot, False)
+++
+++   def __getattr__(self, name):
+++      # discard the first character if it is an underscore. this is a total hack
+++      # to allow packs starting with a digit to work
+++      if name[0] == '_':
+++         name = name[1:]
+++      if (name in packs) or (name in a_unpacks) or (name in r4_unpacks):
+++         if self.pack:
+++            raise Exception('can\'t specify two packs')
+++         return loc_t(self.mux, self.i, self.rot, self.r5_rot, name, self.rw)
+++      raise AttributeError()
+++
+++   def __str__(self):
+++      if self.mux == MUX_AC:
+++         return 'r%d' % self.i
+++      if self.mux == MUX_ANY:
+++         return 'rany%d' % self.i
+++      if self.mux == MUX_A:
+++         return 'ra%d' % self.i
+++      if self.mux == MUX_B:
+++         return 'rb%d' % self.i
+++      assert 0
+++
+++class sema_t:
+++   def __init__(self, acq, i):
+++      if not is_int(i):
+++         raise Exception('semaphore index must be integer')
+++      self.acq = acq
+++      self.i = i
+++
+++class label_t:
+++   def __init__(self, rel, name, offset):
+++      self.rel = rel
+++      self.name = name
+++      self.offset = offset
+++
+++   def __add__(self, offset):
+++      return label_t(self.rel, self.name, self.offset + offset)
+++
+++   def __sub__(self, offset):
+++      return label_t(self.rel, self.name, self.offset - offset)
+++
+++class label_maker_t:
+++   def __init__(self, rel):
+++      self.rel = rel
+++
+++   def __getattr__(self, name):
+++      # we discard the first character. this is a total hack to allow numeric labels to work
+++      if not re_label_ref_right.match(name[1:]):
+++         raise Exception('invalid label reference')
+++      return label_t(self.rel, name[1:], 0)
+++
+++def bits(x, n):
+++   if (x >> n) != 0:
+++      raise Exception('%d doesn\'t fit in %d bits' % (x, n))
+++   return x
+++
+++def bitsw(x, n):
+++   if x == (1 << n):
+++      x = 0
+++   return bits(x, n)
+++
+++def bitsws(x, n):
+++   if x == (1 << (n - 1)):
+++      x = 0
+++   if -(1 << (n - 1)) <= x < 0:
+++      x += 1 << n
+++   return bits(x, n)
+++
+++def vpm_setup(n, stride, addr, v2 = False):
+++   horiz, laned, size, y, x, p = addr
+++   if size not in (0, 1, 2):
+++      raise Exception('addr size should be 0, 1, or 2')
+++   if horiz:
+++      if x != 0:
+++         raise Exception('horizontal accesses must have x of 0')
+++   else:
+++      if (y & 0xf) != 0:
+++         raise Exception('vertical accesses must be 16 row aligned')
+++   hls = (bits(horiz, 1) << 3) | (bits(laned, 1) << 2) | (2 - size)
+++   if v2:
+++      return ((1 << 29) | (bitsw(n, 5) << 24) | (bitsws(stride, 7) << 16) |
+++         (hls << 12) | ((bits(y, 8) | bits(x, 4)) << size) | bits(p, size))
+++   return ((bitsw(n, 4) << 20) | (bitsw(stride, 6) << 12) |
+++      (hls << 8) | ((bits(y, 6) | bits(x, 4)) << size) | bits(p, size))
+++
+++def vdw_setup_0(n, m, addr):
+++   horiz, size, y, x, p = addr
+++   if size not in (0, 1, 2):
+++      raise Exception('addr size should be 0, 1, or 2')
+++   return ((2 << 30) | (bitsw(n, 7) << 23) | (bitsw(m, 7) << 16) |
+++      (bits(horiz, 1) << 14) | (bits(y, 7) << 7) | (bits(x, 4) << 3) | (size << 1) | bits(p, size))
+++
+++def vdr_setup_0(n, m, addr, vpm_stride, stride):
+++   horiz, size, y, x, p = addr
+++   if size not in (0, 1, 2):
+++      raise Exception('addr size should be 0, 1, or 2')
+++   if (stride < 8) or (stride & (stride - 1)):
+++      raise Exception('stride must be power of 2 >= 8, 8 meaning use extended stride')
+++   log2_stride = 3
+++   while (1 << log2_stride) != stride:
+++      log2_stride += 1
+++   return ((1 << 31) | (size << 29) | (bits(p, size) << 28) | (bits(log2_stride - 3, 4) << 24) |
+++      (bitsw(m, 4) << 20) | (bitsw(n, 4) << 16) | (bitsw(vpm_stride, 4) << 12) |
+++      (bits(1 - horiz, 1) << 11) | (bits(y, 7) << 4) | bits(x, 4))
+++
+++class allocator_t:
+++   def __init__(self, *available):
+++      self.available = list(available)
+++      self.allocated = {}
+++      self.reserved = []
+++
+++   def copy(self):
+++      a = allocator_t()
+++      a.available = self.available[:]
+++      a.allocated = self.allocated.copy()
+++      a.reserved = self.reserved[:]
+++      return a
+++
+++   def forget(self):
+++      self.__init__(self.available + self.allocated.values() + self.reserved)
+++
+++   def reserve(self, *rs):
+++      for r in rs:
+++         self.available.remove(r)
+++         self.reserved.append(r)
+++
+++   def retire(self, name):
+++      r = self.allocated.pop(name)
+++      del r.__invert__
+++      del r.retire
+++      self.available.append(r)
+++      return r
+++
+++   def __getattr__(self, name):
+++      if name not in self.allocated:
+++         r = self.available.pop()
+++         r.retire = lambda: self.retire(name) # this is an ugly hack to get nicer retire syntax
+++         r.__invert__ = r.retire
+++         self.allocated[name] = r
+++      return self.allocated[name]
+++
+++def pragma_allow_xor_0(x):
+++   global allow_xor_0
+++
+++   if not isinstance(x, bool):
+++      raise Exception('allow_xor_0 must be bool')
+++   x, allow_xor_0 = allow_xor_0, x
+++   return x
+++
+++def pragma_dont_warn_when_mul_rot_inp_r5(x):
+++   global dont_warn_when_mul_rot_inp_r5
+++
+++   if not isinstance(x, bool):
+++      raise Exception('dont_warn_when_mul_rot_inp_r5 must be bool')
+++   x, dont_warn_when_mul_rot_inp_r5 = dont_warn_when_mul_rot_inp_r5, x
+++   return x
+++
+++arg_defs = {
+++   # special reg names (these alias the regular names, but also have appropriate read/write restrictions)
+++   'w':             loc_t(MUX_A,   15, 0, 0, None, RW_EITHER),
+++   'z':             loc_t(MUX_B,   15, 0, 0, None, RW_EITHER),
+++   'unif':          loc_t(MUX_ANY, 32, 0, 0, None, RW_READ),
+++   'vary':          loc_t(MUX_ANY, 35, 0, 0, None, RW_READ),
+++   'tmurs':         loc_t(MUX_ANY, 36, 0, 0, None, RW_WRITE),
+++   'r5quad':        loc_t(MUX_A,   37, 0, 0, None, RW_WRITE),
+++   'r5rep':         loc_t(MUX_B,   37, 0, 0, None, RW_WRITE),
+++   'elem_num':      loc_t(MUX_A,   38, 0, 0, None, RW_READ),
+++   'qpu_num':       loc_t(MUX_B,   38, 0, 0, None, RW_READ),
+++   'unif_addr':     loc_t(MUX_A,   40, 0, 0, None, RW_WRITE),
+++   'unif_addr_rel': loc_t(MUX_B,   40, 0, 0, None, RW_WRITE),
+++   'x_coord':       loc_t(MUX_A,   41, 0, 0, None, RW_EITHER),
+++   'y_coord':       loc_t(MUX_B,   41, 0, 0, None, RW_EITHER),
+++   'ms_mask':       loc_t(MUX_A,   42, 0, 0, None, RW_EITHER),
+++   'rev_flag':      loc_t(MUX_B,   42, 0, 0, None, RW_EITHER),
+++   'stencil':       loc_t(MUX_ANY, 43, 0, 0, None, RW_WRITE),
+++   'tlbz':          loc_t(MUX_ANY, 44, 0, 0, None, RW_WRITE),
+++   'tlbm':          loc_t(MUX_ANY, 45, 0, 0, None, RW_WRITE),
+++   'tlbc':          loc_t(MUX_ANY, 46, 0, 0, None, RW_WRITE),
+++   'vpm':           loc_t(MUX_ANY, 48, 0, 0, None, RW_EITHER),
+++   'vr_busy':       loc_t(MUX_A,   49, 0, 0, None, RW_READ),
+++   'vw_busy':       loc_t(MUX_B,   49, 0, 0, None, RW_READ),
+++   'vr_setup':      loc_t(MUX_A,   49, 0, 0, None, RW_WRITE),
+++   'vw_setup':      loc_t(MUX_B,   49, 0, 0, None, RW_WRITE),
+++   'vr_wait':       loc_t(MUX_A,   50, 0, 0, None, RW_READ),
+++   'vw_wait':       loc_t(MUX_B,   50, 0, 0, None, RW_READ),
+++   'vr_addr':       loc_t(MUX_A,   50, 0, 0, None, RW_WRITE),
+++   'vw_addr':       loc_t(MUX_B,   50, 0, 0, None, RW_WRITE),
+++   'mutex':         loc_t(MUX_ANY, 51, 0, 0, None, RW_EITHER),
+++   'recip':         loc_t(MUX_ANY, 52, 0, 0, None, RW_WRITE),
+++   'recipsqrt':     loc_t(MUX_ANY, 53, 0, 0, None, RW_WRITE),
+++   'rsqrt':         loc_t(MUX_ANY, 53, 0, 0, None, RW_WRITE),
+++   'exp':           loc_t(MUX_ANY, 54, 0, 0, None, RW_WRITE),
+++   'log':           loc_t(MUX_ANY, 55, 0, 0, None, RW_WRITE),
+++   't0s':           loc_t(MUX_ANY, 56, 0, 0, None, RW_WRITE),
+++   't0t':           loc_t(MUX_ANY, 57, 0, 0, None, RW_WRITE),
+++   't0r':           loc_t(MUX_ANY, 58, 0, 0, None, RW_WRITE),
+++   't0b':           loc_t(MUX_ANY, 59, 0, 0, None, RW_WRITE),
+++   't1s':           loc_t(MUX_ANY, 60, 0, 0, None, RW_WRITE),
+++   't1t':           loc_t(MUX_ANY, 61, 0, 0, None, RW_WRITE),
+++   't1r':           loc_t(MUX_ANY, 62, 0, 0, None, RW_WRITE),
+++   't1b':           loc_t(MUX_ANY, 63, 0, 0, None, RW_WRITE),
+++
+++   # semaphore acq/rel
+++   'sacq': lambda i: sema_t(True, i),
+++   'srel': lambda i: sema_t(False, i),
+++
+++   # label makers (before evaluating, the syntax x:label gets transformed to x_label_maker._label)
+++   'r_label_maker': label_maker_t(True),
+++   'a_label_maker': label_maker_t(False),
+++
+++   # handy functions
+++   'f':     lambda x: struct.unpack('I', struct.pack('f', x))[0],
+++   'sqrt':  math.sqrt,
+++   'sin':   math.sin,
+++   'cos':   math.cos,
+++   'atan2': math.atan2,
+++   'pi':    math.pi,
+++   'rseed': random.seed,
+++   'rand':  lambda: int(random.getrandbits(32)),
+++   'bits':  bits,
+++   'bitsw': bitsw,
+++   'bitsws': bitsws,
+++
+++   # handy vpm/vdw/vdr stuff
+++   'h32':  lambda y:       (1, 0, 0, y, 0, 0),
+++   'h16l': lambda y, p:    (1, 1, 1, y, 0, p),
+++   'h16p': lambda y, p:    (1, 0, 1, y, 0, p),
+++   'h8l':  lambda y, p:    (1, 1, 2, y, 0, p),
+++   'h8p':  lambda y, p:    (1, 0, 2, y, 0, p),
+++   'v32':  lambda y, x:    (0, 0, 0, y, x, 0),
+++   'v16l': lambda y, x, p: (0, 1, 1, y, x, p),
+++   'v16p': lambda y, x, p: (0, 0, 1, y, x, p),
+++   'v8l':  lambda y, x, p: (0, 1, 2, y, x, p),
+++   'v8p':  lambda y, x, p: (0, 0, 2, y, x, p),
+++   'dma_h32':  lambda y, x:    (1, 0, y, x, 0),
+++   'dma_h16p': lambda y, x, p: (1, 1, y, x, p),
+++   'dma_h8p':  lambda y, x, p: (1, 2, y, x, p),
+++   'dma_v32':  lambda y, x:    (0, 0, y, x, 0),
+++   'dma_v16p': lambda y, x, p: (0, 1, y, x, p),
+++   'dma_v8p':  lambda y, x, p: (0, 2, y, x, p),
+++   'vpm_setup': vpm_setup,
+++   'vpm_setup_v2': lambda n, stride, addr: vpm_setup(n, stride, addr, True),
+++   'vdw_setup_0': vdw_setup_0,
+++   'vdw_setup_1': lambda stride: (3 << 30) | bits(stride, 13),
+++   'vdr_setup_0': vdr_setup_0,
+++   'vdr_setup_ext_stride': 8, # stride of 8 means use extended stride
+++   'vdr_setup_1': lambda stride: (9 << 28) | bits(stride, 13),
+++
+++   # annotations
+++   'mul_used': lambda *is_: ('mul_used', sum(1 << i for i in is_)),
+++   'mul_unused': lambda *is_: ('mul_used', sum(1 << i for i in is_) ^ 0xffff),
+++   'preserve_cond': ('preserve_cond', 1),
+++
+++   # somewhat experimental register allocator
+++   'allocator_t': allocator_t,
+++
+++   # pragmas
+++   'pragma_allow_xor_0': pragma_allow_xor_0,
+++   'pragma_dont_warn_when_mul_rot_inp_r5': pragma_dont_warn_when_mul_rot_inp_r5}
+++
+++# accumulators and regs (regular names -- r0, ra0, etc)
+++arg_defs.update(('r%d' % i, loc_t(MUX_AC, i, 0, 0, None, RW_EITHER)) for i in xrange(6))
+++arg_defs.update(('rany%d' % i, loc_t(MUX_ANY, i, 0, 0, None, RW_EITHER)) for i in xrange(64))
+++arg_defs.update(('ra%d' % i, loc_t(MUX_A, i, 0, 0, None, RW_EITHER)) for i in xrange(64))
+++arg_defs.update(('rb%d' % i, loc_t(MUX_B, i, 0, 0, None, RW_EITHER)) for i in xrange(64))
+++
+++def arg_eval(arg, sets):
+++   s = (arg.strip().split('.', 1) + [None])[:2]
+++   if s[0] == '-':
+++      return loc_t(MUX_ANY, WADDR_NOP, 0, 0, s[1], RW_WRITE)
+++   arg = re_label_ref_left.sub('\\1_label_maker._', arg) # todo: we probably don't want to replace in strings...
+++   arg = re_pack.sub('._\\1', arg)
+++   try:
+++      # todo: i would like to be able to pass both arg_defs and sets in here
+++      # (with sets hiding arg_defs in the case of conflicts), but the obvious
+++      # dict(arg_defs, **sets) won't permit things such as:
+++      # .set f, lambda x: y
+++      # .set y, 4
+++      # (the y in the lambda will be looked up in the temporary dict we created
+++      # when evaluating the f .set, which doesn't contain y)
+++      #
+++      # instead, sets is initially set to (a copy of) arg_defs. to simulate the
+++      # hiding behaviour, on an unset, we restore any hidden arg_defs value.
+++      # also, before dumping sets at the end, we strip out the arg_defs stuff
+++      # (this isn't entirely correct as we want to dump sets that are hiding
+++      # arg_defs)
+++      return eval(arg, sets)
+++   except Exception, e:
+++      asm_error(e)
+++   except:
+++      asm_error('unknown error while evaluating argument')
+++
+++# doesn't check/fixup pack
+++def check_and_fixup_loc(loc, read):
+++   if (not read) and (loc.rw == RW_READ):
+++      asm_error('writing to read-only hardware register')
+++   if read and (loc.rw == RW_WRITE):
+++      asm_error('reading from write-only hardware register')
+++   if not read:
+++      # conceptually, we are writing to a location rotated right by
+++      # loc.rot/loc.r5_rot. but we are actually rotating the output right by
+++      # -loc.rot/-loc.r5_rot then writing it to the unrotated location
+++      loc.rot = -loc.rot % 16
+++      loc.r5_rot = -loc.r5_rot % 16
+++   if (loc.rot != 0) and (loc.r5_rot != 0):
+++      asm_error('can\'t rotate by both r5 and immediate')
+++   if (loc.r5_rot != 0) and (loc.r5_rot != 1):
+++      asm_error('only supported rotation by r5 is once to the %s' % ('left', 'right')[read])
+++   if (not mulw_rotate) and ((loc.rot != 0) or loc.r5_rot): # mulw_rotate source checking is done later
+++      if not read:
+++         asm_error('target doesn\'t support write rotation')
+++      if loc.mux == MUX_ANY:
+++         loc.mux = MUX_A # can't do rotated read from regfile b
+++      if loc.mux != MUX_A:
+++         asm_error('rotation on read only allowed from regfile a')
+++      if loc.i >= 32:
+++         asm_warning('rotation only works from physical regfile')
+++   if loc.mux == MUX_AC:
+++      if (loc.i < 0) or (loc.i >= 6):
+++         asm_error('reg out of range')
+++      if not read:
+++         if loc.i == 4:
+++            asm_error('not allowed to write to r4')
+++         if loc.i == 5:
+++
+++            asm_error('not allowed to write to r5 -- please specify r5quad or r5rep')
+++   elif (loc.mux == MUX_ANY) or (loc.mux == MUX_A) or (loc.mux == MUX_B):
+++      if (loc.i < 0) or (loc.i >= 64):
+++         asm_error('reg out of range')
+++   else:
+++      assert 0
+++
+++def get_dst(dst, sets):
+++   if not dst:
+++      return None, None, (0, PACK_TYPE_EITHER, PACK_MODE_EITHER), 0, 0
+++   dst = arg_eval(dst, sets)
+++   if not isinstance(dst, loc_t):
+++      asm_error('invalid dst')
+++   dst = dst.copy()
+++   check_and_fixup_loc(dst, False)
+++   pack = get_pack(dst.pack)
+++   if dst.mux == MUX_AC:
+++      if pack[2] == PACK_MODE_A:
+++         asm_warning('ra packing only works when writing to physical regfile')
+++         return WADDR_R0 + dst.i, WMUX_A, pack, dst.rot, dst.r5_rot
+++      return WADDR_R0 + dst.i, WMUX_ANY, pack, dst.rot, dst.r5_rot
+++   if (dst.mux == MUX_A) or ((dst.mux == MUX_ANY) and (pack[2] == PACK_MODE_A)): # can't pack to regfile b with this operation
+++      if (pack[2] == PACK_MODE_A) and (dst.i >= 32):
+++         asm_warning('ra packing only works when writing to physical regfile')
+++      return dst.i, WMUX_A, pack, dst.rot, dst.r5_rot
+++   if dst.mux == MUX_ANY:
+++      return dst.i, WMUX_ANY, pack, dst.rot, dst.r5_rot
+++   if dst.mux == MUX_B:
+++      if pack[2] == PACK_MODE_A:
+++         asm_error('this packing operation can only be used for regfile a')
+++      return dst.i, WMUX_B, pack, dst.rot, dst.r5_rot
+++   assert 0
+++
+++def get_src(src, sets):
+++   if not src:
+++      return None, None, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), None, None
+++   src = arg_eval(src, sets)
+++   if isinstance(src, sema_t):
+++      if not have_sema:
+++         asm_error('target does not support semaphores')
+++      if (src.i < 0) or (src.i >= 16):
+++         asm_error('semaphore number must be in [0, 16)')
+++      return src.i | (src.acq << 4), RMUX_SEMA, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), 0, 0
+++   if isinstance(src, label_t):
+++      return (src.name, src.rel, src.offset), RMUX_LABEL, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), 0, 0
+++   if isinstance(src, list):
+++      if len(src) != 16:
+++         asm_error('vector immediate must have length 16')
+++      src = src[:]
+++      for i in xrange(16):
+++         if not is_int(src[i]):
+++            asm_error('all elements of vector immediate must be integers')
+++         src[i] &= (1 << 32) - 1
+++      return src, RMUX_IMMV, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), 0, 0
+++   if is_int(src):
+++      return src & ((1 << 32) - 1), RMUX_IMM, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), 0, 0
+++   if not isinstance(src, loc_t):
+++      asm_error('invalid src')
+++   src = src.copy()
+++   check_and_fixup_loc(src, True)
+++   if mulw_rotate:
+++      srot, sr5rot = 0, 0
+++      drot, dr5rot = src.rot, src.r5_rot
+++   else:
+++      srot, sr5rot = src.rot, src.r5_rot
+++      drot, dr5rot = 0, 0
+++   if src.mux == MUX_AC:
+++      if src.i == 4:
+++         return 4, RMUX_AC, get_r4_unpack(src.pack), drot, dr5rot
+++      if src.pack:
+++         asm_error('unpack only allowed for regfile a or r4')
+++      return src.i, RMUX_AC, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), drot, dr5rot
+++   if (src.mux == MUX_A) or ((src.mux == MUX_ANY) and src.pack): # can't unpack from regfile b
+++      return (src.i, srot, sr5rot), RMUX_A, get_a_unpack(src.pack), drot, dr5rot
+++   if src.mux == MUX_ANY:
+++      return src.i, RMUX_ANY, (0, PACK_TYPE_EITHER, UNPACK_LOC_AB), drot, dr5rot
+++   if src.mux == MUX_B:
+++      if src.pack:
+++         asm_error('unpack only allowed for regfile a or r4')
+++      return src.i, RMUX_B, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), drot, dr5rot
+++   assert 0
+++
+++# signals
+++##########
+++
+++sigs = {
+++   'bkpt': SIG_BKPT,
+++   'thrsw': SIG_THRSW,
+++   'thrend': SIG_THREND,
+++   'sbwait': SIG_SBWAIT,
+++   'sbdone': SIG_SBDONE,
+++   'int': SIG_INT,
+++   'loadcv': SIG_LOADCV,
+++   'loadc': SIG_LOADC,
+++   'ldcend': SIG_LDCEND,
+++   'ldtmu0': SIG_LDTMU0,
+++   'ldtmu1': SIG_LDTMU1}
+++
+++def get_sig(sig):
+++   if sig not in sigs:
+++      return SIG_NORMAL
+++   return sigs[sig]
+++
+++# annotations
+++##############
+++
+++def get_annots(annot, sets):
+++   annots = arg_eval(annot, sets)
+++   if isinstance(annots, list):
+++      annots = annots[:]
+++   else:
+++      annots = [annots]
+++   for i, annot in enumerate(annots):
+++      if ((not isinstance(annot, tuple)) or (len(annot) != 2) or (not isinstance(annot[0], str)) or
+++         (not is_int(annot[1]))):
+++         asm_error('annotation must be (string, integer) pair, or a list of such pairs')
+++      annots[i] = (annot[0], annot[1] & ((1 << 32) - 1))
+++   return annots
+++
+++###############################################################################
+++# core
+++###############################################################################
+++
+++def calculate_pack_modes(rpacks, rfloats, couldrfloat, wpacks, wfloats):
+++   needfloat = PACK_TYPE_EITHER
+++   havefloata = False
+++   havefloatr4 = False
+++   unpacka = None
+++   unpackr4 = None
+++   forcebs = [False, False, False, False]
+++   forcerafloat = False
+++
+++   pm = PACK_MODE_EITHER
+++   for i in (0, 1, 2, 3):
+++      if (rpacks[i][2] == UNPACK_LOC_OTHER) or (rpacks[i][2] == UNPACK_LOC_AB):
+++         assert rpacks[i][0] == 0
+++      else:
+++         if rpacks[i][2] == UNPACK_LOC_A:
+++            if unpacka is None:
+++               unpacka = rpacks[i][0]
+++            elif unpacka != rpacks[i][0]:
+++               asm_error('conflicting unpack operations on regfile a')
+++            havefloata = havefloata or rfloats[i]
+++         elif rpacks[i][2] == UNPACK_LOC_R4:
+++            if unpackr4 is None:
+++               unpackr4 = rpacks[i][0]
+++            elif unpackr4 != rpacks[i][0]:
+++               asm_error('conflicting unpack operations on r4')
+++            havefloatr4 = havefloatr4 or rfloats[i]
+++         else:
+++            assert 0
+++
+++         if rpacks[i][1] != PACK_TYPE_EITHER:
+++            if (needfloat != PACK_TYPE_EITHER) and (needfloat != rpacks[i][1]):
+++               asm_error('conflicting unpack float requirements')
+++            needfloat = rpacks[i][1]
+++   for i in (0, 1, 2, 3):
+++      if rpacks[i][2] == UNPACK_LOC_AB:
+++         if (unpacka is not None) and (unpacka != UNPACK_A_NOP):
+++            forcebs[i] = True # non-nop unpack from regfile a. must use b
+++
+++   if unpacka:
+++      if (needfloat == PACK_TYPE_FLOAT) and (not havefloata) and couldrfloat:
+++         havefloata = True
+++         forcerafloat = True
+++      havefloat = havefloata
+++   else:
+++      havefloat = havefloatr4
+++
+++   if (needfloat == PACK_TYPE_FLOAT) and (not havefloat):
+++      asm_error('float unpack operation used in integer alu operations')
+++   if (needfloat == PACK_TYPE_INT) and havefloat:
+++      asm_error('integer unpack operation used in float alu operation')
+++
+++   unpack = 0
+++   if unpacka and unpackr4:
+++      asm_error('cannot specify pack operation for both regfile a and r4')
+++   if unpacka:
+++      pm = PACK_MODE_A
+++      unpack = unpacka
+++   elif unpackr4:
+++      pm = PACK_MODE_M
+++      unpack = unpackr4
+++
+++   pack = 0
+++   if wpacks[0][2] == PACK_MODE_M:
+++      asm_error('mul-unit pack operation used on add result')
+++   for i in (0, 1):
+++      if wpacks[i][2] == PACK_MODE_A:
+++         if (pm != PACK_MODE_EITHER) and (pm != PACK_MODE_A):
+++            asm_error('conflicting pack modes')
+++         pm = PACK_MODE_A
+++         pack = wpacks[i][0]
+++      elif wpacks[i][2] == PACK_MODE_M:
+++         if (pm != PACK_MODE_EITHER) and (pm != PACK_MODE_M):
+++            asm_error('conflicting pack modes')
+++         pm = PACK_MODE_M
+++         pack = wpacks[i][0]
+++
+++      if (wpacks[i][1] == PACK_TYPE_FLOAT) and (not wfloats[i]):
+++         asm_error('float pack operation used with integer alu result')
+++      if (wpacks[i][1] == PACK_TYPE_INT) and wfloats[i]:
+++         asm_error('integer pack operation used with float alu result')
+++
+++   if pm == PACK_MODE_EITHER:
+++      pm = PACK_MODE_A
+++   return pm, pack, unpack, forcebs, forcerafloat
+++
+++# immediates that can be encoded with SIG_SMALLIMMED
+++bimms = {}
+++bimms.update((i, i) for i in xrange(16))
+++bimms.update(((i - 32) + (1 << 32), i) for i in xrange(16, 32))
+++bimms.update(((127 + (i - 32)) << 23, i) for i in xrange(32, 40))
+++bimms.update(((127 + (i - 48)) << 23, i) for i in xrange(40, 48))
+++
+++def merge_rmux(raddr_a, raddr_b, immb, arot_r5, raddr, rmux):
+++   if rmux == RMUX_SEMA:
+++      asm_error('semaphore op can only be used with mov')
+++   if rmux == RMUX_LABEL:
+++      asm_error('label not allowed here')
+++   if rmux == RMUX_IMMV:
+++      asm_error('vector immediate can only be used with mov')
+++   if rmux == RMUX_IMM:
+++      if raddr not in bimms:
+++         asm_error('can\'t encode immediate 0x%08x' % raddr)
+++      raddr = bimms[raddr]
+++      if not immb:
+++         if raddr_b is not None:
+++            asm_error('regfile b and immediates don\'t mix')
+++         raddr_b = raddr
+++         immb = True
+++      elif raddr_b != raddr:
+++         asm_error('can only encode one rotation/immediate')
+++      return raddr_a, raddr_b, immb, arot_r5, RMUX_B
+++   if rmux == RMUX_AC:
+++      return raddr_a, raddr_b, immb, arot_r5, RMUX_A0 + raddr
+++   if rmux == RMUX_ANY:
+++      if (mulw_rotate or (((not immb) or (raddr_b < 48)) and (not arot_r5))) and (raddr_a == raddr):
+++         return raddr_a, raddr_b, immb, arot_r5, RMUX_A
+++      if (not immb) and (raddr_b == raddr):
+++         return raddr_a, raddr_b, immb, arot_r5, RMUX_B
+++      if raddr_a is None:
+++         assert mulw_rotate or (((not immb) or (raddr_b < 48)) and (not arot_r5))
+++         raddr_a = raddr
+++         return raddr_a, raddr_b, immb, arot_r5, RMUX_A
+++      if raddr_b is None:
+++         assert not immb
+++         raddr_b = raddr
+++         return raddr_a, raddr_b, immb, arot_r5, RMUX_B
+++      asm_error('no free read slots')
+++   if rmux == RMUX_A:
+++      if (not mulw_rotate) and (raddr_a is not None) and (
+++         ((raddr[1] != 0) | ((raddr[2] != 0) << 1)) != ((immb and (raddr_b >= 48)) | (arot_r5 << 1))):
+++         asm_error('conflicting rotations from regfile a')
+++      if raddr_a is None:
+++         raddr_a = raddr[0]
+++      elif raddr_a != raddr[0]:
+++         asm_error('can only read from one location in each regfile')
+++      arot_r5 = raddr[2]
+++      if raddr[1] == 0:
+++         return raddr_a, raddr_b, immb, arot_r5, RMUX_A
+++      raddr = 48 + raddr[1]
+++      if not immb:
+++         if raddr_b is not None:
+++            asm_error('regfile b and rotation don\'t mix')
+++         raddr_b = raddr
+++         immb = True
+++      elif raddr_b != raddr:
+++         asm_error('can only encode one rotation/immediate')
+++      return raddr_a, raddr_b, immb, arot_r5, RMUX_A
+++   if rmux == RMUX_B:
+++      if immb:
+++         asm_error('regfile b and rotation/immediates don\'t mix')
+++      if raddr_b is None:
+++         raddr_b = raddr
+++      elif raddr_b != raddr:
+++         asm_error('can only read from one location in each regfile')
+++      return raddr_a, raddr_b, immb, arot_r5, RMUX_B
+++   assert 0
+++
+++# ok if:
+++# - accumulator (r0-r3)
+++# - uniform (ie all elements identical). this is true of unif, qpu_num, vr_busy,
+++#   and vw_busy. it's also true of r5 if it was written by r5rep, but not if it
+++#   was written by r5quad. so, by default, r5 isn't considered uniform. todo:
+++#   what about vr_wait/vw_wait/mutex?
+++def read_rot_ok(rmux, raddr_a, raddr_b):
+++   return ((rmux < 4) or ((rmux == 5) and dont_warn_when_mul_rot_inp_r5) or
+++      ((rmux == 6) and (raddr_a in (32, 49))) or # unif/vr_busy
+++      ((rmux == 7) and (raddr_b in (32, 38, 49)))) # unif/qpu_num/vw_busy
+++
+++def asm_flush_prog_data():
+++   global prog_data
+++
+++   while len(prog_data) & 7:
+++      prog_data.append(0)
+++   for i in xrange(0, len(prog_data), 8):
+++      prog.append(((prog_data[i + 3] << 24) | (prog_data[i + 2] << 16) | (prog_data[i + 1] << 8) | (prog_data[i + 0] << 0),
+++         (prog_data[i + 7] << 24) | (prog_data[i + 6] << 16) | (prog_data[i + 5] << 8) | (prog_data[i + 4] << 0), 'data', {}))
+++   prog_data = []
+++
+++def asm_line(sets, location, line):
+++   global current_location, construct, nwarn_level
+++
+++   prev_location = current_location
+++   current_location = location
+++
+++   try:
+++      if construct != None:
+++         if re_macro.match(line):
+++            construct_stack.append(CONSTRUCT_MACRO)
+++         elif re_if.match(line):
+++            construct_stack.append(CONSTRUCT_IF)
+++         elif re_rep.match(line):
+++            construct_stack.append(CONSTRUCT_REP)
+++         else:
+++            else_m = line == '.else'
+++            elif_m = re_elif.match(line)
+++            if elif_m:
+++               end_construct = CONSTRUCT_IF
+++            else:
+++               end_construct = {
+++                  '.endm':  CONSTRUCT_MACRO,
+++                  '.else':  CONSTRUCT_IF,
+++                  '.endif': CONSTRUCT_IF | CONSTRUCT_ELSE,
+++                  '.endr':  CONSTRUCT_REP}.get(line)
+++            if end_construct is not None:
+++               end_construct &= construct_stack.pop()
+++               if end_construct == 0:
+++                  if elif_m:
+++                     asm_error('unexpected .elif')
+++                  asm_error('unexpected %s' % line)
+++               if len(construct_stack) == 0:
+++                  lines = construct
+++                  construct = None
+++                  if end_construct == CONSTRUCT_MACRO:
+++                     return
+++                  if (end_construct == CONSTRUCT_IF) or (end_construct == CONSTRUCT_ELSE):
+++                     condition_if, condition_else = lines[0]
+++                     lines = lines[1:]
+++                     if condition_if:
+++                        for location, line in lines:
+++                           asm_line(sets, location, line)
+++                     if else_m:
+++                        construct = [(condition_else, False)]
+++                        construct_stack.append(CONSTRUCT_ELSE)
+++                     elif elif_m:
+++                        if elif_m.group('set'):
+++                           condition_if = condition_else and ((elif_m.group('set') == 'nset') ^ (elif_m.group('name') in sets))
+++                        else:
+++                           condition_if = condition_else and arg_eval(elif_m.group('condition'), sets)
+++                        condition_else = condition_else and (not condition_if)
+++                        construct = [(condition_if, condition_else)]
+++                        construct_stack.append(CONSTRUCT_IF)
+++                     return
+++                  if end_construct == CONSTRUCT_REP:
+++                     name, count = lines[0]
+++                     lines = lines[1:]
+++                     for i in xrange(count):
+++                        sets[name] = i
+++                        for location, line in lines:
+++                           asm_line(sets, location, line)
+++                     return
+++                  assert 0
+++               if else_m:
+++                  construct_stack.append(CONSTRUCT_ELSE)
+++               elif elif_m:
+++                  construct_stack.append(CONSTRUCT_IF)
+++         construct.append((current_location, line))
+++         return
+++
+++      if line in ('.endm', '.else', '.endif', '.endr'):
+++         asm_error('unexpected %s' % line)
+++      if re_elif.match(line):
+++         asm_error('unexpected .elif')
+++
+++      m = re_macro.match(line)
+++      if m:
+++         construct = []
+++         construct_stack.append(CONSTRUCT_MACRO)
+++         macros[m.group('name')] = ([param.strip() for param in m.group('params').split(',')[1:]], construct)
+++         return
+++
+++      m = re_if.match(line)
+++      if m:
+++         if m.group('set'):
+++            condition = (m.group('set') == 'nset') ^ (m.group('name') in sets)
+++         else:
+++            # not not forces condition to a bool (this matters if condition is
+++            # something mutable like a list)
+++            condition = not not arg_eval(m.group('condition'), sets)
+++         construct = [(condition, not condition)]
+++         construct_stack.append(CONSTRUCT_IF)
+++         return
+++
+++      m = re_rep.match(line)
+++      if m:
+++         count = arg_eval(m.group('count'), sets)
+++         if not is_int(count):
+++            asm_error('.rep count must be integer')
+++         construct = [(m.group('name'), count)]
+++         construct_stack.append(CONSTRUCT_REP)
+++         return
+++
+++      m = re_include.match(line)
+++      if m:
+++         filename = arg_eval(m.group('filename'), sets)
+++         if not isinstance(filename, str):
+++            asm_error('expected string')
+++         asm_file(sets, '%s: %s' % (current_location, filename), filename)
+++         return
+++
+++      m = re_set.match(line)
+++      if m:
+++         sets[m.group('name')] = arg_eval(m.group('val'), sets)
+++         return
+++
+++      m = re_unset.match(line)
+++      if m:
+++         name = m.group('name')
+++         if name not in sets:
+++            asm_error('%s not set' % name)
+++         if name in arg_defs: # todo: see arg_eval
+++            sets[name] = arg_defs[name]
+++         else:
+++            del sets[name]
+++         return
+++
+++      m = re_eval.match(line)
+++      if m:
+++         arg_eval(m.group('expr'), sets)
+++         return
+++
+++      m = re_print_info_warn_error.match(line)
+++      if m:
+++         def print_fn(message):
+++            print message
+++         def info_fn(message):
+++            sys.stderr.write('%s\n' % message)
+++         {'print': print_fn, 'info': info_fn, 'warn': asm_warning, 'error': asm_error}[
+++            m.group('print_info_warn_error')](arg_eval(m.group('message'), sets))
+++         return
+++
+++      m = re_assert.match(line)
+++      if m:
+++         if not arg_eval(m.group('condition'), sets):
+++            asm_error('assertion failure: \'%s\'' % m.group('condition'))
+++         return
+++
+++      m = re_data.match(line)
+++      if m:
+++         size = int(m.group('size'))
+++         for datum in smart_split(m.group('data')):
+++            datum = arg_eval(datum, sets)
+++            if not is_int(datum):
+++               asm_error('datum must be integer')
+++            prog_data.extend(((datum >> (i * 8)) & 0xff) for i in xrange(size))
+++         return
+++
+++      m = re_macro_inst.match(line)
+++      if m:
+++         name = m.group('name')
+++         if name in macros:
+++            params, lines = macros[name]
+++            args = smart_split(m.group('args'))
+++            if len(args) > len(params):
+++               asm_error('too many arguments to macro')
+++            sets = sets.copy()
+++            sets.update(zip(params, (arg_eval(arg, sets) for arg in args)))
+++            for param in params[len(args):]:
+++               if param in sets:
+++                  if param in arg_defs: # todo: see arg_eval
+++                     sets[param] = arg_defs[param]
+++                  else:
+++                     del sets[param]
+++            for location, line in lines:
+++               asm_line(sets, '%s: %s' % (current_location, location), line)
+++            return
+++
+++      if line == '.pushnwarn':
+++         nwarn_level += 1
+++         return
+++      if line == '.popnwarn':
+++         if nwarn_level == 0:
+++            asm_error('.popnwarn without .pushnwarn')
+++         nwarn_level -= 1
+++         return
+++
+++      # everything below assumes prog is up to date
+++      asm_flush_prog_data()
+++
+++      m = re_label.match(line)
+++      if m:
+++         name = m.group('name')
+++         if name[0].isdigit():
+++            labels.setdefault(name, []).append(len(prog))
+++         else:
+++            if name[0] == ':':
+++               undecorated_name = name[1:]
+++            else:
+++               undecorated_name = name
+++            if (undecorated_name in labels) or ((':' + undecorated_name) in labels):
+++               asm_error('named label defined twice')
+++            labels[name] = len(prog)
+++         return
+++
+++      annots = line.split('@')
+++      ops = [op.strip() for op in annots[0].split(';')]
+++      annots = sum((get_annots(annot, sets) for annot in annots[1:]), [])
+++      sig = get_sig(ops[-1])
+++      if sig != SIG_NORMAL:
+++         ops = ops[:-1]
+++      if len(ops) > 2:
+++         asm_error('too many ops')
+++      elif (len(ops) == 1) and (ops[0] == ''):
+++         ops = []
+++      ops = (ops + ['nop', 'nop'])[:2]
+++      m = re_op.match(ops[0])
+++      if not m:
+++         asm_error('invalid syntax')
+++      aop, aargs_n = get_aop(m.group('op'))
+++      if (aop == AOP_BRA) or (aop == AOP_BRR):
+++         acond = get_bcond(m.group('cond'))
+++      else:
+++         acond = get_cond(m.group('cond'))
+++      asf = get_setf(m.group('sf'))
+++      aargs = smart_split(m.group('args'))
+++      if len(aargs) != aargs_n:
+++         asm_error('wrong operand count')
+++      ard, ara, arb = (aargs + [None, None, None])[:3]
+++      m = re_op.match(ops[1])
+++      if not m:
+++         asm_error('invalid syntax')
+++      mop, margs_n = get_mop(m.group('op'))
+++      mcond = get_cond(m.group('cond'))
+++      msf = get_setf(m.group('sf'))
+++      margs = smart_split(m.group('args'))
+++      if len(margs) != margs_n:
+++         asm_error('wrong operand count')
+++      mrd, mra, mrb = (margs + [None, None, None])[:3]
+++      # eval srcs first so allocator can retire and reuse registers for dst
+++      aaraddr, aarmux, aarpack, aadrot, aadrot_r5 = get_src(ara, sets)
+++      abraddr, abrmux, abrpack, abdrot, abdrot_r5 = get_src(arb, sets)
+++      maraddr, marmux, marpack, madrot, madrot_r5 = get_src(mra, sets)
+++      mbraddr, mbrmux, mbrpack, mbdrot, mbdrot_r5 = get_src(mrb, sets)
+++      awaddr, awmux, awpack, awrot, awrot_r5 = get_dst(ard, sets)
+++      mwaddr, mwmux, mwpack, mwrot, mwrot_r5 = get_dst(mrd, sets)
+++      if (((abrmux is not None) and ((aadrot != abdrot) or (aadrot_r5 != abdrot_r5))) or
+++         ((mbrmux is not None) and ((madrot != mbdrot) or (madrot_r5 != mbdrot_r5)))):
+++         asm_error('cannot have 2 arguments with different rotations')
+++      if aarmux is not None:
+++         awrot = (awrot + aadrot) % 16
+++         awrot_r5 = (awrot_r5 + aadrot_r5) % 16
+++      if (awrot != 0) or awrot_r5:
+++         asm_error('rotate not allowed on add write')
+++      if marmux is not None:
+++         mwrot = (mwrot + madrot) % 16
+++         mwrot_r5 = (mwrot_r5 + madrot_r5) % 16
+++
+++      afloatr = aop in (AOP_FADD, AOP_FSUB, AOP_FMIN, AOP_FMAX, AOP_FMINABS, AOP_FMAXABS, AOP_FTOI)
+++      afloatw = aop in (AOP_FADD, AOP_FSUB, AOP_FMIN, AOP_FMAX, AOP_FMINABS, AOP_FMAXABS, AOP_ITOF)
+++      pm, pack, unpack, forcebs, forcerafloat = calculate_pack_modes(
+++         [aarpack, abrpack, marpack, mbrpack],
+++         [afloatr, afloatr, mop == MOP_FMUL, mop == MOP_FMUL],
+++         aop == AOP_FTOI,
+++         [awpack, mwpack],
+++         [afloatw, mop == MOP_FMUL])
+++      if forcebs[0]:
+++         aarmux = RMUX_B
+++      if forcebs[1]:
+++         abrmux = RMUX_B
+++      if forcebs[2]:
+++         marmux = RMUX_B
+++      if forcebs[3]:
+++         mbrmux = RMUX_B
+++
+++      # extend nops to 3 operands
+++      if aop == AOP_NOP:
+++         awaddr, awmux, aaraddr, aarmux, abraddr, abrmux = WADDR_NOP, WMUX_ANY, 0, RMUX_AC, 0, RMUX_AC
+++      if mop == MOP_NOP:
+++         mwaddr, mwmux, maraddr, marmux, mbraddr, mbrmux = WADDR_NOP, WMUX_ANY, 0, RMUX_AC, 0, RMUX_AC
+++
+++      # extend 2 operand alu ops to 3 operands (by duplicating the 2nd operand)
+++      if (aop == AOP_FTOI) or (aop == AOP_ITOF) or (aop == AOP_NOT) or (aop == AOP_CLZ):
+++         if forcerafloat:
+++            assert aop == AOP_FTOI # can only forcerafloat if we have an unused float operand
+++            # instead of duplicating the 2nd operand, take the ra operand from
+++            # the mul op thus forcing the ra value to be considered a float for
+++            # the purposes of unpacking
+++            if marmux == RMUX_A:
+++               abraddr, abrmux = maraddr, marmux
+++            else:
+++               assert mbrmux == RMUX_A
+++               abraddr, abrmux = mbraddr, mbrmux
+++         else:
+++            abraddr, abrmux = aaraddr, aarmux
+++      else:
+++         assert not forcerafloat # can only forcerafloat if we have an unused operand
+++
+++      # handle write addrs
+++      if (awmux == mwmux) and (awmux != WMUX_ANY):
+++         asm_error('add/mul ops not allowed to write to same regfile')
+++      ws = (awmux == WMUX_B) or (mwmux == WMUX_A)
+++
+++      # handle branch
+++      if (aop == AOP_BRA) or (aop == AOP_BRR):
+++         # check setf
+++         if asf:
+++            asm_error('setf not allowed on bra/brr')
+++
+++         # check pack/unpack
+++         if (pack != 0) or (unpack != 0):
+++            asm_error('pack/unpack not allowed with bra/brr')
+++
+++         # handle read address
+++         if aarmux == RMUX_LABEL:
+++            if (aop == AOP_BRA) and aaraddr[1]:
+++               asm_warning('bra with rel label')
+++            if (aop == AOP_BRR) and (not aaraddr[1]):
+++               asm_warning('brr with abs label')
+++            aaraddr, aarmux = (current_location,) + aaraddr, RMUX_IMM
+++         if aarmux == RMUX_ANY:
+++            aaraddr, aarmux = (aaraddr, 0, 0), RMUX_A
+++         if (aarmux != RMUX_IMM) and (aarmux != RMUX_A):
+++            asm_error('branch destination must be either label, immediate, or from regfile a')
+++         if aarmux == RMUX_IMM:
+++            imm = aaraddr
+++            raddr = 0 # can't use RADDR_NOP
+++         elif aarmux == RMUX_A:
+++            if (aaraddr[1] != 0) or (aaraddr[2] != 0):
+++               asm_error('rotation of read from regfile a not allowed with branch')
+++            if aop == AOP_BRR:
+++               asm_warning('brr with ra')
+++            imm = 0
+++            raddr = aaraddr[0]
+++         else:
+++            assert 0
+++
+++         # check mul op is nop
+++         if mop != MOP_NOP:
+++            asm_error('mul op not allowed with branch')
+++
+++         # check sig
+++         if sig != SIG_NORMAL:
+++            asm_error('no signal allowed with branch')
+++
+++         if raddr >= 32:
+++            asm_error('can only branch to register locations in physical regfile')
+++         if raddr & 1:
+++            asm_warning('branch instruction will destroy flags (see hw-2780)')
+++
+++         # construct branch instruction
+++         prog.append((imm,
+++            (mwaddr << 0) | (awaddr << 6) | (ws << 12) | (raddr << 13) | ((aarmux == RMUX_A) << 18) | ((aop == AOP_BRR) << 19) | (acond << 20) | (SIG_BRANCH << 28),
+++            line, annots))
+++
+++         return
+++
+++      # use COND_NEVER when possible (might save power / allow mul setf)
+++      if not dict(annots).get('preserve_cond', 0):
+++          if (awaddr == WADDR_NOP) and (not asf):
+++             acond = COND_NEVER
+++          if (mwaddr == WADDR_NOP) and (not msf):
+++             mcond = COND_NEVER
+++
+++      # attempt to convert movs to ldi
+++      if (# no mul setf
+++         (not msf) and
+++         # ops must either be nop or mov of sema/label/imm/immv
+++         ((aop == AOP_NOP) or ((aop == AOP_MOV) and (aarmux in (RMUX_SEMA, RMUX_LABEL, RMUX_IMMV, RMUX_IMM)))) and
+++         ((mop == MOP_NOP) or ((mop == MOP_MOV) and (marmux in (RMUX_SEMA, RMUX_LABEL, RMUX_IMMV, RMUX_IMM)))) and
+++         # but we don't want 2 nops
+++         ((aop != AOP_NOP) or (mop != MOP_NOP)) and
+++         # if both ops are movs, srcs must be identical
+++         ((aop != AOP_MOV) or (mop != MOP_MOV) or ((aarmux == marmux) and (aaraddr == maraddr))) and
+++         # no signal
+++         (sig == SIG_NORMAL)):
+++         # make sure aarmux/aaraddr contains the value
+++         if aop != AOP_MOV:
+++            aarmux = marmux
+++            aaraddr = maraddr
+++
+++         # convert immediate
+++         if aarmux == RMUX_SEMA:
+++            ldi_mode = LDI_SEMA
+++         elif aarmux == RMUX_LABEL:
+++            ldi_mode = LDI_32
+++            aaraddr, aarmux = (current_location,) + aaraddr, RMUX_IMM
+++         elif aarmux == RMUX_IMMV:
+++            signed, unsigned = True, True
+++            imm = 0
+++            for i, elem in enumerate(aaraddr):
+++               if elem not in (-2 + (1 << 32), -1 + (1 << 32), 0, 1):
+++                  signed = False
+++               if elem not in (0, 1, 2, 3):
+++                  unsigned = False
+++               imm |= ((elem & 0x1) << i) | ((elem & 0x2) << (15 + i))
+++            if not (signed or unsigned):
+++               asm_error('can\'t encode vector immediate')
+++            if signed:
+++               ldi_mode = LDI_EL_SIGNED
+++            else:
+++               ldi_mode = LDI_EL_UNSIGNED
+++            aaraddr, aarmux = imm, RMUX_IMM
+++         elif aarmux == RMUX_IMM:
+++            ldi_mode = LDI_32
+++         else:
+++            assert 0
+++
+++         # construct ldi instruction
+++         prog.append((aaraddr,
+++            (mwaddr << 0) | (awaddr << 6) | (ws << 12) | (asf << 13) | (mcond << 14) | (acond << 17) | (pack << 20) | (pm << 24) | (ldi_mode << 25) | (SIG_IMMED << 28),
+++            line, annots))
+++
+++         return
+++
+++      # convert movs to alu ops
+++      if aop == AOP_MOV:
+++         if allow_xor_0 and (aarmux == RMUX_IMM) and (aaraddr == 0):
+++            aop = AOP_XOR
+++            aaraddr, aarmux = 0, RMUX_AC
+++            abraddr, abrmux = 0, RMUX_AC
+++         else:
+++            aop = AOP_OR
+++            abraddr, abrmux = aaraddr, aarmux
+++      if mop == MOP_MOV:
+++         if allow_xor_0 and (marmux == RMUX_IMM) and (maraddr == 0):
+++            mop = MOP_V8SUBS
+++            maraddr, marmux = 0, RMUX_AC
+++            mbraddr, mbrmux = 0, RMUX_AC
+++         else:
+++            mop = MOP_V8MIN
+++            mbraddr, mbrmux = maraddr, marmux
+++
+++      # normal alu instruction...
+++
+++      # handle setf
+++      if asf and (aop == AOP_NOP):
+++         asm_error('nop.setf is not allowed in add pipe')
+++      if msf and (mop == MOP_NOP):
+++         asm_warning('nop.setf, really?')
+++      if (aop == AOP_NOP) or (acond == COND_NEVER):
+++         sf = msf
+++      else:
+++         if msf:
+++            asm_error('setf only allowed on mul op if add op is nop or add condition is never')
+++         sf = asf
+++
+++      # handle read addrs
+++      raddr_a = None
+++      raddr_b = None
+++      immb = False
+++      arot_r5 = False
+++      muxes = [0, 0, 0, 0]
+++      if mwrot != 0:
+++         raddr_b = 48 + mwrot
+++         immb = True
+++      if mwrot_r5 and have_am:
+++         raddr_b = 48
+++         immb = True
+++      for f in lambda rmux: rmux != RMUX_ANY, lambda rmux: rmux == RMUX_ANY: # do RMUX_ANY last
+++         for i, raddr, rmux in (0, aaraddr, aarmux), (1, abraddr, abrmux), (2, maraddr, marmux), (3, mbraddr, mbrmux):
+++            if f(rmux):
+++               raddr_a, raddr_b, immb, arot_r5, muxes[i] = merge_rmux(raddr_a, raddr_b, immb, arot_r5, raddr, rmux)
+++      add_a, add_b, mul_a, mul_b = muxes
+++      if (not read_rot_ok(mul_a, raddr_a, raddr_b)) or (not read_rot_ok(mul_b, raddr_a, raddr_b)):
+++         # some output elements might not be as expected
+++         if mwrot_r5 or ((mwrot >= 4) and (mwrot <= 12)):
+++            bad_elems = 0xffff
+++         else:
+++            bad_elems = ((1 << (mwrot & 0x3)) - 1) * 0x1111
+++            if mwrot > 12:
+++               bad_elems ^= 0xffff
+++         bad_elems &= dict(annots).get('mul_used', 0xffff)
+++         if not msf:
+++            if mwaddr == WADDR_NOP:
+++               # not writing anywhere and not setting flags. no elements used
+++               bad_elems = 0
+++            elif ((mwaddr in (36, 40, 43, 49, 50, 51)) or
+++               ((not ws) and (mwaddr == 37))):
+++               # writing to tmurs/r5rep/unif_addr/unif_addr_rel/stencil/
+++               # vr_setup/vw_setup/vr_addr/vw_addr/mutex and not setting flags.
+++               # only use element 0
+++               bad_elems &= 0x0001
+++            elif ((mwaddr == 41) or (ws and (mwaddr == 37)) or
+++               ((not ws) and (mwaddr == 42))):
+++               # writing to r5quad/x_coord/y_coord/rev_flag and not setting
+++               # flags. only use elements 0, 4, 8, and 12
+++               bad_elems &= 0x1111
+++         if bad_elems:
+++            asm_warning('mul inputs don\'t come from accumulators (r0-r3). output may not be as expected')
+++      if raddr_a is None:
+++         raddr_a = RADDR_NOP
+++      if raddr_b is None:
+++         raddr_b = RADDR_NOP
+++      if immb:
+++         if sig != SIG_NORMAL:
+++            asm_error('rotation/immediates and signal don\'t mix')
+++         sig = SIG_SMALLIMMED
+++      if arot_r5 or (mwrot_r5 and (not have_am)):
+++         if sig != SIG_NORMAL:
+++            asm_error('rotation/immediates/signal don\'t mix')
+++         sig = SIG_ROTATE
+++
+++      # construct instruction
+++      prog.append(((mul_b << 0) | (mul_a << 3) | (add_b << 6) | (add_a << 9) | (raddr_b << 12) | (raddr_a << 18) | (aop << 24) | (mop << 29),
+++         (mwaddr << 0) | (awaddr << 6) | (ws << 12) | (sf << 13) | (mcond << 14) | (acond << 17) | (pack << 20) | (pm << 24) | (unpack << 25) | (sig << 28),
+++         line, annots))
+++   finally:
+++      current_location = prev_location
+++
+++def preprocess_passthrough(file):
+++   line_number = 0
+++   for line in file:
+++      line_number += 1
+++      yield line_number, line
+++
+++def asm_file(sets, location, filename, preprocess = None):
+++   global current_dir, current_location
+++
+++   if filename is None:
+++      location = '<stdin>'
+++      file = sys.stdin
+++
+++      prev_dir = current_dir
+++   else:
+++      filename = os.path.normpath(os.path.join(current_dir, filename))
+++
+++      try:
+++         file = open(filename)
+++      except Exception, e:
+++         asm_error(e)
+++      except:
+++         asm_error('unknown error while opening file %s' % filename)
+++
+++      prev_dir = current_dir
+++      current_dir = os.path.dirname(filename)
+++
+++   prev_location = current_location
+++   current_location = location
+++
+++   if preprocess is None:
+++      preprocess = preprocess_passthrough
+++
+++   try:
+++      for line_number, line in preprocess(file):
+++         # strip off comments and whitespace
+++         line = line.split('#')[0].strip()
+++         if line == '':
+++            continue
+++
+++         asm_line(sets, '%s: %d' % (current_location, line_number), line)
+++   finally:
+++      current_dir = prev_dir
+++      current_location = prev_location
+++
+++def asm_end_prog():
+++   # check we aren't in a multi-line construct (eg .macro or .rep)
+++   if construct != None:
+++      asm_error({
+++         CONSTRUCT_MACRO: '.macro without .endm',
+++         CONSTRUCT_IF:    '.if/.elif without .endif',
+++         CONSTRUCT_ELSE:  '.else without .endif',
+++         CONSTRUCT_REP:   '.rep without .endr'}[construct_stack[-1]])
+++
+++   # check no warnings level back to 0
+++   if nwarn_level != 0:
+++      asm_error('.pushnwarn without .popnwarn')
+++
+++   # flush queued up data
+++   asm_flush_prog_data()
+++
+++   # fixup all the label references we can
+++   for pc in xrange(len(prog)):
+++      if isinstance(prog[pc][0], tuple):
+++         location, label, rel, offset = prog[pc][0]
+++         if label[0].isdigit():
+++            label_pcs = labels.get(label[:-1], [])
+++            if label[-1] == 'b':
+++               label_pcs = filter(lambda label_pc: label_pc <= pc, label_pcs)[-1:]
+++            else:
+++               label_pcs = filter(lambda label_pc: label_pc > pc, label_pcs)[:1]
+++            if label_pcs == []:
+++               asm_error('search for label reached begin/end of file', location = location)
+++            imm = label_pcs[0]
+++         elif label in labels:
+++            imm = labels[label]
+++         elif (':' + label) in labels:
+++            imm = labels[':' + label]
+++         elif external_link:
+++            continue # let the external linker deal with it
+++         else:
+++            asm_error('undefined label', location = location)
+++         imm = (imm * 8) + offset
+++         if rel:
+++            imm -= (pc + 4) * 8 # relative to instruction after delay slots
+++            imm &= (1 << 32) - 1
+++         else:
+++            if not external_link:
+++               asm_error('can\'t get absolute address without using an external linker. this mode doesn\'t have an external linker', location = location)
+++            imm = (location, label, rel, offset, imm)
+++         prog[pc] = (imm,) + prog[pc][1:]
+++
+++def asm_init():
+++   global current_dir, current_location, prog, prog_data, macros, labels, construct, construct_stack, nwarn_level
+++
+++   current_dir = os.getcwd()
+++   current_location = ''
+++   prog = []
+++   prog_data = []
+++   macros = {
+++      'sacq': (['dst', 'i'], [('candyland', 'mov  dst, sacq(i)')]),
+++      'srel': (['dst', 'i'], [('candyland', 'mov  dst, srel(i)')])}
+++   labels = {}
+++   construct = None
+++   construct_stack = []
+++   nwarn_level = 0
+++
+++def asm_reset_prog():
+++   global prog, labels
+++
+++   prog = []
+++   labels = {}
+++
+++###############################################################################
+++# dumping
+++###############################################################################
+++
+++def print_lines(lines):
+++   for line in lines:
+++      print line
+++
+++class dumper_t:
+++   def external_link(self): return False
+++   def begin(self): pass
+++   def label(self, pc, name): pass
+++   def line(self, pc, ls, ms, line, annots, first): pass
+++   def end(self): pass
+++   def sets(self, sets): pass
+++   def direct(self, line): pass
+++
+++class clif_dumper_t(dumper_t):
+++   def __init__(self):
+++      self.annot_mode = 0
+++
+++   def external_link(self):
+++      return True
+++
+++   def parse_annot_mode(self, line):
+++      l = line.split(',')
+++      self.annot_mode = int(l[0])
+++      if self.annot_mode not in (0, 1, 2):
+++         asm_error('bad annot mode')
+++      if self.annot_mode == 2:
+++         if len(l) != 2:
+++            asm_error('expected buffer name')
+++         self.annot_name = l[1].strip()
+++         self.annot_offset = 0
+++      elif len(l) != 1:
+++         asm_error('unexpected comma')
+++
+++   def label(self, pc, name):
+++      if (self.annot_mode != 1) and (name[0] == ':'):
+++         if self.annot_mode == 2:
+++            name = name + '_annotations'
+++         print '@label %s' % name[1:]
+++      else:
+++         print '// :%s' % name
+++
+++   def line(self, pc, ls, ms, line, annots, first):
+++      if self.annot_mode == 0:
+++         if isinstance(ls, tuple):
+++            if len(ls) == 5:
+++               location, label, rel, offset, offset_from_prog = ls
+++               assert not rel
+++               ls = '[. - %d + %d]' % (pc * 8, offset_from_prog)
+++            else:
+++               location, label, rel, offset = ls
+++               if rel:
+++                  asm_error('relative external label references not allowed in this mode', location = location)
+++               ls = '[%s + %d]' % (label, offset)
+++         else:
+++            ls = '0x%08x' % ls
+++         print '%s 0x%08x // %s' % (ls, ms, line)
+++      elif self.annot_mode == 1:
+++         print '// %s' % line
+++         for annot in annots:
+++            print '0x%08x 0x%08x // %s' % ({
+++               # todo: would rather not have these hard coded
+++               'mul_used':              1,
+++               'preserve_cond':         2,
+++               'geomd_open':            3,
+++               'geomd_i':               4,
+++               'geomd_tris_clear':      5,
+++               'geomd_verts':           6,
+++               'geomd_tris_add':        7,
+++               'geomd_tris_set_center': 8,
+++               'geomd_region_clear':    9,
+++               'geomd_region_set':      10,
+++               'geomd_images_clear':    11,
+++               'geomd_images_l':        12,
+++               'geomd_images_b':        13,
+++               'geomd_images_r':        14,
+++               'geomd_images_t':        15,
+++               'geomd_images_add_vpm':  16,
+++               'trace_4c':              17,
+++               'geomd_images_add_tex':  18,}[annot[0]], annot[1], annot[0])
+++         if len(annots) != 0:
+++            print '0x00000000 // end'
+++      else:
+++         assert self.annot_mode == 2
+++         if len(annots) == 0:
+++            print '0x00000000 // %s' % line
+++         else:
+++            print '[%s + %d] // %s' % (self.annot_name, self.annot_offset, line)
+++            self.annot_offset += (len(annots) * 8) + 4
+++
+++   def direct(self, line):
+++      print line
+++
+++class plain_dumper_t(dumper_t):
+++   def line(self, pc, ls, ms, line, annots, first):
+++      print '0x%08x, 0x%08x, // %s' % (ls, ms, line)
+++
+++class c_c_dumper_t(dumper_t):
+++   def __init__(self, header_name, full_header_name, array_name):
+++      self.header_name = header_name
+++      self.array_name = array_name
+++
+++   def external_link(self):
+++      return True
+++
+++   def begin(self):
+++      self.external_labels = set()
+++      self.lines = []
+++
+++      print '#include "%s.h"' % self.header_name
+++      print ''
+++      print '#ifdef _MSC_VER'
+++      print '   #include <stdint.h>'
+++      print '   /* cast through uintptr_t to avoid warnings */'
+++      print '   #define POINTER_TO_UINT(X) ((unsigned int)(uintptr_t)(X))'
+++      print '#else'
+++      print '   #define POINTER_TO_UINT(X) ((unsigned int)(X))'
+++      print '#endif'
+++      print ''
+++      print '#ifdef __cplusplus'
+++      print 'extern "C" { /* the types are probably wrong... */'
+++      print '#endif'
+++
+++   def label(self, pc, name):
+++      self.lines.append('// :%s' % name)
+++
+++   def line(self, pc, ls, ms, line, annots, first):
+++      if isinstance(ls, tuple):
+++         if len(ls) == 5:
+++            location, label, rel, offset, offset_from_prog = ls
+++            assert not rel
+++            ls = 'POINTER_TO_UINT(%s) + %d' % (self.array_name, offset_from_prog)
+++         else:
+++            location, label, rel, offset = ls
+++            if rel:
+++               asm_error('relative external label references not allowed in this mode', location = location)
+++            if label not in self.external_labels:
+++               self.external_labels.add(label)
+++               print 'extern uint8_t %s[];' % label
+++            ls = 'POINTER_TO_UINT(%s) + %d' % (label, offset)
+++      else:
+++         ls = '0x%08x' % ls
+++      self.lines.append('/* [0x%08x] */ %s, 0x%08x, // %s' % (pc * 8, ls, ms, line))
+++
+++   def end(self):
+++      print '#ifdef __cplusplus'
+++      print '}'
+++      print '#endif'
+++      print ''
+++      print '#ifdef _MSC_VER'
+++      print '__declspec(align(8))'
+++      print '#elif defined(__GNUC__)'
+++      print '__attribute__((aligned(8)))'
+++      print '#endif'
+++      print 'unsigned int %s[] = {' % self.array_name
+++      print_lines(self.lines)
+++      print '};'
+++      print '#ifdef __HIGHC__'
+++      print '#pragma Align_to(8, %s)' % self.array_name
+++      print '#endif'
+++
+++class c_h_dumper_t(dumper_t):
+++   def __init__(self, header_name, full_header_name, array_name):
+++      self.full_header_name = full_header_name
+++      self.array_name = array_name
+++
+++   def external_link(self):
+++      return True
+++
+++   def begin(self):
+++      print '#ifndef %s_H' % self.full_header_name
+++      print '#define %s_H' % self.full_header_name
+++      print ''
+++      print 'extern unsigned int %s[];' % self.array_name
+++      print ''
+++
+++   def label(self, pc, name):
+++      if name[0] == ':':
+++         print '#define %s (%s + %d)' % (name[1:], self.array_name, pc * 2)
+++
+++   def end(self):
+++      print ''
+++      print '#endif'
+++
+++class ml_c_dumper_t(dumper_t):
+++   def __init__(self, header_name, full_header_name, name, annots):
+++      self.header_name = header_name
+++      self.name = name
+++      self.annots = annots
+++
+++   def external_link(self):
+++      return True
+++
+++   def begin(self):
+++      if self.annots:
+++         self.annot_lines = []
+++      self.lines = []
+++      self.external_labels = set()
+++      self.link_lines = []
+++
+++      print '#include "%s.h"' % self.header_name
+++      print '#include <assert.h>'
+++      if self.annots:
+++         print '#ifdef SIMPENROSE'
+++         print '#include <stddef.h>'
+++         print '#include "v3d/verification/tools/2760sim/simpenrose.h"'
+++      print ''
+++
+++   def label(self, pc, name):
+++      self.lines.append('// :%s' % name)
+++
+++   def line(self, pc, ls, ms, line, annots, first):
+++      if self.annots:
+++         if len(annots) == 0:
+++            self.annot_lines.append('NULL,')
+++         else:
+++            print 'static unsigned int const annotations_%d[] = {' % pc
+++            for annot in annots:
+++               print '   SIMPENROSE_SHADER_ANNOTATION_%s, 0x%08x,' % (annot[0].upper(), annot[1])
+++            print '   SIMPENROSE_SHADER_ANNOTATION_END};'
+++            print ''
+++            self.annot_lines.append('annotations_%d,' % pc)
+++      if isinstance(ls, tuple):
+++         self.link_lines.append('   assert(p[%d] == 0xdeadbeef);' % (pc * 2))
+++         if len(ls) == 5:
+++            location, label, rel, offset, offset_from_prog = ls
+++            assert not rel
+++            self.link_lines.append('   p[%d] = base + %d;' % (pc * 2, offset_from_prog))
+++         else:
+++            location, label, rel, offset = ls
+++            self.external_labels.add(label)
+++            if rel:
+++               self.link_lines.append('   p[%d] = (%s + %d) - (base + %d);' % (pc * 2, label, offset, (pc + 4) * 8))
+++            else:
+++               self.link_lines.append('   p[%d] = %s + %d;' % (pc * 2, label, offset))
+++         ls = '0xdeadbeef'
+++      else:
+++         ls = '0x%08x' % ls
+++      self.lines.append('/* [0x%08x] */ %s, 0x%08x, // %s' % (pc * 8, ls, ms, line))
+++
+++   def end(self):
+++      if self.annots:
+++         print 'unsigned int const *const %s_annotations_array[] = {' % self.name
+++         print_lines(self.annot_lines)
+++         print '};'
+++         print '#endif'
+++         print ''
+++      print 'static unsigned int const array[] = {'
+++      print_lines(self.lines)
+++      print '};'
+++      print ''
+++      print 'void %s_link(void *p_in, unsigned int base' % self.name
+++      for label in sorted(self.external_labels):
+++         print '   , unsigned int %s' % label
+++      print '   )'
+++      print '{'
+++      print '   unsigned int *p = (unsigned int *)p_in;'
+++      print '   unsigned int i;'
+++      print '   for (i = 0; i != (%s_SIZE / 4); ++i) {' % self.name.upper()
+++      print '      p[i] = array[i];'
+++      print '   }'
+++      print_lines(self.link_lines)
+++      print '}'
+++
+++class ml_h_dumper_t(dumper_t):
+++   def __init__(self, header_name, full_header_name, name, annots):
+++      self.full_header_name = full_header_name
+++      self.name = name
+++      self.annots = annots
+++
+++   def external_link(self):
+++      return True
+++
+++   def begin(self):
+++      self.external_labels = set()
+++      self.lines_n = 0
+++
+++      print '#ifndef %s_H' % self.full_header_name
+++      print '#define %s_H' % self.full_header_name
+++      print ''
+++      if self.annots:
+++         print '#ifdef SIMPENROSE'
+++         print '   extern unsigned int const *const %s_annotations_array[];' % self.name
+++         print '#endif'
+++         print ''
+++
+++   def label(self, pc, name):
+++      if name[0] == ':':
+++         print '#define %s_OFFSET %d' % (name[1:].upper(), pc * 8)
+++         if self.annots:
+++            print '#ifdef SIMPENROSE'
+++            print '   #define %s_annotations (%s_annotations_array + %d)' % (name[1:], self.name, pc)
+++            print '#endif'
+++
+++   def line(self, pc, ls, ms, line, annots, first):
+++      if isinstance(ls, tuple) and (len(ls) != 5):
+++         self.external_labels.add(ls[1])
+++      self.lines_n += 1
+++
+++   def end(self):
+++      print ''
+++      print 'extern void %s_link(void *p, unsigned int base' % self.name
+++      for label in sorted(self.external_labels):
+++         print '   , unsigned int %s' % label
+++      print '   );'
+++      print ''
+++      print '#define %s_SIZE %d' % (self.name.upper(), (self.lines_n * 8))
+++      print ''
+++      print '#endif'
+++
+++def print_lines_lc(lines):
+++   for line in lines:
+++      print '%s \\' % line
+++
+++def print_groups_lc(groups):
+++   first = True
+++   for group in groups:
+++      if first:
+++         print '{ \\'
+++      else:
+++         print ', { \\'
+++      print_lines_lc(group)
+++      print '} \\'
+++      first = False
+++
+++class inline_c_dumper_t(dumper_t):
+++   def __init__(self, annots):
+++      self.annots = annots
+++      self.iteration = False
+++
+++   def begin_iteration(self):
+++      assert not self.iteration
+++      self.iteration = True
+++      self.iteration_lines = []
+++      if self.annots:
+++         self.iteration_annot_lines = []
+++         self.annot_arrs = []
+++
+++   def end_iteration(self):
+++      assert self.iteration
+++      self.iteration = False
+++      print '%d, \\' % self.iteration_n
+++      if self.annots:
+++         print '( \\'
+++      print_groups_lc(self.iteration_lines)
+++      if self.annots:
+++         print '), ( \\'
+++         print_groups_lc(self.iteration_annot_lines)
+++         print '), ( \\'
+++         for annot_arr in self.annot_arrs:
+++            print_lines_lc(annot_arr)
+++         print ') \\'
+++
+++   def begin(self):
+++      self.n = 0
+++      self.lines = []
+++      if self.annots:
+++         self.annot_lines = []
+++         if not self.iteration:
+++            self.annot_arrs = []
+++
+++   def label(self, pc, name):
+++      self.lines.append('/* :%s */' % name)
+++      if self.annots:
+++         self.annot_lines.append('/* :%s */' % name)
+++
+++   def line(self, pc, ls, ms, line, annots, first):
+++      self.n += 1
+++      if first:
+++         prefix = ''
+++      else:
+++         prefix = ', '
+++      self.lines.append('%s0x%08x, 0x%08x /* %s */' % (prefix, ls, ms, line))
+++      if self.annots:
+++         if len(annots) == 0:
+++            a = 'NULL'
+++         else:
+++            a = 'annotations_%d' % len(self.annot_arrs)
+++            annot_arr = ['static unsigned int const annotations_%d[] = {' % len(self.annot_arrs)]
+++            for annot in annots:
+++               annot_arr.append('   SIMPENROSE_SHADER_ANNOTATION_%s, 0x%08x,' % (annot[0].upper(), annot[1]))
+++            annot_arr.append('   SIMPENROSE_SHADER_ANNOTATION_END};')
+++            self.annot_arrs.append(annot_arr)
+++         self.annot_lines.append('%s%s /* %s */' % (prefix, a, line))
+++
+++   def end(self):
+++      if self.iteration:
+++         if len(self.iteration_lines) == 0:
+++            self.iteration_n = self.n
+++         elif self.iteration_n != self.n:
+++            asm_error('number of instructions differs between iterations')
+++         self.iteration_lines.append(self.lines)
+++         if self.annots:
+++            self.iteration_annot_lines.append(self.annot_lines)
+++      else:
+++         if self.annots:
+++            print '( \\'
+++         print_lines_lc(self.lines)
+++         if self.annots:
+++            print '), ( \\'
+++            print_lines_lc(self.annot_lines)
+++            print '), ( \\'
+++            for annot_arr in self.annot_arrs:
+++               print_lines_lc(annot_arr)
+++            print ') \\'
+++
+++   def direct(self, line):
+++      print line
+++
+++class asvc_dumper_t(dumper_t):
+++   def external_link(self):
+++      return True
+++
+++   def begin(self):
+++      print '.align 8'
+++
+++   def label(self, pc, name):
+++      if name[0] == ':':
+++         print '%s::' % name[1:]
+++      else:
+++         print '%s:' % name
+++
+++   def line(self, pc, ls, ms, line, annots, first):
+++      if isinstance(ls, tuple):
+++         location, label, rel, offset = ls[:4]
+++         if rel:
+++            ls = '%s + %d - (. + 32)' % (label, offset)
+++         else:
+++            ls = '%s + %d' % (label, offset)
+++      else:
+++         ls = '0x%08x' % ls
+++      print '.word %s, 0x%08x ; %s' % (ls, ms, line)
+++
+++def is_ra_or_rb(val):
+++   return isinstance(val, loc_t) and ((val.mux == MUX_A) or (val.mux == MUX_B))
+++
+++class aliases_dumper_t(dumper_t):
+++   def external_link(self):
+++      return True
+++
+++   def begin(self):
+++      print '#ifndef JUST_DQASM_ARGS'
+++
+++   def label(self, pc, name):
+++      if not name[0].isdigit():
+++         if name[0] == ':':
+++            name = name[1:]
+++         print '"bs%s", "bs%x",' % (name, pc * 8)
+++         print '"bu%s", "bu%x",' % (name, pc * 8)
+++
+++   def end(self):
+++      print '#endif'
+++
+++   # todo: handle things other than ra and rb? dqasm only allows ra and rb atm
+++   def sets(self, sets):
+++      dqasm_args = []
+++      print '#ifndef JUST_DQASM_ARGS'
+++      for name in sets:
+++         if is_ra_or_rb(sets[name]):
+++            dqasm_args.append('-r%s=%s' % (sets[name], name))
+++            print '"%s", "%s",' % (name, sets[name])
+++         elif isinstance(sets[name], list):
+++            for i, val in enumerate(sets[name]):
+++               if is_ra_or_rb(val):
+++                  dqasm_args.append('-r%s=%s[%d]' % (val, name, i))
+++                  print '"%s[%d]", "%s",' % (name, i, val)
+++      print '#endif'
+++      print '#define DQASM_ARGS "%s"' % ' '.join(dqasm_args)
+++
+++def dump(dumper):
+++   if (len(prog) != 0) or (len(labels) != 0):
+++      dumper.begin()
+++
+++      sorted_labels = []
+++      for name in labels:
+++         if name[0].isdigit():
+++            for pc in labels[name]:
+++               sorted_labels.append((pc, name))
+++         else:
+++            sorted_labels.append((labels[name], name))
+++      sorted_labels.sort(reverse = True)
+++
+++      first = True
+++      for pc in xrange(len(prog)):
+++         ls, ms, line, annots = prog[pc]
+++         while (len(sorted_labels) != 0) and (sorted_labels[-1][0] == pc):
+++            dumper.label(*sorted_labels.pop())
+++         dumper.line(pc, ls, ms, line, annots, first)
+++         first = False
+++      for sorted_label in sorted_labels:
+++         assert sorted_label[0] == len(prog)
+++         dumper.label(*sorted_label)
+++
+++      dumper.end()
+++
+++###############################################################################
+++# preprocessing
+++###############################################################################
+++
+++def preprocess_inline_c(dumper):
+++   def preprocess(file):
+++      ls = None
+++      line_number = 0
+++      for line in file:
+++         line_number += 1
+++         while True:
+++            if ls is None:
+++               l = line.split('%[', 1)
+++               if len(l) == 1:
+++                  dumper.direct(l[0].rstrip())
+++                  break
+++               dumper.direct('%s \\' % l[0].rstrip())
+++               line = l[1]
+++               ls = []
+++            else:
+++               l = line.split('%]', 1)
+++               ls.append((line_number, l[0]))
+++               if len(l) == 1:
+++                  break
+++               line = l[1]
+++               l = ls[-1][1].split('%|', 1)
+++               if len(l) == 1:
+++                  for l_number, l in ls:
+++                     yield l_number, l
+++                  asm_end_prog()
+++                  dump(dumper)
+++                  asm_reset_prog()
+++               else:
+++                  ls[-1] = (ls[-1][0], l[0])
+++                  if hasattr(dumper, 'begin_iteration'):
+++                     dumper.begin_iteration()
+++                  for repls in l[1].split('%,'):
+++                     repls = [repl.strip() for repl in repls.split('%/')]
+++                     for l_number, l in ls:
+++                        for i, repl in enumerate(repls):
+++                           l = l.replace('%' + str(i), repl)
+++                        yield l_number, l
+++                     asm_end_prog()
+++                     dump(dumper)
+++                     asm_reset_prog()
+++                  if hasattr(dumper, 'end_iteration'):
+++                     dumper.end_iteration()
+++               ls = None
+++   return preprocess
+++
+++def preprocess_clif(dumper):
+++   def preprocess(file):
+++      in_asm = False
+++      line_number = 0
+++      for line in file:
+++         line_number += 1
+++         if in_asm:
+++            if line.strip() == '%]':
+++               asm_end_prog()
+++               dump(dumper)
+++               asm_reset_prog()
+++               in_asm = False
+++            else:
+++               yield line_number, line
+++         else:
+++            if line.strip() == '%[':
+++               in_asm = True
+++            elif (line[:1] == '%') and (line[:2] != '%@'):
+++               yield line_number, line[1:]
+++            else:
+++               asm_end_prog()
+++               dump(dumper)
+++               asm_reset_prog()
+++               if line[:2] == '%@':
+++                  if hasattr(dumper, 'parse_annot_mode'):
+++                     dumper.parse_annot_mode(line[2:])
+++               else:
+++                  dumper.direct(line.rstrip())
+++   return preprocess
+++
+++###############################################################################
+++# main
+++###############################################################################
+++
+++def main():
+++   global external_link, allow_xor_0, dont_warn_when_mul_rot_inp_r5
+++   global warnings_are_errors, disable_warnings, have_sema, have_am, mulw_rotate
+++
+++   asm_init() # do this first so we can use asm_error without having to pass a location and so asm_warning will work
+++
+++   # parse command line
+++   parser = optparse.OptionParser(usage = 'usage: %prog [options] <filename>')
+++   parser.add_option('-m', '--mode', dest = 'mode',
+++      help = '<mode> should be clif, plain, ' +
+++      'c_c:<header_name>,<full_header_name>,<array_name>, ' +
+++      'c_h:<header_name>,<full_header_name>,<array_name>, ' +
+++      'ml_c:<header_name>,<full_header_name>,<name>[,annots], ' +
+++      'ml_h:<header_name>,<full_header_name>,<name>[,annots], ' +
+++      'inline_c[:annots], asvc, or aliases[:<preprocess_mode>]', metavar = '<mode>')
+++   parser.add_option('-t', '--target', dest = 'target',
+++      help = '<target> should be a0, b0, or hera', metavar = '<target>')
+++   parser.add_option('-x', '--allow_xor_0', dest = 'allow_xor_0', action = 'store_true', default = False)
+++   parser.add_option('-r', '--dont_warn_when_mul_rot_inp_r5', dest = 'dont_warn_when_mul_rot_inp_r5', action = 'store_true', default = False)
+++   parser.add_option('-w', '--warnings_are_errors', dest = 'warnings_are_errors', action = 'store_true', default = False)
+++   parser.add_option('-d', '--disable_warnings', dest = 'disable_warnings', action = 'store_true', default = False)
+++   parser.add_option('-s', '--set', dest = 'sets', action = 'append', default = [], metavar = '<name>=<val>')
+++   options, args = parser.parse_args()
+++   if len(args) == 0:
+++      filename = None
+++   elif len(args) == 1:
+++      filename = args[0]
+++   else:
+++      parser.print_help()
+++      sys.exit(-1)
+++
+++   # handle mode
+++   mode = options.mode or 'clif' # assume clif if no mode specified
+++   if mode == 'clif':
+++      dumper = clif_dumper_t()
+++      preprocess = preprocess_clif(dumper)
+++   elif mode == 'plain':
+++      dumper = plain_dumper_t()
+++      preprocess = None
+++   elif (mode[:4] == 'c_c:') or (mode[:4] == 'c_h:'):
+++      mode_options = mode[4:].split(',')
+++      if len(mode_options) != 3:
+++         asm_error('badly formatted mode on command line')
+++      dumper = {'c_c': c_c_dumper_t, 'c_h': c_h_dumper_t}[mode[:3]](*mode_options)
+++      preprocess = None
+++   elif (mode[:5] == 'ml_c:') or (mode[:5] == 'ml_h:'):
+++      mode_options = mode[5:].split(',')
+++      if (len(mode_options) != 3) and ((len(mode_options) != 4) or (mode_options[3] != 'annots')):
+++         asm_error('badly formatted mode on command line')
+++      dumper = {'ml_c': ml_c_dumper_t, 'ml_h': ml_h_dumper_t
+++         }[mode[:4]](*(mode_options[:3] + [len(mode_options) == 4]))
+++      preprocess = None
+++   elif mode == 'inline_c':
+++      dumper = inline_c_dumper_t(False)
+++      preprocess = preprocess_inline_c(dumper)
+++   elif mode == 'inline_c:annots':
+++      dumper = inline_c_dumper_t(True)
+++      preprocess = preprocess_inline_c(dumper)
+++   elif mode == 'asvc':
+++      dumper = asvc_dumper_t()
+++      preprocess = None
+++   elif mode == 'aliases':
+++      dumper = aliases_dumper_t()
+++      preprocess = None
+++   elif mode == 'aliases:inline_c':
+++      dumper = aliases_dumper_t()
+++      preprocess = preprocess_inline_c(dumper)
+++   else:
+++      asm_error('invalid mode')
+++   external_link = dumper.external_link()
+++
+++   # handle target
+++   target = options.target or 'b0' # assume b0 if no target specified
+++   if target == 'a0':
+++      have_sema = False
+++      have_am = False
+++      mulw_rotate = False
+++      have_lthrsw = False
+++   elif target == 'b0':
+++      have_sema = True
+++      have_am = True
+++      mulw_rotate = True
+++      have_lthrsw = True
+++   elif target == 'hera':
+++      have_sema = True
+++      have_am = False
+++      mulw_rotate = True
+++      have_lthrsw = True
+++   else:
+++      asm_error('invalid target')
+++   if have_am:
+++      sigs['loadam'] = SIG_LOADAM
+++      arg_defs['tlbam'] = loc_t(MUX_ANY, 47, 0, 0, None, RW_WRITE)
+++   if have_lthrsw:
+++      sigs['lthrsw'] = SIG_LTHRSW
+++      del sigs['int']
+++      arg_defs['interrupt'] = loc_t(MUX_ANY, 38, 0, 0, None, RW_WRITE)
+++
+++   # handle misc options
+++   allow_xor_0 = options.allow_xor_0
+++   dont_warn_when_mul_rot_inp_r5 = options.dont_warn_when_mul_rot_inp_r5
+++   warnings_are_errors = options.warnings_are_errors
+++   disable_warnings = options.disable_warnings
+++
+++   # make options visible to asm
+++   arg_defs['mode'] = mode
+++   arg_defs['target'] = target
+++
+++   # arg_defs all setup at this point
+++   sets = arg_defs.copy() # todo: see arg_eval
+++
+++   # handle command line sets
+++   re_options_set = re.compile('(?P<name>\\w+)=(?P<val>.+)$')
+++   for options_set in options.sets:
+++      m = re_options_set.match(options_set)
+++      if not m:
+++         asm_error('badly formatted set on command line')
+++      sets[m.group('name')] = arg_eval(m.group('val'), sets)
+++
+++   # assemble input file and dump
+++   asm_file(sets, filename, filename, preprocess)
+++   asm_end_prog()
+++   dump(dumper)
+++   for name in arg_defs: # todo: see arg_eval
+++      del sets[name]
+++   dumper.sets(sets)
+++
+++if __name__ == '__main__':
+++   main()
++diff --git a/pi-util/rebase_liblinks.py b/pi-util/rebase_liblinks.py
++new file mode 100755
++index 0000000..6a9a33f
++--- /dev/null
+++++ b/pi-util/rebase_liblinks.py
++@@ -0,0 +1,37 @@
+++#!/usr/bin/env python
+++
+++import os, sys
+++from stat import *
+++
+++def walktree(top, callback, n, prefix):
+++    '''recursively descend the directory tree rooted at top,
+++       calling the callback function for each regular file'''
+++
+++    for f in os.listdir(top):
+++        pathname = os.path.join(top, f)
+++        mode = os.lstat(pathname).st_mode
+++        if S_ISDIR(mode):
+++            # It's a directory, recurse into it
+++            walktree(pathname, callback, n+1, prefix)
+++        elif S_ISLNK(mode):
+++            # It's a file, call the callback function
+++            callback(pathname, os.readlink(pathname), n, prefix)
+++
+++def visitfile(file, linkname, n, prefix):
+++    if (linkname.startswith(prefix + 'lib/')):
+++        newlink = "../" * n + linkname[len(prefix):]
+++        print 'relinking', file, "->", newlink
+++        os.remove(file)
+++        os.symlink(newlink, file)
+++
+++if __name__ == '__main__':
+++    argc = len(sys.argv)
+++    if argc == 2:
+++        walktree(sys.argv[1], visitfile, 0, "/")
+++    elif argc == 3:
+++        walktree(sys.argv[1], visitfile, 0, sys.argv[2])
+++    else:
+++        print "rebase_liblinks.py <local root> [<old sysroot>]"
+++
+++
+++
++diff --git a/pi-util/syncroot.sh b/pi-util/syncroot.sh
++new file mode 100755
++index 0000000..d8bdd91
++--- /dev/null
+++++ b/pi-util/syncroot.sh
++@@ -0,0 +1,43 @@
+++set -e
+++
+++if [ "$1" == "" ]; then
+++  echo Usage: $0 \<src_dir\> [\<rootname\>]
+++  echo src_dir is a source for rsync so may contain m/c name.
+++  echo rootname will be set to \"raspian_jessie_pi1\" if missing
+++  echo e.g.: pi-util/syncroot.sh my-pi: raspian_jessie_pi1
+++  exit 1
+++fi
+++
+++SYSROOT_NAME=$2
+++if [ "$SYSROOT_NAME" == "" ]; then
+++  SYSROOT_NAME=raspian_jessie_pi1
+++fi
+++
+++DST_ROOT=`pwd`
+++DST=$DST_ROOT/build/linux/$SYSROOT_NAME-sysroot
+++SRC=$1
+++
+++echo Sync src:  $SRC
+++echo Sync dest: $DST
+++
+++mkdir -p $DST/lib
+++mkdir -p $DST/opt/vc/include
+++mkdir -p $DST/usr/lib/pkgconfig
+++mkdir -p $DST/usr/bin
+++mkdir -p $DST/usr/share
+++
+++#### MUST NOT include /opt/vc/include/*GL*
+++# Creates conflicts with GL includes inside Chrome
+++
+++rsync -rl $SRC/lib/arm-linux-gnueabihf $DST/lib
+++rsync -rl $SRC/opt/vc/lib $DST/opt/vc
+++rsync -l  $SRC/opt/vc/include/bcm_host.h $DST/opt/vc/include
+++rsync -rl $SRC/opt/vc/include/interface $DST/opt/vc/include
+++rsync -rl $SRC/opt/vc/include/vcinclude $DST/opt/vc/include
+++rsync -rl $SRC/usr/lib/arm-linux-gnueabihf $DST/usr/lib
+++rsync -rl $SRC/usr/lib/gcc $DST/usr/lib
+++rsync -rl $SRC/usr/include $DST/usr
+++
+++pi-util/rebase_liblinks.py $DST
+++
+++
diff --git a/projects/RPi2/patches/kodi/kodi-001-backport.patch b/projects/RPi2/patches/kodi/kodi-001-backport.patch
index 355c3494c9..42910f0a13 100644
--- a/projects/RPi2/patches/kodi/kodi-001-backport.patch
+++ b/projects/RPi2/patches/kodi/kodi-001-backport.patch
@@ -1,91 +1,7 @@
-From 3a032772cf28a21dcfcd12f8872e211b391fac64 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Tue, 28 Oct 2014 00:19:40 +0000
-Subject: [PATCH 01/64] [cec] Add settings for configuring button repeats
-
----
- addons/resource.language.en_gb/resources/strings.po | 15 +++++++++++++++
- system/peripherals.xml                              |  4 +++-
- xbmc/peripherals/devices/PeripheralCecAdapter.cpp   | 16 ++++++++++++++++
- 3 files changed, 34 insertions(+), 1 deletion(-)
-
-diff --git a/addons/resource.language.en_gb/resources/strings.po b/addons/resource.language.en_gb/resources/strings.po
-index da5580360222805f83da510d7eba0b67a4c67c84..6e0d5ed0fbba1aee3cca9bff3401b366cb77c2b7 100644
---- a/addons/resource.language.en_gb/resources/strings.po
-+++ b/addons/resource.language.en_gb/resources/strings.po
-@@ -19728,3 +19728,18 @@ msgstr ""
- msgctxt "#39010"
- msgid "Select sort method"
- msgstr ""
-+
-+#: system/peripherals.xml
-+msgctxt "#38050"
-+msgid "Remote button press delay before repeating (ms)"
-+msgstr ""
-+
-+#: system/peripherals.xml
-+msgctxt "#38051"
-+msgid "Remote button press repeat rate (ms)"
-+msgstr ""
-+
-+#: system/peripherals.xml
-+msgctxt "#38052"
-+msgid "Remote button press release time (ms)"
-+msgstr ""
-diff --git a/system/peripherals.xml b/system/peripherals.xml
-index d5704b249c3065b2980dc92c7c81dc7b384187bc..02b1a9ed6fce1986bd864bba09a9df0621f9e041 100644
---- a/system/peripherals.xml
-+++ b/system/peripherals.xml
-@@ -31,7 +31,9 @@
-     <setting key="device_type" type="int" value="1" configurable="0" />
-     <setting key="wake_devices_advanced" type="string" value="" configurable="0" />
-     <setting key="standby_devices_advanced" type="string" value="" configurable="0" />
--    <setting key="double_tap_timeout_ms" type="int" min="0" value="300" configurable="0" />
-+    <setting key="double_tap_timeout_ms" type="int" min="50" max="1000" step="50" value="300" label="38050" order="16" />
-+    <setting key="button_repeat_rate_ms" type="int" min="0" max="250" step="10" value="0" label="38051" order="17" />
-+    <setting key="button_release_delay_ms" type="int" min="0" max="500" step="50" value="0" label="38052" order="18" />
-   </peripheral>
- 
-   <peripheral vendor_product="2548:1001,2548:1002" bus="usb" name="Pulse-Eight CEC Adapter" mapTo="cec">
-diff --git a/xbmc/peripherals/devices/PeripheralCecAdapter.cpp b/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
-index d032ffd707fee5eec035e90bdf618530f7215c37..30367a3fde956090afdca9930fa52e829f35046f 100644
---- a/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
-+++ b/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
-@@ -1296,6 +1296,20 @@ void CPeripheralCecAdapter::SetConfigurationFromLibCEC(const CEC::libcec_configu
-   m_configuration.bActivateSource = config.bActivateSource;
-   bChanged |= SetSetting("activate_source", m_configuration.bActivateSource == 1);
- 
-+#if defined(CEC_DOUBLE_TAP_TIMEOUT_MS_OLD)
-+  m_configuration.iDoubleTapTimeout50Ms = config.iDoubleTapTimeout50Ms;
-+  bChanged |= SetSetting("double_tap_timeout_ms", (int)m_configuration.iDoubleTapTimeout50Ms * 50);
-+#else
-+  m_configuration.iDoubleTapTimeoutMs = config.iDoubleTapTimeoutMs;
-+  bChanged |= SetSetting("double_tap_timeout_ms", (int)m_configuration.iDoubleTapTimeoutMs);
-+#endif
-+
-+  m_configuration.iButtonRepeatRateMs = config.iButtonRepeatRateMs;
-+  bChanged |= SetSetting("button_repeat_rate_ms", (int)m_configuration.iButtonRepeatRateMs);
-+
-+  m_configuration.iButtonReleaseDelayMs = config.iButtonReleaseDelayMs;
-+  bChanged |= SetSetting("button_release_delay_ms", (int)m_configuration.iButtonReleaseDelayMs);
-+
-   m_configuration.bPowerOffOnStandby = config.bPowerOffOnStandby;
- 
-   m_configuration.iFirmwareVersion = config.iFirmwareVersion;
-@@ -1398,6 +1412,8 @@ void CPeripheralCecAdapter::SetConfigurationFromSettings(void)
-   // backwards compatibility. will be removed once the next major release of libCEC is out
-   m_configuration.iDoubleTapTimeoutMs = GetSettingInt("double_tap_timeout_ms");
- #endif
-+  m_configuration.iButtonRepeatRateMs = GetSettingInt("button_repeat_rate_ms");
-+  m_configuration.iButtonReleaseDelayMs = GetSettingInt("button_release_delay_ms");
- 
-   if (GetSettingBool("pause_playback_on_deactivate"))
-   {
-
-From 84fde1194d89b02d321ff4049a572bce88947ec9 Mon Sep 17 00:00:00 2001
+From fb711d36229c80705b4a0a36ce2e120c3e1466fd Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sat, 26 Apr 2014 17:27:52 +0100
-Subject: [PATCH 02/64] [cec] Don't suspend pi on tv switch off - it can't wake
+Subject: [PATCH 02/67] [cec] Don't suspend pi on tv switch off - it can't wake
  up
 
 ---
@@ -106,10 +22,10 @@ index 02b1a9ed6fce1986bd864bba09a9df0621f9e041..54f9b70cfd5c8c82ceb99932e1b3e325
      <setting key="use_tv_menu_language" type="bool" value="1" label="36018" order="10" />
      <setting key="pause_playback_on_deactivate" type="bool" value="1" label="36033" configurable="0" />
 
-From 46438fbd67528d9ac3ca8bba386a9f7e1e18c777 Mon Sep 17 00:00:00 2001
+From 6b34039dc27c952fc5217ffc1f0e1fac49992bed Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Mon, 7 Apr 2014 18:19:32 +0100
-Subject: [PATCH 03/64] [rbp/omxplayer] When opening a stream don't try to
+Subject: [PATCH 03/67] [rbp/omxplayer] When opening a stream don't try to
  update gui so often
 
 ---
@@ -133,10 +49,10 @@ index c8fe0706d128b3c67a4000894129ae0fa08bb223..8a5916299575661743131b921a27a76f
          dialog->ProcessRenderLoop(false);
          if (allowCancel && dialog->IsCanceled())
 
-From 1b2fcbc6357fa1399576a819398c01833053b35e Mon Sep 17 00:00:00 2001
+From 4757e370bc4f5fc42f7191b893c2d806d6c76bbc Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sat, 8 Mar 2014 15:36:06 +0000
-Subject: [PATCH 04/64] [hifiberry] Hack: force it to be recognised as IEC958
+Subject: [PATCH 04/67] [hifiberry] Hack: force it to be recognised as IEC958
  capable to enable passthrough options
 
 ---
@@ -159,10 +75,10 @@ index d66993a09583d8f9f54f5f97c18fbba45dddee9b..3c0b691860ace57e0a25f01013df01a5
          info.m_displayName.substr(info.m_displayName.size()-5) == " HDMI")
      {
 
-From ea630fb4c3a67d3fb21b927dd18eaa5ba8937fbb Mon Sep 17 00:00:00 2001
+From eec779f1dba335e11b9b30955f047fa432896b2f Mon Sep 17 00:00:00 2001
 From: Ben Avison <bavison@riscosopen.org>
 Date: Thu, 1 May 2014 16:28:39 +0100
-Subject: [PATCH 05/64] Improved file buffering in CArchive
+Subject: [PATCH 05/67] Improved file buffering in CArchive
 
 Even though memcpy is typically inlined by the compiler into byte/word loads
 and stores (at least for release builds), the frequency with which 1, 2 and 4
@@ -222,10 +138,10 @@ index 23cac2759fb10d532da56fa75c5528c5589e9010..89d31d4db1afa7340ed8cd51a7a9fa7a
      }
  
 
-From b56b00d9100980eaee66810fac1ab0124292ba75 Mon Sep 17 00:00:00 2001
+From 2ed4fa5cf7935b1e04d2e2aebd0f214047ed358f Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sun, 10 Aug 2014 16:45:16 +0100
-Subject: [PATCH 06/64] filesystem: Make support of browsing into archives
+Subject: [PATCH 06/67] filesystem: Make support of browsing into archives
  optional
 
 The ability to browse, scan and play content in archives can cause problems on low powered/low memory devices.
@@ -244,10 +160,10 @@ We'll let people who don't use archives disable it manually
  4 files changed, 26 insertions(+), 2 deletions(-)
 
 diff --git a/addons/resource.language.en_gb/resources/strings.po b/addons/resource.language.en_gb/resources/strings.po
-index 6e0d5ed0fbba1aee3cca9bff3401b366cb77c2b7..6cc42fe19769b23fa71d6bc9ae6776cad01d9e19 100644
+index 6443f3dd885bf0aa8e031039e36e273972a310ae..7dfc5355cc0d85d94360ba21bc738733e4878f3d 100644
 --- a/addons/resource.language.en_gb/resources/strings.po
 +++ b/addons/resource.language.en_gb/resources/strings.po
-@@ -19371,6 +19371,15 @@ msgstr ""
+@@ -19388,6 +19388,15 @@ msgstr ""
  #: system/settings/rbp.xml
  msgctxt "#38010"
  msgid "GPU accelerated"
@@ -335,10 +251,10 @@ index a0fd0a9011e71f4af1535110c696b6ea5c4b37db..688b71a297c7c617c6764bfe6be157d7
    {
      CURL xbtUrl = URIUtils::CreateArchivePath("xbt", url);
 
-From a5cb6b253a6c9e6a1b7f4cf8aaf16e804f679856 Mon Sep 17 00:00:00 2001
+From dcebf738cde43680261a85a3385c728189b84cdb Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Mon, 27 Oct 2014 13:06:57 +0000
-Subject: [PATCH 07/64] [rbp] Make cachemembuffersize default depend on memory
+Subject: [PATCH 07/67] [rbp] Make cachemembuffersize default depend on memory
  size
 
 ---
@@ -402,7 +318,7 @@ index a35a509a91483f13e2cf0e688fc7e9528f254290..fffa5182126159f6dfcf750b21fa0464
    void Deinitialize();
    int GetArmMem() { return m_arm_mem; }
 diff --git a/xbmc/settings/AdvancedSettings.cpp b/xbmc/settings/AdvancedSettings.cpp
-index cc37998f0c9edfb38cf609666374cfa96530bf8f..3891a7ed34acb3489a860678d56a8ec049890f6e 100644
+index 1c00edab33101b82a5817ac03c7f1d98007e1856..12ba1aca0ba838bd8d33e9ca1043845c10f90954 100644
 --- a/xbmc/settings/AdvancedSettings.cpp
 +++ b/xbmc/settings/AdvancedSettings.cpp
 @@ -50,6 +50,9 @@
@@ -440,10 +356,10 @@ index cc37998f0c9edfb38cf609666374cfa96530bf8f..3891a7ed34acb3489a860678d56a8ec0
  }
  
 
-From 2fb7a0e59386ce93c8f4e7685880bd292d179b29 Mon Sep 17 00:00:00 2001
+From b3cfcae349f63dc41713cb9cd24707f02b0184d6 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Fri, 30 May 2014 14:58:43 +0100
-Subject: [PATCH 08/64] [settings] Experiment: Report DESKTOP resolution in
+Subject: [PATCH 08/67] [settings] Experiment: Report DESKTOP resolution in
  video settings
 
 ---
@@ -465,10 +381,10 @@ index ef95bc286fa982790248bad26da3c3e00c1da002..da69c6960867621d4ebe9267929664d9
          StringUtils::Format("%dx%d%s", resolution->width, resolution->height,
                              ModeFlagsToString(resolution->flags, false).c_str()),
 
-From d38a1b9896fe67dd2504144f49138393a335eaad Mon Sep 17 00:00:00 2001
+From 4a486ce217ef15870a9c23d6e0cbd2c69137100a Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Wed, 24 Sep 2014 23:13:52 +0100
-Subject: [PATCH 09/64] [audio] Add settings option to boost centre channel
+Subject: [PATCH 09/67] [audio] Add settings option to boost centre channel
  when downmixing
 
 This allows a dB volume increase to be added to centre channel.
@@ -486,10 +402,10 @@ Should work with Pi Sink (dvdplayer/paplayer) and omxplayer
  5 files changed, 46 insertions(+)
 
 diff --git a/addons/resource.language.en_gb/resources/strings.po b/addons/resource.language.en_gb/resources/strings.po
-index 6cc42fe19769b23fa71d6bc9ae6776cad01d9e19..7b171b3186d47726d1f60cc0225358dc434e9d9f 100644
+index 7dfc5355cc0d85d94360ba21bc738733e4878f3d..c67fc9a16f303a822dadfb4f558a390ada04bca8 100644
 --- a/addons/resource.language.en_gb/resources/strings.po
 +++ b/addons/resource.language.en_gb/resources/strings.po
-@@ -19591,6 +19591,21 @@ msgstr ""
+@@ -19608,6 +19608,21 @@ msgstr ""
  
  #empty strings from id 38062 to 38099
  
@@ -512,7 +428,7 @@ index 6cc42fe19769b23fa71d6bc9ae6776cad01d9e19..7b171b3186d47726d1f60cc0225358dc
  #: system/settings/settings.xml
  msgctxt "#38100"
 diff --git a/system/settings/settings.xml b/system/settings/settings.xml
-index f28f5daa8145613670b93fdb221dc53eadf5ce63..316b641d01ceaa8e0a347d8331b56b41c6a44b49 100644
+index 301e7276e5b79e00457db1f33b1cd576bdef4c85..5f1f3ca48342ef1a4eeed7432221d7b2dda354e8 100644
 --- a/system/settings/settings.xml
 +++ b/system/settings/settings.xml
 @@ -2358,6 +2358,18 @@
@@ -594,10 +510,10 @@ index f16b822ed7b4aebe18b5d339b3f71ee66e97c23f..993d4b33a294e88c2c004b7943895ba5
      // stereo upmix
      if (upmix && m_src_channels == 2 && m_dst_channels > 2)
 
-From 3d536f5d2226193b5066d97f727795efce625d48 Mon Sep 17 00:00:00 2001
+From d3125a94e433da8ead850dfa45ed1d6ded3f3148 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Mon, 27 Oct 2014 15:23:51 +0000
-Subject: [PATCH 10/64] [rbp] Default extract thumbnails to false
+Subject: [PATCH 10/67] [rbp] Default extract thumbnails to false
 
 It can take 80 seconds for a single file on a Pi. It can cause crashes with out-of-memory errors.
 It genereates a lot of support issues. Best to default to disabled and let users enable it if they must
@@ -623,10 +539,10 @@ index e8b0d3d472b02fd161a4b51e957b9129e3cb9792..289dc55ec41aa44848519a05f8ee1ccc
      </category>
    </section>
 
-From 6adf157187e26042b135d082bc1d91637e4108c4 Mon Sep 17 00:00:00 2001
+From 2e4c872b8c0b795156f96918cb8a2d4e099b1d1e Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Thu, 27 Nov 2014 16:31:56 +0000
-Subject: [PATCH 11/64] [languageinvoker] Reduce priority of python threads
+Subject: [PATCH 11/67] [languageinvoker] Reduce priority of python threads
 
 ---
  xbmc/interfaces/generic/LanguageInvokerThread.cpp | 5 +++++
@@ -649,10 +565,10 @@ index fcdd0633f30cd9595ae6cc4ed293677cdcb1f422..16f0c8916b5e0a9e90973d194cf2ebd1
  }
  
 
-From 7edf27fd315fa0aa2683790e8d16c7674253f86f Mon Sep 17 00:00:00 2001
+From 97ce5209853a7f18a79e5d98893353a3bb52f3dd Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sat, 29 Nov 2014 15:25:16 +0000
-Subject: [PATCH 12/64] [rbp] hack: wait for splash to complete before changing
+Subject: [PATCH 12/67] [rbp] hack: wait for splash to complete before changing
  hdmi mode
 
 ---
@@ -736,10 +652,10 @@ index ee297700f8583dbb15cbe53baf8c887b36bd2ea0..bbe501d40c5e101f1d0d64b8b59b1928
  
    RENDER_STEREO_MODE stereo_mode = g_graphicsContext.GetStereoMode();
 
-From cdc528d79b99a1cc90d1828f56a46234fd685c9a Mon Sep 17 00:00:00 2001
+From 70f0ff25bc73321491cc1ad85e3fbb5514dfdc16 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Thu, 11 Dec 2014 17:00:57 +0000
-Subject: [PATCH 13/64] Fix for UI not showing both extractflags and
+Subject: [PATCH 13/67] Fix for UI not showing both extractflags and
  extractthumb
 
 ---
@@ -748,7 +664,7 @@ Subject: [PATCH 13/64] Fix for UI not showing both extractflags and
  2 files changed, 9 insertions(+), 5 deletions(-)
 
 diff --git a/addons/resource.language.en_gb/resources/strings.po b/addons/resource.language.en_gb/resources/strings.po
-index 7b171b3186d47726d1f60cc0225358dc434e9d9f..7c619f5619ea974eda22315179a20569e832641a 100644
+index c67fc9a16f303a822dadfb4f558a390ada04bca8..b2f17db119a179e3e2bf4c8c186a19ea4e6d49a7 100644
 --- a/addons/resource.language.en_gb/resources/strings.po
 +++ b/addons/resource.language.en_gb/resources/strings.po
 @@ -12451,7 +12451,7 @@ msgstr ""
@@ -760,7 +676,7 @@ index 7b171b3186d47726d1f60cc0225358dc434e9d9f..7c619f5619ea974eda22315179a20569
  msgstr ""
  
  #: xbmc/dialogs/GUIDialogSmartPlaylistRule.cpp
-@@ -17011,7 +17011,7 @@ msgstr ""
+@@ -17028,7 +17028,7 @@ msgstr ""
  #. Description of setting with label #20433 "Extract thumbnails and video information"
  #: system/settings/settings.xml
  msgctxt "#36178"
@@ -769,7 +685,7 @@ index 7b171b3186d47726d1f60cc0225358dc434e9d9f..7c619f5619ea974eda22315179a20569
  msgstr ""
  
  #. Description of setting with label #20419 "Replace file names with library titles"
-@@ -17023,7 +17023,7 @@ msgstr ""
+@@ -17040,7 +17040,7 @@ msgstr ""
  #. Description of setting with label #20433 "Extract thumbnails and video information"
  #: system/settings/settings.xml
  msgctxt "#36180"
@@ -778,7 +694,7 @@ index 7b171b3186d47726d1f60cc0225358dc434e9d9f..7c619f5619ea974eda22315179a20569
  msgstr ""
  
  #: system/settings/settings.xml
-@@ -19767,3 +19767,7 @@ msgstr ""
+@@ -19784,3 +19784,7 @@ msgstr ""
  msgctxt "#38052"
  msgid "Remote button press release time (ms)"
  msgstr ""
@@ -787,7 +703,7 @@ index 7b171b3186d47726d1f60cc0225358dc434e9d9f..7c619f5619ea974eda22315179a20569
 +msgid "Extract thumbnails from video files"
 +msgstr ""
 diff --git a/system/settings/settings.xml b/system/settings/settings.xml
-index 316b641d01ceaa8e0a347d8331b56b41c6a44b49..85d3b93466236c33940d01a10d0b8761d1eaa2f6 100644
+index 5f1f3ca48342ef1a4eeed7432221d7b2dda354e8..2ed5fb217c6b9f63f28d760e2a2c00b29942315a 100644
 --- a/system/settings/settings.xml
 +++ b/system/settings/settings.xml
 @@ -974,8 +974,8 @@
@@ -802,10 +718,10 @@ index 316b641d01ceaa8e0a347d8331b56b41c6a44b49..85d3b93466236c33940d01a10d0b8761
            <control type="toggle" />
          </setting>
 
-From 3201c31912acbedfd5f035d4bd65df0fbb73d0e3 Mon Sep 17 00:00:00 2001
+From a1f119e0986ee89641e533cbafae576147e5848d Mon Sep 17 00:00:00 2001
 From: anaconda <anaconda@menakite.eu>
 Date: Thu, 11 Sep 2014 21:30:43 +0200
-Subject: [PATCH 14/64] Disable autoscrolling while on screensaver and while
+Subject: [PATCH 14/67] Disable autoscrolling while on screensaver and while
  opening streams.
 
 ---
@@ -818,10 +734,10 @@ Subject: [PATCH 14/64] Disable autoscrolling while on screensaver and while
  6 files changed, 24 insertions(+), 3 deletions(-)
 
 diff --git a/xbmc/Application.cpp b/xbmc/Application.cpp
-index c6ef0c0e08493090b02accd5cbcbbcb7d8530d87..426835c1b34477ef4871c8720879ed5f89e40386 100644
+index b8ff91b427c4fd430675aab3d1d93098c976031f..fdf7b1dc04e31ffe8e1d1b83825343b24c645b02 100644
 --- a/xbmc/Application.cpp
 +++ b/xbmc/Application.cpp
-@@ -5226,3 +5226,13 @@ bool CApplication::NotifyActionListeners(const CAction &action) const
+@@ -5229,3 +5229,13 @@ bool CApplication::NotifyActionListeners(const CAction &action) const
    
    return false;
  }
@@ -836,7 +752,7 @@ index c6ef0c0e08493090b02accd5cbcbbcb7d8530d87..426835c1b34477ef4871c8720879ed5f
 +  return onBlackDimScreenSaver || openingStreams;
 +}
 diff --git a/xbmc/Application.h b/xbmc/Application.h
-index 8d5876e03d7180ca71ed7c06108c1fa7c81ebe64..2d7f2616159406efdd0d8df4384f41ac9a144f5d 100644
+index 2c21d92c942dfcb3e29f26f00cb545f4b16dca0d..d5446b200439833fed02f998d180ce001eb98067 100644
 --- a/xbmc/Application.h
 +++ b/xbmc/Application.h
 @@ -393,6 +393,8 @@ public:
@@ -936,10 +852,10 @@ index d7bc1c5ba6067af9a460589920367288c640a915..ac766293f1c47c7f145cb46f6b152144
        if (m_lastRenderTime)
          m_autoScrollDelayTime += currentTime - m_lastRenderTime;
 
-From 66f8b01e3d210df4f9cfadbdf188a82651804cc3 Mon Sep 17 00:00:00 2001
+From 65ee3a30d6489b53126e6d34b01ed8c29a4920e5 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sat, 13 Dec 2014 18:35:20 +0000
-Subject: [PATCH 15/64] [demuxer] Avoid memcpy on every demuxer packet
+Subject: [PATCH 15/67] [demuxer] Avoid memcpy on every demuxer packet
 
 Avoids an unnecessary memcpy on every demuxer packet which for
 high bitrate videos can be significant.
@@ -1039,10 +955,10 @@ index df0f35bd49c65b302de4ccd110d859e8b881ea5f..b4b591ae4c4dd4fb0b36d4d00fedca96
      }
      catch(...) {
 
-From 9d108d1ba19f61e3b60260eaf0b65c7e607e9f55 Mon Sep 17 00:00:00 2001
+From 66365771b22ae63d65bbb6df6f8d77d5a5dab33e Mon Sep 17 00:00:00 2001
 From: anaconda <anaconda@menakite.eu>
 Date: Wed, 25 Feb 2015 18:22:21 +0100
-Subject: [PATCH 16/64] Load OSD dialogs on startup.
+Subject: [PATCH 16/67] Load OSD dialogs on startup.
 
 Fixes skipped frames the first time they're loaded in memory on less powered
 devices, like a Raspberry Pi, when using DVDPlayer.
@@ -1137,10 +1053,10 @@ index 0534828dd85520134f7a6890e43a873e223062c1..5a86dfc1e2a54c8fe8d82cb75b612d8e
  CGUIDialogVideoSettings::~CGUIDialogVideoSettings()
  { }
 
-From 973728dd114827a31c754ab1f128a88707f80a49 Mon Sep 17 00:00:00 2001
+From 2c71f9b477cfd5ecb5cdedb6688502dc8cef8fa8 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Tue, 14 Apr 2015 20:51:14 +0100
-Subject: [PATCH 17/64] [gui] Also limit GUI updates when in non full-screen
+Subject: [PATCH 17/67] [gui] Also limit GUI updates when in non full-screen
  video mode
 
 ---
@@ -1148,10 +1064,10 @@ Subject: [PATCH 17/64] [gui] Also limit GUI updates when in non full-screen
  1 file changed, 3 insertions(+), 1 deletion(-)
 
 diff --git a/xbmc/Application.cpp b/xbmc/Application.cpp
-index 426835c1b34477ef4871c8720879ed5f89e40386..0d2a8b4b4c70d9b194de35a9369b0c2d46ea490d 100644
+index fdf7b1dc04e31ffe8e1d1b83825343b24c645b02..513deb7f27846891fb875b9263ad4d61752519ef 100644
 --- a/xbmc/Application.cpp
 +++ b/xbmc/Application.cpp
-@@ -2768,7 +2768,7 @@ void CApplication::FrameMove(bool processEvents, bool processGUI)
+@@ -2771,7 +2771,7 @@ void CApplication::FrameMove(bool processEvents, bool processGUI)
  #if defined(TARGET_RASPBERRY_PI) || defined(HAS_IMXVPU)
      // This code reduces rendering fps of the GUI layer when playing videos in fullscreen mode
      // it makes only sense on architectures with multiple layers
@@ -1160,7 +1076,7 @@ index 426835c1b34477ef4871c8720879ed5f89e40386..0d2a8b4b4c70d9b194de35a9369b0c2d
        fps = CSettings::GetInstance().GetInt(CSettings::SETTING_VIDEOPLAYER_LIMITGUIUPDATE);
  #endif
  
-@@ -2781,6 +2781,8 @@ void CApplication::FrameMove(bool processEvents, bool processGUI)
+@@ -2784,6 +2784,8 @@ void CApplication::FrameMove(bool processEvents, bool processGUI)
      {
        if (!m_skipGuiRender)
          g_windowManager.Process(CTimeUtils::GetFrameTime());
@@ -1170,10 +1086,10 @@ index 426835c1b34477ef4871c8720879ed5f89e40386..0d2a8b4b4c70d9b194de35a9369b0c2d
      g_windowManager.FrameMove();
    }
 
-From aeab20d2a11f70c12577db4cda2d307fb65ddeb8 Mon Sep 17 00:00:00 2001
+From e0ee1d794615a8f4378801b55fe4c36e12d7aab5 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Tue, 5 May 2015 23:58:06 +0100
-Subject: [PATCH 18/64] [screensaver] Leave GUI contents available for
+Subject: [PATCH 18/67] [screensaver] Leave GUI contents available for
  screensaver
 
 ---
@@ -1203,10 +1119,10 @@ index 5808f7ed1e94d68ead7305ba6d284edd4df12bdd..2a3b7f16531c9822e79c77efabdd30ac
  
    // Add window to the history list (we must do this before we activate it,
 
-From fb83e3c356d0c1f70c8fd09170d8ea868c5c4bdd Mon Sep 17 00:00:00 2001
+From fb4c838814069e7178ecfde96cecf43ba76cf722 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sat, 6 Jun 2015 18:43:57 +0100
-Subject: [PATCH 19/64] ffmpeg: Automatic switch to software decode for GMC
+Subject: [PATCH 19/67] ffmpeg: Automatic switch to software decode for GMC
  with more than one warp point
 
 ---
@@ -1434,10 +1350,10 @@ index f135d423c0ca76fd70e79ae5b7d035f0cb79fc75..d9b576bc46055fdab1c134e5f2c63cd4
        else if ((hint.codec == AV_CODEC_ID_VC1 || hint.codec == AV_CODEC_ID_WMV3) && g_RBP.GetCodecWvc1())
          supported = true;
 
-From 75b4698f0a751244d729c2b8d09a489c2e37d365 Mon Sep 17 00:00:00 2001
+From acde728909548c939ae05ff179e461fdffda3e1b Mon Sep 17 00:00:00 2001
 From: Claudio-Sjo <Claudio.Porfiri@gmail.com>
 Date: Mon, 16 Feb 2015 14:51:26 +0100
-Subject: [PATCH 20/64] - allow reads < CDIO_CD_FRAMESIZE_RAW by using a buffer
+Subject: [PATCH 20/67] - allow reads < CDIO_CD_FRAMESIZE_RAW by using a buffer
  - fixes #15794
 
 ---
@@ -1629,10 +1545,10 @@ index 0427af4534bfe59a343f0518c7f4242d93299836..e99236294fa8b9b613e465a8ecaf3ad3
    lsn_t m_lsnCurrent; // Position inside the track in logical sector number
    lsn_t m_lsnEnd;   // End of m_iTrack in logical sector number
 
-From 4282fb935f1a3b2deb8d16e3e9cbc73ed327e451 Mon Sep 17 00:00:00 2001
+From 79220cf49a616e2d1f18a6872323dc02521d4440 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Fri, 24 Jun 2016 19:38:13 +0100
-Subject: [PATCH 21/64] codecoverlay: Include codec name in overlay
+Subject: [PATCH 21/67] codecoverlay: Include codec name in overlay
 
 ---
  xbmc/cores/VideoPlayer/VideoPlayerAudio.cpp | 4 ++++
@@ -1643,10 +1559,10 @@ Subject: [PATCH 21/64] codecoverlay: Include codec name in overlay
  5 files changed, 17 insertions(+), 5 deletions(-)
 
 diff --git a/xbmc/cores/VideoPlayer/VideoPlayerAudio.cpp b/xbmc/cores/VideoPlayer/VideoPlayerAudio.cpp
-index f822935ab7fc919128db53f70a6c4eb84d9759bc..9db3a9cc91fd5f9b194d6c1aa66aa02121164c29 100644
+index ec5f91443f99f57a5e250ddc89a0d04278c00c63..1823f2b02a076e0ab33ca2776fefddb2e126c3d1 100644
 --- a/xbmc/cores/VideoPlayer/VideoPlayerAudio.cpp
 +++ b/xbmc/cores/VideoPlayer/VideoPlayerAudio.cpp
-@@ -210,6 +210,10 @@ void CVideoPlayerAudio::UpdatePlayerInfo()
+@@ -208,6 +208,10 @@ void CVideoPlayerAudio::UpdatePlayerInfo()
    std::ostringstream s;
    s << "aq:"     << std::setw(2) << std::min(99,m_messageQueue.GetLevel()) << "%";
    s << ", Kb/s:" << std::fixed << std::setprecision(2) << (double)GetAudioBitrate() / 1024.0;
@@ -1677,10 +1593,10 @@ index 89db27cce079e3e273050f2fa71f941f21b8280b..903f0d83527d9088ff1bf0ba056f357f
    s << ", skip:" << m_renderManager.GetSkippedFrames();
  
 diff --git a/xbmc/cores/omxplayer/OMXPlayerAudio.cpp b/xbmc/cores/omxplayer/OMXPlayerAudio.cpp
-index 1e5d2b98bbef15b47994c3e4735873a9946b58c7..d43350fa0eefb5960475a02c1327efc24d138e0f 100644
+index 3fa9e11bf58cc1d59773beb1fbeb6fe614535a6c..3006d5445eb1de27e6a5b9a82f564bcde24f3557 100644
 --- a/xbmc/cores/omxplayer/OMXPlayerAudio.cpp
 +++ b/xbmc/cores/omxplayer/OMXPlayerAudio.cpp
-@@ -659,6 +659,10 @@ std::string OMXPlayerAudio::GetPlayerInfo()
+@@ -641,6 +641,10 @@ std::string OMXPlayerAudio::GetPlayerInfo()
    std::ostringstream s;
    s << "aq:"     << std::setw(2) << std::min(99,m_messageQueue.GetLevel() + MathUtils::round_int(100.0/8.0*GetCacheTime())) << "%";
    s << ", Kb/s:" << std::fixed << std::setprecision(2) << (double)GetAudioBitrate() / 1024.0;
@@ -1726,10 +1642,10 @@ index 0df7e72cc9d1947173c2bac5e72eb09976b51aa5..b5050081c360d29b1b478c27e6b88291
    double                    m_iSubtitleDelay;
    bool                      m_bRenderSubs;
 
-From 9d1a5913a50d500595acbe929eede41eb43d1a01 Mon Sep 17 00:00:00 2001
+From 5813a683f670077d064bba5fe2592c105b4f73b4 Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <afedchin@ruswizards.com>
 Date: Tue, 8 Mar 2016 21:20:58 +0300
-Subject: [PATCH 22/64] [DebugInfo] Add cpu usage info.
+Subject: [PATCH 22/67] [DebugInfo] Add cpu usage info.
 
 ---
  .../VideoPlayer/VideoRenderers/DebugRenderer.cpp   | 56 ++++++++--------------
@@ -1899,10 +1815,10 @@ index 420b5b5d8e6089e1049ef9af25e23d915df50dc1..fd8a0a2447c40357a9e13003f2ef45ef
  
        m_debugTimer.Set(1000);
 
-From ffe63455a082ff98b5b6916e47116027eda65ed2 Mon Sep 17 00:00:00 2001
+From 37063ea86134466d1a61a6b6e1cf51638cb7088b Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Fri, 22 May 2015 13:56:29 +0100
-Subject: [PATCH 23/64] ffmpeg: Allow neon to be enabled in unified builds
+Subject: [PATCH 23/67] ffmpeg: Allow neon to be enabled in unified builds
 
 ---
  tools/depends/target/ffmpeg/Makefile | 4 ++++
@@ -1925,10 +1841,10 @@ index 8dd14cdfd053f142f386b6dee1fc0b21bb1f8d93..b5f38a458dfb341c43089e07afded153
  ifeq ($(OS), linux)
    ffmpg_config += --target-os=$(OS) --cpu=$(CPU)
 
-From 322fa05ebba56f61ee7a7ed46da04301ca2814d0 Mon Sep 17 00:00:00 2001
+From ba52537598de76e187946e0869fcadda7c7d48be Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Fri, 27 Feb 2015 14:37:27 +0000
-Subject: [PATCH 24/64] ffmpeg: Add some upstream HEVC optimisations
+Subject: [PATCH 24/67] ffmpeg: Add some upstream HEVC optimisations
 
 ---
  tools/depends/target/ffmpeg/Makefile               |    6 +-
@@ -5726,10 +5642,10 @@ index 0000000000000000000000000000000000000000..5e8e07d407f045fc99554f0f061d1e81
 +2.5.0
 +
 
-From 2182021a32ca71f9c71366a1df71af134f6ecbbc Mon Sep 17 00:00:00 2001
+From b7d0926bb1263518cc9c6fda4945b1b24ba0bf63 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Thu, 7 May 2015 14:04:18 +0100
-Subject: [PATCH 25/64] [ffmpeg] Add GPU acceleration to hevc
+Subject: [PATCH 25/67] [ffmpeg] Add GPU acceleration to hevc
 
 ---
  tools/depends/target/ffmpeg/Makefile               |     4 +-
@@ -43915,10 +43831,10 @@ index 0000000000000000000000000000000000000000..e172ebf157aebffe1ae50b4a2b25fd71
 +2.7.4
 +
 
-From e81f7f20e79e8dcedd41aa6cf1ec32ae4a9862c0 Mon Sep 17 00:00:00 2001
+From 1c5e663d7d0ad95e5fc3de874b006531a50d9b47 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Tue, 12 Jan 2016 16:29:57 +0000
-Subject: [PATCH 26/64] ffmpeg: Add cabac opimisations for hevc
+Subject: [PATCH 26/67] ffmpeg: Add cabac opimisations for hevc
 
 ---
  .../0001-Squashed-commit-of-the-following.patch    | 2179 ++++++++++++++++++++
@@ -46163,10 +46079,10 @@ index d6856dbd4fb4957ace700cbc08332223c01938f6..a61357f14cb2139e8125ae04684bed1b
  
  make -j ${BUILDTHREADS} 
 
-From 9ada0f3d707391c0f9c684ef5bf3deab52926fbc Mon Sep 17 00:00:00 2001
+From cea464637de727807464b87e3efa161268b891ad Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Wed, 16 Sep 2015 19:05:12 +0100
-Subject: [PATCH 27/64] [3d] Make MVC a valid 3D filename tag
+Subject: [PATCH 27/67] [3d] Make MVC a valid 3D filename tag
 
 ---
  xbmc/guilib/StereoscopicsManager.cpp | 9 +++++++++
@@ -46195,7 +46111,7 @@ index b34873cba6534086ae243326550385867a03256a..1443acaf0f25df458ae49766e13dd032
  }
  
 diff --git a/xbmc/settings/AdvancedSettings.cpp b/xbmc/settings/AdvancedSettings.cpp
-index 3891a7ed34acb3489a860678d56a8ec049890f6e..974305ff329eb6999c908d5e05d723f93137ae33 100644
+index 12ba1aca0ba838bd8d33e9ca1043845c10f90954..3478719e18e9430224542c3ed825cd036e975434 100644
 --- a/xbmc/settings/AdvancedSettings.cpp
 +++ b/xbmc/settings/AdvancedSettings.cpp
 @@ -402,6 +402,7 @@ void CAdvancedSettings::Initialize()
@@ -46227,10 +46143,10 @@ index fc526d11c3a78bc74125429120e29bf295bd3b16..6b0e3b8cf9e3ff40e6af758c54fe7eef
      bool m_useDisplayControlHWStereo;
  
 
-From 714732f56e37ffa93ed95405799cadf96b550282 Mon Sep 17 00:00:00 2001
+From bb14b6e47f39bf9e9c659175e397d4b54057b904 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Mon, 5 Oct 2015 14:58:05 +0100
-Subject: [PATCH 28/64] [3d] Swap top/bottom sides of GUI
+Subject: [PATCH 28/67] [3d] Swap top/bottom sides of GUI
 
 ---
  xbmc/guilib/GraphicContext.cpp | 2 +-
@@ -46250,10 +46166,10 @@ index 3706e4d80b3b31da4c5be0a1b21f36e59d2910f2..e170b3fb05279ffa316794dbce1d4f9d
    }
    if(m_stereoMode == RENDER_STEREO_MODE_SPLIT_VERTICAL)
 
-From a9695b01e43e3c9f2148f7bc138f45b07dca89f0 Mon Sep 17 00:00:00 2001
+From 6ea3e583f9bf50d3e4d9fba65442d965db2e7c60 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sun, 11 Oct 2015 20:51:37 +0100
-Subject: [PATCH 29/64] Revert "Revert "Disable extra logging by default""
+Subject: [PATCH 29/67] Revert "Revert "Disable extra logging by default""
 
 This reverts commit a880554325be187b877cd8f0e2b338e7267da636.
 ---
@@ -46261,10 +46177,10 @@ This reverts commit a880554325be187b877cd8f0e2b338e7267da636.
  1 file changed, 2 insertions(+), 2 deletions(-)
 
 diff --git a/system/settings/settings.xml b/system/settings/settings.xml
-index 85d3b93466236c33940d01a10d0b8761d1eaa2f6..8b7e9698510c611909d56caa5902391627a084b8 100644
+index 2ed5fb217c6b9f63f28d760e2a2c00b29942315a..850abcd174cc8773319639c7e337f2e2fdbe11b2 100644
 --- a/system/settings/settings.xml
 +++ b/system/settings/settings.xml
-@@ -2822,12 +2822,12 @@
+@@ -2834,12 +2834,12 @@
          </setting>
          <setting id="debug.extralogging" type="boolean" label="666" help="36394">
            <level>1</level>
@@ -46280,10 +46196,10 @@ index 85d3b93466236c33940d01a10d0b8761d1eaa2f6..8b7e9698510c611909d56caa59023916
              <options>loggingcomponents</options>
              <delimiter>,</delimiter>
 
-From a1b74504cf236599e0a4cd1a7e70472d98b8e5fe Mon Sep 17 00:00:00 2001
+From e4a199961a0f9eef3de3b6d9b2f43746aae44e2a Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Mon, 21 Dec 2015 22:17:25 +0000
-Subject: [PATCH 30/64] [omximage] Fall back to arm jpeg encode/decode when gpu
+Subject: [PATCH 30/67] [omximage] Fall back to arm jpeg encode/decode when gpu
  is busy
 
 ---
@@ -46526,10 +46442,10 @@ index a93aa82663903fb1bf712058c2e259290ee742e6..6f38dbc7e5cc721c59a3633935f08218
  
  extern COMXImage g_OMXImage;
 
-From 2d5f106241a3360cc6dafc9bfa8815bfb1811073 Mon Sep 17 00:00:00 2001
+From 4d10ca2fe22af671bc3ee041242aa19fcc6d986d Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Wed, 9 Dec 2015 13:31:14 +0000
-Subject: [PATCH 31/64] [mmalcodec] Fail to open when width is invalid. Can
+Subject: [PATCH 31/67] [mmalcodec] Fail to open when width is invalid. Can
  happen with mpegts files
 
 ---
@@ -46551,10 +46467,10 @@ index 822b7bf75f2e732b5eed8687403d0eda503fa641..c43952d4d29b42f3a5c7605573294568
    if (!CSettings::GetInstance().GetBool(CSettings::SETTING_VIDEOPLAYER_USEMMAL) || hints.software)
      return false;
 
-From 779210ec56e1c9910f7ed75f3d5ca793468f1dfe Mon Sep 17 00:00:00 2001
+From 1c412e8a9559575ce37830782c994eeaf608aace Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Fri, 19 Sep 2014 11:54:49 +0100
-Subject: [PATCH 32/64] [videoplayer/rbp] Add pi specific option to maintain
+Subject: [PATCH 32/67] [videoplayer/rbp] Add pi specific option to maintain
  vsync with pll adjustment
 
 New A/V sync option in settings/video/playback to do "Adjust PLL".
@@ -46576,10 +46492,10 @@ or drop/dupe audio packets which is normally required.
  12 files changed, 143 insertions(+), 21 deletions(-)
 
 diff --git a/addons/resource.language.en_gb/resources/strings.po b/addons/resource.language.en_gb/resources/strings.po
-index 7c619f5619ea974eda22315179a20569e832641a..6a637a80c9e5d900e23cfd87ee6ce5375d2065d6 100644
+index b2f17db119a179e3e2bf4c8c186a19ea4e6d49a7..55ec0a9985a8e77873d787e879d73c076e13b2c6 100644
 --- a/addons/resource.language.en_gb/resources/strings.po
 +++ b/addons/resource.language.en_gb/resources/strings.po
-@@ -19771,3 +19771,35 @@ msgstr ""
+@@ -19788,3 +19788,35 @@ msgstr ""
  msgctxt "#38190"
  msgid "Extract thumbnails from video files"
  msgstr ""
@@ -46881,10 +46797,10 @@ index 81882a1a3828e3f95df26c1bd88c061d3b994b44..ed6974b1155a7272f3ef5bfed3f74967
    void Drain();
    void AbortAddPackets();
 diff --git a/xbmc/cores/VideoPlayer/VideoPlayerAudio.cpp b/xbmc/cores/VideoPlayer/VideoPlayerAudio.cpp
-index 9db3a9cc91fd5f9b194d6c1aa66aa02121164c29..56170f48cda417554c57b2adf934c2df58a23abf 100644
+index 1823f2b02a076e0ab33ca2776fefddb2e126c3d1..af38453a0e9b212634ee8a4b99c336fff0a71efc 100644
 --- a/xbmc/cores/VideoPlayer/VideoPlayerAudio.cpp
 +++ b/xbmc/cores/VideoPlayer/VideoPlayerAudio.cpp
-@@ -96,6 +96,7 @@ bool CVideoPlayerAudio::OpenStream(CDVDStreamInfo &hints)
+@@ -95,6 +95,7 @@ bool CVideoPlayerAudio::OpenStream(CDVDStreamInfo &hints)
    bool allowpassthrough = !CSettings::GetInstance().GetBool(CSettings::SETTING_VIDEOPLAYER_USEDISPLAYASCLOCK);
    if (hints.realtime)
      allowpassthrough = false;
@@ -46892,7 +46808,7 @@ index 9db3a9cc91fd5f9b194d6c1aa66aa02121164c29..56170f48cda417554c57b2adf934c2df
    CDVDAudioCodec* codec = CDVDFactoryCodec::CreateAudioCodec(hints, m_processInfo, allowpassthrough, m_processInfo.AllowDTSHDDecode());
    if(!codec)
    {
-@@ -217,8 +218,12 @@ void CVideoPlayerAudio::UpdatePlayerInfo()
+@@ -215,8 +216,12 @@ void CVideoPlayerAudio::UpdatePlayerInfo()
  
    //print the inverse of the resample ratio, since that makes more sense
    //if the resample ratio is 0.5, then we're playing twice as fast
@@ -46905,7 +46821,7 @@ index 9db3a9cc91fd5f9b194d6c1aa66aa02121164c29..56170f48cda417554c57b2adf934c2df
  
    s << ", att:" << std::fixed << std::setprecision(1) << log(GetCurrentAttenuation()) * 20.0f << " dB";
  
-@@ -541,10 +546,12 @@ void CVideoPlayerAudio::SetSyncType(bool passthrough)
+@@ -525,10 +530,12 @@ void CVideoPlayerAudio::SetSyncType(bool passthrough)
      int synctype = (m_synctype >= 0 && m_synctype <= 1) ? m_synctype : 2;
      CLog::Log(LOGDEBUG, "CVideoPlayerAudio:: synctype set to %i: %s", m_synctype, synctypes[synctype]);
      m_prevsynctype = m_synctype;
@@ -46920,7 +46836,7 @@ index 9db3a9cc91fd5f9b194d6c1aa66aa02121164c29..56170f48cda417554c57b2adf934c2df
    }
  }
  
-@@ -602,6 +609,7 @@ bool CVideoPlayerAudio::SwitchCodecIfNeeded()
+@@ -586,6 +593,7 @@ bool CVideoPlayerAudio::SwitchCodecIfNeeded()
    bool allowpassthrough = !CSettings::GetInstance().GetBool(CSettings::SETTING_VIDEOPLAYER_USEDISPLAYASCLOCK);
    if (m_streaminfo.realtime)
      allowpassthrough = false;
@@ -46995,10 +46911,10 @@ index fffa5182126159f6dfcf750b21fa0464e229e545..815d758e7086d73b4d4eb16849fdbb50
  
  extern CRBP g_RBP;
 
-From f7ed055669b0f9a04bf7ddcaeb579506006dd45e Mon Sep 17 00:00:00 2001
+From 8fcc6e14f70281abfe8f29d4eecb09d3a1981750 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Thu, 7 May 2015 15:35:43 +0100
-Subject: [PATCH 33/64] rbp: Support zero copy interface with hevc acceleration
+Subject: [PATCH 33/67] rbp: Support zero copy interface with hevc acceleration
 
 ---
  xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecFFmpeg.cpp | 9 +++++++++
@@ -47042,10 +46958,10 @@ index 77ae3273bc8e224fe6c193300ccef32fb7fbafe1..c0b3f19f2ef9cdef9adf00cf81154803
    if (g_advancedSettings.CanLogComponent(LOGVIDEO))
      CLog::Log(LOGDEBUG, "%s::%s - mmal:%p dts:%.3f pts:%.3f buf:%p gpu:%p", CLASSNAME, __FUNCTION__, picture->MMALBuffer->mmal_buffer, 1e-6*picture->dts, 1e-6*picture->pts, picture->MMALBuffer, gmem);
 
-From 92cc9e7d80c9bf2eb519eb683b0627554aae9bcf Mon Sep 17 00:00:00 2001
+From d13334643b6bdf8df9c7711fc2942498092f6a88 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sat, 16 May 2015 18:26:04 +0100
-Subject: [PATCH 34/64] ffmpeg: use upstream mvc patches
+Subject: [PATCH 34/67] ffmpeg: use upstream mvc patches
 
 ---
  ...vcodec-add-h264_mvc-codec-id-and-profiles.patch |  68 ++++++++++++
@@ -47355,10 +47271,10 @@ index 0000000000000000000000000000000000000000..b39480ad098b9cd0882fcf75b96afb1b
 +2.7.4
 +
 
-From 6a3b03ea91643c4be1edd7c5bb4308fe193e6f8e Mon Sep 17 00:00:00 2001
+From a31ff8c43a33afc551e852ac68dcce470c418c67 Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <afedchin@ruswizards.com>
 Date: Fri, 29 Jan 2016 17:18:50 +0300
-Subject: [PATCH 35/64] [win32] Settings: Added setting to enable/disable MVC
+Subject: [PATCH 35/67] [win32] Settings: Added setting to enable/disable MVC
  decoder.
 
 ---
@@ -47388,10 +47304,10 @@ index a017d30c24232fb01220b87b29398403b8ed9662..2fcee72a64e8b701c8e895143410bbe9
      <category id="display">
        <group id="1">
 
-From d0c53b95139c9170448cbfaebc6cb8da3adfe62c Mon Sep 17 00:00:00 2001
+From 8accdc748ba93a0ff8406a12da77b3456887eea9 Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <afedchin@ruswizards.com>
 Date: Wed, 20 Jan 2016 17:02:16 +0300
-Subject: [PATCH 36/64] [VideoPlayer] DemuxFFmpeg: Properly demuxing h264_mvc
+Subject: [PATCH 36/67] [VideoPlayer] DemuxFFmpeg: Properly demuxing h264_mvc
  streams.
 
 ---
@@ -47454,10 +47370,10 @@ index 54a18c669a058b705e0276cb7e14522ae6cd04ae..55431978dcfabee8da95e2e76292ff81
        }
      case AVMEDIA_TYPE_DATA:
 
-From 546338b3e9e48142fa89cb54ae5864f4e0d2f7d0 Mon Sep 17 00:00:00 2001
+From f24fa3672ebcb63848be6e7b3669f7034616287c Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <anightik@gmail.com>
 Date: Thu, 25 Feb 2016 11:21:25 +0300
-Subject: [PATCH 37/64] [Stereo3D] Added block_lr and block_rl to supported
+Subject: [PATCH 37/67] [Stereo3D] Added block_lr and block_rl to supported
  modes.
 
 ---
@@ -47507,10 +47423,10 @@ index 1443acaf0f25df458ae49766e13dd0323454f2eb..6aaa82f4d883b8cae0ccdedf6c5a6814
      i++;
    }
 
-From 0ac0cad635d5ba36264d7674242e95342969f4f6 Mon Sep 17 00:00:00 2001
+From b61e26e406192689178f2059d91ce73d10cf0a5c Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <afedchin@ruswizards.com>
 Date: Sat, 23 Jan 2016 10:21:32 +0300
-Subject: [PATCH 38/64] [VideoPlayer] Fix possible wrong aspect.
+Subject: [PATCH 38/67] [VideoPlayer] Fix possible wrong aspect.
 
 ---
  xbmc/cores/VideoPlayer/VideoPlayerVideo.cpp | 2 +-
@@ -47530,10 +47446,10 @@ index 903f0d83527d9088ff1bf0ba056f357f6abfda81..a5a33d34c70892cde77ad4d8f3cb65fd
    else
      m_fForcedAspectRatio = 0.0;
 
-From 49681796854013d3028d92d6cee16bb361c31b0d Mon Sep 17 00:00:00 2001
+From 88e75af3d50f9e4c2912317f3f3713372a472fec Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <afedchin@ruswizards.com>
 Date: Fri, 22 Jan 2016 18:18:33 +0300
-Subject: [PATCH 39/64] [VideoPlayer] DemuxFFmpeg: ssif remux
+Subject: [PATCH 39/67] [VideoPlayer] DemuxFFmpeg: ssif remux
 
 ---
  xbmc/cores/VideoPlayer/DVDDemuxers/CMakeLists.txt  |   2 +
@@ -47954,7 +47870,7 @@ index e4f8aed0af96fe0dceec4d8517087742f2c7df81..30076937bd084936571abf0e6eeecf5a
  LIB = DVDDemuxers.a
  
 diff --git a/xbmc/settings/AdvancedSettings.cpp b/xbmc/settings/AdvancedSettings.cpp
-index 974305ff329eb6999c908d5e05d723f93137ae33..985ecf9722141d78471c00e90da15bfad931462a 100644
+index 3478719e18e9430224542c3ed825cd036e975434..748354c94045ca279801464930e98bd57963de96 100644
 --- a/xbmc/settings/AdvancedSettings.cpp
 +++ b/xbmc/settings/AdvancedSettings.cpp
 @@ -391,7 +391,7 @@ void CAdvancedSettings::Initialize()
@@ -47967,10 +47883,10 @@ index 974305ff329eb6999c908d5e05d723f93137ae33..985ecf9722141d78471c00e90da15bfa
    m_discStubExtensions = ".disc";
    // internal music extensions
 
-From cfc857d82be72d89051610c0f72432487153c99c Mon Sep 17 00:00:00 2001
+From 91fa36649e96a95270e2296449aaa7aa92a25713 Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <afedchin@ruswizards.com>
 Date: Tue, 23 Feb 2016 16:02:46 +0300
-Subject: [PATCH 40/64] [3DBD] Added support of 3D-BluRay playback.
+Subject: [PATCH 40/67] [3DBD] Added support of 3D-BluRay playback.
 
 ---
  lib/DllLibbluray.h                                 |   8 +
@@ -48960,10 +48876,10 @@ index b967a85e6557e42a7f1235cdd804d5a0263b866f..561fb5cd4f971bc9ee4f41218a60bb3d
    typedef std::shared_ptr<CDVDOverlayImage> SOverlay;
    typedef std::list<SOverlay>                 SOverlays;
 
-From 901ebdbc1d85ea4522d75ca17d57b5ef4ac3cdd4 Mon Sep 17 00:00:00 2001
+From 7c59c8fe5dfd19c92cd01adba7203588fc9a05cf Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <anightik@gmail.com>
 Date: Fri, 11 Mar 2016 16:58:53 +0300
-Subject: [PATCH 41/64] [VideoPlayer] HasVideo returns true if video stream
+Subject: [PATCH 41/67] [VideoPlayer] HasVideo returns true if video stream
  exists. This don't allow start visualization if audio is opened before video.
 
 ---
@@ -48971,10 +48887,10 @@ Subject: [PATCH 41/64] [VideoPlayer] HasVideo returns true if video stream
  1 file changed, 1 insertion(+), 1 deletion(-)
 
 diff --git a/xbmc/cores/VideoPlayer/VideoPlayer.cpp b/xbmc/cores/VideoPlayer/VideoPlayer.cpp
-index bd11cd8a76fce261e5b7e0129d3b9181f0ef84c6..b8db03c873df41c2a3daa52867d30f8b14965821 100644
+index defe4e44a4cca76527186abb989dcb847e1431cd..b45b7573636de0cecd86606d942d5e3baf214c91 100644
 --- a/xbmc/cores/VideoPlayer/VideoPlayer.cpp
 +++ b/xbmc/cores/VideoPlayer/VideoPlayer.cpp
-@@ -3117,7 +3117,7 @@ void CVideoPlayer::Pause()
+@@ -3074,7 +3074,7 @@ void CVideoPlayer::Pause()
  
  bool CVideoPlayer::HasVideo() const
  {
@@ -48984,10 +48900,10 @@ index bd11cd8a76fce261e5b7e0129d3b9181f0ef84c6..b8db03c873df41c2a3daa52867d30f8b
  
  bool CVideoPlayer::HasAudio() const
 
-From 6b11efe3833212a6bdbe714ae3ac7a2e114a6791 Mon Sep 17 00:00:00 2001
+From 21b67480972e81a6892a02477468fc4d33d786c4 Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <anightik@gmail.com>
 Date: Thu, 10 Mar 2016 18:11:33 +0300
-Subject: [PATCH 42/64] fixup! Revert supporting crappy tab/sbs subtitles. this
+Subject: [PATCH 42/67] fixup! Revert supporting crappy tab/sbs subtitles. this
  fixes regular subtitles.
 
 ---
@@ -49024,10 +48940,10 @@ index 3a080d06c90b0762482816928642e6de7810b539..a8323f419e404037c4e5fb4d78fa1b45
      CDVDOverlayImage* overlay = new CDVDOverlayImage();
  
 
-From 8e6cc386c73cc66084f0bdb7a45e2ee6756db6af Mon Sep 17 00:00:00 2001
+From 7fe6612e145e2868b058726f22dfcd8292092aa5 Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <anightik@gmail.com>
 Date: Thu, 7 Apr 2016 17:28:50 +0300
-Subject: [PATCH 43/64] [VideoPlayer] Disable reading extension stream from
+Subject: [PATCH 43/67] [VideoPlayer] Disable reading extension stream from
  input stream if decoder doesn't support it.
 
 ---
@@ -49257,10 +49173,10 @@ index 0b676c9b611fe956f1aa721013412e41ff5b62f6..6762e733848d1298a75a862b0aaf81aa
  
  class CDVDAudioCodec;
 diff --git a/xbmc/cores/VideoPlayer/VideoPlayer.cpp b/xbmc/cores/VideoPlayer/VideoPlayer.cpp
-index b8db03c873df41c2a3daa52867d30f8b14965821..2ee5cbf243a91763ca89747c6c23f3b71875437c 100644
+index b45b7573636de0cecd86606d942d5e3baf214c91..ccc371d723b386eb76f022f794c7563d70e1dadd 100644
 --- a/xbmc/cores/VideoPlayer/VideoPlayer.cpp
 +++ b/xbmc/cores/VideoPlayer/VideoPlayer.cpp
-@@ -3894,6 +3894,10 @@ bool CVideoPlayer::OpenVideoStream(CDVDStreamInfo& hint, bool reset)
+@@ -3802,6 +3802,10 @@ bool CVideoPlayer::OpenVideoStream(CDVDStreamInfo& hint, bool reset)
      if (!player->OpenStream(hint))
        return false;
  
@@ -49284,10 +49200,10 @@ index 0d4100e58e9db7e5035bcf9ae23b0147f80cec8f..69570153f0810a5840f3780c7a6681a1
    // classes
    CDVDOverlayContainer* m_pOverlayContainer;
 
-From a0c8e7ce6f37a8cd9fb0e2ce474c0239f8b2280f Mon Sep 17 00:00:00 2001
+From 29f7bc99e15245a3303f2b70c46b6c3f4d15fd84 Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <anightik@gmail.com>
 Date: Fri, 16 Sep 2016 11:37:48 +0300
-Subject: [PATCH 44/64] [Settings] move SETTING_VIDEOPLAYER_SUPPORTMVC from
+Subject: [PATCH 44/67] [Settings] move SETTING_VIDEOPLAYER_SUPPORTMVC from
  platform settings to common settings.
 
 ---
@@ -49317,7 +49233,7 @@ index 2572e25753712186f69390965ee1448bff3fadd5..7098edf32dff8c00e192229c3ffb060b
    </section>
    <section id="media">
 diff --git a/system/settings/settings.xml b/system/settings/settings.xml
-index 8b7e9698510c611909d56caa5902391627a084b8..e73d85c18ea63453275f2a8f2a0cdd96c4b11e39 100644
+index 850abcd174cc8773319639c7e337f2e2fdbe11b2..0fb9464a598cad05893bff627cbd7ddee7341ca8 100644
 --- a/system/settings/settings.xml
 +++ b/system/settings/settings.xml
 @@ -343,6 +343,12 @@
@@ -49369,10 +49285,10 @@ index 74e8e1fc2da66d3c98a5bab04faa2f6bf16539ff..7dd85f0173bd636f4f5ae6e7fc43b306
      MPLS_PL * mpls = m_dll->bd_get_title_mpls(m_bd);
      if (mpls)
 diff --git a/xbmc/settings/SettingConditions.cpp b/xbmc/settings/SettingConditions.cpp
-index 6b1f2b6d757354d6065c2862b44dfb47184a1dcc..9163ec85bd0feb48a698a025d9870bf40042c675 100644
+index 473ca093f45f6a5779cade1268269bb7ba483e9d..11a422b1a5cbfde9914d3bfd23b5b540cc3b8f88 100644
 --- a/xbmc/settings/SettingConditions.cpp
 +++ b/xbmc/settings/SettingConditions.cpp
-@@ -327,6 +327,9 @@ void CSettingConditions::Initialize()
+@@ -339,6 +339,9 @@ void CSettingConditions::Initialize()
    m_simpleConditions.insert("has_dx");
    m_simpleConditions.insert("hasdxva2");
  #endif
@@ -49383,10 +49299,10 @@ index 6b1f2b6d757354d6065c2862b44dfb47184a1dcc..9163ec85bd0feb48a698a025d9870bf4
    m_simpleConditions.insert("have_lcms2");
  #endif
 
-From 618138892406de177ea6ffeb1c8170013f38dc57 Mon Sep 17 00:00:00 2001
+From 6cafd78f07821d41e50d95015e7531b47ff4ba22 Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <afedchin@ruswizards.com>
 Date: Fri, 4 Nov 2016 22:56:56 +0300
-Subject: [PATCH 45/64] [VideoPlayer] SSIF: fix for corner case when mvc stream
+Subject: [PATCH 45/67] [VideoPlayer] SSIF: fix for corner case when mvc stream
  is switched before the last packet is read from previous stream.
 
 ---
@@ -49575,33 +49491,33 @@ index f70657c9e31fb2460d12910c635dba5163282e74..a11ec77903d2a9b2c68106a8e2301af9
    typedef std::shared_ptr<CDVDOverlayImage> SOverlay;
    typedef std::list<SOverlay>                 SOverlays;
 
-From 9414392ca337af025f99cb5ff2388cd18fab05e0 Mon Sep 17 00:00:00 2001
+From a94a694d85b3f7bb7ebd528cbff1c22500dcd033 Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <afedchin@ruswizards.com>
 Date: Tue, 23 Feb 2016 16:01:08 +0300
-Subject: [PATCH 46/64] [libbluray] bump libbluray to 0.9.2-mvc.
+Subject: [PATCH 46/67] [libbluray] bump libbluray to 0.9.2-mvc.
 
 ---
  project/BuildDependencies/scripts/0_package.list | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)
 
 diff --git a/project/BuildDependencies/scripts/0_package.list b/project/BuildDependencies/scripts/0_package.list
-index 67151c1a1bf47df2b81d38f80ddc3f5e1a3b4eab..9f7ff84b06acca2a5c24f6a74b82d46c544a8b07 100644
+index 3ca0ecb8b91f4db2a0dae1f0fac217bd18c7bc43..48f6088640142b2d40d9a4bce525baa87d3278a3 100644
 --- a/project/BuildDependencies/scripts/0_package.list
 +++ b/project/BuildDependencies/scripts/0_package.list
-@@ -17,7 +17,7 @@ freetype-2.6.3-win32-vc140.7z
+@@ -17,7 +17,7 @@ freetype-dc2b38-win32-vc140-v2.7z
  giflib-5.1.4-win32-vc140.7z
  jsonschemabuilder-1.0.0-win32-3.7z
- libass-542975a-win32-vc140.7z
+ libass-6aaaf5-win32-vc140.7z
 -libbluray-0.9.3-win32-vc140.7z
 +libbluray-0.9.2-mvc-win32-vc120.7z
  libcdio-0.9.3-win32-vc140.7z
  libcec-4.0.1-win32-vc140-2.7z
  libfribidi-0.19.2-win32.7z
 
-From e707f353eb3a974559e4a139610ae9a23ff48f4d Mon Sep 17 00:00:00 2001
+From 4b4f63b3b9083dc559680a33095ffd21ec6c8d81 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Mon, 29 Feb 2016 17:00:50 +0000
-Subject: [PATCH 47/64] libbluray: Bump to Nevcairie's v0.9.2
+Subject: [PATCH 47/67] libbluray: Bump to Nevcairie's v0.9.2
 
 This includes 3D support
 ---
@@ -51258,10 +51174,10 @@ index 0000000000000000000000000000000000000000..5ef0124e35c9d81143921a328e272220
 + 
 +     return fp;
 
-From c1e2e8a5832154824fb4ed0b43d214bbef8fcbde Mon Sep 17 00:00:00 2001
+From 05f619cf0bde4209267f1de6270e7fc93718b0d7 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sun, 6 Mar 2016 12:54:59 +0000
-Subject: [PATCH 48/64] mvc: Automatically enable stereo mode
+Subject: [PATCH 48/67] mvc: Automatically enable stereo mode
 
 ---
  xbmc/cores/VideoPlayer/DVDCodecs/Video/MMALCodec.cpp | 6 +++++-
@@ -51319,10 +51235,10 @@ index 311dd6689236d660919c4c4483c51dca2752514a..536332c43e22ccb229e72b88518e54dd
      break;
      case AV_CODEC_ID_MPEG4:
 
-From 965d2478fb479e9b1624c8cba5a2d4a53a6187e4 Mon Sep 17 00:00:00 2001
+From 0792d059c421811de7a52b55638373c26c21e008 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Thu, 24 Mar 2016 13:02:58 +0000
-Subject: [PATCH 49/64] ffmpeg: mvc: fix for pixelation from packets with no
+Subject: [PATCH 49/67] ffmpeg: mvc: fix for pixelation from packets with no
  pts/dts
 
 ---
@@ -51384,10 +51300,10 @@ index 7e97e4d91a443d46d933df528763422ff5e8f4fa..d4f279fd4f2ceb260698cd6fedb124ba
  	cd $(PLATFORM);\
  	CFLAGS="$(CFLAGS)" CXXFLAGS="$(CXXFLAGS)" CPPFLAGS="$(CPPFLAGS)" LDFLAGS="$(LDFLAGS)" \
 
-From 8092e05383265a3d5f9b86d63de81fba443dd719 Mon Sep 17 00:00:00 2001
+From 7cca3aef831e26c2e43a92646a0243b58486981d Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Fri, 11 Nov 2016 15:53:53 +0000
-Subject: [PATCH 50/64] stereoscopicmanager: fixups for rbp
+Subject: [PATCH 50/67] stereoscopicmanager: fixups for rbp
 
 ---
  xbmc/cores/VideoPlayer/DVDCodecs/DVDCodecUtils.cpp | 61 ++++++++++++++++++++++
@@ -51625,10 +51541,10 @@ index 6aaa82f4d883b8cae0ccdedf6c5a6814e7aaa720..cc929b599125a44ac128713fd4331782
  };
  
 
-From 2df4c943373762d16be809dd00f188af2b6dc631 Mon Sep 17 00:00:00 2001
+From 780c207fcd4e086d32f74d6a097642ba86b8b8a3 Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <anightik@gmail.com>
 Date: Thu, 10 Mar 2016 18:11:33 +0300
-Subject: [PATCH 51/64] fixup! Revert supporting crappy tab/sbs subtitles. this
+Subject: [PATCH 51/67] fixup! Revert supporting crappy tab/sbs subtitles. this
  fixes regular subtitles.
 
 ---
@@ -51648,10 +51564,10 @@ index a8323f419e404037c4e5fb4d78fa1b45409337a7..7c0b70777556ac7694e7fc511cd4bb18
    }
  
 
-From 6502330a06981420a0e94795c9470ea0eced81e3 Mon Sep 17 00:00:00 2001
+From be1121b5490ba594b4338223d0587c4ce12b6030 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sat, 26 Nov 2016 18:24:18 +0000
-Subject: [PATCH 52/64] DemuxMVC: fixup after SeekTime API change
+Subject: [PATCH 52/67] DemuxMVC: fixup after SeekTime API change
 
 ---
  xbmc/cores/VideoPlayer/DVDDemuxers/DemuxMVC.cpp | 2 +-
@@ -51685,36 +51601,10 @@ index bbb836a61344689a83af68c821c05c212a86b097..54f91a02391368fbfbb4d669c003f425
    virtual int GetStreamLength() { return 0; };
    virtual CDemuxStream* GetStream(int iStreamId) const override { return nullptr; };
 
-From 71bd79c370a44ab2f0c01faca4fbe0d2557bc553 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Mon, 3 Nov 2014 23:17:46 +0000
-Subject: [PATCH 53/64] [cec] Don't discard buttons when repeat mode is enabled
-
----
- xbmc/peripherals/devices/PeripheralCecAdapter.cpp | 5 ++++-
- 1 file changed, 4 insertions(+), 1 deletion(-)
-
-diff --git a/xbmc/peripherals/devices/PeripheralCecAdapter.cpp b/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
-index 30367a3fde956090afdca9930fa52e829f35046f..febacb3b7964eab3b8615a6a807e0f27d911b4da 100644
---- a/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
-+++ b/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
-@@ -803,7 +803,10 @@ void CPeripheralCecAdapter::PushCecKeypress(const CecButtonPress &key)
-   CLog::Log(LOGDEBUG, "%s - received key %2x duration %d", __FUNCTION__, key.iButton, key.iDuration);
- 
-   CSingleLock lock(m_critSection);
--  if (key.iDuration > 0)
-+  // avoid the queue getting too long
-+  if (m_configuration.iButtonRepeatRateMs && m_buttonQueue.size() > 5)
-+    return;
-+  if (m_configuration.iButtonRepeatRateMs == 0 && key.iDuration > 0)
-   {
-     if (m_currentButton.iButton == key.iButton && m_currentButton.iDuration == 0)
-     {
-
-From bcd2d7a264e2822f9dfd0283254a4a1cf233b7c6 Mon Sep 17 00:00:00 2001
+From 76a51144a06e9b8147407ccb4d6fe926e8a7816b Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Tue, 4 Nov 2014 18:50:00 +0000
-Subject: [PATCH 54/64] [cec] Temp - more logging
+Subject: [PATCH 54/67] [cec] Temp - more logging
 
 ---
  xbmc/peripherals/devices/PeripheralCecAdapter.cpp | 8 +++++++-
@@ -51766,10 +51656,10 @@ index febacb3b7964eab3b8615a6a807e0f27d911b4da..52d6e6a7ab68ce91faf5a3881b23ea7a
  }
  
 
-From 938124103d0041c0d275629d691fc6d39739b840 Mon Sep 17 00:00:00 2001
+From 205a27d0a81ab333c53a786a00dffae7e1d6205f Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Wed, 25 May 2016 18:31:17 +0100
-Subject: [PATCH 55/64] rbp: Hard code the number of buffers to improve audio
+Subject: [PATCH 55/67] rbp: Hard code the number of buffers to improve audio
  sync
 
 ---
@@ -51811,10 +51701,10 @@ index fd8a0a2447c40357a9e13003f2ef45ef20ccb205..be0de0d962fd374bc17bfa48a27ca17d
  
  }
 
-From 9688862943ab68f8aca7c8d58bf75172bdc7128e Mon Sep 17 00:00:00 2001
+From ac68b8f32842e24009e2de53e42fa6775af263a8 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Mon, 4 Jul 2016 18:30:03 +0100
-Subject: [PATCH 56/64] rbp: Update the GL libs to new naming scheme
+Subject: [PATCH 56/67] rbp: Update the GL libs to new naming scheme
 
 As the opensource mesa GL library is getting more usable, the name collision wih the firmware GL driver is causing issues.
 As such we are renaming the firmware GL driver to avoid this.
@@ -51828,7 +51718,7 @@ will be dropped at some point
  3 files changed, 5 insertions(+), 5 deletions(-)
 
 diff --git a/configure.ac b/configure.ac
-index 060939073c841360dd69bfd9c3a50bd15b6a9411..065af598a8e06b80a779ece30d1d09440b1293bf 100644
+index a37c8c5d2d92194731203b19a5cf8a369e96d3fa..772facc7c837e730317b8708800741efc608a9c9 100644
 --- a/configure.ac
 +++ b/configure.ac
 @@ -949,7 +949,7 @@ if test "$use_gles" = "yes"; then
@@ -51879,10 +51769,10 @@ index 3626ea5204eb561dc1ae0b64c6bb7253d2ec59ec..100ff3178bafe7434bd5456100b5bb71
  fi
  
 
-From 34a539610c33155e45768df7e5d2d9c9ee6258d7 Mon Sep 17 00:00:00 2001
+From 9838684c2520a11221010731a75d1ee556216205 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Tue, 28 Jun 2016 14:46:01 +0100
-Subject: [PATCH 57/64] ffmpeg: hacky fix for files with GMC
+Subject: [PATCH 57/67] ffmpeg: hacky fix for files with GMC
 
 ---
  xbmc/cores/VideoPlayer/DVDDemuxers/DVDDemuxFFmpeg.cpp | 4 ++--
@@ -51904,10 +51794,10 @@ index 9149698884c8ae6a23649abbaa0e659587dfe982..84d515e9e2df6a4c1c448a52a42f4675
          {
            if (pStream->codec->codec_id == AV_CODEC_ID_PROBE)
 
-From 0f4661f6302b5c8025ba24d55522faa6513f2dab Mon Sep 17 00:00:00 2001
+From d31579fe437b86fb71198d5f077645cfaae0f017 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Tue, 19 Jul 2016 20:39:18 +0100
-Subject: [PATCH 58/64] mmalrender: Add sharpness control
+Subject: [PATCH 58/67] mmalrender: Add sharpness control
 
 ---
  addons/resource.language.en_gb/resources/strings.po         |  2 +-
@@ -51916,7 +51806,7 @@ Subject: [PATCH 58/64] mmalrender: Add sharpness control
  3 files changed, 14 insertions(+), 2 deletions(-)
 
 diff --git a/addons/resource.language.en_gb/resources/strings.po b/addons/resource.language.en_gb/resources/strings.po
-index 6a637a80c9e5d900e23cfd87ee6ce5375d2065d6..4ae21a644a4739448e7752c95970fc61b1d3ebd3 100644
+index 55ec0a9985a8e77873d787e879d73c076e13b2c6..eea89feb0f698619623ec67ed0078d30d18c22fc 100644
 --- a/addons/resource.language.en_gb/resources/strings.po
 +++ b/addons/resource.language.en_gb/resources/strings.po
 @@ -8694,7 +8694,7 @@ msgstr ""
@@ -51979,10 +51869,10 @@ index e0e6f7c0e0546013ca74265aef54704fd332f8e4..69eae6cbef0131d20dc979dcb35915cd
    CCriticalSection m_sharedSection;
    MMAL_COMPONENT_T *m_vout;
 
-From 41a329e62bc59dcb395c060c02551e5bba7c5691 Mon Sep 17 00:00:00 2001
+From bf759c6dad91e60580981c29146bf495a18167cf Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Fri, 14 Oct 2016 15:37:53 +0100
-Subject: [PATCH 59/64] MMALFFMpeg: Report as SW decode in codec overlay info
+Subject: [PATCH 59/67] MMALFFMpeg: Report as SW decode in codec overlay info
 
 ---
  xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecFFmpeg.cpp | 2 +-
@@ -52002,10 +51892,10 @@ index 8bace5b3eb98b3b1ddad7f56af83a41ae067bc75..c820a04c903866862b5ff04b38124ff0
    CLog::Log(LOGDEBUG, "CDVDVideoCodecFFmpeg - Updated codec: %s", m_name.c_str());
  }
 
-From a92c013a38a1b086d74c6267c587484297128626 Mon Sep 17 00:00:00 2001
+From 8c3b6d8a7d39d535cc73cd5d0479fb2a01fd1171 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Mon, 7 Nov 2016 18:28:01 +0000
-Subject: [PATCH 60/64] advancedsettings: Add option to set cache size on
+Subject: [PATCH 60/67] advancedsettings: Add option to set cache size on
  libass
 
 E.g to set total cache size in libass to 32M
@@ -52071,7 +51961,7 @@ index f9de4f15e7c612d69ef46e7cad870ecb61afaec3..b5303fd100f1a930eb5c010a95193206
    END_METHOD_RESOLVE()
  };
 diff --git a/xbmc/settings/AdvancedSettings.cpp b/xbmc/settings/AdvancedSettings.cpp
-index 985ecf9722141d78471c00e90da15bfad931462a..a33581ba02a26110105a2d0ae810d96c410efbf1 100644
+index 748354c94045ca279801464930e98bd57963de96..5bdd6244e28c8320e18fed5148d332da19801221 100644
 --- a/xbmc/settings/AdvancedSettings.cpp
 +++ b/xbmc/settings/AdvancedSettings.cpp
 @@ -364,6 +364,8 @@ void CAdvancedSettings::Initialize()
@@ -52107,10 +51997,10 @@ index 6b0e3b8cf9e3ff40e6af758c54fe7eefb89a131c..35bf38719f0eaaa5ac29e9495480ae97
      unsigned int m_jsonTcpPort;
  
 
-From 07d4465716adca7dcad36fa18c74fa26f7afabe5 Mon Sep 17 00:00:00 2001
+From 4ce0c7911dba73b04ae32cb813b9a5fca1d44998 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sun, 13 Nov 2016 20:30:15 +0000
-Subject: [PATCH 61/64] [rbp] Experimental limit libass cache size depending on
+Subject: [PATCH 61/67] [rbp] Experimental limit libass cache size depending on
  arm memory size
 
 ---
@@ -52141,7 +52031,7 @@ index 6e8529001b1a464b4547a846f553d98f5bc0b6c0..238eba372af2cbab11d7543c857ee476
    response[sizeof(response) - 1] = '\0';
    CLog::Log(LOGNOTICE, "Config:\n%s", response);
 diff --git a/xbmc/settings/AdvancedSettings.cpp b/xbmc/settings/AdvancedSettings.cpp
-index a33581ba02a26110105a2d0ae810d96c410efbf1..d70e2cf3113bbe0dad60dfc7accc8d77f7f30c30 100644
+index 5bdd6244e28c8320e18fed5148d332da19801221..95a9d347049dbfa04d74248dce9167a6896566dc 100644
 --- a/xbmc/settings/AdvancedSettings.cpp
 +++ b/xbmc/settings/AdvancedSettings.cpp
 @@ -361,8 +361,10 @@ void CAdvancedSettings::Initialize()
@@ -52156,33 +52046,10 @@ index a33581ba02a26110105a2d0ae810d96c410efbf1..d70e2cf3113bbe0dad60dfc7accc8d77
    m_libAssCache = 0;
  
 
-From d82023dcbc5281e42f9d3795d94c8a6d2da8ddae Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Tue, 17 Jan 2017 21:05:26 +0000
-Subject: [PATCH 62/64] ADSP: Hack - disable
-
----
- xbmc/ServiceManager.cpp | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/xbmc/ServiceManager.cpp b/xbmc/ServiceManager.cpp
-index ecaa4037e53fbefcbbd6f7e6b75d1cb781a82cc0..27e50d337c702371817582c61b22892c43c3683a 100644
---- a/xbmc/ServiceManager.cpp
-+++ b/xbmc/ServiceManager.cpp
-@@ -70,7 +70,7 @@ bool CServiceManager::Init2()
- 
- bool CServiceManager::Init3()
- {
--  m_ADSPManager->Init();
-+  //m_ADSPManager->Init();
-   m_PVRManager->Init();
-   m_contextMenuManager->Init();
- 
-
-From b7ba577164b7f01786e2424e34fee7747e373f70 Mon Sep 17 00:00:00 2001
+From 8f293b32905920446d7d84c031a01ef451e8fa4c Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Mon, 22 Jun 2015 21:46:57 +0100
-Subject: [PATCH 63/64] [rbp] Use default resampling setting on Pi2
+Subject: [PATCH 62/67] [rbp] Use default resampling setting on Pi2
 
 ---
  system/settings/rbp2.xml | 5 +++++
@@ -52205,10 +52072,10 @@ index 50bd55e9c90864c1ff4c36c4650e9ec247737a44..f218216e615d9723e5a163aab9c42ca5
    </section>
  </settings>
 
-From 5e72bc627bb8417a4b5fe19628370ddbd9dcad89 Mon Sep 17 00:00:00 2001
+From d96d809cdb68c8299b0b6771a11eb390060df5b8 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Thu, 1 Dec 2016 17:06:01 +0000
-Subject: [PATCH 64/64] MMALRender: Allow advanced deinterlace with software
+Subject: [PATCH 63/67] MMALRender: Allow advanced deinterlace with software
  decode
 
 Uses YUV420 directly which improves performance.
@@ -52230,3 +52097,53500 @@ index f5f0f0d01227b3b4dcebb4a22a54dbcaac2d5ee9..05cbd8eeaef1a21fc32ea1fa23ea686e
  
      status = mmal_port_format_commit(m_deint_output);
      if (status != MMAL_SUCCESS)
+
+From 38b097a6d6653267c52de2a2d5f6a09c39642684 Mon Sep 17 00:00:00 2001
+From: Nuno Senica <nsenica@gmail.com>
+Date: Tue, 27 Dec 2016 20:59:56 +0000
+Subject: [PATCH 64/67] Apply ffmpeg patches automatically after downloading
+ and extracting the ffmpeg tar ball
+
+---
+ project/cmake/modules/FindFFMPEG.cmake | 12 +++++++++++-
+ 1 file changed, 11 insertions(+), 1 deletion(-)
+
+diff --git a/project/cmake/modules/FindFFMPEG.cmake b/project/cmake/modules/FindFFMPEG.cmake
+index 3d7fcc8ca30224fc589c720e37102588f3739448..486842e9bd6c6e7c65ca0dd9c2a6fc6f26169a5d 100644
+--- a/project/cmake/modules/FindFFMPEG.cmake
++++ b/project/cmake/modules/FindFFMPEG.cmake
+@@ -260,7 +260,17 @@ if(NOT FFMPEG_FOUND)
+                                     <SOURCE_DIR> &&
+                                     ${CMAKE_COMMAND} -E copy
+                                     ${CORE_SOURCE_DIR}/tools/depends/target/ffmpeg/FindGnuTls.cmake
+-                                    <SOURCE_DIR>)
++                                    <SOURCE_DIR> &&
++                                    patch -p1 < ${CORE_SOURCE_DIR}/tools/depends/target/ffmpeg/0001-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch &&
++                                    patch -p1 < ${CORE_SOURCE_DIR}/tools/depends/target/ffmpeg/hevcdsp_ARM_NEON_optimized_epel_functions.patch &&
++                                    patch -p1 < ${CORE_SOURCE_DIR}/tools/depends/target/ffmpeg/added_ARM_NEON_optimized_SAO_patches.patch &&
++                                    patch -p1 < ${CORE_SOURCE_DIR}/tools/depends/target/ffmpeg/pfcd_hevc_optimisations.patch &&
++                                    patch -p1 < ${CORE_SOURCE_DIR}/tools/depends/target/ffmpeg/0001-Squashed-commit-of-the-following.patch &&
++                                    patch -p1 < ${CORE_SOURCE_DIR}/tools/depends/target/ffmpeg/0001-avcodec-add-h264_mvc-codec-id-and-profiles.patch &&
++                                    patch -p1 < ${CORE_SOURCE_DIR}/tools/depends/target/ffmpeg/0001-h264_parser-add-support-for-parsing-h264-mvc-NALUs.patch &&
++                                    patch -p1 < ${CORE_SOURCE_DIR}/tools/depends/target/ffmpeg/h264_parser_fix_parsing_of_mvc_slices_in_some_corner_cases.patch &&
++                                    patch -p1 < ${CORE_SOURCE_DIR}/tools/depends/target/ffmpeg/73fde6f9f3d01f7fc0f3ae4b66f6c725f9fb1105.patch
++                     )
+ 
+   file(WRITE ${CMAKE_BINARY_DIR}/${CORE_BUILD_DIR}/ffmpeg/ffmpeg-link-wrapper
+ "#!/bin/bash
+
+From 41d5ae2774e4c5ed6386180c1553f129a1be549c Mon Sep 17 00:00:00 2001
+From: popcornmix <popcornmix@gmail.com>
+Date: Sun, 1 May 2016 19:56:43 +0100
+Subject: [PATCH 65/67] omxplayer: Avoid CAEFactory::Suspend which should only
+ be called by application
+
+---
+ xbmc/cores/AudioEngine/Sinks/AESinkPi.cpp | 12 +++++++++---
+ xbmc/cores/omxplayer/OMXAudio.cpp         | 15 +++++++++++----
+ xbmc/cores/omxplayer/OMXAudio.h           |  2 ++
+ 3 files changed, 22 insertions(+), 7 deletions(-)
+
+diff --git a/xbmc/cores/AudioEngine/Sinks/AESinkPi.cpp b/xbmc/cores/AudioEngine/Sinks/AESinkPi.cpp
+index 750ea754924d00dbaae9f479485d03f4b3011028..adb74a8bffedc118d4734f59162d0fb9598cc139 100644
+--- a/xbmc/cores/AudioEngine/Sinks/AESinkPi.cpp
++++ b/xbmc/cores/AudioEngine/Sinks/AESinkPi.cpp
+@@ -214,8 +214,6 @@ bool CAESinkPi::Initialize(AEAudioFormat &format, std::string &device)
+   format.m_sampleRate    = std::max(8000U, std::min(192000U, format.m_sampleRate));
+   format.m_frames        = format.m_sampleRate * AUDIO_PLAYBUFFER / NUM_OMX_BUFFERS;
+ 
+-  SetAudioProps(m_passthrough, GetChannelMap(format.m_channelLayout, m_passthrough));
+-
+   m_format = format;
+   m_sinkbuffer_sec_per_byte = 1.0 / (double)(m_format.m_frameSize * m_format.m_sampleRate);
+ 
+@@ -223,6 +221,12 @@ bool CAESinkPi::Initialize(AEAudioFormat &format, std::string &device)
+                 m_format.m_dataFormat, channels, m_format.m_sampleRate, m_format.m_frameSize, m_format.m_frameSize * m_format.m_frames, 1.0/m_sinkbuffer_sec_per_byte,
+                 CSettings::GetInstance().GetString(CSettings::SETTING_AUDIOOUTPUT_AUDIODEVICE).c_str());
+ 
++  // magic value used when omxplayer is playing - want sink to be disabled
++  if (m_passthrough && m_format.m_streamInfo.m_sampleRate == 16000)
++    return true;
++
++  SetAudioProps(m_passthrough, GetChannelMap(m_format.m_channelLayout, m_passthrough));
++
+   OMX_ERRORTYPE omx_err   = OMX_ErrorNone;
+ 
+   if (!m_omx_render.Initialize("OMX.broadcom.audio_render", OMX_IndexParamAudioInit))
+@@ -432,8 +436,10 @@ double CAESinkPi::GetCacheTotal()
+ unsigned int CAESinkPi::AddPackets(uint8_t **data, unsigned int frames, unsigned int offset)
+ {
+   if (!m_Initialized || !m_omx_output || !frames)
++  {
++    Sleep(10);
+     return frames;
+-
++  }
+   OMX_ERRORTYPE omx_err   = OMX_ErrorNone;
+   OMX_BUFFERHEADERTYPE *omx_buffer = NULL;
+ 
+diff --git a/xbmc/cores/omxplayer/OMXAudio.cpp b/xbmc/cores/omxplayer/OMXAudio.cpp
+index 993d4b33a294e88c2c004b7943895ba55558c2d0..21764045fbde39bffe58b61f32ad422231d617d2 100644
+--- a/xbmc/cores/omxplayer/OMXAudio.cpp
++++ b/xbmc/cores/omxplayer/OMXAudio.cpp
+@@ -95,16 +95,23 @@ COMXAudio::COMXAudio() :
+   m_failed_eos      (false  ),
+   m_output          (AESINKPI_UNKNOWN)
+ {
+-  CAEFactory::Suspend();
+-  while (!CAEFactory::IsSuspended())
+-    Sleep(10);
++  // magic value used when omxplayer is playing - want sink to be disabled
++  AEAudioFormat m_format;
++  m_format.m_dataFormat = AE_FMT_RAW;
++  m_format.m_streamInfo.m_type = CAEStreamInfo::STREAM_TYPE_AC3;
++  m_format.m_streamInfo.m_sampleRate = 16000;
++  m_format.m_streamInfo.m_channels = 2;
++  m_format.m_sampleRate = 16000;
++  m_format.m_frameSize = 1;
++  m_pAudioStream = CAEFactory::MakeStream(m_format, 0, nullptr);
+ }
+ 
+ COMXAudio::~COMXAudio()
+ {
+   Deinitialize();
+ 
+-  CAEFactory::Resume();
++  if (m_pAudioStream)
++    CAEFactory::FreeStream(m_pAudioStream);
+ }
+ 
+ bool COMXAudio::PortSettingsChanged()
+diff --git a/xbmc/cores/omxplayer/OMXAudio.h b/xbmc/cores/omxplayer/OMXAudio.h
+index db7f98ddbc2db2f20bdc42379df3f08eba165bfc..02acfc8cfe57446be4e00b991ef6fde9d8fe8eab 100644
+--- a/xbmc/cores/omxplayer/OMXAudio.h
++++ b/xbmc/cores/omxplayer/OMXAudio.h
+@@ -24,6 +24,7 @@
+ 
+ #include "cores/AudioEngine/Utils/AEAudioFormat.h"
+ #include "cores/AudioEngine/Utils/AEUtil.h"
++#include "cores/AudioEngine/Interfaces/AEStream.h"
+ #include "linux/PlatformDefs.h"
+ #include "DVDStreamInfo.h"
+ 
+@@ -145,6 +146,7 @@ private:
+   OMX_AUDIO_PARAM_PCMMODETYPE m_pcm_input;
+   OMX_AUDIO_PARAM_DTSTYPE     m_dtsParam;
+   WAVEFORMATEXTENSIBLE        m_wave_header;
++  IAEStream *m_pAudioStream;
+ protected:
+   COMXCoreComponent m_omx_render_analog;
+   COMXCoreComponent m_omx_render_hdmi;
+
+From a6419a20e48ee45774ef61d41ae9124228e632ca Mon Sep 17 00:00:00 2001
+From: popcornmix <popcornmix@gmail.com>
+Date: Wed, 1 Mar 2017 21:40:22 +0000
+Subject: [PATCH 66/67] MMALRender: default to bob (x2) deinterlace for HD
+
+There are still issues with some dvb dongles run on the same Pi as playback.
+Default to bob. Users who aren't using these devices will have to manually enable advanced.
+---
+ xbmc/cores/VideoPlayer/VideoRenderers/HwDecRender/MMALRenderer.cpp | 4 ++--
+ xbmc/cores/omxplayer/OMXVideo.cpp                                  | 5 +++++
+ 2 files changed, 7 insertions(+), 2 deletions(-)
+
+diff --git a/xbmc/cores/VideoPlayer/VideoRenderers/HwDecRender/MMALRenderer.cpp b/xbmc/cores/VideoPlayer/VideoRenderers/HwDecRender/MMALRenderer.cpp
+index 05cbd8eeaef1a21fc32ea1fa23ea686e3cd7e33b..9279966fa634f6f5a3e00f12dd528337392cf038 100644
+--- a/xbmc/cores/VideoPlayer/VideoRenderers/HwDecRender/MMALRenderer.cpp
++++ b/xbmc/cores/VideoPlayer/VideoRenderers/HwDecRender/MMALRenderer.cpp
+@@ -555,8 +555,8 @@ void CMMALRenderer::Run()
+         if (interlace_method == VS_INTERLACEMETHOD_AUTO)
+         {
+           interlace_method = VS_INTERLACEMETHOD_MMAL_ADVANCED;
+-          // avoid advanced deinterlace when using software decode and HD resolution
+-          if (omvb->m_state == MMALStateFFDec && omvb->m_width * omvb->m_height > 720*576)
++          // avoid advanced deinterlace when using HD resolution
++          if (omvb->m_width * omvb->m_height > 720*576)
+             interlace_method = VS_INTERLACEMETHOD_MMAL_BOB;
+         }
+         bool interlace = (omvb->mmal_buffer->flags & MMAL_BUFFER_HEADER_VIDEO_FLAG_INTERLACED) ? true:false;
+diff --git a/xbmc/cores/omxplayer/OMXVideo.cpp b/xbmc/cores/omxplayer/OMXVideo.cpp
+index 39bc0530cecd54ae8c3a5481c92f1a6a18a4d9c5..cb0a06888a919879155fea2a689c1bae9ff2f139 100644
+--- a/xbmc/cores/omxplayer/OMXVideo.cpp
++++ b/xbmc/cores/omxplayer/OMXVideo.cpp
+@@ -236,7 +236,12 @@ bool COMXVideo::PortSettingsChanged(ResolutionUpdateInfo &resinfo)
+ 
+   EINTERLACEMETHOD interlace_method = CMediaSettings::GetInstance().GetCurrentVideoSettings().m_InterlaceMethod;
+   if (interlace_method == VS_INTERLACEMETHOD_AUTO)
++  {
+     interlace_method = VS_INTERLACEMETHOD_MMAL_ADVANCED;
++    // avoid advanced deinterlace when using HD resolution
++    if (port_image.format.video.nFrameWidth * port_image.format.video.nFrameHeight > 720*576)
++      interlace_method = VS_INTERLACEMETHOD_MMAL_BOB;
++  }
+ 
+   if (m_deinterlace && interlace_method != VS_INTERLACEMETHOD_NONE)
+   {
+
+From 58b2734d76c3e32caed3ed96da93b3b02391c8d8 Mon Sep 17 00:00:00 2001
+From: popcornmix <popcornmix@gmail.com>
+Date: Fri, 17 Feb 2017 17:58:13 +0000
+Subject: [PATCH 67/67] ffmpeg: Update hevc optimisation to use the gpu service
+
+---
+ project/cmake/modules/FindFFMPEG.cmake             |    14 +-
+ tools/depends/target/ffmpeg/Makefile               |    14 +-
+ tools/depends/target/ffmpeg/autobuild.sh           |    14 +-
+ .../target/ffmpeg/pfcd_hevc_optimisations.patch    | 52224 ++++++-------------
+ 4 files changed, 16342 insertions(+), 35924 deletions(-)
+
+diff --git a/project/cmake/modules/FindFFMPEG.cmake b/project/cmake/modules/FindFFMPEG.cmake
+index 486842e9bd6c6e7c65ca0dd9c2a6fc6f26169a5d..92e79eb225640bb8a965ee63b3fd5c743e09758f 100644
+--- a/project/cmake/modules/FindFFMPEG.cmake
++++ b/project/cmake/modules/FindFFMPEG.cmake
+@@ -261,14 +261,14 @@ if(NOT FFMPEG_FOUND)
+                                     ${CMAKE_COMMAND} -E copy
+                                     ${CORE_SOURCE_DIR}/tools/depends/target/ffmpeg/FindGnuTls.cmake
+                                     <SOURCE_DIR> &&
+-                                    patch -p1 < ${CORE_SOURCE_DIR}/tools/depends/target/ffmpeg/0001-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch &&
+-                                    patch -p1 < ${CORE_SOURCE_DIR}/tools/depends/target/ffmpeg/hevcdsp_ARM_NEON_optimized_epel_functions.patch &&
+-                                    patch -p1 < ${CORE_SOURCE_DIR}/tools/depends/target/ffmpeg/added_ARM_NEON_optimized_SAO_patches.patch &&
++                                    #patch -p1 < ${CORE_SOURCE_DIR}/tools/depends/target/ffmpeg/0001-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch &&
++                                    #patch -p1 < ${CORE_SOURCE_DIR}/tools/depends/target/ffmpeg/hevcdsp_ARM_NEON_optimized_epel_functions.patch &&
++                                    #patch -p1 < ${CORE_SOURCE_DIR}/tools/depends/target/ffmpeg/added_ARM_NEON_optimized_SAO_patches.patch &&
+                                     patch -p1 < ${CORE_SOURCE_DIR}/tools/depends/target/ffmpeg/pfcd_hevc_optimisations.patch &&
+-                                    patch -p1 < ${CORE_SOURCE_DIR}/tools/depends/target/ffmpeg/0001-Squashed-commit-of-the-following.patch &&
+-                                    patch -p1 < ${CORE_SOURCE_DIR}/tools/depends/target/ffmpeg/0001-avcodec-add-h264_mvc-codec-id-and-profiles.patch &&
+-                                    patch -p1 < ${CORE_SOURCE_DIR}/tools/depends/target/ffmpeg/0001-h264_parser-add-support-for-parsing-h264-mvc-NALUs.patch &&
+-                                    patch -p1 < ${CORE_SOURCE_DIR}/tools/depends/target/ffmpeg/h264_parser_fix_parsing_of_mvc_slices_in_some_corner_cases.patch &&
++                                    #patch -p1 < ${CORE_SOURCE_DIR}/tools/depends/target/ffmpeg/0001-Squashed-commit-of-the-following.patch &&
++                                    #patch -p1 < ${CORE_SOURCE_DIR}/tools/depends/target/ffmpeg/0001-avcodec-add-h264_mvc-codec-id-and-profiles.patch &&
++                                    #patch -p1 < ${CORE_SOURCE_DIR}/tools/depends/target/ffmpeg/0001-h264_parser-add-support-for-parsing-h264-mvc-NALUs.patch &&
++                                    #patch -p1 < ${CORE_SOURCE_DIR}/tools/depends/target/ffmpeg/h264_parser_fix_parsing_of_mvc_slices_in_some_corner_cases.patch &&
+                                     patch -p1 < ${CORE_SOURCE_DIR}/tools/depends/target/ffmpeg/73fde6f9f3d01f7fc0f3ae4b66f6c725f9fb1105.patch
+                      )
+ 
+diff --git a/tools/depends/target/ffmpeg/Makefile b/tools/depends/target/ffmpeg/Makefile
+index d4f279fd4f2ceb260698cd6fedb124bae61018d0..11e92a9ad618b748cad4831fa6af7565e29081ab 100644
+--- a/tools/depends/target/ffmpeg/Makefile
++++ b/tools/depends/target/ffmpeg/Makefile
+@@ -88,14 +88,14 @@ $(PLATFORM): $(TARBALLS_LOCATION)/$(ARCHIVE) $(DEPS)
+ 	rm -rf $(PLATFORM); mkdir -p $(PLATFORM)
+ 	cd $(PLATFORM); $(ARCHIVE_TOOL) $(ARCHIVE_TOOL_FLAGS) $(TARBALLS_LOCATION)/$(ARCHIVE)
+ 	cd $(PLATFORM); sed -i".bak" -e "s%pkg_config_default=pkg-config%export PKG_CONFIG_LIBDIR=$(PREFIX)/lib/pkgconfig \&\& pkg_config_default=$(NATIVEPREFIX)/bin/pkg-config%" configure
+-	cd $(PLATFORM); patch -p1 < ../0001-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch
+-	cd $(PLATFORM); patch -p1 < ../hevcdsp_ARM_NEON_optimized_epel_functions.patch
+-	cd $(PLATFORM); patch -p1 < ../added_ARM_NEON_optimized_SAO_patches.patch
++	#cd $(PLATFORM); patch -p1 < ../0001-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch
++	#cd $(PLATFORM); patch -p1 < ../hevcdsp_ARM_NEON_optimized_epel_functions.patch
++	#cd $(PLATFORM); patch -p1 < ../added_ARM_NEON_optimized_SAO_patches.patch
+ 	cd $(PLATFORM); patch -p1 < ../pfcd_hevc_optimisations.patch
+-	cd $(PLATFORM); patch -p1 < ../0001-Squashed-commit-of-the-following.patch
+-	cd $(PLATFORM); patch -p1 < ../0001-avcodec-add-h264_mvc-codec-id-and-profiles.patch
+-	cd $(PLATFORM); patch -p1 < ../0001-h264_parser-add-support-for-parsing-h264-mvc-NALUs.patch
+-	cd $(PLATFORM); patch -p1 < ../h264_parser_fix_parsing_of_mvc_slices_in_some_corner_cases.patch
++	#cd $(PLATFORM); patch -p1 < ../0001-Squashed-commit-of-the-following.patch
++	#cd $(PLATFORM); patch -p1 < ../0001-avcodec-add-h264_mvc-codec-id-and-profiles.patch
++	#cd $(PLATFORM); patch -p1 < ../0001-h264_parser-add-support-for-parsing-h264-mvc-NALUs.patch
++	#cd $(PLATFORM); patch -p1 < ../h264_parser_fix_parsing_of_mvc_slices_in_some_corner_cases.patch
+ 	cd $(PLATFORM); patch -p1 < ../73fde6f9f3d01f7fc0f3ae4b66f6c725f9fb1105.patch
+ 
+ 	cd $(PLATFORM);\
+diff --git a/tools/depends/target/ffmpeg/autobuild.sh b/tools/depends/target/ffmpeg/autobuild.sh
+index 9f6c26c8acd08ed603aadeb4d9d81b07026e7506..3d970429012c1f3aede4df0545ced5006c165d50 100755
+--- a/tools/depends/target/ffmpeg/autobuild.sh
++++ b/tools/depends/target/ffmpeg/autobuild.sh
+@@ -132,14 +132,14 @@ mkdir -p "ffmpeg-${VERSION}"
+ cd "ffmpeg-${VERSION}" || exit 2
+ tar --strip-components=1 -xf $MYDIR/${ARCHIVE}
+ 
+-patch -p1 < ../0001-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch
+-patch -p1 < ../hevcdsp_ARM_NEON_optimized_epel_functions.patch
+-patch -p1 < ../added_ARM_NEON_optimized_SAO_patches.patch
++#patch -p1 < ../0001-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch
++#patch -p1 < ../hevcdsp_ARM_NEON_optimized_epel_functions.patch
++#patch -p1 < ../added_ARM_NEON_optimized_SAO_patches.patch
+ patch -p1 < ../pfcd_hevc_optimisations.patch
+-patch -p1 < ../0001-Squashed-commit-of-the-following.patch
+-patch -p1 < ../0001-avcodec-add-h264_mvc-codec-id-and-profiles.patch
+-patch -p1 < ../0001-h264_parser-add-support-for-parsing-h264-mvc-NALUs.patch
+-patch -p1 < ../h264_parser_fix_parsing_of_mvc_slices_in_some_corner_cases.patch
++#patch -p1 < ../0001-Squashed-commit-of-the-following.patch
++#patch -p1 < ../0001-avcodec-add-h264_mvc-codec-id-and-profiles.patch
++#patch -p1 < ../0001-h264_parser-add-support-for-parsing-h264-mvc-NALUs.patch
++#patch -p1 < ../h264_parser_fix_parsing_of_mvc_slices_in_some_corner_cases.patch
+ 
+ CFLAGS="$CFLAGS" CXXFLAGS="$CXXFLAGS" LDFLAGS="$LDFLAGS" \
+ ./configure --prefix=$FFMPEG_PREFIX \
+diff --git a/tools/depends/target/ffmpeg/pfcd_hevc_optimisations.patch b/tools/depends/target/ffmpeg/pfcd_hevc_optimisations.patch
+index e172ebf157aebffe1ae50b4a2b25fd71bc708c93..852815d5f4ae80771c5304f6f3520b5e49b18a67 100644
+--- a/tools/depends/target/ffmpeg/pfcd_hevc_optimisations.patch
++++ b/tools/depends/target/ffmpeg/pfcd_hevc_optimisations.patch
+@@ -1,14 +1,17 @@
+-From b9b5434c61afd492a54dad5158b4d56ecbf7f01d Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Tue, 28 Apr 2015 16:18:40 +0100
+-Subject: [PATCH 01/68] Added display output
+-
+----
+- ffmpeg.c | 159 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+- 1 file changed, 159 insertions(+)
+-
++diff --git a/.gitignore b/.gitignore
++index 524fb73..305632b 100644
++--- a/.gitignore
+++++ b/.gitignore
++@@ -23,6 +23,7 @@
++ .\#*
++ /.config
++ /.version
+++/build/
++ /ffmpeg
++ /ffplay
++ /ffprobe
+ diff --git a/ffmpeg.c b/ffmpeg.c
+-index 9ffd833..50c6e86 100644
++index 9ffd833..7a86d7e 100644
+ --- a/ffmpeg.c
+ +++ b/ffmpeg.c
+ @@ -23,6 +23,11 @@
+@@ -17,17 +20,20 @@ index 9ffd833..50c6e86 100644
+  
+ +#ifdef RPI
+ +#define RPI_DISPLAY
+-+//#define RPI_ZERO_COPY
+++#define RPI_ZERO_COPY
+ +#endif
+ +
+  #include "config.h"
+  #include <ctype.h>
+  #include <string.h>
+-@@ -66,6 +71,20 @@
++@@ -66,6 +71,25 @@
+  # include "libavfilter/buffersrc.h"
+  # include "libavfilter/buffersink.h"
+  
+ +#ifdef RPI_DISPLAY
+++#pragma GCC diagnostic push
+++// Many many redundant decls in the header files
+++#pragma GCC diagnostic ignored "-Wredundant-decls"
+ +#include <bcm_host.h>
+ +#include <interface/mmal/mmal.h>
+ +#include <interface/mmal/mmal_parameters_camera.h>
+@@ -36,15 +42,17 @@ index 9ffd833..50c6e86 100644
+ +#include <interface/mmal/util/mmal_default_components.h>
+ +#include <interface/mmal/util/mmal_connection.h>
+ +#include <interface/mmal/util/mmal_util_params.h>
+++#pragma GCC diagnostic pop
+ +#ifdef RPI_ZERO_COPY
+ +#include "libavcodec/rpi_qpu.h"
+ +#endif
+++#include "libavcodec/rpi_zc.h"
+ +#endif
+ +
+  #if HAVE_SYS_RESOURCE_H
+  #include <sys/time.h>
+  #include <sys/types.h>
+-@@ -158,6 +177,134 @@ static int restore_tty;
++@@ -158,6 +182,169 @@ static int restore_tty;
+  static void free_input_threads(void);
+  #endif
+  
+@@ -54,13 +62,7 @@ index 9ffd833..50c6e86 100644
+ +
+ +static MMAL_COMPONENT_T* rpi_display = NULL;
+ +static MMAL_POOL_T *rpi_pool = NULL;
+-+
+-+#ifdef RPI_ZERO_COPY
+-+static uint8_t *get_vc_handle(AVBufferRef *bref) {
+-+  GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
+-+  return (uint8_t *)p->vc_handle;
+-+}
+-+#endif
+++static volatile int rpi_display_count = 0;
+ +
+ +static MMAL_POOL_T* display_alloc_pool(MMAL_PORT_T* port, size_t w, size_t h)
+ +{
+@@ -77,7 +79,7 @@ index 9ffd833..50c6e86 100644
+ +    for (i = 0; i < NUM_BUFFERS; ++i)
+ +    {
+ +       MMAL_BUFFER_HEADER_T* buffer = pool->header[i];
+-+       void* bufPtr = buffer->data;
+++       char * bufPtr = buffer->data;
+ +       memset(bufPtr, i*30, w*h);
+ +       memset(bufPtr+w*h, 128, (w*h)/2);
+ +    }
+@@ -86,81 +88,122 @@ index 9ffd833..50c6e86 100644
+ +    return pool;
+ +}
+ +
+-+static void display_cb_input(MMAL_PORT_T *port,MMAL_BUFFER_HEADER_T *buffer) {
+++static void display_cb_input(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer) {
+++#ifdef RPI_ZERO_COPY
+++    av_rpi_zc_unref(buffer->user_data);
+++    --rpi_display_count;
+++#endif
+++    mmal_buffer_header_release(buffer);
+++}
+++
+++static void display_cb_control(MMAL_PORT_T *port,MMAL_BUFFER_HEADER_T *buffer) {
+ +  mmal_buffer_header_release(buffer);
+ +}
+ +
+ +static MMAL_COMPONENT_T* display_init(size_t x, size_t y, size_t w, size_t h)
+ +{
+ +    MMAL_COMPONENT_T* display;
+-+    int w2 = (w+31)&~31;
+-+    int h2 = (h+15)&~15;
+ +    MMAL_DISPLAYREGION_T region =
+ +    {
+-+        {MMAL_PARAMETER_DISPLAYREGION, sizeof(region)},
+++        .hdr = {MMAL_PARAMETER_DISPLAYREGION, sizeof(region)},
+ +        .set = MMAL_DISPLAY_SET_LAYER | MMAL_DISPLAY_SET_FULLSCREEN | MMAL_DISPLAY_SET_DEST_RECT,
+ +        .layer = 2,
+ +        .fullscreen = 0,
+ +        .dest_rect = {x, y, w, h}
+ +    };
+++    const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(w, h);
+++
+ +    bcm_host_init();  // TODO is this needed?
+ +    mmal_component_create(MMAL_COMPONENT_DEFAULT_VIDEO_RENDERER, &display);
+ +    assert(display);
+ +
+ +    mmal_port_parameter_set(display->input[0], &region.hdr);
+ +
+-+    MMAL_ES_FORMAT_T* format = display->input[0]->format;
+-+    format->encoding = MMAL_ENCODING_I420;
+-+    format->es->video.width = w2;
+-+    format->es->video.height = h2;
+-+    format->es->video.crop.x = 0;
+-+    format->es->video.crop.y = 0;
+-+    format->es->video.crop.width = w;
+-+    format->es->video.crop.height = h;
+-+    mmal_port_format_commit(display->input[0]);
+++    {
+++        MMAL_ES_FORMAT_T* format = display->input[0]->format;
+++        format->encoding = MMAL_ENCODING_I420;
+++        format->es->video.width = geo.stride_y;
+++        format->es->video.height = geo.height_y;
+++        format->es->video.crop.x = 0;
+++        format->es->video.crop.y = 0;
+++        format->es->video.crop.width = w;
+++        format->es->video.crop.height = h;
+++        mmal_port_format_commit(display->input[0]);
+++    }
+ +
+ +    mmal_component_enable(display);
+ +
+-+    rpi_pool = display_alloc_pool(display->input[0], w2, h2);
+++    rpi_pool = display_alloc_pool(display->input[0], geo.stride_y, geo.height_y);
+ +
+ +    mmal_port_enable(display->input[0],display_cb_input);
+-+    mmal_port_enable(display->control,display_cb_input);
+++    mmal_port_enable(display->control,display_cb_control);
+ +
+-+    printf("Allocated display %d %d\n",w,h);
+++    printf("Allocated display %dx%d in %dx%d\n", w, h, geo.stride_y, geo.height_y);
+ +
+ +    return display;
+ +}
+ +
+-+static void display_frame(MMAL_COMPONENT_T* display,AVFrame* fr)
+++static void display_frame(struct AVCodecContext * const s, MMAL_COMPONENT_T* const display, const AVFrame* const fr)
+ +{
+-+    int w = fr->width;
+-+    int h = fr->height;
+-+    int w2 = (w+31)&~31;
+-+    int h2 = (h+15)&~15;
+++    MMAL_BUFFER_HEADER_T* buf;
+++
+ +    if (!display || !rpi_pool)
+ +        return;
+-+    MMAL_BUFFER_HEADER_T* buf = mmal_queue_get(rpi_pool->queue);
+++
+++    if (rpi_display_count >= 3) {
+++        av_log(s, AV_LOG_VERBOSE, "Frame dropped\n");
+++        return;
+++    }
+++
+++    buf = mmal_queue_get(rpi_pool->queue);
+ +    if (!buf) {
+-+      // Running too fast so drop the frame
+-+      return;
+++        // Running too fast so drop the frame
+++        printf("Q alloc failure\n");
+++        return;
+ +    }
+ +    assert(buf);
+ +    buf->cmd = 0;
+-+    buf->length = (w2 * h2 * 3)/2;
+ +    buf->offset = 0; // Offset to valid data
+ +    buf->flags = 0;
+ +#ifdef RPI_ZERO_COPY
+-+    buf->data = get_vc_handle(fr->buf[0]);
+-+    buf->alloc_size = (w2*h2*3)/2;
+++{
+++    const AVRpiZcRefPtr fr_buf = av_rpi_zc_ref(s, fr, 1);
+++
+++    buf->user_data = fr_buf;
+++    buf->data = av_rpi_zc_vc_handle(fr_buf);
+++    buf->alloc_size =
+++        buf->length = av_rpi_zc_numbytes(fr_buf);
+++
+++    ++rpi_display_count;
+++}
+ +#else
+++{
+++#error YYY
+++    int w = fr->width;
+++    int h = fr->height;
+++    int w2 = (w+31)&~31;
+++    int h2 = (h+15)&~15;
+++
+++    buf->length = (w2 * h2 * 3)/2;
+++    buf->user_data = NULL;
+++
+ +    //mmal_buffer_header_mem_lock(buf);
+ +    memcpy(buf->data, fr->data[0], w2 * h);
+ +    memcpy(buf->data+w2*h2, fr->data[1], w2 * h / 4);
+ +    memcpy(buf->data+w2*h2*5/4, fr->data[2], w2 * h / 4);
+ +    //mmal_buffer_header_mem_unlock(buf);
+++}
+ +#endif
+ +
+-+    mmal_port_send_buffer(display->input[0], buf);  // I assume this will automatically get released
+++    while (rpi_display_count >= 3) {
+++        usleep(5000);
+++    }
+++
+++    if (mmal_port_send_buffer(display->input[0], buf) != MMAL_SUCCESS)
+++    {
+++        printf("** send failed: depth=%d\n", rpi_display_count);
+++        display_cb_input(NULL, buf);
+++    }
+ +}
+ +
+ +static void display_exit(MMAL_COMPONENT_T* display)
+@@ -179,4965 +222,6886 @@ index 9ffd833..50c6e86 100644
+  /* sub2video hack:
+     Convert subtitles to video with alpha to insert them in filter graphs.
+     This is a temporary solution until libavfilter gets real subtitles support.
+-@@ -581,6 +728,10 @@ static void ffmpeg_cleanup(int ret)
++@@ -540,6 +727,11 @@ static void ffmpeg_cleanup(int ret)
++         avformat_close_input(&input_files[i]->ctx);
++         av_freep(&input_files[i]);
+      }
+-     term_exit();
+-     ffmpeg_exited = 1;
+ +
+ +#ifdef RPI_DISPLAY
+ +    display_exit(rpi_display);
+ +#endif
+++
++     for (i = 0; i < nb_input_streams; i++) {
++         InputStream *ist = input_streams[i];
++ 
++@@ -551,6 +743,9 @@ static void ffmpeg_cleanup(int ret)
++         av_freep(&ist->filters);
++         av_freep(&ist->hwaccel_device);
++ 
+++#ifdef RPI_ZERO_COPY
+++        av_rpi_zc_uninit(ist->dec_ctx);
+++#endif
++         avcodec_free_context(&ist->dec_ctx);
++ 
++         av_freep(&input_streams[i]);
++@@ -581,6 +776,7 @@ static void ffmpeg_cleanup(int ret)
++     }
++     term_exit();
++     ffmpeg_exited = 1;
+++
+  }
+  
+  void remove_avoptions(AVDictionary **a, AVDictionary *b)
+-@@ -940,6 +1091,14 @@ static void do_video_out(AVFormatContext *s,
+-     int frame_size = 0;
+-     InputStream *ist = NULL;
+-     AVFilterContext *filter = ost->filter->filter;
++@@ -944,6 +1140,15 @@ static void do_video_out(AVFormatContext *s,
++     if (ost->source_index >= 0)
++         ist = input_streams[ost->source_index];
++ 
+ +#ifdef RPI_DISPLAY
+-+    if (next_picture)
+++    if (next_picture && ist != NULL)
+ +    {
+-+	if (!rpi_display)
+++        if (!rpi_display)
+ +           rpi_display = display_init(0,0,next_picture->width,next_picture->height);
+-+        display_frame(rpi_display,next_picture);
+++        display_frame(ist->dec_ctx, rpi_display, next_picture);
+ +    }
+ +#endif
+++
++     if (filter->inputs[0]->frame_rate.num > 0 &&
++         filter->inputs[0]->frame_rate.den > 0)
++         duration = 1/(av_q2d(filter->inputs[0]->frame_rate) * av_q2d(enc->time_base));
++@@ -2549,6 +2754,12 @@ static int init_input_stream(int ist_index, char *error, int error_len)
++         ist->dec_ctx->opaque                = ist;
++         ist->dec_ctx->get_format            = get_format;
++         ist->dec_ctx->get_buffer2           = get_buffer;
+++
+++#ifdef RPI_ZERO_COPY
+++        // Overrides the above get_buffer2
+++        av_rpi_zc_init(ist->dec_ctx);
+++#endif
+++
++         ist->dec_ctx->thread_safe_callbacks = 1;
+  
+-     if (ost->source_index >= 0)
+-         ist = input_streams[ost->source_index];
+--- 
+-2.7.4
+-
+-
+-From b90a5aff7bf9112ebd2a07949c8d79a49fcafe48 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Wed, 29 Apr 2015 16:49:43 +0100
+-Subject: [PATCH 02/68] Split transform and intra prediction into commands
+-
+----
+- libavcodec/hevc.c       | 119 +++++++++++++++++++++++++++++++++++++++++++++++-
+- libavcodec/hevc.h       |  58 +++++++++++++++++++++++
+- libavcodec/hevc_cabac.c |  15 ++++++
+- 3 files changed, 191 insertions(+), 1 deletion(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index b478065..aa45dd6 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -931,6 +931,25 @@ static int hls_cross_component_pred(HEVCContext *s, int idx) {
+-     return 0;
+- }
++         av_opt_set_int(ist->dec_ctx, "refcounted_frames", 1, 0);
++diff --git a/libavcodec/Makefile b/libavcodec/Makefile
++index fd0d1f0..40d22d2 100644
++--- a/libavcodec/Makefile
+++++ b/libavcodec/Makefile
++@@ -5,6 +5,11 @@ NAME = avcodec
++ HEADERS = avcodec.h                                                     \
++           avdct.h                                                       \
++           avfft.h                                                       \
+++          rpi_qpu.h                                                     \
+++          rpi_shader.h                                                  \
+++          rpi_mailbox.h                                                 \
+++          rpi_hevc_transform.h                                          \
+++          rpi_zc.h                                                      \
++           d3d11va.h                                                     \
++           dirac.h                                                       \
++           dv_profile.h                                                  \
++@@ -43,6 +48,10 @@ OBJS = allcodecs.o                                                      \
++        resample.o                                                       \
++        resample2.o                                                      \
++        utils.o                                                          \
+++       rpi_qpu.o                                                        \
+++       rpi_shader.o                                                     \
+++       rpi_mailbox.o                                                    \
+++       rpi_zc.o                                                         \
++        vorbis_parser.o                                                  \
++        xiph.o                                                           \
+  
+-+#ifdef RPI
+-+static void rpi_intra_pred(HEVCContext *s, int log2_trafo_size, int x0, int y0, int c_idx)
+-+{
+-+    if (s->enable_rpi) {
+-+        HEVCLocalContext *lc = s->HEVClc;
+-+        HEVCPredCmd *cmd = s->univ_pred_cmds + s->num_pred_cmds++;
+-+        cmd->type = RPI_PRED_INTRA;
+-+        cmd->size = log2_trafo_size;
+-+        cmd->c_idx = c_idx;
+-+        cmd->x = x0;
+-+        cmd->y = y0;
+-+        cmd->na = (lc->na.cand_bottom_left<<4) + (lc->na.cand_left<<3) + (lc->na.cand_up_left<<2) + (lc->na.cand_up<<1) + lc->na.cand_up_right;
+-+        cmd->mode = c_idx ? lc->tu.intra_pred_mode_c :  lc->tu.intra_pred_mode;
+-+    } else {
+-+        s->hpc.intra_pred[log2_trafo_size - 2](s, x0, y0, c_idx);
+-+    }
+-+}
++@@ -1078,3 +1087,11 @@ $(SUBDIR)qdm2.o: $(SUBDIR)qdm2_tables.h
++ $(SUBDIR)sinewin.o: $(SUBDIR)sinewin_tables.h
++ $(SUBDIR)sinewin_fixed.o: $(SUBDIR)sinewin_fixed_tables.h
++ endif
+++
+++$(SUBDIR)rpi_shader.c: $(SUBDIR)rpi_shader.qasm
+++	python $(SUBDIR)../pi-util/qasm.py -mc_c:rpi_shader,rpi_shader,rpi_shader $< > $@
+++
+++$(SUBDIR)rpi_shader.h: $(SUBDIR)rpi_shader.qasm
+++	python $(SUBDIR)../pi-util/qasm.py -mc_h:rpi_shader,rpi_shader,rpi_shader $< > $@
+++
+++$(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_shader.h
++diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c
++index 54efaad..02a89c3 100644
++--- a/libavcodec/allcodecs.c
+++++ b/libavcodec/allcodecs.c
++@@ -667,6 +667,7 @@ void avcodec_register_all(void)
++     REGISTER_PARSER(H261,               h261);
++     REGISTER_PARSER(H263,               h263);
++     REGISTER_PARSER(H264,               h264);
+++    REGISTER_PARSER(H264_MVC,           h264_mvc);
++     REGISTER_PARSER(HEVC,               hevc);
++     REGISTER_PARSER(MJPEG,              mjpeg);
++     REGISTER_PARSER(MLP,                mlp);
++diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
++index a4ceca7..1354c14 100644
++--- a/libavcodec/arm/Makefile
+++++ b/libavcodec/arm/Makefile
++@@ -132,8 +132,10 @@ NEON-OBJS-$(CONFIG_LLAUDDSP)           += arm/lossless_audiodsp_neon.o
++ NEON-OBJS-$(CONFIG_DCA_DECODER)        += arm/synth_filter_neon.o
++ NEON-OBJS-$(CONFIG_HEVC_DECODER)       += arm/hevcdsp_init_neon.o       \
++                                           arm/hevcdsp_deblock_neon.o    \
+++                                          arm/hevcdsp_epel_neon.o       \
++                                           arm/hevcdsp_idct_neon.o       \
++-                                          arm/hevcdsp_qpel_neon.o
+++                                          arm/hevcdsp_qpel_neon.o       \
+++                                          arm/hevcdsp_sao_neon.o
++ NEON-OBJS-$(CONFIG_RV30_DECODER)       += arm/rv34dsp_neon.o
++ NEON-OBJS-$(CONFIG_RV40_DECODER)       += arm/rv34dsp_neon.o            \
++                                           arm/rv40dsp_neon.o
++diff --git a/libavcodec/arm/cabac.h b/libavcodec/arm/cabac.h
++index fdbf86b..0a3980a 100644
++--- a/libavcodec/arm/cabac.h
+++++ b/libavcodec/arm/cabac.h
++@@ -26,13 +26,34 @@
++ #include "libavutil/internal.h"
++ #include "libavcodec/cabac.h"
++ 
+++
+++#if UNCHECKED_BITSTREAM_READER
+++#define LOAD_16BITS_BEHI\
+++        "ldrh       %[tmp]        , [%[ptr]]    , #2            \n\t"\
+++        "rev        %[tmp]        , %[tmp]                      \n\t"
+++#elif CONFIG_THUMB
+++#define LOAD_16BITS_BEHI\
+++        "ldr        %[tmp]        , [%[c], %[end]]              \n\t"\
+++        "cmp        %[tmp]        , %[ptr]                      \n\t"\
+++        "it         cs                                          \n\t"\
+++        "ldrhcs     %[tmp]        , [%[ptr]]    , #2            \n\t"\
+++        "rev        %[tmp]        , %[tmp]                      \n\t"
+++#else
+++#define LOAD_16BITS_BEHI\
+++        "ldr        %[tmp]        , [%[c], %[end]]              \n\t"\
+++        "cmp        %[tmp]        , %[ptr]                      \n\t"\
+++        "ldrcsh     %[tmp]        , [%[ptr]]    , #2            \n\t"\
+++        "rev        %[tmp]        , %[tmp]                      \n\t"
+ +#endif
+ +
+- static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+-                               int xBase, int yBase, int cb_xBase, int cb_yBase,
+-                               int log2_cb_size, int log2_trafo_size,
+-@@ -943,8 +962,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+-     if (lc->cu.pred_mode == MODE_INTRA) {
+-         int trafo_size = 1 << log2_trafo_size;
+-         ff_hevc_set_neighbour_available(s, x0, y0, trafo_size, trafo_size);
+++
++ #define get_cabac_inline get_cabac_inline_arm
++ static av_always_inline int get_cabac_inline_arm(CABACContext *c,
++                                                  uint8_t *const state)
++ {
++     int bit;
+++#if 0
++     void *reg_b, *reg_c, *tmp;
+ -
+-+#ifdef RPI
+-+        rpi_intra_pred(s, log2_trafo_size, x0, y0, 0);
++     __asm__ volatile(
++         "ldrb       %[bit]        , [%[state]]                  \n\t"
++         "add        %[r_b]        , %[tables]   , %[lps_off]    \n\t"
++@@ -100,9 +121,141 @@ static av_always_inline int get_cabac_inline_arm(CABACContext *c,
++           [mlps_off]"I"(H264_MLPS_STATE_OFFSET + 128)
++         : "memory", "cc"
++         );
+ +#else
+-         s->hpc.intra_pred[log2_trafo_size - 2](s, x0, y0, 0);
+++   // *** Not thumb compatible yet
+++   unsigned int reg_b, tmp;
+++    __asm__ (
+++        "ldrb       %[bit]        , [%[state]]                  \n\t"
+++        "sub        %[r_b]        , %[mlps_tables], %[lps_off]  \n\t"
+++        "and        %[tmp]        , %[range]    , #0xC0         \n\t"
+++        "add        %[r_b]        , %[r_b]      , %[bit]        \n\t"
+++        "ldrb       %[tmp]        , [%[r_b]     , %[tmp], lsl #1] \n\t"
+++// %bit = *state
+++// %range = range
+++// %tmp = RangeLPS
+++        "sub        %[range]      , %[range]    , %[tmp]        \n\t"
+++
+++        "cmp        %[low]        , %[range]    , lsl #17       \n\t"
+++        "ittt       ge                                          \n\t"
+++        "subge      %[low]        , %[low]      , %[range], lsl #17 \n\t"
+++        "mvnge      %[bit]        , %[bit]                      \n\t"
+++        "movge      %[range]      , %[tmp]                      \n\t"
+++
+++        "clz        %[tmp]        , %[range]                    \n\t"
+++        "sub        %[tmp]        , #23                         \n\t"
+++
+++        "ldrb       %[r_b]        , [%[mlps_tables], %[bit]]    \n\t"
+++        "lsl        %[low]        , %[low]      , %[tmp]        \n\t"
+++        "lsl        %[range]      , %[range]    , %[tmp]        \n\t"
+++
+++        "strb       %[r_b]        , [%[state]]                  \n\t"
+++        "lsls       %[tmp]        , %[low]      , #16           \n\t"
+++
+++        "bne        2f                                          \n\t"
+++        LOAD_16BITS_BEHI
+++        "lsr        %[tmp]        , %[tmp]      , #15           \n\t"
+++        "movw       %[r_b]        , #0xFFFF                     \n\t"
+++        "sub        %[tmp]        , %[tmp]      , %[r_b]        \n\t"
+++
+++        "rbit       %[r_b]        , %[low]                      \n\t"
+++        "clz        %[r_b]        , %[r_b]                      \n\t"
+++        "sub        %[r_b]        , %[r_b]      , #16           \n\t"
+++#if CONFIG_THUMB
+++        "lsl        %[tmp]        , %[tmp]      , %[r_b]        \n\t"
+++        "add        %[low]        , %[low]      , %[tmp]        \n\t"
+++#else
+++        "add        %[low]        , %[low]      , %[tmp], lsl %[r_b] \n\t"
+++#endif
+++        "2:                                                     \n\t"
+++        :    [bit]"=&r"(bit),
+++             [low]"+&r"(c->low),
+++           [range]"+&r"(c->range),
+++             [r_b]"=&r"(reg_b),
+++             [ptr]"+&r"(c->bytestream),
+++             [tmp]"=&r"(tmp)
+++          :  [state]"r"(state),
+++            [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128),
+++              [byte]"M"(offsetof(CABACContext, bytestream)),
+++#if !UNCHECKED_BITSTREAM_READER
+++                 [c]"r"(c),
+++               [end]"M"(offsetof(CABACContext, bytestream_end)),
+++#endif
+++           [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
+++        : "memory", "cc"
+++        );
+ +#endif
+-     }
+  
+-     if (cbf_luma || cbf_cb[0] || cbf_cr[0] ||
+-@@ -1030,7 +1052,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+-             for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
+-                 if (lc->cu.pred_mode == MODE_INTRA) {
+-                     ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
+-+#ifdef RPI
+-+                    rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (i << log2_trafo_size_c), 1);
++     return bit & 1;
++ }
+++
+++#define get_cabac_bypass get_cabac_bypass_arm
+++static inline int get_cabac_bypass_arm(CABACContext * const c)
+++{
+++    int rv = 0;
+++    unsigned int tmp;
+++    __asm (
+++        "lsl        %[low]        , #1                          \n\t"
+++        "cmp        %[low]        , %[range]    , lsl #17       \n\t"
+++        "adc        %[rv]         , %[rv]       , #0            \n\t"
+++        "it         cs                                          \n\t"
+++        "subcs      %[low]        , %[low]      , %[range], lsl #17 \n\t"
+++        "lsls       %[tmp]        , %[low]      , #16           \n\t"
+++        "bne        1f                                          \n\t"
+++        LOAD_16BITS_BEHI
+++        "add        %[low]        , %[low]      , %[tmp], lsr #15 \n\t"
+++        "movw       %[tmp]        , #0xFFFF                     \n\t"
+++        "sub        %[low]        , %[low]      , %[tmp]        \n\t"
+++        "1:                                                     \n\t"
+++        : // Outputs
+++              [rv]"+&r"(rv),
+++             [low]"+&r"(c->low),
+++             [tmp]"=&r"(tmp),
+++             [ptr]"+&r"(c->bytestream)
+++        : // Inputs
+++#if !UNCHECKED_BITSTREAM_READER
+++                 [c]"r"(c),
+++               [end]"M"(offsetof(CABACContext, bytestream_end)),
+++#endif
+++             [range]"r"(c->range)
+++        : "cc"
+++    );
+++    return rv;
+++}
+++
+++
+++#define get_cabac_bypass_sign get_cabac_bypass_sign_arm
+++static inline int get_cabac_bypass_sign_arm(CABACContext * const c, int rv)
+++{
+++    unsigned int tmp;
+++    __asm (
+++        "lsl        %[low]        , #1                          \n\t"
+++        "cmp        %[low]        , %[range]    , lsl #17       \n\t"
+++        "ite        cc                                          \n\t"
+++        "rsbcc      %[rv]         , %[rv]       , #0            \n\t"
+++        "subcs      %[low]        , %[low]      , %[range], lsl #17 \n\t"
+++        "lsls       %[tmp]        , %[low]      , #16           \n\t"
+++        "bne        1f                                          \n\t"
+++        LOAD_16BITS_BEHI
+++        "add        %[low]        , %[low]      , %[tmp], lsr #15 \n\t"
+++        "movw       %[tmp]        , #0xFFFF                     \n\t"
+++        "sub        %[low]        , %[low]      , %[tmp]        \n\t"
+++        "1:                                                     \n\t"
+++        : // Outputs
+++              [rv]"+&r"(rv),
+++             [low]"+&r"(c->low),
+++             [tmp]"=&r"(tmp),
+++             [ptr]"+&r"(c->bytestream)
+++        : // Inputs
+++#if !UNCHECKED_BITSTREAM_READER
+++                 [c]"r"(c),
+++               [end]"M"(offsetof(CABACContext, bytestream_end)),
+++#endif
+++             [range]"r"(c->range)
+++        : "cc"
+++    );
+++    return rv;
+++}
+++
++ #endif /* HAVE_ARMV6T2_INLINE */
++ 
++ #endif /* AVCODEC_ARM_CABAC_H */
++diff --git a/libavcodec/arm/hevc_cabac.h b/libavcodec/arm/hevc_cabac.h
++new file mode 100644
++index 0000000..31d3c59
++--- /dev/null
+++++ b/libavcodec/arm/hevc_cabac.h
++@@ -0,0 +1,491 @@
+++/*
+++ * This file is part of FFmpeg.
+++ *
+++ * FFmpeg is free software; you can redistribute it and/or
+++ * modify it under the terms of the GNU Lesser General Public
+++ * License as published by the Free Software Foundation; either
+++ * version 2.1 of the License, or (at your option) any later version.
+++ *
+++ * FFmpeg is distributed in the hope that it will be useful,
+++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+++ * Lesser General Public License for more details.
+++ *
+++ * You should have received a copy of the GNU Lesser General Public
+++ * License along with FFmpeg; if not, write to the Free Software
+++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+++ */
+++
+++#ifndef AVCODEC_ARM_HEVC_CABAC_H
+++#define AVCODEC_ARM_HEVC_CABAC_H
+++
+++#include "config.h"
+++#if HAVE_ARMV6T2_INLINE
+++
+++#define hevc_mem_bits32 hevc_mem_bits32_arm
+++static inline uint32_t hevc_mem_bits32_arm(const void * p, const unsigned int bits)
+++{
+++    unsigned int n;
+++    __asm__ (
+++        "rev        %[n], %[x]                     \n\t"
+++        : [n]"=r"(n)
+++        : [x]"r"(*(const uint32_t *)((const uint8_t *)p + (bits >> 3)))
+++        :
+++        );
+++    return n << (bits & 7);
+++}
+++
+++
+++// ---------------------------------------------------------------------------
+++//
+++// Helper fns - little bits of code where ARM has an instraction that the
+++// compiler doesn't know about / use
+++
+++#define trans_scale_sat trans_scale_sat_arm
+++static inline int trans_scale_sat_arm(const int level, const unsigned int scale, const unsigned int scale_m, const unsigned int shift)
+++{
+++    int rv;
+++    int t = ((level * (int)(scale * scale_m)) >> shift) + 1;
+++
+++    __asm__ (
+++    "ssat %[rv], #16, %[t], ASR #1 \n\t"
+++    : [rv]"=r"(rv)
+++    : [t]"r"(t)
+++    :
+++    );
+++    return rv;
+++}
+++
+++#define update_rice update_rice_arm
+++static inline void update_rice_arm(uint8_t * const stat_coeff,
+++    const unsigned int last_coeff_abs_level_remaining,
+++    const unsigned int c_rice_param)
+++{
+++    int t;
+++    __asm__ (
+++    "lsl   %[t], %[coeff], #1               \n\t"
+++    "lsrs  %[t], %[t], %[shift]             \n\t"
+++    "it    eq                               \n\t"
+++    "subeq %[stat], %[stat], #1             \n\t"
+++    "cmp   %[t], #6                         \n\t"
+++    "adc   %[stat], %[stat], #0             \n\t"
+++    "usat  %[stat], #8, %[stat]             \n\t"
+++    : [stat]"+&r"(*stat_coeff),
+++         [t]"=&r"(t)
+++    :  [coeff]"r"(last_coeff_abs_level_remaining),
+++       [shift]"r"(c_rice_param)
+++    : "cc"
+++    );
+++}
+++
+++// ---------------------------------------------------------------------------
+++//
+++// CABAC get loops
+++//
+++// Where the loop is simple enough we can normally do 10-30% better than the
+++// compiler
+++
+++// Get the residual greater than 1 bits
+++
+++#define get_cabac_greater1_bits get_cabac_greater1_bits_arm
+++static inline unsigned int get_cabac_greater1_bits_arm(CABACContext * const c, const unsigned int n,
+++    uint8_t * const state0)
+++{
+++    unsigned int i, reg_b, st, tmp, bit, rv;
+++     __asm__ (
+++         "mov        %[i]          , #0                          \n\t"
+++         "mov        %[rv]         , #0                          \n\t"
+++         "1:                                                     \n\t"
+++         "add        %[i]          , %[i]        , #1            \n\t"
+++         "cmp        %[rv]         , #0                          \n\t"
+++         "ite        eq                                          \n\t"
+++         "usateq     %[st]         , #2          , %[i]          \n\t"
+++         "movne      %[st]         , #0                          \n\t"
+++
+++         "ldrb       %[bit]        , [%[state0], %[st]]          \n\t"
+++         "sub        %[r_b]        , %[mlps_tables], %[lps_off]  \n\t"
+++         "and        %[tmp]        , %[range]    , #0xC0         \n\t"
+++         "add        %[r_b]        , %[r_b]      , %[bit]        \n\t"
+++         "ldrb       %[tmp]        , [%[r_b], %[tmp], lsl #1]    \n\t"
+++         "sub        %[range]      , %[range]    , %[tmp]        \n\t"
+++
+++         "cmp        %[low]        , %[range], lsl #17           \n\t"
+++         "ittt       ge                                          \n\t"
+++         "subge      %[low]        , %[low]      , %[range], lsl #17 \n\t"
+++         "mvnge      %[bit]        , %[bit]                      \n\t"
+++         "movge      %[range]      , %[tmp]                      \n\t"
+++
+++         "ldrb       %[r_b]        , [%[mlps_tables], %[bit]]    \n\t"
+++         "and        %[bit]        , %[bit]      , #1            \n\t"
+++         "orr        %[rv]         , %[bit]      , %[rv], lsl #1 \n\t"
+++
+++         "clz        %[tmp]        , %[range]                    \n\t"
+++         "sub        %[tmp]        , #23                         \n\t"
+++
+++         "lsl        %[low]        , %[low]      , %[tmp]        \n\t"
+++         "lsl        %[range]      , %[range]    , %[tmp]        \n\t"
+++
+++         "strb       %[r_b]        , [%[state0], %[st]]          \n\t"
+++// There is a small speed gain from combining both conditions, using a single
+++// branch and then working out what that meant later
+++         "lsls       %[tmp]        , %[low]      , #16           \n\t"
+++         "it         ne                                          \n\t"
+++         "cmpne      %[n]          , %[i]                        \n\t"
+++         "bne        1b                                          \n\t"
+++
+++// If reload is not required then we must have run out of flags to decode
+++         "tst        %[tmp]        , %[tmp]                      \n\t"
+++         "bne        2f                                          \n\t"
+++
+++// Do reload
+++         "ldrh       %[tmp]        , [%[bptr]]   , #2            \n\t"
+++         "movw       %[r_b]        , #0xFFFF                     \n\t"
+++         "rev        %[tmp]        , %[tmp]                      \n\t"
+++         "rsb        %[tmp]        , %[r_b]      , %[tmp], lsr #15 \n\t"
+++
+++         "rbit       %[r_b]        , %[low]                      \n\t"
+++         "clz        %[r_b]        , %[r_b]                      \n\t"
+++         "sub        %[r_b]        , %[r_b]      , #16           \n\t"
+++
+++#if CONFIG_THUMB
+++         "lsl        %[tmp]        , %[tmp]      , %[r_b]        \n\t"
+++         "add        %[low]        , %[low]      , %[tmp]        \n\t"
+ +#else
+-                     s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (i << log2_trafo_size_c), 1);
+++         "add        %[low]        , %[low]      , %[tmp], lsl %[r_b] \n\t"
+ +#endif
+-                 }
+-                 if (cbf_cb[i])
+-                     ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c),
+-@@ -1059,7 +1085,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+-             for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
+-                 if (lc->cu.pred_mode == MODE_INTRA) {
+-                     ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
+-+#ifdef RPI
+-+                    rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (i << log2_trafo_size_c), 2);
+++
+++         "cmp        %[n]          , %[i]                        \n\t"
+++         "bne        1b                                          \n\t"
+++         "2:                                                     \n\t"
+++         :    [bit]"=&r"(bit),
+++              [low]"+&r"(c->low),
+++            [range]"+&r"(c->range),
+++              [r_b]"=&r"(reg_b),
+++             [bptr]"+&r"(c->bytestream),
+++                [i]"=&r"(i),
+++              [tmp]"=&r"(tmp),
+++               [st]"=&r"(st),
+++               [rv]"=&r"(rv)
+++          :  [state0]"r"(state0),
+++                  [n]"r"(n),
+++        [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128),
+++               [byte]"M"(offsetof(CABACContext, bytestream)),
+++            [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
+++         : "memory", "cc"
+++    );
+++    return rv;
+++}
+++
+++
+++// n must be > 0 on entry
+++#define get_cabac_sig_coeff_flag_idxs get_cabac_sig_coeff_flag_idxs_arm
+++static inline uint8_t * get_cabac_sig_coeff_flag_idxs_arm(CABACContext * const c, uint8_t * const state0,
+++    unsigned int n,
+++    const uint8_t const * ctx_map,
+++    uint8_t * p)
+++{
+++    unsigned int reg_b, tmp, st, bit;
+++     __asm__ (
+++         "1:                                                     \n\t"
+++// Get bin from map
+++         "ldrb       %[st]         , [%[ctx_map], %[n]]          \n\t"
+++
+++// Load state & ranges
+++         "sub        %[r_b]        , %[mlps_tables], %[lps_off]  \n\t"
+++         "ldrb       %[bit]        , [%[state0], %[st]]          \n\t"
+++         "and        %[tmp]        , %[range]    , #0xC0         \n\t"
+++         "add        %[r_b]        , %[r_b]      , %[tmp], lsl #1 \n\t"
+++         "ldrb       %[tmp]        , [%[r_b], %[bit]]            \n\t"
+++         "sub        %[range]      , %[range]    , %[tmp]        \n\t"
+++
+++         "cmp        %[low]        , %[range], lsl #17           \n\t"
+++         "ittt       ge                                          \n\t"
+++         "subge      %[low]        , %[low]      , %[range], lsl #17 \n\t"
+++         "mvnge      %[bit]        , %[bit]                      \n\t"
+++         "movge      %[range]      , %[tmp]                      \n\t"
+++
+++         "ldrb       %[r_b]        , [%[mlps_tables], %[bit]]    \n\t"
+++         "tst        %[bit]        , #1                          \n\t"
+++// GCC asm seems to need strbne written differently for thumb and arm
+++#if CONFIG_THUMB
+++         "it         ne                                          \n\t"
+++         "strbne     %[n]          , [%[idx]]    , #1            \n\t"
+ +#else
+-                     s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (i << log2_trafo_size_c), 2);
+++         "strneb     %[n]          , [%[idx]]    , #1            \n\t"
+ +#endif
+-                 }
+-                 if (cbf_cr[i])
+-                     ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c),
+-@@ -1088,7 +1118,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+-                 if (lc->cu.pred_mode == MODE_INTRA) {
+-                     ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size),
+-                                                     trafo_size_h, trafo_size_v);
+-+#ifdef RPI
+-+                    rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (i << log2_trafo_size), 1);
+++
+++// Renorm
+++         "clz        %[tmp]        , %[range]                    \n\t"
+++         "sub        %[tmp]        , #23                         \n\t"
+++         "lsl        %[low]        , %[low]      , %[tmp]        \n\t"
+++         "lsl        %[range]      , %[range]    , %[tmp]        \n\t"
+++
+++         "strb       %[r_b]        , [%[state0], %[st]]          \n\t"
+++// There is a small speed gain from combining both conditions, using a single
+++// branch and then working out what that meant later
+++         "subs       %[n]          , %[n]        , #1            \n\t"
+++#if CONFIG_THUMB
+++         "itt        ne                                          \n\t"
+++         "lslsne     %[tmp]        , %[low]      , #16           \n\t"
+++         "bne        1b                                          \n\t"
+ +#else
+-                     s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (i << log2_trafo_size), 1);
+++         "lslnes     %[tmp]        , %[low]      , #16           \n\t"
+++         "bne        1b                                          \n\t"
+ +#endif
+-                 }
+-                 if (cbf_cb[i])
+-                     ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size),
+-@@ -1098,7 +1132,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+-                 if (lc->cu.pred_mode == MODE_INTRA) {
+-                     ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size),
+-                                                 trafo_size_h, trafo_size_v);
+-+#ifdef RPI
+-+                    rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (i << log2_trafo_size), 2);
+-+#else
+-                     s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (i << log2_trafo_size), 2);
+-+#endif
+-                 }
+-                 if (cbf_cr[i])
+-                     ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size),
+-@@ -1110,26 +1148,46 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+-             int trafo_size_h = 1 << (log2_trafo_size_c + s->ps.sps->hshift[1]);
+-             int trafo_size_v = 1 << (log2_trafo_size_c + s->ps.sps->vshift[1]);
+-             ff_hevc_set_neighbour_available(s, x0, y0, trafo_size_h, trafo_size_v);
+-+#ifdef RPI
+-+            rpi_intra_pred(s, log2_trafo_size_c, x0, y0, 1);
+-+            rpi_intra_pred(s, log2_trafo_size_c, x0, y0, 2);
+-+#else
+-             s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0, 1);
+-             s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0, 2);
+-+#endif
+-             if (s->ps.sps->chroma_format_idc == 2) {
+-                 ff_hevc_set_neighbour_available(s, x0, y0 + (1 << log2_trafo_size_c),
+-                                                 trafo_size_h, trafo_size_v);
+-+#ifdef RPI
+-+                rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (1 << log2_trafo_size_c), 1);
+-+                rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (1 << log2_trafo_size_c), 2);
+-+#else
+-                 s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (1 << log2_trafo_size_c), 1);
+-                 s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (1 << log2_trafo_size_c), 2);
+-+#endif
+-             }
+-         } else if (blk_idx == 3) {
+-             int trafo_size_h = 1 << (log2_trafo_size + 1);
+-             int trafo_size_v = 1 << (log2_trafo_size + s->ps.sps->vshift[1]);
+-             ff_hevc_set_neighbour_available(s, xBase, yBase,
+-                                             trafo_size_h, trafo_size_v);
+-+#ifdef RPI
+-+            rpi_intra_pred(s, log2_trafo_size, xBase, yBase, 1);
+-+            rpi_intra_pred(s, log2_trafo_size, xBase, yBase, 2);
+-+#else
+-             s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase, 1);
+-             s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase, 2);
+-+#endif
+-             if (s->ps.sps->chroma_format_idc == 2) {
+-                 ff_hevc_set_neighbour_available(s, xBase, yBase + (1 << (log2_trafo_size)),
+-                                                 trafo_size_h, trafo_size_v);
+-+#ifdef RPI
+-+                rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (1 << (log2_trafo_size)), 1);
+-+                rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (1 << (log2_trafo_size)), 2);
+-+#else
+-                 s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (1 << (log2_trafo_size)), 1);
+-                 s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (1 << (log2_trafo_size)), 2);
+-+#endif
+-             }
+-         }
+-     }
+-@@ -2304,6 +2362,31 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
+-     lc->ctb_up_left_flag = ((x_ctb > 0) && (y_ctb > 0)  && (ctb_addr_in_slice-1 >= s->ps.sps->ctb_width) && (s->ps.pps->tile_id[ctb_addr_ts] == s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1 - s->ps.sps->ctb_width]]));
+- }
+- 
+-+#ifdef RPI
+-+static void rpi_execute_pred_cmds(HEVCContext *s)
+-+{
+-+  int i;
+-+  HEVCPredCmd *cmd = s->univ_pred_cmds;
+-+  HEVCLocalContext *lc = s->HEVClc;
+ +
+-+  for(i = s->num_pred_cmds; i > 0; i--, cmd++) {
+-+      if (cmd->type == RPI_PRED_INTRA) {
+-+          lc->tu.intra_pred_mode_c = lc->tu.intra_pred_mode = cmd->mode;
+-+          lc->na.cand_bottom_left  = (cmd->na >> 4) & 1;
+-+          lc->na.cand_left         = (cmd->na >> 3) & 1;
+-+          lc->na.cand_up_left      = (cmd->na >> 2) & 1;
+-+          lc->na.cand_up           = (cmd->na >> 1) & 1;
+-+          lc->na.cand_up_right     = (cmd->na >> 0) & 1;
+-+          s->hpc.intra_pred[cmd->size - 2](s, cmd->x, cmd->y, cmd->c_idx);
+-+      } else {
+-+          s->hevcdsp.transform_add[cmd->size-2](cmd->dst, cmd->buf, cmd->stride);
+-+      }
+-+  }
+-+  s->num_pred_cmds = 0;
+-+  s->num_coeffs = 0;
+-+}
+-+#endif
+++// If we have bits left then n must be 0 so give up now
+++         "lsls       %[tmp]        , %[low]      , #16           \n\t"
+++         "bne        2f                                          \n\t"
+ +
+- static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+- {
+-     HEVCContext *s  = avctxt->priv_data;
+-@@ -2313,6 +2396,10 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-     int y_ctb       = 0;
+-     int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs];
+- 
+-+#ifdef RPI
+-+    s->enable_rpi = 1; // TODO this should depend on cross component and frame width etc.
+-+#endif
+++// Do reload
+++         "ldrh       %[tmp]        , [%[bptr]]   , #2            \n\t"
+++         "movw       %[r_b]        , #0xFFFF                     \n\t"
+++         "rev        %[tmp]        , %[tmp]                      \n\t"
+++         "rsb        %[tmp]        , %[r_b]      , %[tmp], lsr #15 \n\t"
+ +
+-     if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) {
+-         av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n");
+-         return AVERROR_INVALIDDATA;
+-@@ -2342,6 +2429,9 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-         s->filter_slice_edges[ctb_addr_rs]  = s->sh.slice_loop_filter_across_slices_enabled_flag;
+- 
+-         more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
+-+#ifdef RPI
+-+        rpi_execute_pred_cmds(s);
+-+#endif
+-         if (more_data < 0) {
+-             s->tab_slice_address[ctb_addr_rs] = -1;
+-             return more_data;
+-@@ -2387,6 +2477,10 @@ static int hls_decode_entry_wpp(AVCodecContext *avctxt, void *input_ctb_row, int
+-     s = s1->sList[self_id];
+-     lc = s->HEVClc;
+- 
+-+#ifdef RPI
+-+    s->enable_rpi = 0;
+-+#endif
+++         "rbit       %[r_b]        , %[low]                      \n\t"
+++         "clz        %[r_b]        , %[r_b]                      \n\t"
+++         "sub        %[r_b]        , %[r_b]      , #16           \n\t"
+ +
+-     if(ctb_row) {
+-         ret = init_get_bits8(&lc->gb, s->data + s->sh.offset[ctb_row - 1], s->sh.size[ctb_row - 1]);
+- 
+-@@ -3075,6 +3169,13 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
+- 
+-     av_freep(&s->cabac_state);
+- 
+-+#ifdef RPI
+-+    av_freep(&s->unif_mv_cmds);
+-+    av_freep(&s->unif_xfm_cmds);
+-+    av_freep(&s->univ_pred_cmds);
+-+    av_freep(&s->coeffs_buf);
+++#if CONFIG_THUMB
+++         "lsl        %[tmp]        , %[tmp]      , %[r_b]        \n\t"
+++         "add        %[low]        , %[low]      , %[tmp]        \n\t"
+++#else
+++         "add        %[low]        , %[low]      , %[tmp], lsl %[r_b] \n\t"
+ +#endif
+ +
+-     for (i = 0; i < 3; i++) {
+-         av_freep(&s->sao_pixel_buffer_h[i]);
+-         av_freep(&s->sao_pixel_buffer_v[i]);
+-@@ -3129,6 +3230,22 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
+-     s->HEVClcList[0] = s->HEVClc;
+-     s->sList[0] = s;
+- 
+-+#ifdef RPI
+-+    s->unif_mv_cmds = av_mallocz(sizeof(HEVCMvCmd)*RPI_MAX_MV_CMDS);
+-+    if (!s->unif_mv_cmds)
+-+        goto fail;
+-+    s->unif_xfm_cmds = av_mallocz(sizeof(HEVCXfmCmd)*RPI_MAX_XFM_CMDS);
+-+    if (!s->unif_xfm_cmds)
+-+        goto fail;
+-+    s->univ_pred_cmds = av_mallocz(sizeof(HEVCPredCmd)*RPI_MAX_PRED_CMDS);
+-+    if (!s->univ_pred_cmds)
+-+        goto fail;
+-+    s->coeffs_buf = av_mallocz(sizeof(int16_t)*RPI_MAX_XFM_CMDS*16);
+-+    if (!s->coeffs_buf)
+-+        goto fail;
+-+    s->enable_rpi = 0;
+-+#endif
+++// Check to see if we still have more to do
+++         "cmp        %[n]          , #0                          \n\t"
+++         "bne        1b                                          \n\t"
+++         "2:                                                     \n\t"
+++         :    [bit]"=&r"(bit),
+++              [low]"+&r"(c->low),
+++            [range]"+&r"(c->range),
+++              [r_b]"=&r"(reg_b),
+++             [bptr]"+&r"(c->bytestream),
+++              [idx]"+&r"(p),
+++                [n]"+&r"(n),
+++              [tmp]"=&r"(tmp),
+++               [st]"=&r"(st)
+++          :  [state0]"r"(state0),
+++            [ctx_map]"r"(ctx_map),
+++        [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128),
+++               [byte]"M"(offsetof(CABACContext, bytestream)),
+++            [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
+++         : "memory", "cc"
+++    );
+++
+++    return p;
+++}
+ +
+-     s->cabac_state = av_malloc(HEVC_CONTEXTS);
+-     if (!s->cabac_state)
+-         goto fail;
+-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
+-index be91010..7a1c35f 100644
+---- a/libavcodec/hevc.h
+-+++ b/libavcodec/hevc.h
+-@@ -23,6 +23,9 @@
+- #ifndef AVCODEC_HEVC_H
+- #define AVCODEC_HEVC_H
+- 
+-+// define RPI to split the CABAC/prediction/transform into separate stages
+-+#include "config.h"
+++// ---------------------------------------------------------------------------
+++//
+++// CABAC_BY22 functions
+++//
+++// By and large these are (at best) no faster than their C equivalents - the
+++// only one worth having is _peek where we do a slightly better job than the
+++// compiler
+++//
+++// The others have been stashed here for reference in case larger scale asm
+++// is attempted in which case they might be a useful base
+ +
+- #include "libavutil/buffer.h"
+- #include "libavutil/md5.h"
+- 
+-@@ -790,6 +793,49 @@ typedef struct HEVCLocalContext {
+-     int boundary_flags;
+- } HEVCLocalContext;
+- 
+-+#ifdef RPI
+ +
+-+// RPI_MAX_WIDTH is maximum width in pixels supported by the accelerated code
+-+#define RPI_MAX_WIDTH 2048
+++#define get_cabac_by22_peek get_cabac_by22_peek_arm
+++static inline uint32_t get_cabac_by22_peek_arm(const CABACContext *const c)
+++{
+++    uint32_t rv, tmp;
+++    __asm__ (
+++        "bic      %[rv]  , %[low], #1            \n\t"
+++        "cmp      %[inv] , #0                    \n\t"
+++        "it       ne                             \n\t"
+++        "umullne  %[tmp] , %[rv] , %[inv], %[rv] \n\t"
+++        :  // Outputs
+++             [rv]"=&r"(rv),
+++             [tmp]"=r"(tmp)
+++        :  // Inputs
+++             [low]"r"(c->low),
+++             [inv]"r"(c->range)
+++        :  // Clobbers
+++                "cc"
+++    );
+++    return rv << 1;
+++}
+ +
+-+// Worst case is for 4:4:4 4x4 blocks with 64 high coding tree blocks, so 16 MV cmds per 4 pixels across for each colour plane
+-+#define RPI_MAX_MV_CMDS   (16*3*(RPI_MAX_WIDTH/4))
+-+#define RPI_MAX_XFM_CMDS  (16*3*(RPI_MAX_WIDTH/4))
+-+// Each block can have an intra prediction and a transform_add command
+-+#define RPI_MAX_PRED_CMDS (2*16*3*(RPI_MAX_WIDTH/4))
+++#if 0
+ +
+-+// Command for inter prediction
+-+typedef struct HEVCMvCmd {
+-+} HEVCMvCmd;
+++// ***** Slower than the C  :-(
+++#define get_cabac_by22_flush get_cabac_by22_flush_arm
+++static inline void get_cabac_by22_flush_arm(CABACContext *const c, const unsigned int n, const uint32_t val)
+++{
+++    uint32_t m, tmp;
+++    __asm__ (
+++    "add    %[bits], %[bits], %[n]   \n\t"
+++    "ldr    %[m], [%[ptr], %[bits], lsr #3]  \n\t"
+++
+++    "rsb    %[tmp], %[n], #32        \n\t"
+++    "lsr    %[tmp], %[val], %[tmp]   \n\t"
+++    "mul    %[tmp], %[range], %[tmp] \n\t"
+++
+++    "rev    %[m], %[m]               \n\t"
+++
+++    "lsl    %[tmp], %[tmp], #23      \n\t"
+++    "rsb    %[low], %[tmp], %[low], lsl %[n] \n\t"
+++
+++    "and    %[tmp], %[bits], #7         \n\t"
+++    "lsl    %[m], %[m], %[tmp]          \n\t"
+++
+++    "orr    %[low], %[low], %[m], lsr #9      \n\t"
+++        :  // Outputs
+++             [m]"=&r"(m),
+++           [tmp]"=&r"(tmp),
+++          [bits]"+&r"(c->by22.bits),
+++           [low]"+&r"(c->low)
+++        :  // Inputs
+++               [n]"r"(n),
+++             [val]"r"(val),
+++             [inv]"r"(c->range),
+++           [range]"r"(c->by22.range),
+++             [ptr]"r"(c->bytestream)
+++        :  // Clobbers
+++    );
+++}
+ +
+-+// Command for transform to process a block of coefficients
+-+typedef struct HEVCXfmCmd {
+-+} HEVCXfmCmd;
+ +
+-+// Command for intra prediction and transform_add of predictions to coefficients
+-+#define RPI_PRED_TRANSFORM_ADD 0
+-+#define RPI_PRED_INTRA 1
+-+typedef struct HEVCPredCmd {
+-+    uint8_t size;
+-+    uint8_t type;
+-+    uint8_t na;
+-+    uint8_t c_idx;
+-+    union {
+-+        uint8_t *dst; // RPI_PRED_TRANSFORM_ADD
+-+        uint32_t x;   // RPI_PRED_INTRA
+-+    };
+-+    union {
+-+        int16_t *buf; // RPI_PRED_TRANSFORM_ADD
+-+        uint32_t y;   // RPI_PRED_INTRA
+-+    };
+-+    union {
+-+        enum IntraPredMode mode; // RPI_PRED_TRANSFORM_ADD
+-+        uint32_t stride;         // RPI_PRED_INTRA
+-+    };
+-+} HEVCPredCmd;
+++// Works but slower than C
+++#define coeff_abs_level_remaining_decode_by22(c,r) coeff_abs_level_remaining_decode_by22_arm(c, r)
+++static int coeff_abs_level_remaining_decode_by22_arm(CABACContext * const c, const unsigned int c_rice_param)
+++{
+++    uint32_t n, val, tmp, level;
+ +
+-+#endif
+++//    PROFILE_START();
+ +
+- typedef struct HEVCContext {
+-     const AVClass *c;  // needed by private avoptions
+-     AVCodecContext *avctx;
+-@@ -805,6 +851,18 @@ typedef struct HEVCContext {
+-     int                 width;
+-     int                 height;
+- 
+-+#ifdef RPI
+-+    int enable_rpi;
+-+    HEVCMvCmd *unif_mv_cmds;
+-+    HEVCXfmCmd *unif_xfm_cmds;
+-+    HEVCPredCmd *univ_pred_cmds;
+-+    int16_t *coeffs_buf;
+-+    int num_mv_cmds;
+-+    int num_xfm_cmds;
+-+    int num_pred_cmds;
+-+    int num_coeffs;
+-+#endif
+++    __asm__ (
+++            // Peek
+++            "bic    %[val],  %[low],   #1  \n\t"
+++            "cmp    %[inv], #0          \n\t"
+++            "umullne  %[tmp], %[val], %[inv], %[val] \n\t"
+++            "lsl    %[val], %[val], #1  \n\t"
+ +
+-     uint8_t *cabac_state;
+- 
+-     /** 1 if the independent slice segment header was successfully parsed */
+-diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
+-index 05b2821..4e97f06 100644
+---- a/libavcodec/hevc_cabac.c
+-+++ b/libavcodec/hevc_cabac.c
+-@@ -1510,6 +1510,21 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+-             coeffs[i] = coeffs[i] + ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
+-         }
+-     }
+-+#ifdef RPI
+-+    if (s->enable_rpi) {
+-+        int16_t *c = s->coeffs_buf + s->num_coeffs;
+-+        int n = trafo_size * trafo_size;
+-+        HEVCPredCmd *cmd = s->univ_pred_cmds + s->num_pred_cmds++;
+-+        memcpy(c, coeffs, n * sizeof(int16_t));  // TODO change pointer earlier and we can avoid this copy
+-+        s->num_coeffs += n;
+-+        cmd->type = RPI_PRED_TRANSFORM_ADD;
+-+        cmd->size = log2_trafo_size;
+-+        cmd->buf = c;
+-+        cmd->dst = dst;
+-+        cmd->stride = stride;
+-+        return;
+-+    }
+-+#endif
+-     s->hevcdsp.transform_add[log2_trafo_size-2](dst, coeffs, stride);
+- }
+- 
+--- 
+-2.7.4
+-
+-
+-From f8293de11dc040d9fa2a558762a357c0c353d2c9 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Thu, 30 Apr 2015 15:23:22 +0100
+-Subject: [PATCH 03/68] Added simple VPU test code
+-
+----
+- libavcodec/Makefile             |    7 +
+- libavcodec/hevc.c               |   33 +-
+- libavcodec/rpi_hevc_transform.h |  212 ++++++
+- libavcodec/rpi_hevc_transform.s |  147 ++++
+- libavcodec/rpi_mailbox.c        |  293 ++++++++
+- libavcodec/rpi_mailbox.h        |   20 +
+- libavcodec/rpi_qpu.c            |  652 ++++++++++++++++++
+- libavcodec/rpi_qpu.h            |   45 ++
+- libavcodec/rpi_shader.c         |  818 ++++++++++++++++++++++
+- libavcodec/rpi_shader.h         |   20 +
+- libavcodec/rpi_shader.qasm      | 1413 +++++++++++++++++++++++++++++++++++++++
+- libavcodec/rpi_user_vcsm.h      |  425 ++++++++++++
+- 12 files changed, 4084 insertions(+), 1 deletion(-)
+- create mode 100644 libavcodec/rpi_hevc_transform.h
+- create mode 100644 libavcodec/rpi_hevc_transform.s
+- create mode 100644 libavcodec/rpi_mailbox.c
+- create mode 100644 libavcodec/rpi_mailbox.h
+- create mode 100644 libavcodec/rpi_qpu.c
+- create mode 100644 libavcodec/rpi_qpu.h
+- create mode 100644 libavcodec/rpi_shader.c
+- create mode 100644 libavcodec/rpi_shader.h
+- create mode 100644 libavcodec/rpi_shader.qasm
+- create mode 100644 libavcodec/rpi_user_vcsm.h
+-
+-diff --git a/libavcodec/Makefile b/libavcodec/Makefile
+-index fd0d1f0..03065cd 100644
+---- a/libavcodec/Makefile
+-+++ b/libavcodec/Makefile
+-@@ -5,6 +5,10 @@ NAME = avcodec
+- HEADERS = avcodec.h                                                     \
+-           avdct.h                                                       \
+-           avfft.h                                                       \
+-+          rpi_qpu.h                                                     \
+-+          rpi_shader.h                                                  \
+-+          rpi_mailbox.h                                                 \
+-+          rpi_hevc_transform.h                                          \
+-           d3d11va.h                                                     \
+-           dirac.h                                                       \
+-           dv_profile.h                                                  \
+-@@ -43,6 +47,9 @@ OBJS = allcodecs.o                                                      \
+-        resample.o                                                       \
+-        resample2.o                                                      \
+-        utils.o                                                          \
+-+       rpi_qpu.o                                                        \
+-+       rpi_shader.o                                                     \
+-+       rpi_mailbox.o                                                    \
+-        vorbis_parser.o                                                  \
+-        xiph.o                                                           \
+- 
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index aa45dd6..ab55df1 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -41,6 +41,10 @@
+- #include "hevc.h"
+- #include "profiles.h"
+- 
+-+#ifdef RPI
+-+#include "rpi_qpu.h"
+++            // Count bits (n = prefix)
+++            "mvn    %[n], %[val] \n\t"
+++            "clz    %[n], %[n]   \n\t"
+++
+++            "lsl    %[level], %[val], %[n] \n\t"
+++            "subs   %[tmp], %[n], #3 \n\t"
+++            "blo    2f \n\t"
+++
+++            // prefix >= 3
+++            // < tmp = prefix - 3
+++            // > tmp = prefix + rice - 3
+++            "add    %[tmp], %[tmp], %[rice] \n\t"
+++            // > n = prefix * 2 + rice - 3
+++            "add    %[n], %[tmp], %[n] \n\t"
+++            "cmp    %[n], #21 \n\t"
+++            "bhi    3f \n\t"
+++
+++            "orr    %[level], %[level], #0x80000000 \n\t"
+++            "rsb    %[tmp], %[tmp], #31 \n\t"
+++            "lsr    %[level], %[level], %[tmp] \n\t"
+++
+++            "mov    %[tmp], #2 \n\t"
+++            "add    %[level], %[level], %[tmp], lsl %[rice] \n\t"
+++            "b      1f \n\t"
+++
+++            // > 22 bits used in total - need reload
+++            "3:  \n\t"
+++
+++            // Stash prefix + rice - 3 in level (only spare reg)
+++            "mov    %[level], %[tmp] \n\t"
+++            // Restore n to flush value (prefix)
+++            "sub    %[n], %[n], %[tmp] \n\t"
+++
+++            // Flush + reload
+++
+++//          "rsb    %[tmp], %[n], #32        \n\t"
+++//          "lsr    %[tmp], %[val], %[tmp]   \n\t"
+++//          "mul    %[tmp], %[range], %[tmp] \n\t"
+++
+++            // As it happens we know that all the bits we are flushing are 1
+++            // so we can cheat slightly
+++            "rsb    %[tmp], %[range], %[range], lsl %[n] \n\t"
+++            "lsl    %[tmp], %[tmp], #23      \n\t"
+++            "rsb    %[low], %[tmp], %[low], lsl %[n] \n\t"
+++
+++            "add    %[bits], %[bits], %[n]   \n\t"
+++            "ldr    %[n], [%[ptr], %[bits], lsr #3]  \n\t"
+++            "rev    %[n], %[n]               \n\t"
+++            "and    %[tmp], %[bits], #7         \n\t"
+++            "lsl    %[n], %[n], %[tmp]          \n\t"
+++
+++            "orr    %[low], %[low], %[n], lsr #9      \n\t"
+++
+++            // (reload)
+++
+++            "bic    %[val],  %[low],   #1  \n\t"
+++            "cmp    %[inv], #0          \n\t"
+++            "umullne  %[tmp], %[val], %[inv], %[val] \n\t"
+++            "lsl    %[val], %[val], #1  \n\t"
+++
+++            // Build value
+++
+++            "mov    %[n], %[level] \n\t"
+++
+++            "orr     %[tmp], %[val], #0x80000000 \n\t"
+++            "rsb     %[level], %[level], #31 \n\t"
+++            "lsr     %[level], %[tmp], %[level] \n\t"
+++
+++            "mov    %[tmp], #2 \n\t"
+++            "add    %[level], %[level], %[tmp], lsl %[rice] \n\t"
+++            "b      1f \n\t"
+++
+++            // prefix < 3
+++            "2:  \n\t"
+++            "rsb    %[tmp], %[rice], #31 \n\t"
+++            "lsr    %[level], %[level], %[tmp] \n\t"
+++            "orr    %[level], %[level], %[n], lsl %[rice] \n\t"
+++            "add    %[n], %[n], %[rice] \n\t"
+++
+++            "1:  \n\t"
+++            // Flush
+++            "add    %[n], %[n], #1 \n\t"
+++
+++            "rsb    %[tmp], %[n], #32        \n\t"
+++            "lsr    %[tmp], %[val], %[tmp]   \n\t"
+++
+++            "add    %[bits], %[bits], %[n]   \n\t"
+++            "ldr    %[val], [%[ptr], %[bits], lsr #3]  \n\t"
+++
+++            "mul    %[tmp], %[range], %[tmp] \n\t"
+++            "lsl    %[tmp], %[tmp], #23      \n\t"
+++            "rsb    %[low], %[tmp], %[low], lsl %[n] \n\t"
+++
+++            "rev    %[val], %[val]               \n\t"
+++            "and    %[tmp], %[bits], #7         \n\t"
+++            "lsl    %[val], %[val], %[tmp]          \n\t"
+++
+++            "orr    %[low], %[low], %[val], lsr #9      \n\t"
+++        :  // Outputs
+++         [level]"=&r"(level),
+++             [n]"=&r"(n),
+++           [val]"=&r"(val),
+++           [tmp]"=&r"(tmp),
+++          [bits]"+&r"(c->by22.bits),
+++           [low]"+&r"(c->low)
+++        :  // Inputs
+++            [rice]"r"(c_rice_param),
+++             [inv]"r"(c->range),
+++           [range]"r"(c->by22.range),
+++             [ptr]"r"(c->bytestream)
+++        :  // Clobbers
+++                "cc"
+++    );
+++
+++//    PROFILE_ACC(residual_abs);
+++
+++    return level;
+++}
+ +#endif
+ +
+- const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
+- 
+- /**
+-@@ -2430,7 +2434,9 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+- 
+-         more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
+- #ifdef RPI
+--        rpi_execute_pred_cmds(s);
+-+        if (x_ctb + ctb_size >= s->ps.sps->width) {
+-+            rpi_execute_pred_cmds(s);
+-+        }
+- #endif
+-         if (more_data < 0) {
+-             s->tab_slice_address[ctb_addr_rs] = -1;
+-@@ -3244,6 +3250,31 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
+-     if (!s->coeffs_buf)
+-         goto fail;
+-     s->enable_rpi = 0;
+-+
+-+    // A little test program
+-+    {
+-+      GPU_MEM_PTR_T p;
+-+      int err = gpu_malloc_cached(16, &p);
+-+      short *q = (short *)p.arm;
+-+      int i;
+-+      int r;
+-+      printf("Allocated memory %d ARM 0x%x, VC 0x%x, Code 0x%x\n",err,(int)p.arm,p.vc,(int)vpu_get_fn());
+-+      printf("Allocated memory %d ARM 0x%x, VC 0x%x\n",err,(int)p.arm,p.vc);
+-+      printf("Preparing data %p\n",q);
+-+      for(i=0;i<16;i++)
+-+        q[i] = i;
+-+      printf("Flush cache\n");
+-+      gpu_cache_flush(&p);
+-+      printf("Executing code\n");
+-+      r = vpu_execute_code( vpu_get_fn(), p.vc, 0, 0, 0, 0, 0);
+-+      printf("Return value %d (",r);
+-+      for(i=0;i<16;i++)
+-+        printf("%d ",q[i]);
+-+      printf(")\n");
+-+      gpu_free(&p);
+-+      goto fail; // Early out
+-+    }
+++#endif /* HAVE_ARMV6T2_INLINE */
+ +
+- #endif
+- 
+-     s->cabac_state = av_malloc(HEVC_CONTEXTS);
+-diff --git a/libavcodec/rpi_hevc_transform.h b/libavcodec/rpi_hevc_transform.h
+++#endif /* AVCODEC_ARM_HEVC_CABAC_H */
++diff --git a/libavcodec/arm/hevcdsp_deblock_neon.S b/libavcodec/arm/hevcdsp_deblock_neon.S
++index 166bddb..a088cc3 100644
++--- a/libavcodec/arm/hevcdsp_deblock_neon.S
+++++ b/libavcodec/arm/hevcdsp_deblock_neon.S
++@@ -383,3 +383,127 @@ function ff_hevc_h_loop_filter_chroma_neon, export=1
++         vst1.8   {d4}, [r0]
++         bx       lr
++ endfunc
+++
+++/* ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_inc, int out_inc,
+++ *                                            int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
+++ *                                            MvField *curr, MvField *neigh, uint8_t *bs)
+++ */
+++function ff_hevc_deblocking_boundary_strengths_neon, export=1
+++        add         ip, sp, #4*4
+++        push        {a2-a4,v1-v8,lr}
+++        ldmia       ip, {v5-v7}
+++1:      ldmdb       ip, {v1-v4}
+++        ldrsb       a3, [v5, #8]    @ curr->ref_idx
+++        ldrsb       v8, [v5, #9]
+++        ldrsb       ip, [v6, #8]    @ neigh->ref_idx
+++        ldrsb       lr, [v6, #9]
+++        ldr         v1, [v1, a3, lsl #2]
+++        ldrb        a3, [v5, #10]   @ curr->pred_flag
+++        ldr         v2, [v2, v8, lsl #2]
+++        ldrb        v8, [v6, #10]   @ neigh->pred_flag
+++        ldr         v3, [v3, ip, lsl #2]
+++        ldr         v4, [v4, lr, lsl #2]
+++        teq         a3, #3
+++        beq         20f
+++        teq         v8, #3
+++        beq         90f
+++
+++        tst         a3, #1
+++        itee        ne
+++        ldrne       a3, [v5, #0]    @ curr->mv[0]
+++        ldreq       a3, [v5, #4]    @ curr->mv[1]
+++        moveq       v1, v2
+++        tst         v8, #1
+++        itee        ne
+++        ldrne       v8, [v6, #0]    @ neigh->mv[0]
+++        ldreq       v8, [v6, #4]    @ neigh->mv[1]
+++        moveq       v3, v4
+++        teq         v1, v3
+++        bne         10f
+++        ldr         lr, =0xFFFCFFFC
+++        ssub16      ip, v8, a3
+++        ssub16      a3, a3, v8
+++        sel         a3, a3, ip
+++        ands        a3, a3, lr
+++        @ drop through
+++10:     it          ne
+++        movne       a3, #1
+++11:     subs        a2, a2, #1
+++12:
+++A       strbhs      a3, [v7], a4
+++T       itt         hs
+++T       strbhs      a3, [v7]
+++T       addhs       v7, v7, a4
+++        subs        a2, a2, #1
+++        bhs         12b
+++
+++        ldm         sp, {a2, a3}
+++        add         ip, sp, #16*4
+++        subs        a1, a1, #1
+++        add         v5, v5, a3
+++        add         v6, v6, a3
+++        bhi         1b
+++        pop         {a2-a4,v1-v8,pc}
+++
+++20:     teq         v8, #3
+++        bne         10b
+++
+++        teq         v1, v3
+++        it          eq
+++        teqeq       v2, v4
+++        bne         40f
+++        teq         v1, v2
+++        bne         30f
+++
+++        ldrd        v1, v2, [v5]    @ curr->mv
+++        ldrd        v3, v4, [v6]    @ neigh->mv
+++        ldr         lr, =0xFFFCFFFC
+++        ssub16      ip, v3, v1
+++        ssub16      a3, v1, v3
+++        sel         a3, a3, ip
+++        ands        a3, a3, lr
+++        bne         25f
+++        ssub16      ip, v4, v2
+++        ssub16      a3, v2, v4
+++        sel         a3, a3, ip
+++        ands        a3, a3, lr
+++        beq         11b
+++        @ drop through
+++25:     ssub16      ip, v4, v1
+++        ssub16      a3, v1, v4
+++        sel         a3, a3, ip
+++        ands        a3, a3, lr
+++        bne         10b
+++        ssub16      ip, v3, v2
+++        ssub16      a3, v2, v3
+++        sel         a3, a3, ip
+++        ands        a3, a3, lr
+++        b           10b
+++
+++30:     ldrd        v1, v2, [v5]    @ curr->mv
+++        ldrd        v3, v4, [v6]    @ neigh->mv
+++        ldr         lr, =0xFFFCFFFC
+++        ssub16      ip, v3, v1
+++        ssub16      a3, v1, v3
+++        sel         a3, a3, ip
+++        ands        a3, a3, lr
+++        bne         10b
+++        ssub16      ip, v4, v2
+++        ssub16      a3, v2, v4
+++        sel         a3, a3, ip
+++        ands        a3, a3, lr
+++        b           10b
+++
+++40:     teq         v1, v4
+++        ite         eq
+++        teqeq       v2, v3
+++        bne         10b
+++
+++        ldrd        v1, v2, [v5]    @ curr->mv
+++        ldrd        v3, v4, [v6]    @ neigh->mv
+++        ldr         lr, =0xFFFCFFFC
+++        b           25b
+++
+++90:     mov         a3, #1
+++        b           11b
+++endfunc
++diff --git a/libavcodec/arm/hevcdsp_epel_neon.S b/libavcodec/arm/hevcdsp_epel_neon.S
+ new file mode 100644
+-index 0000000..85a9102
++index 0000000..00eab9e
+ --- /dev/null
+-+++ b/libavcodec/rpi_hevc_transform.h
+-@@ -0,0 +1,212 @@
+-+unsigned char rpi_hevc_transform [] = {
+-+169,
+-+3,
+-+3,
+-+232,
+-+128,
+-+0,
+-+0,
+-+0,
+-+20,
+-+248,
+-+0,
+-+136,
+-+0,
+-+0,
+-+192,
+-+248,
+-+0,
+-+0,
+-+0,
+-+96,
+-+3,
+-+232,
+-+32,
+-+0,
+-+0,
+-+0,
+-+7,
+-+232,
+-+0,
+-+2,
+-+0,
+-+0,
+-+8,
+-+232,
+-+0,
+-+4,
+-+0,
+-+0,
+-+12,
+-+248,
+-+0,
+-+128,
+-+0,
+-+0,
+-+192,
+-+8,
+-+4,
+-+0,
+-+4,
+-+232,
+-+64,
+-+0,
+-+0,
+-+0,
+-+5,
+-+232,
+-+0,
+-+0,
+-+8,
+-+0,
+-+128,
+-+69,
+-+113,
+-+66,
+-+12,
+-+248,
+-+0,
+-+128,
+-+0,
+-+0,
+-+192,
+-+8,
+-+4,
+-+0,
+-+128,
+-+69,
+-+113,
+-+70,
+-+128,
+-+144,
+-+39,
+-+0,
+-+4,
+-+255,
+-+48,
+-+192,
+-+128,
+-+3,
+-+32,
+-+8,
+-+16,
+-+0,
+-+76,
+-+254,
+-+48,
+-+192,
+-+9,
+-+4,
+-+32,
+-+8,
+-+0,
+-+0,
+-+4,
+-+254,
+-+0,
+-+144,
+-+128,
+-+2,
+-+0,
+-+248,
+-+62,
+-+0,
+-+128,
+-+144,
+-+22,
+-+0,
+-+4,
+-+255,
+-+48,
+-+192,
+-+128,
+-+3,
+-+32,
+-+8,
+-+16,
+-+0,
+-+76,
+-+254,
+-+48,
+-+192,
+-+9,
+-+4,
+-+32,
+-+8,
+-+0,
+-+0,
+-+140,
+-+248,
+-+44,
+-+0,
+-+0,
+-+0,
+-+32,
+-+48,
+-+4,
+-+0,
+-+128,
+-+69,
+-+113,
+-+66,
+-+242,
+-+140,
+-+211,
+-+192,
+-+41,
+-+3,
+-+68,
+-+192,
+-+80,
+-+7,
+-+164,
+-+255,
+-+36,
+-+220,
+-+96,
+-+2,
+-+0,
+-+248,
+-+62,
+-+0,
+-+3,
+-+255,
+-+55,
+-+208,
+-+120,
+-+3,
+-+224,
+-+3,
+-+190,
+-+11,
+-+16,
+-+139,
+-+246,
+-+83,
+-+0,
+-+103,
+-+90,
+-+0,
+-+8,
+-+240,
+-+0,
+-+128,
+-+128,
+-+3,
+-+0,
+-+247,
+-+32,
+-+128,
+-+10,
+-+4,
+-+136,
+-+240,
+-+32,
+-+0,
+-+128,
+-+3,
+-+112,
+-+96,
+-+90,
+-+0,
+-+};
+-diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s
+-new file mode 100644
+-index 0000000..5e2728d
+---- /dev/null
+-+++ b/libavcodec/rpi_hevc_transform.s
+-@@ -0,0 +1,147 @@
+-+# ******************************************************************************
+-+# Argon Design Ltd.
+-+# (c) Copyright 2015 Argon Design Ltd. All rights reserved.
+-+#
+-+# Module : HEVC
+-+# Author : Peter de Rivaz
+-+# ******************************************************************************
+++++ b/libavcodec/arm/hevcdsp_epel_neon.S
++@@ -0,0 +1,337 @@
+++/*
+++ * Copyright (c) 2014 - 2015 Seppo Tomperi <seppo.tomperi@vtt.fi>
+++ *
+++ * This file is part of FFmpeg.
+++ *
+++ * FFmpeg is free software; you can redistribute it and/or
+++ * modify it under the terms of the GNU Lesser General Public
+++ * License as published by the Free Software Foundation; either
+++ * version 2.1 of the License, or (at your option) any later version.
+++ *
+++ * FFmpeg is distributed in the hope that it will be useful,
+++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+++ * Lesser General Public License for more details.
+++ *
+++ * You should have received a copy of the GNU Lesser General Public
+++ * License along with FFmpeg; if not, write to the Free Software
+++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+++ */
+ +
+-+# HEVC VPU Transform
+-+#
+-+# Transform matrix can be thought of as
+-+#   output row vector = input row vector * transMatrix2
+-+#
+-+# The even rows of the matrix are symmetric
+-+# The odd rows of the matrix are antisymmetric
+-+#
+-+# So only need to compute the first half of the results, then can compute the remainder with a butterfly
+-+#
+-+# EXAMPLE
+-+#   (a b c d) (1 2  2  1)
+-+#             (3 4 -4 -3)
+-+#             (5 6  6  5)
+-+#             (7 8 -8 -7)
+-+#
+-+#  x=(a c)(1 2) = 1a+5c 2a+6c
+-+#         (5 6)
+-+#
+-+#  y=(b d)(3 4) = 3b+7d 4b+8d
+-+#         (7 8)
+-+#
+-+#  u=x+y = 1a+5c+3b+7d 2a+4b+6c+8d
+-+#  v=x-y = 1a+5c-3b-7d 2a+6c-4b-8d
+-+#
+-+#  Final results are (u , v[::-1])
+-+#
+-+#
+-+#  For 32x1 input, load even rows into HX(0++,0), odd rows into HX(16++,0)
+-+#  Apply the even matrix first and stop before rounding
+-+#  Then apply the odd matrix in a full manner:
+-+#
+-+#   First step is to compute partial products with the first input (16 cycles)
+-+#   1a 3b 5c 7d   16x1 input coefficients produce 16x16 output
+-+#   2a 4b 6c 8d
+-+#   2a -4b 6c -8d
+-+#   1a -3b 5c -7d
+-+#
+-+#   Second step is to sum partial products into final position (8 cycles)
+-+#   1a+3b+5c+7d
+-+#   2a+4b+6c+8d
+-+#   2a-4b+6c-8d
+-+#   1a-3b+5c-7d
+-+#
+-+#   Then can apply butterfly to combine even results and odd results + rounding to produce 16 rows of output at a time (need to save in transposed format)
+-+#
+-+#   For 16x16 no butterfly is required and can store final results in original location  (Could do 2 16x16s in parallel to make use of the trick - saves on the adds)
+-+#
+-+#   For 8x8 we could compute two in parallel.
+-+#
+-+#
+++#include "libavutil/arm/asm.S"
+++#include "neon.S"
+++
+++#define MAX_PB_SIZE #64
+++
+++.macro vextin_d4
+++    vld1.8    {q10}, [r1], r2
+++    vmov      d16, d20
+++    vext.8    d17, d20, d21, #1
+++    vext.8    d18, d20, d21, #2
+++    vext.8    d19, d20, d21, #3
+++.endm
+++
+++.macro vextin_d4_8
+++    vld1.8    d16, [r1], r2
+++    vext.8    d17, d16, d16, #1
+++    vext.8    d18, d16, d16, #2
+++    vext.8    d19, d16, d16, #3
+++.endm
+++
+++.macro load_coeffs_16b coeffs
+++    ldr      \coeffs, [\coeffs]
+++    vdup.i8  d0, \coeffs
+++    lsr      \coeffs, #8
+++    vdup.i8  d1, \coeffs
+++    lsr      \coeffs, #8
+++    vdup.i8  d2, \coeffs
+++    lsr      \coeffs, #8
+++    vdup.i8  d3, \coeffs
+++.endm
+++
+++.macro epel_filter_16b out=q12
+++    vmull.u8 q3, d16, d0
+++    vmull.u8 q11, d19, d3
+++    vmull.u8 \out, d17, d1
+++    vmull.u8 q10, d18, d2
+++    vadd.s16 q3, q11
+++    vadd.s16 \out, q10
+++    vsub.s16 \out, q3
+++.endm
+++
+++.macro load_coeffs_32b coeffs
+++    ldr      \coeffs, [\coeffs]
+++    vmov.i64 d4, #0
+++    vmov.8   d4[0], \coeffs
+++    lsr      \coeffs, #8
+++    vmov.8   d4[2], \coeffs
+++    lsr      \coeffs, #8
+++    vmov.8   d4[4], \coeffs
+++    lsr      \coeffs, #8
+++    vmov.8   d4[6], \coeffs
+++.endm
+++
+++.macro epel_filter_32b
+++    vmull.s16 q3, d24, d4[0] //q12
+++    vmull.s16 q4, d25, d4[0]
+++    vmull.s16 q5, d30, d4[3] //q15
+++    vmull.s16 q6, d31, d4[3]
+++
+++    vmull.s16 q7, d26, d4[1] // q13
+++    vmull.s16 q8, d27, d4[1]
+++    vmull.s16 q9, d28, d4[2] // q14
+++    vmull.s16 q10, d29, d4[2]
+++    vadd.s32 q3, q5
+++    vadd.s32 q4, q6
+++    vadd.s32 q7, q9
+++    vadd.s32 q8, q10
+++    vsub.s32 q7, q3
+++    vsub.s32 q8, q4
+++    vqshrn.s32  d6, q7, #6
+++    vqshrn.s32  d7, q8, #6
+++.endm
+++
+++.macro epel_filter_32b_4
+++    vmull.s16 q3, d24, d4[0] //q12
+++    vmull.s16 q5, d30, d4[3] //q15
+++    vmull.s16 q7, d26, d4[1] // q13
+++    vmull.s16 q9, d28, d4[2] // q14
+++    vadd.s32 q3, q5
+++    vadd.s32 q7, q9
+++    vsub.s32 q7, q3
+++    vqshrn.s32  d6, q7, #6
+++.endm
+++
+++function ff_hevc_put_epel_h_neon_8, export=1
+++        push   {r4-r7}
+++        mov    r4, MAX_PB_SIZE
+++        ldr    r7, [sp, #16] // mx
+++        ldr    r5, [sp, #24] // width
+++        sub    r7, #1
+++        lsl    r7, #2
+++        vpush {d8-d15}
+++@ adr reaches if we are in thumb mode but not in arm
+++T       adr    r12, epel_coeffs
+++A       adrl   r12, epel_coeffs
+++        add    r7, r12
+++        sub       r1, #1
+++        lsl       r4, #1
+++        load_coeffs_16b r7
+++        mov   r12, r3
+++        mov   r6, r0
+++        mov   r7, r1
+++        cmp       r5, #6
+++        bgt       8f
+++        cmp       r5, #4
+++        blt       2f
+++        b         4f
+++8:      subs r3, #1
+++        pld [r1]
+++        vextin_d4
+++        epel_filter_16b
+++        vst1.16    {q12}, [r0], r4
+++        bne 8b
+++        subs    r5, #8
+++        beq  99f
+++        mov       r3, r12
+++        add       r6, #16
+++        mov       r0, r6
+++        add       r7, #8
+++        mov       r1, r7
+++        cmp       r5, #4
+++        bgt       8b
+++4:      subs r3, #1
+++        pld [r1]
+++        vextin_d4_8
+++        epel_filter_16b
+++        vst1.16    d24, [r0], r4
+++        bne 4b
+++        subs      r5, #4
+++        beq       99f
+++        mov       r3, r12
+++        add       r6, #8
+++        mov       r0, r6
+++        add       r7, #4
+++        mov       r1, r7
+++2:      subs r3, #1
+++        pld [r1]
+++        vextin_d4_8
+++        epel_filter_16b
+++        vst1.32    d24[0], [r0], r4
+++        bne 2b
+++99:     vpop {d8-d15}
+++        pop {r4-r7}
+++        bx lr
+++endfunc
+ +
+-+test_add:
+-+  vldh HX(0,0),(r0)
+-+  vadd HX(0,0),HX(0,0),10
+-+  vsth HX(0,0),(r0)
+-+  mov r0,7 # return value
+-+  b lr
+++function ff_hevc_put_epel_v_neon_8, export=1
+++        push   {r4-r7}
+++        mov    r4, MAX_PB_SIZE
+++        ldr    r7, [sp, #20] // my
+++        ldr    r5, [sp, #24] // width
+++        sub    r7, #1
+++        lsl    r7, #2
+++        vpush {d8-d15}
+++T       adr    r12, epel_coeffs
+++A       adrl   r12, epel_coeffs
+++        add    r7, r12
+++        load_coeffs_16b r7
+++        sub       r1, r2
+++        lsl       r4, #1
+++        mov   r12, r3
+++        mov   r6, r0
+++        mov   r7, r1
+++0:      pld [r1]
+++        vld1.8    {d16}, [r1], r2
+++        pld [r1]
+++        vld1.8    {d17}, [r1], r2
+++        pld [r1]
+++        vld1.8    {d18}, [r1], r2
+++        cmp       r5, #6
+++        bgt       8f
+++        cmp       r5, #4
+++        blt       2f
+++        b         4f
+++8:      pld [r1]
+++        vld1.8    {d19}, [r1], r2
+++        subs r3, #1
+++        epel_filter_16b
+++        vst1.16    {q12}, [r0], r4
+++        vmov d16, d17
+++        vmov d17, d18
+++        vmov d18, d19
+++        bne 8b
+++        subs    r5, #8
+++        beq  99f
+++        mov       r3, r12
+++        add       r6, #16
+++        mov       r0, r6
+++        add       r7, #8
+++        mov       r1, r7
+++        b         0b
+++4:      pld       [r1]
+++        vld1.8    {d19}, [r1], r2
+++        subs r3, #1
+++        epel_filter_16b
+++        vst1.16    d24, [r0], r4
+++        vmov d16, d17
+++        vmov d17, d18
+++        vmov d18, d19
+++        bne 4b
+++        subs      r5, #4
+++        beq       99f
+++        mov       r3, r12
+++        add       r6, #8
+++        mov       r0, r6
+++        add       r7, #4
+++        mov       r1, r7
+++        b         0b
+++2:      pld [r1]
+++        vld1.8    {d19}, [r1], r2
+++        subs r3, #1
+++        epel_filter_16b
+++        vst1.32    d24[0], [r0], r4
+++        vmov d16, d17
+++        vmov d17, d18
+++        vmov d18, d19
+++        bne 2b
+++99:     vpop {d8-d15}
+++        pop {r4-r7}
+++        bx lr
+++endfunc
+ +
+-+# Columns are transformed first
+-+#
+-+# Store top left half of transMatrix2 in
+-+# Store bottom left half of transMatrix2 in HX(32,32)
+-+#
+-+# For 16x16
+-+# HX(0:15,0) contains input data before transform
+-+# HY(0:15,0) contains 32bit output data after transform
+-+# HX(32,0) contains even rows of left half of transMatrix2
+-+# HX(32,32) contains odd rows of left half of transMatrix2
+-+# HY(48,0) contains partial products ready for summing
+-+#
+++function ff_hevc_put_epel_hv_neon_8, export=1
+++        push   {r4-r7}
+++        mov    r4, MAX_PB_SIZE
+++        ldr    r6, [sp, #16] // mx
+++        ldr    r7, [sp, #20] // my
+++        ldr    r5, [sp, #24] // width
+++        sub    r7, #1
+++        lsl    r7, #2
+++        vpush {d8-d15}
+++        adr    r12, epel_coeffs
+++        sub    r6, #1
+++        lsl    r6, #2
+++        add    r6, r12 // mx epel coeff offset
+++        add    r7, r12
+++        sub       r1, #1
+++        sub       r1, r2
+++        lsl       r4, #1
+++        load_coeffs_16b r6
+++        load_coeffs_32b r7
+++        mov   r12, r3
+++        mov   r6, r0
+++        mov   r7, r1
+++0:      pld   [r1]
+++        vextin_d4
+++        epel_filter_16b q12
+++        pld   [r1]
+++        vextin_d4
+++        epel_filter_16b q13
+++        pld   [r1]
+++        vextin_d4
+++        epel_filter_16b q14
+++        cmp       r5, #6
+++        bgt       8f
+++        cmp       r5, #4
+++        blt       2f
+++        b         4f
+++8:      pld     [r1]
+++        vextin_d4
+++        epel_filter_16b q15
+++        subs r3, #1
+++        epel_filter_32b
+++        vst1.16    {q3}, [r0], r4
+++        vmov q12, q13
+++        vmov q13, q14
+++        vmov q14, q15
+++        bne 8b
+++        subs    r5, #8
+++        beq  99f
+++        mov       r3, r12
+++        add       r6, #16
+++        mov       r0, r6
+++        add       r7, #8
+++        mov       r1, r7
+++        b         0b
+++4:      pld      [r1]
+++        vextin_d4_8
+++        epel_filter_16b q15
+++        subs r3, #1
+++        epel_filter_32b_4
+++        vst1.16    d6, [r0], r4
+++        vmov q12, q13
+++        vmov q13, q14
+++        vmov q14, q15
+++        bne 4b
+++        subs      r5, #4
+++        beq       99f
+++        mov       r3, r12
+++        add       r6, #8
+++        mov       r0, r6
+++        add       r7, #4
+++        mov       r1, r7
+++        b         0b
+++2:      pld      [r1]
+++        vextin_d4_8
+++        epel_filter_16b q15
+++        subs r3, #1
+++        epel_filter_32b_4
+++        vst1.32    d6[0], [r0], r4
+++        vmov q12, q13
+++        vmov q13, q14
+++        vmov q14, q15
+++        bne 2b
+++99:     vpop {d8-d15}
+++        pop {r4-r7}
+++        bx lr
+++endfunc
+ +
+++epel_coeffs:
+++       .byte 2, 58, 10, 2
+++       .byte 4, 54, 16, 2
+++       .byte 6, 46, 28, 4
+++       .byte 4, 36, 36, 4
+++       .byte 4, 28, 46, 6
+++       .byte 2, 16, 54, 4
+++       .byte 2, 10, 58, 2
++diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c
++index 5591807..49c70dd 100644
++--- a/libavcodec/arm/hevcdsp_init_neon.c
+++++ b/libavcodec/arm/hevcdsp_init_neon.c
++@@ -22,6 +22,8 @@
++ #include "libavutil/arm/cpu.h"
++ #include "libavcodec/hevcdsp.h"
++ #include "hevcdsp_arm.h"
+++#include "libavcodec/avcodec.h"
+++#include "libavcodec/bit_depth_template.c"
++ 
++ void ff_hevc_v_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
++ void ff_hevc_h_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
++@@ -43,6 +45,21 @@ void ff_hevc_transform_add_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
++ void ff_hevc_transform_add_32x32_neon_8(uint8_t *_dst, int16_t *coeffs,
++                                       ptrdiff_t stride);
++ 
+++void ff_hevc_sao_band_w8_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height);
+++void ff_hevc_sao_band_w16_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height);
+++void ff_hevc_sao_band_w32_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height);
+++void ff_hevc_sao_band_w64_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height);
+++
+++void ff_hevc_sao_edge_eo0_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
+++void ff_hevc_sao_edge_eo1_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
+++void ff_hevc_sao_edge_eo2_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
+++void ff_hevc_sao_edge_eo3_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
+++
+++void ff_hevc_sao_edge_eo0_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
+++void ff_hevc_sao_edge_eo1_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
+++void ff_hevc_sao_edge_eo2_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
+++void ff_hevc_sao_edge_eo3_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
+++
++ #define PUT_PIXELS(name) \
++     void name(int16_t *dst, uint8_t *src, \
++                                 ptrdiff_t srcstride, int height, \
++@@ -58,6 +75,15 @@ PUT_PIXELS(ff_hevc_put_pixels_w32_neon_8);
++ PUT_PIXELS(ff_hevc_put_pixels_w48_neon_8);
++ PUT_PIXELS(ff_hevc_put_pixels_w64_neon_8);
++ #undef PUT_PIXELS
+++void ff_hevc_put_epel_h_neon_8(int16_t *dst, uint8_t *src,
+++                                ptrdiff_t srcstride, int height,
+++                                intptr_t mx, intptr_t my, int width);
+++void ff_hevc_put_epel_v_neon_8(int16_t *dst, uint8_t *src,
+++                                ptrdiff_t srcstride, int height,
+++                                intptr_t mx, intptr_t my, int width);
+++void ff_hevc_put_epel_hv_neon_8(int16_t *dst, uint8_t *src,
+++                                ptrdiff_t srcstride, int height,
+++                                intptr_t mx, intptr_t my, int width);
++ 
++ static void (*put_hevc_qpel_neon[4][4])(int16_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
++                                    int height, int width);
++@@ -142,6 +168,132 @@ void ff_hevc_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t
++     put_hevc_qpel_uw_neon[my][mx](dst, dststride, src, srcstride, width, height, src2, MAX_PB_SIZE);
++ }
++ 
+++static void ff_hevc_sao_band_neon_wrapper(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+++                                          int16_t *sao_offset_val, int sao_left_class, int width, int height)
+++{
+++    pixel *dst = (pixel *)_dst;
+++    pixel *src = (pixel *)_src;
+++    int8_t offset_table[32] = { 0 };
+++    int k, y, x;
+++    int shift  = 3; // BIT_DEPTH - 5
+++    int cwidth = 0;
+ +
+-+# hevc_trans_16x16(short *transMatrix2, short *coeffs, int num)
+-+# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory)
+-+# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
+-+# num: number of 16x16 transforms to be done
+-+#
+-+hevc_trans_16x16:
+-+  push r6-r15, lr # TODO cut down number of used registers
+++    stride_src /= sizeof(pixel);
+++    stride_dst /= sizeof(pixel);
+ +
+-+  mov r3, 2*32*2 # Twice Stride of transMatrix2 in bytes
+-+  vld HX(32++,0),(r0 += r3) REP 16 # This is the 16x16 matrix, a transform is equivalent to multiplying input row vector * matrix
+-+  # Now use r0 to describe which matrix we are working on.
+-+  # Allows us to prefetch the next block of coefficients for efficiency.
+-+  mov r0,0 # This describes the location where we read our coefficients from
+-+  mov r3,16*2 # Stride of coefficients in bytes
+-+  mov r7,16*16*2 # Total block size
+-+  mov r8,64*16 # Value used to swap from current to next VRF location
+-+  vldh HX(0++,0)+r0,(r1 += r3) REP 16
+-+  mov r4,64 # Constant used for rounding first pass
+-+  mov r5,1<<19 # Constant used for rounding second pass
+++    for (k = 0; k < 4; k++)
+++        offset_table[(k + sao_left_class) & 31] = sao_offset_val[k + 1];
+ +
+-+  # At start of block r0,r1 point to the current block (that has already been loaded)
+-+block_loop:
+-+  eor r0,r8
+-+  add r1,r7
+-+  # Prefetch the next block
+-+  vldh HX(0++,0)+r0,(r1 += r3) REP 16
+-+  eor r0,r8
+-+  sub r1,r7
+++    if (height % 8 == 0)
+++        cwidth = width;
+ +
+-+  # Transform the current block
+-+  bl col_trans_16
+-+  vadd HY(0++,0)+r0,HY(0++,0)+r0,r4 REP 16   # Now add on rounding, shift down by 7, and saturate
+-+  #vsasls HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # 9+7=16 so this ends up with the output saturated and in the top half of the word.
+-+  vasl HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16    # This should be saturating, but the instruction above does not assemble?
+-+  vmov VX(0,0++), HX(0++,32) REP 16          # For simplicity transpose this back to the original position
+++    switch(cwidth){
+++    case 8:
+++        ff_hevc_sao_band_w8_neon_8(_dst, _src, offset_table, stride_src, stride_dst, height);
+++        break;
+++    case 16:
+++        ff_hevc_sao_band_w16_neon_8(_dst, _src, offset_table, stride_src, stride_dst, height);
+++        break;
+++    case 32:
+++        ff_hevc_sao_band_w32_neon_8(_dst, _src, offset_table, stride_src, stride_dst, height);
+++        break;
+++    case 64:
+++        ff_hevc_sao_band_w64_neon_8(_dst, _src, offset_table, stride_src, stride_dst, height);
+++        break;
+++    default:
+++        for (y = 0; y < height; y++) {
+++            for (x = 0; x < width; x++)
+++                dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
+++            dst += stride_dst;
+++            src += stride_src;
+++        }
+++    }
+++}
+ +
+-+  bl col_trans_16
+-+  vadd HY(0++,0)+r0,HY(0++,0)+r0,r4 REP 16   # Now add on rounding, shift down by 7, and saturate
+-+  #vsasls HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # 9+7=16 so this ends up with the output saturated and in the top half of the word.
+-+  vasl HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16    # This should be saturating, but the instruction above does not assemble?
+++#define CMP(a, b) ((a) > (b) ? 1 : ((a) == (b) ? 0 : -1))
+++static void ff_hevc_sao_edge_neon_wrapper(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
+++                                          int16_t *_sao_offset_val, int eo, int width, int height)
+++{
+++    static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
+++    static const int8_t pos[4][2][2] = {
+++        { { -1,  0 }, {  1, 0 } }, // horizontal
+++        { {  0, -1 }, {  0, 1 } }, // vertical
+++        { { -1, -1 }, {  1, 1 } }, // 45 degree
+++        { {  1, -1 }, { -1, 1 } }, // 135 degree
+++    };
+++    int8_t sao_offset_val[8];  // padding of 3 for vld
+++    ptrdiff_t stride_src = (2*MAX_PB_SIZE + FF_INPUT_BUFFER_PADDING_SIZE);
+++    pixel *dst = (pixel *)_dst;
+++    pixel *src = (pixel *)_src;
+++    int a_stride, b_stride;
+++    int x, y;
+++    int cwidth = 0;
+++
+++    for (x = 0; x < 5; x++) {
+++        sao_offset_val[x] = _sao_offset_val[edge_idx[x]];
+++    }
+ +
+-+  # Save results - note there has been a transposition during the processing so we save columns
+-+  vsth VX(0,32++)+r0, (r1 += r3) REP 16
+++    if (height % 8 == 0)
+++        cwidth = width;
+ +
+-+  # Move onto next block
+-+  eor r0,r8
+-+  add r1,r7
+++    stride_src /= sizeof(pixel);
+++    stride_dst /= sizeof(pixel);
+ +
+-+  addcmpbgt r2,-1,0,block_loop
+-+  pop r6-r15, pc
+++    switch (cwidth) {
+++    case 32:
+++        switch(eo) {
+++        case 0:
+++            ff_hevc_sao_edge_eo0_w32_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
+++            break;
+++        case 1:
+++            ff_hevc_sao_edge_eo1_w32_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
+++            break;
+++        case 2:
+++            ff_hevc_sao_edge_eo2_w32_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
+++            break;
+++        case 3:
+++            ff_hevc_sao_edge_eo3_w32_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
+++            break;
+++        }
+++        break;
+++    case 64:
+++        switch(eo) {
+++        case 0:
+++            ff_hevc_sao_edge_eo0_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
+++            break;
+++        case 1:
+++            ff_hevc_sao_edge_eo1_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
+++            break;
+++        case 2:
+++            ff_hevc_sao_edge_eo2_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
+++            break;
+++        case 3:
+++            ff_hevc_sao_edge_eo3_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
+++            break;
+++        }
+++        break;
+++    default:
+++        a_stride = pos[eo][0][0] + pos[eo][0][1] * stride_src;
+++        b_stride = pos[eo][1][0] + pos[eo][1][1] * stride_src;
+++        for (y = 0; y < height; y++) {
+++            for (x = 0; x < width; x++) {
+++                int diff0         = CMP(src[x], src[x + a_stride]);
+++                int diff1         = CMP(src[x], src[x + b_stride]);
+++                int idx           = diff0 + diff1;
+++                if (idx)
+++                    dst[x] = av_clip_pixel(src[x] + sao_offset_val[idx+2]);
+++            }
+++            src += stride_src;
+++            dst += stride_dst;
+++        }
+++    }
+++}
+++#undef CMP
+ +
+-+# r1,r2,r3 r7,r8 should be preserved
+-+# HX(0++,0)+r0 is the block to be transformed
+-+# HX(32++,0) is the 16x16 matrix of transform coefficients
+-+# Use HY(48,0) for intermediate results
+-+# r0 can be used, but should be returned to its original value at the end
+-+col_trans_16:
+-+  add r4,r0,16 # Final value for this loop
+-+col_trans_16_loop:
+-+  # First compute partial products for a single column
+-+  vmul32s VY(48,0++), VX(0,0)+r0, VX(32,0++) REP 16
+-+  # Then sum up the results and place back
+-+  vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
+-+  addcmpblt r0,1,r4,col_trans_16_loop
+-+  sub r0,16  # but r0 back to its original value
+-+  b lr
+-diff --git a/libavcodec/rpi_mailbox.c b/libavcodec/rpi_mailbox.c
+-new file mode 100644
+-index 0000000..536896f
+---- /dev/null
+-+++ b/libavcodec/rpi_mailbox.c
+-@@ -0,0 +1,293 @@
+-+/*
+-+Copyright (c) 2012, Broadcom Europe Ltd.
+-+All rights reserved.
+++void ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_inc, int out_inc,
+++                                                int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
+++                                                MvField *curr, MvField *neigh, uint8_t *bs);
+ +
+-+Redistribution and use in source and binary forms, with or without
+-+modification, are permitted provided that the following conditions are met:
+-+    * Redistributions of source code must retain the above copyright
+-+      notice, this list of conditions and the following disclaimer.
+-+    * Redistributions in binary form must reproduce the above copyright
+-+      notice, this list of conditions and the following disclaimer in the
+-+      documentation and/or other materials provided with the distribution.
+-+    * Neither the name of the copyright holder nor the
+-+      names of its contributors may be used to endorse or promote products
+-+      derived from this software without specific prior written permission.
++ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
++ {
++     if (bit_depth == 8) {
++@@ -161,6 +313,10 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
++         c->transform_add[2]            = ff_hevc_transform_add_16x16_neon_8;
++         c->transform_add[3]            = ff_hevc_transform_add_32x32_neon_8;
++         c->idct_4x4_luma               = ff_hevc_transform_luma_4x4_neon_8;
+++        for (x = 0; x < sizeof c->sao_band_filter / sizeof *c->sao_band_filter; x++) {
+++          c->sao_band_filter[x]        = ff_hevc_sao_band_neon_wrapper;
+++          c->sao_edge_filter[x]        = ff_hevc_sao_edge_neon_wrapper;
+++        }
++         put_hevc_qpel_neon[1][0]       = ff_hevc_put_qpel_v1_neon_8;
++         put_hevc_qpel_neon[2][0]       = ff_hevc_put_qpel_v2_neon_8;
++         put_hevc_qpel_neon[3][0]       = ff_hevc_put_qpel_v3_neon_8;
++@@ -201,7 +357,21 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
++             c->put_hevc_qpel_bi[x][1][0]      = ff_hevc_put_qpel_bi_neon_wrapper;
++             c->put_hevc_qpel_bi[x][0][1]      = ff_hevc_put_qpel_bi_neon_wrapper;
++             c->put_hevc_qpel_bi[x][1][1]      = ff_hevc_put_qpel_bi_neon_wrapper;
+++            c->put_hevc_epel[x][1][0]         = ff_hevc_put_epel_v_neon_8;
+++            c->put_hevc_epel[x][0][1]         = ff_hevc_put_epel_h_neon_8;
+++            c->put_hevc_epel[x][1][1]         = ff_hevc_put_epel_hv_neon_8;
++         }
+++        c->put_hevc_epel[0][0][0]  = ff_hevc_put_pixels_w2_neon_8;
+++        c->put_hevc_epel[1][0][0]  = ff_hevc_put_pixels_w4_neon_8;
+++        c->put_hevc_epel[2][0][0]  = ff_hevc_put_pixels_w6_neon_8;
+++        c->put_hevc_epel[3][0][0]  = ff_hevc_put_pixels_w8_neon_8;
+++        c->put_hevc_epel[4][0][0]  = ff_hevc_put_pixels_w12_neon_8;
+++        c->put_hevc_epel[5][0][0]  = ff_hevc_put_pixels_w16_neon_8;
+++        c->put_hevc_epel[6][0][0]  = ff_hevc_put_pixels_w24_neon_8;
+++        c->put_hevc_epel[7][0][0]  = ff_hevc_put_pixels_w32_neon_8;
+++        c->put_hevc_epel[8][0][0]  = ff_hevc_put_pixels_w48_neon_8;
+++        c->put_hevc_epel[9][0][0]  = ff_hevc_put_pixels_w64_neon_8;
+++
++         c->put_hevc_qpel[0][0][0]  = ff_hevc_put_pixels_w2_neon_8;
++         c->put_hevc_qpel[1][0][0]  = ff_hevc_put_pixels_w4_neon_8;
++         c->put_hevc_qpel[2][0][0]  = ff_hevc_put_pixels_w6_neon_8;
++@@ -221,4 +391,9 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
++         c->put_hevc_qpel_uni[8][0][0]  = ff_hevc_put_qpel_uw_pixels_w48_neon_8;
++         c->put_hevc_qpel_uni[9][0][0]  = ff_hevc_put_qpel_uw_pixels_w64_neon_8;
++     }
+ +
+-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+-+*/
+++    assert(offsetof(MvField, mv) == 0);
+++    assert(offsetof(MvField, ref_idx) == 8);
+++    assert(offsetof(MvField, pred_flag) == 10);
+++    c->hevc_deblocking_boundary_strengths = ff_hevc_deblocking_boundary_strengths_neon;
++ }
++diff --git a/libavcodec/arm/hevcdsp_sao_neon.S b/libavcodec/arm/hevcdsp_sao_neon.S
++new file mode 100644
++index 0000000..9c7808d
++--- /dev/null
+++++ b/libavcodec/arm/hevcdsp_sao_neon.S
++@@ -0,0 +1,510 @@
+++/*
+++ * Copyright (c) 2014 - 2015 Seppo Tomperi <seppo.tomperi@vtt.fi>
+++ *
+++ * This file is part of FFmpeg.
+++ *
+++ * FFmpeg is free software; you can redistribute it and/or
+++ * modify it under the terms of the GNU Lesser General Public
+++ * License as published by the Free Software Foundation; either
+++ * version 2.1 of the License, or (at your option) any later version.
+++ *
+++ * FFmpeg is distributed in the hope that it will be useful,
+++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+++ * Lesser General Public License for more details.
+++ *
+++ * You should have received a copy of the GNU Lesser General Public
+++ * License along with FFmpeg; if not, write to the Free Software
+++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+++ */
+ +
+-+#include <stdio.h>
+-+#include <string.h>
+-+#include <stdlib.h>
+-+#include <fcntl.h>
+-+#include <unistd.h>
+-+#include <assert.h>
+-+#include <stdint.h>
+-+#include <sys/mman.h>
+-+#include <sys/ioctl.h>
+++#include "libavutil/arm/asm.S"
+++#include "neon.S"
+++
+++.macro init_sao_band
+++        pld      [r1]
+++        vld1.8   {q0, q1}, [r2]  // offset table
+++        ldr       r2, [sp, #0]   // stride_dst
+++        ldr      r12, [sp, #4]   // height
+++        vmov.u8  q3, #128
+++.endm
+++
+++// 128 in q3
+++// input q8 - q11
+++.macro sao_band_64
+++        vtbl.8   d24, {d0, d1, d2, d3}, d24
+++        vadd.s8  q8, q3
+++        vtbl.8   d25, {d0, d1, d2, d3}, d25
+++        vadd.s8  q9, q3
+++        vtbl.8   d26, {d0, d1, d2, d3}, d26
+++        vadd.s8  q10, q3
+++        vtbl.8   d27, {d0, d1, d2, d3}, d27
+++        vadd.s8  q11, q3
+++        vtbl.8   d28, {d0, d1, d2, d3}, d28
+++        vqadd.s8 q8, q12
+++        vtbl.8   d29, {d0, d1, d2, d3}, d29
+++        vqadd.s8 q9, q13
+++        vtbl.8   d30, {d0, d1, d2, d3}, d30
+++        vqadd.s8 q10, q14
+++        vtbl.8   d31, {d0, d1, d2, d3}, d31
+++        vsub.s8  q8, q3
+++        vqadd.s8 q11, q15
+++        vsub.s8  q9, q3
+++        vsub.s8  q10, q3
+++        vsub.s8  q11, q3
+++.endm
+++
+++function ff_hevc_sao_band_w8_neon_8, export=1
+++        init_sao_band
+++1:      subs     r12, #8
+++        vld1.8   {d16}, [r1, :64], r3
+++        vld1.8   {d17}, [r1, :64], r3
+++        vshr.u8  q12, q8, #3
+++        vld1.8   {d18}, [r1, :64], r3
+++        vld1.8   {d19}, [r1, :64], r3
+++        vshr.u8  q13, q9, #3
+++        vld1.8   {d20}, [r1, :64], r3
+++        vld1.8   {d21}, [r1, :64], r3
+++        vshr.u8  q14, q10, #3
+++        vld1.8   {d22}, [r1, :64], r3
+++        vld1.8   {d23}, [r1, :64], r3
+++        vshr.u8  q15, q11, #3
+++        sao_band_64
+++        vst1.8  {d16}, [r0, :64], r2
+++        vst1.8  {d17}, [r0, :64], r2
+++        vst1.8  {d18}, [r0, :64], r2
+++        vst1.8  {d19}, [r0, :64], r2
+++        vst1.8  {d20}, [r0, :64], r2
+++        vst1.8  {d21}, [r0, :64], r2
+++        vst1.8  {d22}, [r0, :64], r2
+++        vst1.8  {d23}, [r0, :64], r2
+++        bne    1b
+++
+++        bx lr
+++endfunc
+ +
+-+#include <linux/ioctl.h>
+++function ff_hevc_sao_band_w16_neon_8, export=1
+++        init_sao_band
+++1:      subs     r12, #4
+++        vld1.8  {q8}, [r1, :128], r3
+++        vshr.u8  q12, q8, #3
+++        vld1.8  {q9}, [r1, :128], r3
+++        vshr.u8  q13, q9, #3
+++        vld1.8  {q10}, [r1, :128], r3
+++        vshr.u8  q14, q10, #3
+++        vld1.8  {q11}, [r1, :128], r3
+++        vshr.u8  q15, q11, #3
+++        sao_band_64
+++        vst1.8   {q8}, [r0, :128], r2
+++        vst1.8   {q9}, [r0, :128], r2
+++        vst1.8   {q10}, [r0, :128], r2
+++        vst1.8   {q11}, [r0, :128], r2
+++        bne    1b
+++
+++        bx lr
+++endfunc
+ +
+-+#define MAJOR_NUM 100
+-+#define IOCTL_MBOX_PROPERTY _IOWR(MAJOR_NUM, 0, char *)
+-+#define DEVICE_FILE_NAME "/dev/char_dev"
+++function ff_hevc_sao_band_w32_neon_8, export=1
+++        init_sao_band
+++1:      subs     r12, #2
+++        vld1.8   {q8-q9}, [r1, :128], r3
+++        vshr.u8  q12, q8, #3
+++        vshr.u8  q13, q9, #3
+++        vld1.8   {q10-q11}, [r1, :128], r3
+++        vshr.u8  q14, q10, #3
+++        vshr.u8  q15, q11, #3
+++        sao_band_64
+++        vst1.8   {q8-q9}, [r0, :128], r2
+++        vst1.8   {q10-q11}, [r0, :128], r2
+++        bne      1b
+++
+++        bx       lr
+++endfunc
+ +
+-+#include "rpi_mailbox.h"
+++function ff_hevc_sao_band_w64_neon_8, export=1
+++        init_sao_band
+++1:      subs      r12, #1
+++        pld       [r1, r3]
+++        vld1.8    {q8-q9}, [r1, :128]!
+++        vshr.u8  q12, q8, #3
+++        vshr.u8  q13, q9, #3
+++        vld1.8    {q10-q11}, [r1, :128], r3
+++        vshr.u8  q14, q10, #3
+++        vshr.u8  q15, q11, #3
+++        sub       r1, #32
+++        sao_band_64
+++        vst1.8    {q8-q9}, [r0, :128]!
+++        vst1.8    {q10-q11}, [r0, :128], r2
+++        sub       r0, #32
+++        bne       1b
+++
+++        bx lr
+++endfunc
+ +
+-+#define PAGE_SIZE (4*1024)
+++.macro diff32 out0, out1, tmp0, tmp1, in0, in1, in2, in3
+++        vcgt.u8 \out0, \in2, \in0  // c > a -> -1 , otherwise 0
+++        vcgt.u8 \tmp0,  \in0, \in2  // a > c -> -1 , otherwise 0
+++        vcgt.u8 \out1, \in3, \in1  // c > a -> -1 , otherwise 0 part 2
+++        vcgt.u8 \tmp1,  \in1, \in3  // a > c -> -1 , otherwise 0 part 2
+++        vsub.s8 \out0, \tmp0, \out0 // diff0
+++        vsub.s8 \out1, \tmp1, \out1 // diff0 part 2
+++.endm
+++
+++.macro table64
+++        vmov.s8 q13, #2 // 2 to all elements
+++        vmov.32  d24[0], r4  // load offset table from general registers
+++        vmov.32  d24[1], r5  // load rest of offset table
+++
+++        vadd.s8 q0, q13
+++        vadd.s8 q1, q13
+++        vadd.s8 q2, q13
+++        vadd.s8 q3, q13
+++
+++        vmov.u8  q15, #128 // s8 #-128
+++        vtbl.8   d0, {d24}, d0
+++        vadd.s8  q13,  q4, q15
+++        vtbl.8   d1, {d24}, d1
+++        vadd.s8  q14,  q5, q15
+++        vtbl.8   d2, {d24}, d2
+++        vqadd.s8 q0, q13
+++        vtbl.8   d3, {d24}, d3
+++        vqadd.s8 q1, q14
+++        vtbl.8   d4, {d24}, d4
+++        vadd.s8  q13,  q6, q15
+++        vtbl.8   d5, {d24}, d5
+++        vadd.s8  q14,  q7, q15
+++        vtbl.8   d6, {d24}, d6
+++        vqadd.s8 q2, q13
+++        vtbl.8   d7, {d24}, d7
+++        vqadd.s8 q3, q14
+++        vsub.s8   q0, q15
+++        vsub.s8   q1, q15
+++        vsub.s8   q2, q15
+++        vsub.s8   q3, q15
+++        vst1.8  {q0-q1}, [r0, :128]!
+++        vst1.8  {q2-q3}, [r0, :128], r2
+++        sub     r0, #32
+++.endm
+++
+++// input
+++// a in q0 - q3
+++// c in q4 - q7
+++// b in q8 - q11
+++// offset table in r7 and r5
+++// output in q0 - q3
+++// clobbers q12 - q15
+++.macro edge_w64_body
+++        diff32 q12, q13, q0, q1, q0, q1, q4, q5
+++        diff32 q0, q1, q14, q15, q8, q9, q4, q5
+++
+++        vadd.s8  q0, q12 //diff0 + diff1
+++        vadd.s8  q1, q13
+++
+++        diff32  q14, q15, q2, q3, q2, q3, q6, q7
+++        diff32  q2, q3, q12, q13, q10, q11, q6, q7
+++
+++        vadd.s8  q2, q14
+++        vadd.s8  q3, q15
+++        table64
+++.endm
+++
+++.macro init_edge_64
+++        push   {r4-r5}
+++        ldr    r12, [sp, #8] // height
+++        ldr    r5, [sp, #12] // sao_offset_val_table
+++        ldr    r4, [r5]
+++        add    r5, #4
+++        ldr    r5, [r5]
+++.endm
+++
+++function ff_hevc_sao_edge_eo0_w64_neon_8, export=1
+++        init_edge_64
+++        vpush {d8-d15}
+++        sub    r1, #8
+++1:      subs    r12, #1
+++        vld1.64  {d7}, [r1, :64]!
+++        vld1.64  {q4-q5}, [r1, :128]! // load c
+++        vld1.64  {q6-q7}, [r1, :128]!
+++        vld1.64  {d24}, [r1, :64], r3
+++        sub      r1, #72
+++        // load a
+++        vext.8 q0, q3, q4, #15
+++        vext.8 q1, q4, q5, #15
+++        vext.8 q2, q5, q6, #15
+++        vext.8 q3, q6, q7, #15
+++        // load b
+++        vext.8 q8, q4, q5, #1
+++        vext.8 q9, q5, q6, #1
+++        vext.8 q10, q6, q7, #1
+++        vext.8 q11, q7, q12, #1
+++        edge_w64_body
+++        bne   1b
+++        vpop  {d8-d15}
+++        pop   {r4-r5}
+++        bx lr
+++endfunc
+ +
+-+// Shared memory will not be cached in ARM cache
+-+void *mapmem_shared(unsigned base, unsigned size)
+-+{
+-+   int mem_fd;
+-+   unsigned offset = base % PAGE_SIZE;
+-+   base = base - offset;
+-+   /* open /dev/mem */
+-+   if ((mem_fd = open("/dev/mem", O_RDWR|O_SYNC) ) < 0) {
+-+      printf("can't open /dev/mem\nThis program should be run as root. Try prefixing command with: sudo\n");
+-+      return NULL;
+-+   }
+-+   void *mem = mmap(
+-+      0,
+-+      size,
+-+      PROT_READ|PROT_WRITE,
+-+      MAP_SHARED/*|MAP_FIXED*/,
+-+      mem_fd,
+-+      base);
+-+#ifdef DEBUG
+-+   printf("base=0x%x, mem=%p\n", base, mem);
+-+#endif
+-+   if (mem == MAP_FAILED) {
+-+      printf("mmap error %d\n", (int)mem);
+-+      return NULL;
+-+   }
+-+   close(mem_fd);
+-+   return (char *)mem + offset;
+-+}
+++function ff_hevc_sao_edge_eo1_w64_neon_8, export=1
+++        init_edge_64
+++        vpush {d8-d15}
+++        sub     r1, r3
+++        // load a
+++        vld1.8  {q0-q1}, [r1, :128]!
+++        vld1.8  {q2-q3}, [r1, :128], r3
+++        sub     r1, #32
+++        // load c
+++        vld1.8  {q4-q5}, [r1, :128]!
+++        vld1.8  {q6-q7}, [r1, :128], r3
+++        sub     r1, #32
+++1:      subs    r12, #1
+++        // load b
+++        vld1.8  {q8-q9}, [r1, :128]!
+++        vld1.8  {q10-q11}, [r1, :128], r3
+++        sub     r1, #32
+++        edge_w64_body
+++        // copy c to a
+++        vmov.64 q0, q4
+++        vmov.64 q1, q5
+++        vmov.64 q2, q6
+++        vmov.64 q3, q7
+++        // copy b to c
+++        vmov.64 q4, q8
+++        vmov.64 q5, q9
+++        vmov.64 q6, q10
+++        vmov.64 q7, q11
+++        bne   1b
+++        vpop  {d8-d15}
+++        pop   {r4-r5}
+++        bx lr
+++endfunc
+ +
+-+// Unshared memory will be faster as lives in ARM cache, but requires cache flushing
+-+void *mapmem_private(unsigned base, unsigned size)
+-+{
+-+   int mem_fd;
+-+   unsigned offset = base % PAGE_SIZE;
+-+   base = base - offset;
+-+   /* open /dev/mem */
+-+   if ((mem_fd = open("/dev/mem", O_RDWR|O_SYNC) ) < 0) {
+-+      printf("can't open /dev/mem\nThis program should be run as root. Try prefixing command with: sudo\n");
+-+      return NULL;
+-+   }
+-+   void *mem = mmap(
+-+      0,
+-+      size,
+-+      PROT_READ|PROT_WRITE,
+-+      MAP_PRIVATE/*|MAP_FIXED*/,
+-+      mem_fd,
+-+      base);
+-+#ifdef DEBUG
+-+   printf("base=0x%x, mem=%p\n", base, mem);
+-+#endif
+-+   if (mem == MAP_FAILED) {
+-+      printf("mmap error %d\n", (int)mem);
+-+      return NULL;
+-+   }
+-+   close(mem_fd);
+-+   return (char *)mem + offset;
+-+}
+++function ff_hevc_sao_edge_eo2_w64_neon_8, export=1
+++        init_edge_64
+++        vpush {d8-d15}
+++1:      sub     r1, r3
+++        // load a
+++        // TODO: fix unaligned load
+++        //       don't reload a like in eo1
+++        sub     r1, #1
+++        vld1.8  {q0-q1}, [r1]!
+++        vld1.8  {q2-q3}, [r1], r3
+++        sub     r1, #31
+++        subs    r12, #1
+++        // load c
+++        vld1.8  {q4-q5}, [r1, :128]!
+++        vld1.8  {q6-q7}, [r1, :128], r3
+++        sub     r1, #32
+++        // load b
+++        add     r1, #1
+++        vld1.8  {q8-q9}, [r1]!
+++        vld1.8  {q10-q11}, [r1]
+++        sub     r1, #33
+++        edge_w64_body
+++        bne   1b
+++        vpop  {d8-d15}
+++        pop   {r4-r5}
+++        bx lr
+++endfunc
+ +
+-+void unmapmem(void *addr, unsigned size)
+-+{
+-+   int s = munmap(addr, size);
+-+   if (s != 0) {
+-+      printf("munmap error %d\n", s);
+-+      exit (-1);
+-+   }
+-+}
+++function ff_hevc_sao_edge_eo3_w64_neon_8, export=1
+++        init_edge_64
+++        vpush {d8-d15}
+++1:      sub     r1, r3
+++        // load a
+++        // TODO: fix unaligned load
+++        //       don't reload a like in eo1
+++        add     r1, #1
+++        vld1.8  {q0-q1}, [r1]!
+++        vld1.8  {q2-q3}, [r1], r3
+++        sub     r1, #33
+++        subs    r12, #1
+++        // load c
+++        vld1.8  {q4-q5}, [r1, :128]!
+++        vld1.8  {q6-q7}, [r1, :128], r3
+++        sub     r1, #32
+++        // load b
+++        sub     r1, #1
+++        vld1.8  {q8-q9}, [r1]!
+++        vld1.8  {q10-q11}, [r1]
+++        sub     r1, #31
+++        edge_w64_body
+++        bne   1b
+++        vpop  {d8-d15}
+++        pop   {r4-r5}
+++        bx lr
+++endfunc
+ +
+-+/*
+-+ * use ioctl to send mbox property message
+-+ */
+++.macro init_edge_32
+++        ldr     r12, [sp, #4] // sao_offset_val_table
+++        vld1.32 {d31}, [r12]
+++        ldr     r12, [sp] // height
+++.endm
+++
+++.macro diff out0, tmp0, in0, in1
+++        vcgt.u8 \out0, \in1, \in0  // c > a -> -1 , otherwise 0
+++        vcgt.u8 \tmp0,  \in0, \in1  // a > c -> -1 , otherwise 0
+++        vsub.s8 \out0, \tmp0, \out0 // diff0
+++.endm
+++
+++.macro table32
+++        vmov.s8  q10, #2
+++        vadd.s8  q0, q10
+++        vadd.s8  q1, q10
+++        vmov.s8  q10, #128
+++        vtbl.8   d0, {d31}, d0
+++        vadd.s8  q11, q2, q10
+++        vtbl.8   d1, {d31}, d1
+++        vadd.s8  q12, q3, q10
+++        vtbl.8   d2, {d31}, d2
+++        vqadd.s8 q11, q0
+++        vtbl.8   d3, {d31}, d3
+++        vqadd.s8 q12, q1
+++        vsub.s8  q0, q11, q10
+++        vsub.s8  q1, q12, q10
+++        vst1.8   {q0-q1}, [r0, :128], r2
+++.endm
+++
+++function ff_hevc_sao_edge_eo0_w32_neon_8, export=1
+++        init_edge_32
+++        vpush {q4-q7}
+++        sub     r1, #4
+++1:      subs    r12, #1
+++        vld1.8  {q13-q14}, [r1]!
+++        vld1.32 d30, [r1], r3
+++        sub     r1, #32
+++        // a
+++        vext.8   q0, q13, q14, #3
+++        vext.8   q1, q14, q15, #3
+++        vshr.u64 d24, d30, #24
+++        // c
+++        vext.8   q2, q13, q14, #4
+++        vext.8   q3, q14, q15, #4
+++        vshr.u64 d16, d30, #32
+++        // diff0
+++        diff32 q13, q14, q4, q5, q0, q1, q2, q3
+++        diff   d18, d25, d24, d16
+++        // -diff1
+++        vext.s8 q0, q13, q14, #1
+++        vext.s8 q1, q14, q9, #1
+++
+++        vsub.s8 q0, q13, q0 //diff0 + diff1
+++        vsub.s8 q1, q14, q1
+++        table32
+++        bne     1b
+++        vpop {q4-q7}
+++
+++        bx      lr
+++endfunc
+ +
+-+static int mbox_property(int file_desc, void *buf)
+-+{
+-+   int ret_val = ioctl(file_desc, IOCTL_MBOX_PROPERTY, buf);
+++function ff_hevc_sao_edge_eo1_w32_neon_8, export=1
+++        init_edge_32
+++        vpush {q4-q7}
+++        // load a
+++        sub     r1, r3
+++        vld1.8  {q0-q1}, [r1, :128], r3
+++        // load c
+++        vld1.8  {q2-q3}, [r1, :128], r3
+++        diff32 q12, q13, q0, q1, q0, q1, q2, q3 // CMP ( c, a )
+++1:      subs    r12, #1
+++        // load b
+++        vld1.8  {q8-q9}, [r1, :128], r3
+++        diff32 q4, q5, q10, q11, q8, q9, q2, q3 // CMP ( c, b )
+++        vadd.s8 q0, q4, q12 //diff0 + diff1
+++        vadd.s8 q1, q5, q13
+++        table32
+++        // CMP ( c, a )
+++        vneg.s8 q12, q4
+++        vneg.s8 q13, q5
+++        // c
+++        vmov.64 q2, q8
+++        vmov.64 q3, q9
+++        bne     1b
+++        vpop {q4-q7}
+++        bx      lr
+++endfunc
+ +
+-+   if (ret_val < 0) {
+-+      printf("ioctl_set_msg failed:%d\n", ret_val);
+-+   }
+++function ff_hevc_sao_edge_eo2_w32_neon_8, export=1
+++        init_edge_32
+++        vpush   {d8-d15}
+++        // load a
+++        sub     r1, r3
+++        sub     r1, #8
+++        vld1.8  {q10-q11}, [r1, :64]!
+++        vld1.8  {d24}, [r1, :64], r3
+++        sub     r1, #32
+++        vext.8  q0, q10, q11, #7
+++        vext.8  q1, q11, q12, #7
+++        // load c
+++        vld1.8  {d9}, [r1, :64]!
+++        vld1.8  {q2-q3}, [r1, :64], r3
+++        sub     r1, #8
+++        vext.8  q4, q4, q2, #15
+++1:      subs    r12, #1
+++        // load b
+++        vld1.8  {q10-q11}, [r1, :64]!
+++        vld1.8  {q12}, [r1, :64], r3
+++        sub     r1, #32
+++        vext.8  q8, q10, q11, #9
+++        vext.8  q9, q11, q12, #9
+++        vext.8  q6, q10, q11, #8
+++        vext.8  q7, q11, q12, #8
+++        vext.8  q5, q10, q11, #7
+++        diff32 q12, q13, q0, q1, q0, q1, q2, q3
+++        diff32 q0, q1, q10, q11, q8, q9, q2, q3
+++        vadd.s8 q0, q12 //diff0 + diff1
+++        vadd.s8 q1, q13
+++        table32
+++        // inputs for next loop iteration
+++        // a
+++        vmov.8  q0, q4
+++        vext.8  q1, q2, q3, #15
+++        // c
+++        vmov.8  q2, q6
+++        vmov.8  q3, q7
+++        vmov.8  q4, q5
+++        bne     1b
+++        vpop    {d8-d15}
+++        bx      lr
+++endfunc
+ +
+-+#ifdef DEBUG
+-+   unsigned *p = buf; int i; unsigned size = *(unsigned *)buf;
+-+   for (i=0; i<size/4; i++)
+-+      printf("%04x: 0x%08x\n", i*sizeof *p, p[i]);
+-+#endif
+-+   return ret_val;
+-+}
+++function ff_hevc_sao_edge_eo3_w32_neon_8, export=1
+++        init_edge_32
+++        sub     r1, r3
+++        // load a
+++        vld1.8  {q10-q11}, [r1, :64]!
+++        vld1.8  {d24}, [r1, :64], r3
+++        sub     r1, #32
+++        vext.8  q0, q10, q11, #1
+++        vext.8  q1, q11, q12, #1
+++        // load c
+++        vld1.8  {q2-q3}, [r1, :64]!
+++        vld1.8  {d30}, [r1, :64], r3
+++        sub     r1, #40
+++1:      subs    r12, #1
+++        // load b
+++        vld1.8  {q10-q11}, [r1, :64]!
+++        vld1.8  {q12}, [r1, :64], r3
+++        sub     r1, #32
+++        vext.8  q8, q10, q11, #7
+++        vext.8  q9, q11, q12, #7
+++        vext.8  q14, q12, q10, #7
+++
+++        diff32 q12, q13, q0, q1, q0, q1, q2, q3
+++        diff32 q0, q1, q10, q11, q8, q9, q2, q3
+++
+++        vadd.s8 q0, q12 //diff0 + diff1
+++        vadd.s8 q1, q13
+++        table32
+++
+++        // inputs for next loop iteration
+++        // a
+++        vext.8  q0, q2, q3, #1
+++        vext.8  q1, q3, q15, #1
+++        // c
+++        vext.8  q2, q8, q9, #1
+++        vext.8  q3, q9, q14, #1
+++        vext.8  d30, d28, d2, #1
+++        bne     1b
+++        bx      lr
+++endfunc
+ +
+-+unsigned mem_alloc(int file_desc, unsigned size, unsigned align, unsigned flags)
+-+{
+-+   int i=0;
+-+   unsigned p[32];
+-+   p[i++] = 0; // size
+-+   p[i++] = 0x00000000; // process request
++diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
++index 39713ed..25eb52b 100644
++--- a/libavcodec/avcodec.h
+++++ b/libavcodec/avcodec.h
++@@ -410,6 +410,8 @@ enum AVCodecID {
++     AV_CODEC_ID_SHEERVIDEO,
++     AV_CODEC_ID_YLC,
++ 
+++    AV_CODEC_ID_H264_MVC,
+++
++     /* various PCM "codecs" */
++     AV_CODEC_ID_FIRST_AUDIO = 0x10000,     ///< A dummy id pointing at the start of audio codecs
++     AV_CODEC_ID_PCM_S16LE = 0x10000,
++@@ -2850,6 +2852,7 @@ typedef struct AVCodecContext {
++ #define FF_BUG_DC_CLIP          4096
++ #define FF_BUG_MS               8192 ///< Work around various bugs in Microsoft's broken decoders.
++ #define FF_BUG_TRUNCATED       16384
+++#define FF_BUG_GMC_UNSUPPORTED 32768
++ 
++     /**
++      * strictly follow the standard (MPEG-4, ...).
++@@ -3195,6 +3198,9 @@ typedef struct AVCodecContext {
++ #define FF_PROFILE_H264_HIGH_444_PREDICTIVE  244
++ #define FF_PROFILE_H264_HIGH_444_INTRA       (244|FF_PROFILE_H264_INTRA)
++ #define FF_PROFILE_H264_CAVLC_444            44
+++#define FF_PROFILE_H264_MULTIVIEW_HIGH       118
+++#define FF_PROFILE_H264_STEREO_HIGH          128
+++#define FF_PROFILE_H264_MULTIVIEW_HIGH_DEPTH 138
++ 
++ #define FF_PROFILE_VC1_SIMPLE   0
++ #define FF_PROFILE_VC1_MAIN     1
++@@ -3505,6 +3511,12 @@ typedef struct AVCodecContext {
++ #define FF_SUB_TEXT_FMT_ASS_WITH_TIMINGS 1
++ #endif
++ 
+++    /**
+++     * Opaque pointer for use by replacement get_buffer2 code
+++     *
+++     * @author jc (08/02/2016)
+++     */
+++    void * get_buffer_context;
++ } AVCodecContext;
++ 
++ AVRational av_codec_get_pkt_timebase         (const AVCodecContext *avctx);
++diff --git a/libavcodec/cabac.h b/libavcodec/cabac.h
++index 1bf1c62..ccfa991 100644
++--- a/libavcodec/cabac.h
+++++ b/libavcodec/cabac.h
++@@ -43,7 +43,14 @@ extern const uint8_t ff_h264_cabac_tables[512 + 4*2*64 + 4*64 + 63];
++ typedef struct CABACContext{
++     int low;
++     int range;
++-    int outstanding_count;
+++    union
+++    {
+++        int outstanding_count;
+++        struct {
+++            uint16_t bits;
+++            uint16_t range;
+++        } by22;
+++    };
++     const uint8_t *bytestream_start;
++     const uint8_t *bytestream;
++     const uint8_t *bytestream_end;
++diff --git a/libavcodec/codec_desc.c b/libavcodec/codec_desc.c
++index 9d94b72..535ebf0 100644
++--- a/libavcodec/codec_desc.c
+++++ b/libavcodec/codec_desc.c
++@@ -1563,6 +1563,13 @@ static const AVCodecDescriptor codec_descriptors[] = {
++         .long_name = NULL_IF_CONFIG_SMALL("YUY2 Lossless Codec"),
++         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
++     },
+++    {
+++        .id        = AV_CODEC_ID_H264_MVC,
+++        .type      = AVMEDIA_TYPE_VIDEO,
+++        .name      = "h264_mvc",
+++        .long_name = NULL_IF_CONFIG_SMALL("H264 MVC"),
+++        .props     = AV_CODEC_PROP_LOSSY,
+++    },
++ 
++     /* various PCM "codecs" */
++     {
++diff --git a/libavcodec/h264.h b/libavcodec/h264.h
++index efe3555..16358aa 100644
++--- a/libavcodec/h264.h
+++++ b/libavcodec/h264.h
++@@ -126,7 +126,9 @@ enum {
++     NAL_END_STREAM      = 11,
++     NAL_FILLER_DATA     = 12,
++     NAL_SPS_EXT         = 13,
+++    NAL_SPS_SUBSET      = 15,
++     NAL_AUXILIARY_SLICE = 19,
+++    NAL_SLICE_EXT       = 20,
++     NAL_FF_IGNORE       = 0xff0f001,
++ };
++ 
++diff --git a/libavcodec/h264_parser.c b/libavcodec/h264_parser.c
++index ce4bab2..b9b0c78 100644
++--- a/libavcodec/h264_parser.c
+++++ b/libavcodec/h264_parser.c
++@@ -58,6 +58,8 @@ typedef struct H264ParseContext {
++     uint8_t parse_history[6];
++     int parse_history_count;
++     int parse_last_mb;
+++    int is_mvc;
+++    int slice_ext;
++ } H264ParseContext;
++ 
++ 
++@@ -105,24 +107,27 @@ static int h264_find_frame_end(H264ParseContext *p, const uint8_t *buf,
++         } else if (state <= 5) {
++             int nalu_type = buf[i] & 0x1F;
++             if (nalu_type == NAL_SEI || nalu_type == NAL_SPS ||
++-                nalu_type == NAL_PPS || nalu_type == NAL_AUD) {
+++                nalu_type == NAL_PPS || nalu_type == NAL_AUD ||
+++                nalu_type == NAL_SPS_SUBSET) {
++                 if (pc->frame_start_found) {
++                     i++;
++                     goto found;
++                 }
++             } else if (nalu_type == NAL_SLICE || nalu_type == NAL_DPA ||
++-                       nalu_type == NAL_IDR_SLICE) {
+++                       nalu_type == NAL_IDR_SLICE || (p->is_mvc && nalu_type == NAL_SLICE_EXT)) {
++                 state += 8;
+ +
+-+   p[i++] = 0x3000c; // (the tag id)
+-+   p[i++] = 12; // (size of the buffer)
+-+   p[i++] = 12; // (size of the data)
+-+   p[i++] = size; // (num bytes? or pages?)
+-+   p[i++] = align; // (alignment)
+-+   p[i++] = flags; // (MEM_FLAG_L1_NONALLOCATING)
+++                p->slice_ext = (nalu_type == NAL_SLICE_EXT);
++                 continue;
++             }
++             state = 7;
++         } else {
++             p->parse_history[p->parse_history_count++] = buf[i];
++-            if (p->parse_history_count > 5) {
+++            if (p->parse_history_count > 8) {
++                 unsigned int mb, last_mb = p->parse_last_mb;
++                 GetBitContext gb;
++ 
++-                init_get_bits(&gb, p->parse_history, 8*p->parse_history_count);
+++                init_get_bits8(&gb, p->parse_history + 3*p->slice_ext, p->parse_history_count - 3*p->slice_ext);
++                 p->parse_history_count = 0;
++                 mb= get_ue_golomb_long(&gb);
++                 p->parse_last_mb = mb;
++@@ -145,7 +150,7 @@ found:
++     pc->frame_start_found = 0;
++     if (p->is_avc)
++         return next_avc;
++-    return i - (state & 5) - 5 * (state > 7);
+++    return i - (state & 5) - 8 * (state > 7);
++ }
++ 
++ static int scan_mmco_reset(AVCodecParserContext *s, GetBitContext *gb,
++@@ -585,7 +590,8 @@ static int h264_parse(AVCodecParserContext *s,
++         }
++     }
++ 
++-    parse_nal_units(s, avctx, buf, buf_size);
+++    if (!p->is_mvc)
+++        parse_nal_units(s, avctx, buf, buf_size);
++ 
++     if (avctx->framerate.num)
++         avctx->time_base = av_inv_q(av_mul_q(avctx->framerate, (AVRational){avctx->ticks_per_frame, 1}));
++@@ -622,7 +628,7 @@ static int h264_split(AVCodecContext *avctx,
++         if ((state & 0xFFFFFF00) != 0x100)
++             break;
++         nalu_type = state & 0x1F;
++-        if (nalu_type == NAL_SPS) {
+++        if (nalu_type == NAL_SPS || nalu_type == NAL_SPS_SUBSET) {
++             has_sps = 1;
++         } else if (nalu_type == NAL_PPS)
++             has_pps = 1;
++@@ -672,3 +678,23 @@ AVCodecParser ff_h264_parser = {
++     .parser_close   = h264_close,
++     .split          = h264_split,
++ };
+ +
+-+   p[i++] = 0x00000000; // end tag
+-+   p[0] = i*sizeof *p; // actual size
+++static av_cold int init_mvc(AVCodecParserContext *s)
+++{
+++    H264ParseContext *p = s->priv_data;
+++    int ret = init(s);
+++    if (ret < 0)
+++        return ret;
+ +
+-+   mbox_property(file_desc, p);
+-+   return p[5];
+++    p->is_mvc = 1;
+++    return 0;
+ +}
+ +
+-+unsigned mem_free(int file_desc, unsigned handle)
+-+{
+-+   int i=0;
+-+   unsigned p[32];
+-+   p[i++] = 0; // size
+-+   p[i++] = 0x00000000; // process request
+++AVCodecParser ff_h264_mvc_parser = {
+++    .codec_ids      = { AV_CODEC_ID_H264_MVC },
+++    .priv_data_size = sizeof(H264ParseContext),
+++    .parser_init    = init_mvc,
+++    .parser_parse   = h264_parse,
+++    .parser_close   = h264_close,
+++    .split          = h264_split,
+++};
++diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
++index b478065..88dd40b 100644
++--- a/libavcodec/hevc.c
+++++ b/libavcodec/hevc.c
++@@ -41,8 +41,186 @@
++ #include "hevc.h"
++ #include "profiles.h"
++ 
+++#ifdef RPI
+++  #include "rpi_qpu.h"
+++  #include "rpi_user_vcsm.h"
+++  // Move Inter prediction into separate pass
+++  #define RPI_INTER
+ +
+-+   p[i++] = 0x3000f; // (the tag id)
+-+   p[i++] = 4; // (size of the buffer)
+-+   p[i++] = 4; // (size of the data)
+-+   p[i++] = handle;
+++  #ifdef RPI_INTER_QPU
+++    // Define RPI_MULTI_MAILBOX to use the updated mailbox that can launch both QPU and VPU
+++    #define RPI_MULTI_MAILBOX
+++  #endif
+ +
+-+   p[i++] = 0x00000000; // end tag
+-+   p[0] = i*sizeof *p; // actual size
+++  // Define RPI_CACHE_UNIF_MVS to write motion vector uniform stream to cached memory
+++  // RPI_CACHE_UNIF_MVS doesn't seem to make much difference, so left undefined.
+ +
+-+   mbox_property(file_desc, p);
+-+   return p[5];
+-+}
+++  // Define RPI_SIMULATE_QPUS for debugging to run QPU code on the ARMs (*rotted*)
+++  //#define RPI_SIMULATE_QPUS
+++  #ifdef RPI_WORKER
+++    #include "pthread.h"
+++  #endif
+ +
+-+unsigned mem_lock(int file_desc, unsigned handle)
+-+{
+-+   int i=0;
+-+   unsigned p[32];
+-+   p[i++] = 0; // size
+-+   p[i++] = 0x00000000; // process request
+++  static void rpi_execute_dblk_cmds(HEVCContext *s);
+++  static void rpi_execute_transform(HEVCContext *s);
+++  static void rpi_launch_vpu_qpu(HEVCContext *s);
+++  static void rpi_execute_pred_cmds(HEVCContext *s);
+++  static void rpi_execute_inter_cmds(HEVCContext *s);
+++  static void rpi_begin(HEVCContext *s);
+++  static void flush_frame(HEVCContext *s,AVFrame *frame);
+++  static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2, int job);
+ +
+-+   p[i++] = 0x3000d; // (the tag id)
+-+   p[i++] = 4; // (size of the buffer)
+-+   p[i++] = 4; // (size of the data)
+-+   p[i++] = handle;
+++#endif
+ +
+-+   p[i++] = 0x00000000; // end tag
+-+   p[0] = i*sizeof *p; // actual size
+++// #define DISABLE_MC
+ +
+-+   mbox_property(file_desc, p);
+-+   return p[5];
+-+}
+++#define PACK2(hi,lo) (((hi) << 16) | ((lo) & 0xffff))
+ +
+-+unsigned mem_unlock(int file_desc, unsigned handle)
+++#ifndef av_mod_uintp2
+++static av_always_inline av_const unsigned av_mod_uintp2_c(unsigned a, unsigned p)
+ +{
+-+   int i=0;
+-+   unsigned p[32];
+-+   p[i++] = 0; // size
+-+   p[i++] = 0x00000000; // process request
+-+
+-+   p[i++] = 0x3000e; // (the tag id)
+-+   p[i++] = 4; // (size of the buffer)
+-+   p[i++] = 4; // (size of the data)
+-+   p[i++] = handle;
+-+
+-+   p[i++] = 0x00000000; // end tag
+-+   p[0] = i*sizeof *p; // actual size
+-+
+-+   mbox_property(file_desc, p);
+-+   return p[5];
+++    return a & ((1 << p) - 1);
+ +}
+++#   define av_mod_uintp2   av_mod_uintp2_c
+++#endif
+ +
+-+unsigned execute_code(int file_desc, unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5)
+-+{
+-+   int i=0;
+-+   unsigned p[32];
+-+   p[i++] = 0; // size
+-+   p[i++] = 0x00000000; // process request
++ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
++ 
+ +
+-+   p[i++] = 0x30010; // (the tag id)
+-+   p[i++] = 28; // (size of the buffer)
+-+   p[i++] = 28; // (size of the data)
+-+   p[i++] = code;
+-+   p[i++] = r0;
+-+   p[i++] = r1;
+-+   p[i++] = r2;
+-+   p[i++] = r3;
+-+   p[i++] = r4;
+-+   p[i++] = r5;
+++#ifdef RPI_INTER_QPU
+ +
+-+   p[i++] = 0x00000000; // end tag
+-+   p[0] = i*sizeof *p; // actual size
+++// Each luma QPU processes 2*RPI_NUM_CHUNKS 64x64 blocks
+++// Each chroma QPU processes 3*RPI_NUM_CHUNKS 64x64 blocks, but requires two commands for B blocks
+++// For each block of 64*64 the smallest block size is 8x4
+++// We also need an extra command for the setup information
+ +
+-+   mbox_property(file_desc, p);
+-+   return p[5];
+-+}
+++#define RPI_CHROMA_COMMAND_WORDS 12
+++#define UV_COMMANDS_PER_QPU ((1 + 3*RPI_NUM_CHUNKS*(64*64)*2/(8*4)) * RPI_CHROMA_COMMAND_WORDS)
+++// The QPU code for UV blocks only works up to a block width of 8
+++#define RPI_CHROMA_BLOCK_WIDTH 8
+ +
+-+unsigned qpu_enable(int file_desc, unsigned enable)
+-+{
+-+   int i=0;
+-+   unsigned p[32];
+++#define RPI_LUMA_COMMAND_WORDS 10
+++#define Y_COMMANDS_PER_QPU ((1+2*RPI_NUM_CHUNKS*(64*64)/(8*4)) * RPI_LUMA_COMMAND_WORDS)
+ +
+-+   p[i++] = 0; // size
+-+   p[i++] = 0x00000000; // process request
+++#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24)
+ +
+-+   p[i++] = 0x30012; // (the tag id)
+-+   p[i++] = 4; // (size of the buffer)
+-+   p[i++] = 4; // (size of the data)
+-+   p[i++] = enable;
+++// TODO Chroma only needs 4 taps
+ +
+-+   p[i++] = 0x00000000; // end tag
+-+   p[0] = i*sizeof *p; // actual size
+++// Actual filter goes -ve, +ve, +ve, -ve using these values
+++static const uint32_t rpi_filter_coefs[8][1] = {
+++        { ENCODE_COEFFS(   0,  64,   0,   0) },
+++        { ENCODE_COEFFS(  2,  58,  10,  2) },
+++        { ENCODE_COEFFS(  4,  54,  16,  2) },
+++        { ENCODE_COEFFS(  6,  46,  28,  4) },
+++        { ENCODE_COEFFS(  4,  36,  36,  4) },
+++        { ENCODE_COEFFS(  4,  28,  46,  6) },
+++        { ENCODE_COEFFS(  2,  16,  54,  4) },
+++        { ENCODE_COEFFS(  2,  10,  58,  2) }
+++};
+ +
+-+   mbox_property(file_desc, p);
+-+   return p[5];
+-+}
+++#endif
+ +
+-+unsigned execute_qpu(int file_desc, unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout) {
+-+   int i=0;
+-+   unsigned p[32];
+ +
+-+   p[i++] = 0; // size
+-+   p[i++] = 0x00000000; // process request
+-+   p[i++] = 0x30011; // (the tag id)
+-+   p[i++] = 16; // (size of the buffer)
+-+   p[i++] = 16; // (size of the data)
+-+   p[i++] = num_qpus;
+-+   p[i++] = control;
+-+   p[i++] = noflush;
+-+   p[i++] = timeout; // ms
+++#ifdef RPI_WORKER
+ +
+-+   p[i++] = 0x00000000; // end tag
+-+   p[0] = i*sizeof *p; // actual size
+++//#define LOG_ENTER printf("Enter %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s);
+++//#define LOG_EXIT printf("Exit %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s);
+ +
+-+   mbox_property(file_desc, p);
+-+   return p[5];
+-+}
+++#define LOG_ENTER
+++#define LOG_EXIT
+ +
+-+int mbox_open() {
+-+   int file_desc;
+++// Call this when we have completed pass0 and wish to trigger pass1 for the current job
+++static void worker_submit_job(HEVCContext *s)
+++{
+++  LOG_ENTER
+++  pthread_mutex_lock(&s->worker_mutex);
+++  s->worker_tail++;
+++  s->pass0_job = (s->pass0_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
+++  pthread_cond_broadcast(&s->worker_cond_tail); // Let people know that the tail has moved
+++  pthread_mutex_unlock(&s->worker_mutex);
+++  LOG_EXIT
+++}
+ +
+-+   // open a char device file used for communicating with kernel mbox driver
+-+   file_desc = open(DEVICE_FILE_NAME, 0);
+-+   if (file_desc < 0) {
+-+      printf("Can't open device file: %s\n", DEVICE_FILE_NAME);
+-+      printf("Try creating a device file with: sudo mknod %s c %d 0\n", DEVICE_FILE_NAME, MAJOR_NUM);
+-+   }
+-+   return file_desc;
+++// Call this to say we have completed pass1
+++static void worker_complete_job(HEVCContext *s)
+++{
+++  LOG_ENTER
+++  pthread_mutex_lock(&s->worker_mutex);
+++  s->worker_head++;
+++  s->pass1_job = (s->pass1_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
+++  pthread_cond_broadcast(&s->worker_cond_head); // Let people know that the head has moved
+++  pthread_mutex_unlock(&s->worker_mutex);
+++  LOG_EXIT
+ +}
+ +
+-+void mbox_close(int file_desc) {
+-+  close(file_desc);
+++// Call this to wait for all jobs to have completed at the end of a frame
+++static void worker_wait(HEVCContext *s)
+++{
+++  LOG_ENTER
+++  pthread_mutex_lock(&s->worker_mutex);
+++  while( s->worker_head !=s->worker_tail)
+++  {
+++    pthread_cond_wait(&s->worker_cond_head, &s->worker_mutex);
+++  }
+++  pthread_mutex_unlock(&s->worker_mutex);
+++  LOG_EXIT
+ +}
+-diff --git a/libavcodec/rpi_mailbox.h b/libavcodec/rpi_mailbox.h
+-new file mode 100644
+-index 0000000..c264d2e
+---- /dev/null
+-+++ b/libavcodec/rpi_mailbox.h
+-@@ -0,0 +1,20 @@
+-+#ifndef RPI_MAILBOX_H
+-+#define RPI_MAILBOX_H
+ +
+-+extern int mbox_open(void);
+-+extern void mbox_close(int file_desc);
+++// Call worker_pass0_ready to wait until the s->pass0_job slot becomes
+++// available to receive the next job.
+++static void worker_pass0_ready(HEVCContext *s)
+++{
+++  LOG_ENTER
+++    pthread_mutex_lock(&s->worker_mutex);
+++    // tail is number of submitted jobs
+++    // head is number of completed jobs
+++    // tail-head is number of outstanding jobs in the queue
+++    // we need to ensure there is at least 1 space left for us to use
+++    while( s->worker_tail - s->worker_head >= RPI_MAX_JOBS)
+++    {
+++      // Wait until another job is completed
+++      pthread_cond_wait(&s->worker_cond_head, &s->worker_mutex);
+++    }
+++    pthread_mutex_unlock(&s->worker_mutex);
+++  LOG_EXIT
+++}
+ +
+-+extern unsigned get_version(int file_desc);
+-+extern unsigned mem_alloc(int file_desc, unsigned size, unsigned align, unsigned flags);
+-+extern unsigned mem_free(int file_desc, unsigned handle);
+-+extern unsigned mem_lock(int file_desc, unsigned handle);
+-+extern unsigned mem_unlock(int file_desc, unsigned handle);
+-+extern void *mapmem_shared(unsigned base, unsigned size);
+-+extern void *mapmem_private(unsigned base, unsigned size);
+-+extern void unmapmem(void *addr, unsigned size);
+++static void *worker_start(void *arg)
+++{
+++  HEVCContext *s = (HEVCContext *)arg;
+++  while(1) {
+++    pthread_mutex_lock(&s->worker_mutex);
+ +
+-+extern unsigned execute_code(int file_desc, unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5);
+-+extern unsigned execute_qpu(int file_desc, unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout);
+-+extern unsigned qpu_enable(int file_desc, unsigned enable);
+++    while( !s->kill_worker && s->worker_tail - s->worker_head <= 0)
+++    {
+++      pthread_cond_wait(&s->worker_cond_tail, &s->worker_mutex);
+++    }
+++    pthread_mutex_unlock(&s->worker_mutex);
+ +
+-+#endif
+-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
+-new file mode 100644
+-index 0000000..b1f50ee
+---- /dev/null
+-+++ b/libavcodec/rpi_qpu.c
+-@@ -0,0 +1,652 @@
+-+#ifdef RPI
+-+// Use the vcsm device for shared memory
+-+// This works better than the mmap in that the memory can be cached, but requires a kernel modification to enable the device.
+-+#define RPI_USE_VCSM
+-+#define RPI_TIME_TOTAL_QPU
+++    if (s->kill_worker) {
+++      break;
+++    }
+++    LOG_ENTER
+++    // printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
+++    rpi_launch_vpu_qpu(s);
+++    // Perform inter prediction
+++    rpi_execute_inter_cmds(s);
+++    // Wait for transform completion
+++    vpu_wait(s->vpu_id);
+ +
+-+#include <stdio.h>
+-+#include <stdlib.h>
+-+#include <string.h>
+-+#include <stddef.h>
+-+#include <assert.h>
+++    // Perform intra prediction and residual reconstruction
+++    rpi_execute_pred_cmds(s);
+++    // Perform deblocking for CTBs in this row
+++    rpi_execute_dblk_cmds(s);
+ +
+-+#include "config.h"
+++    worker_complete_job(s);
+++    LOG_EXIT
+++  }
+++  return NULL;
+++}
+ +
+-+#include <pthread.h>
+-+#include <time.h>
+++#endif
+ +
+-+#include "rpi_mailbox.h"
+-+#include "rpi_qpu.h"
+-+#include "rpi_shader.h"
+-+#include "rpi_hevc_transform.h"
++ /**
++  * NOTE: Each function hls_foo correspond to the function foo in the
++  * specification (HLS stands for High Level Syntax).
++@@ -55,6 +233,32 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
++ /* free everything allocated  by pic_arrays_init() */
++ static void pic_arrays_free(HEVCContext *s)
++ {
+++#ifdef RPI
+++    int job;
+++    for(job=0;job<RPI_MAX_JOBS;job++) {
+++      if (s->coeffs_buf_arm[job][0]) {
+++        gpu_free(&s->coeffs_buf_default[job]);
+++        s->coeffs_buf_arm[job][0] = 0;
+++      }
+++      if (s->coeffs_buf_arm[job][2]) {
+++        gpu_free(&s->coeffs_buf_accelerated[job]);
+++        s->coeffs_buf_arm[job][2] = 0;
+++      }
+++    }
+++#endif
+++#ifdef RPI_DEBLOCK_VPU
+++    {
+++        int i;
+++        for (i = 0; i != RPI_DEBLOCK_VPU_Q_COUNT; ++i) {
+++            struct dblk_vpu_q_s * const dvq = s->dvq_ents + i;
+ +
+-+#ifdef RPI_USE_VCSM
+-+#include "rpi_user_vcsm.h"
+++            if (dvq->vpu_cmds_arm) {
+++                gpu_free(&dvq->deblock_vpu_gmem);
+++              dvq->vpu_cmds_arm = 0;
+++            }
+++        }
+++    }
+ +#endif
++     av_freep(&s->sao);
++     av_freep(&s->deblock);
++ 
++@@ -91,6 +295,87 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
++     int ctb_count        = sps->ctb_width * sps->ctb_height;
++     int min_pu_size      = sps->min_pu_width * sps->min_pu_height;
++ 
+++#ifdef RPI
+++    int coefs_in_ctb = (1 << sps->log2_ctb_size) * (1 << sps->log2_ctb_size);
+++    int coefs_per_luma = 64*64*24*RPI_NUM_CHUNKS;
+++    int coefs_per_chroma = (coefs_per_luma * 2) >> sps->vshift[1] >> sps->hshift[1];
+++    int coefs_per_row = coefs_per_luma + coefs_per_chroma;
+++    int job;
+ +
+-+// On Pi2 there is no way to access the VPU L2 cache
+-+// GPU_MEM_FLG should be 4 for uncached memory.
+-+// However, if using VCSM allocated buffers, need to use C at the moment because VCSM does not allocate uncached memory correctly
+-+// The QPU crashes if we mix L2 cached and L2 uncached accesses due to a HW bug.
+-+#define GPU_MEM_FLG 0xC
+-+#define GPU_MEM_MAP 0x0
+-+
+-+#define vcos_verify(x) ((x)>=0)
+-+
+-+typedef unsigned char uint8_t;
+-+typedef signed char int8_t;
+-+typedef unsigned short uint16_t;
+-+typedef unsigned int uint32_t;
+-+typedef int int32_t;
+-+
+-+/*static const unsigned code[] =
+-+{
+-+  #include "rpi_shader.hex"
+-+};*/
+-+
+-+// Size in 32bit words
+-+#define QPU_CODE_SIZE 2048
+-+#define VPU_CODE_SIZE 2048
+-+
+-+struct GPU
+-+{
+-+  unsigned int qpu_code[QPU_CODE_SIZE];
+-+  unsigned int vpu_code[VPU_CODE_SIZE];
+-+  int open_count; // Number of allocated video buffers
+-+  unsigned int vc_handle; // Handle of this memory
+-+  int      mb; // Mailbox handle
+-+  int      vc; // Address in GPU memory
+-+  int mail[12]; // These are used to pass pairs of code/unifs to the QPUs
+-+};
+-+
+-+// Stop more than one thread trying to allocate memory or use the processing resources at once
+-+static pthread_mutex_t gpu_mutex = PTHREAD_MUTEX_INITIALIZER;
+-+static volatile struct GPU* gpu = NULL;
+-+
+-+#ifdef RPI_TIME_TOTAL_QPU
+-+static unsigned int Microseconds(void) {
+-+    struct timespec ts;
+-+    unsigned int x;
+-+    static unsigned int base = 0;
+-+    clock_gettime(CLOCK_REALTIME, &ts);
+-+    x = ts.tv_sec*1000000 + ts.tv_nsec/1000;
+-+    if (base==0) base=x;
+-+    return x-base;
+-+}
+++    av_assert0(sps);
+++    s->max_ctu_count = coefs_per_luma / coefs_in_ctb;
+++    s->ctu_per_y_chan = s->max_ctu_count / 12;
+++    s->ctu_per_uv_chan = s->max_ctu_count / 8;
+++    for(job=0;job<RPI_MAX_JOBS;job++) {
+++      printf("Allocated %d\n",coefs_per_row);
+++      for(job=0;job<RPI_MAX_JOBS;job++) {
+++        gpu_malloc_cached(sizeof(int16_t) * coefs_per_row, &s->coeffs_buf_default[job]);
+++        s->coeffs_buf_arm[job][0] = (int16_t*) s->coeffs_buf_default[job].arm;
+++        if (!s->coeffs_buf_arm[job][0])
+++            goto fail;
+++        gpu_malloc_cached(sizeof(int16_t) * (coefs_per_row + 32*32), &s->coeffs_buf_accelerated[job]);  // We prefetch past the end so provide an extra blocks worth of data
+++        s->coeffs_buf_arm[job][2] = (int16_t*) s->coeffs_buf_accelerated[job].arm;
+++        s->coeffs_buf_vc[job][2] = s->coeffs_buf_accelerated[job].vc;
+++        if (!s->coeffs_buf_arm[job][2])
+++            goto fail;
+++        s->coeffs_buf_arm[job][3] = coefs_per_row + s->coeffs_buf_arm[job][2];  // This points to just beyond the end of the buffer.  Coefficients fill in backwards.
+++        s->coeffs_buf_vc[job][3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[job][2];
+++      }
+++    }
+ +#endif
+++#ifdef RPI_DEBLOCK_VPU
+++    {
+++        int i;
+++        s->enable_rpi_deblock = !sps->sao_enabled;
+++        s->setup_width = (sps->width+15) / 16;
+++        s->setup_height = (sps->height+15) / 16;
+++        s->uv_setup_width = ( (sps->width >> sps->hshift[1]) + 15) / 16;
+++        s->uv_setup_height = ( (sps->height >> sps->vshift[1]) + 15) / 16;
+++
+++        for (i = 0; i != RPI_DEBLOCK_VPU_Q_COUNT; ++i)
+++        {
+++            struct dblk_vpu_q_s * const dvq = s->dvq_ents + i;
+++            const unsigned int cmd_size = (sizeof(*dvq->vpu_cmds_arm) * 3 + 15) & ~15;
+++            const unsigned int y_size = (sizeof(*dvq->y_setup_arm) * s->setup_width * s->setup_height + 15) & ~15;
+++            const unsigned int uv_size = (sizeof(*dvq->uv_setup_arm) * s->uv_setup_width * s->uv_setup_height + 15) & ~15;
+++            const unsigned int total_size =- cmd_size + y_size + uv_size;
+++            int p_vc;
+++            uint8_t * p_arm;
+++ #if RPI_VPU_DEBLOCK_CACHED
+++            gpu_malloc_cached(total_size, &dvq->deblock_vpu_gmem);
+++ #else
+++            gpu_malloc_uncached(total_size, &dvq->deblock_vpu_gmem);
+++ #endif
+++            p_vc = dvq->deblock_vpu_gmem.vc;
+++            p_arm = dvq->deblock_vpu_gmem.arm;
+++
+++            // Zap all
+++            memset(p_arm, 0, dvq->deblock_vpu_gmem.numbytes);
+++
+++            // Subdivide
+++            dvq->vpu_cmds_arm = (void*)p_arm;
+++            dvq->vpu_cmds_vc = p_vc;
+++
+++            p_arm += cmd_size;
+++            p_vc += cmd_size;
+++
+++            dvq->y_setup_arm = (void*)p_arm;
+++            dvq->y_setup_vc = (void*)p_vc;
+++
+++            p_arm += y_size;
+++            p_vc += y_size;
+++
+++            dvq->uv_setup_arm = (void*)p_arm;
+++            dvq->uv_setup_vc = (void*)p_vc;
+++
+++            dvq->cmd_id = -1;
+++        }
+ +
+-+// Connect to QPU, returns 0 on success.
+-+static int gpu_init(volatile struct GPU **gpu) {
+-+  int mb = mbox_open();
+-+  int vc;
+-+  int handle;
+-+  volatile struct GPU* ptr;
+-+	if (mb < 0)
+-+		return -1;
+-+
+-+	if (qpu_enable(mb, 1)) return -2;
+-+
+-+#ifdef RPI_USE_VCSM
+-+  vcsm_init();
+++        s->dvq_n = 0;
+++        s->dvq = s->dvq_ents + s->dvq_n;
+++    }
+ +#endif
+ +
+-+  handle = mem_alloc(mb, sizeof(struct GPU), 4096, GPU_MEM_FLG);
+-+  if (!handle)
+-+  {
+-+    qpu_enable(mb, 0);
+-+    return -3;
+-+  }
+-+	vc = mem_lock(mb, handle);
+-+	ptr = mapmem_shared((vc+GPU_MEM_MAP)&~0xc0000000, sizeof(struct GPU));
+-+	if (ptr == NULL)
+-+	{	mem_free(mb, handle);
+-+		mem_unlock(mb, handle);
+-+		qpu_enable(mb, 0);
+-+		return -4;
+-+	}
+-+
+-+	ptr->mb = mb;
+-+	ptr->vc_handle = handle;
+-+	ptr->vc = vc;
+-+
+-+  *gpu = ptr;
+-+
+-+  // Now copy over the QPU code into GPU memory
+-+  {
+-+    int num_bytes = qpu_get_fn(QPU_MC_END) - qpu_get_fn(QPU_MC_SETUP);
+-+    assert(num_bytes<=QPU_CODE_SIZE*sizeof(unsigned int));
+-+    memcpy((void*)ptr->qpu_code, rpi_shader, num_bytes);
+-+  }
+-+  // And the VPU code
+-+  {
+-+    int num_bytes = sizeof(rpi_hevc_transform);
+-+    assert(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
+-+    memcpy((void*)ptr->vpu_code, rpi_hevc_transform, num_bytes);
+-+  }
+-+
+-+  return 0;
+-+}
+-+
+-+// Make sure we have exclusive access to the mailbox, and enable qpu if necessary.
+-+static void gpu_lock(void) {
+-+  pthread_mutex_lock(&gpu_mutex);
+-+  if (gpu==NULL) {
+-+    gpu_init(&gpu);
+-+  }
+-+}
+-+
+-+static void gpu_unlock(void) {
+-+  pthread_mutex_unlock(&gpu_mutex);
+-+}
+-+
+-+// Allocate memory on GPU
+-+// Fills in structure <p> containing ARM pointer, videocore handle, videocore memory address, numbytes
+-+// Returns 0 on success.
+-+// This allocates memory that will not be cached in ARM's data cache.
+-+// Therefore safe to use without data cache flushing.
+-+int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p) {
+-+  gpu_lock();
+-+  p->vc_handle = mem_alloc(gpu->mb, numbytes, 4096, GPU_MEM_FLG);
+-+  p->vcsm_handle = 0;
+-+  if (!p->vc_handle)
+-+  {
+-+    qpu_enable(gpu->mb, 0);
+-+    return -3;
+-+  }
+-+  p->vc = mem_lock(gpu->mb, p->vc_handle);
+-+  p->arm = mapmem_shared((p->vc+GPU_MEM_MAP)&~0xc0000000,numbytes);
+-+  p->numbytes = numbytes;
+-+  if (p->arm == NULL)
+-+  {
+-+    mem_free(gpu->mb, p->vc_handle);
+-+    mem_unlock(gpu->mb, p->vc_handle);
+-+    gpu_unlock();
+-+    qpu_enable(gpu->mb, 0);
+-+    return -4;
+-+  }
+-+  gpu->open_count++;
+-+  gpu_unlock();
+-+  return 0;
+-+}
+-+
+-+void gpu_cache_flush(GPU_MEM_PTR_T *p)
++     s->bs_width  = (width  >> 2) + 1;
++     s->bs_height = (height >> 2) + 1;
++ 
++@@ -137,6 +422,29 @@ fail:
++     return AVERROR(ENOMEM);
++ }
++ 
+++static void default_pred_weight_table(HEVCContext * const s)
+ +{
+-+  // This only works when using RPI_USE_VCSM
+-+  void *tmp = vcsm_lock(p->vcsm_handle);
+-+  vcsm_unlock_ptr(tmp);
+-+}
+-+
+-+// This allocates data that will be
+-+//    Cached in ARM L2
+-+//    Uncached in VPU L2
+-+int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p) {
+-+  gpu_lock();
+-+#ifdef RPI_USE_VCSM
+-+  {
+-+      p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST, (char *)"Video Frame" ); // f....... locks up for VP9 - retest this?
+-+      //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_VC, (char *)"Video Frame" ); // 3b...... works
+-+      //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" ); //fb...... locks up
+-+      //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST_AND_VC, (char *)"Video Frame" ); // 3b works (but corrupted due to caching)
+-+      p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
+-+      p->arm = vcsm_lock(p->vcsm_handle);
+-+      p->vc = mem_lock(gpu->mb, p->vc_handle);
+-+  }
+-+#else
+-+  p->vc_handle = mem_alloc(gpu->mb, numbytes, 4096, GPU_MEM_FLG);
+-+  p->vcsm_handle = 0;
+-+  if (!p->handle)
+-+  {
+-+    qpu_enable(gpu->mb, 0);
+-+    return -3;
+++  unsigned int i;
+++  s->sh.luma_log2_weight_denom = 0;
+++  s->sh.chroma_log2_weight_denom = 0;
+++  for (i = 0; i < s->sh.nb_refs[L0]; i++) {
+++      s->sh.luma_weight_l0[i] = 1;
+++      s->sh.luma_offset_l0[i] = 0;
+++      s->sh.chroma_weight_l0[i][0] = 1;
+++      s->sh.chroma_offset_l0[i][0] = 0;
+++      s->sh.chroma_weight_l0[i][1] = 1;
+++      s->sh.chroma_offset_l0[i][1] = 0;
+ +  }
+-+  p->vc = mem_lock(gpu->mb, p->vc_handle);
+-+  printf("This mapmem_private does not seem to work\n");
+-+  exit(-1);
+-+  p->arm = mapmem_private((p->vc+GPU_MEM_MAP)&~0xc0000000,numbytes);
+-+  p->numbytes = numbytes;
+-+  if (p->arm == NULL)
+-+  {
+-+    mem_free(gpu->mb, p->handle);
+-+    mem_unlock(gpu->mb, p->handle);
+-+    gpu_unlock();
+-+    qpu_enable(gpu->mb, 0);
+-+    return -4;
+++  for (i = 0; i < s->sh.nb_refs[L1]; i++) {
+++      s->sh.luma_weight_l1[i] = 1;
+++      s->sh.luma_offset_l1[i] = 0;
+++      s->sh.chroma_weight_l1[i][0] = 1;
+++      s->sh.chroma_offset_l1[i][0] = 0;
+++      s->sh.chroma_weight_l1[i][1] = 1;
+++      s->sh.chroma_offset_l1[i][1] = 0;
+ +  }
+-+#endif
+-+  gpu->open_count++;
+-+  gpu_unlock();
+-+  return 0;
+ +}
+ +
+-+static void gpu_term(void)
++ static void pred_weight_table(HEVCContext *s, GetBitContext *gb)
++ {
++     int i = 0;
++@@ -674,6 +982,11 @@ static int hls_slice_header(HEVCContext *s)
++                 (s->ps.pps->weighted_bipred_flag && sh->slice_type == B_SLICE)) {
++                 pred_weight_table(s, gb);
++             }
+++            else
+++            {
+++              // Give us unit weights
+++              default_pred_weight_table(s);
+++            }
++ 
++             sh->max_num_merge_cand = 5 - get_ue_golomb_long(gb);
++             if (sh->max_num_merge_cand < 1 || sh->max_num_merge_cand > 5) {
++@@ -931,6 +1244,25 @@ static int hls_cross_component_pred(HEVCContext *s, int idx) {
++     return 0;
++ }
++ 
+++#ifdef RPI
+++static void rpi_intra_pred(HEVCContext *s, int log2_trafo_size, int x0, int y0, int c_idx)
+ +{
+-+	int mb = gpu->mb;
+-+	unsigned handle = gpu->vc_handle;
+-+  if (gpu==NULL)
+-+    return;
+-+	unmapmem((void*)gpu, sizeof(struct GPU));
+-+	mem_unlock(mb, handle);
+-+	mem_free(mb, handle);
+-+	qpu_enable(mb, 0);
+-+#ifdef RPI_USE_VCSM
+-+  vcsm_exit();
+-+#endif
+-+	mbox_close(mb);
+-+  gpu = NULL;
+++    if (s->enable_rpi) {
+++        HEVCLocalContext *lc = s->HEVClc;
+++        HEVCPredCmd *cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++;
+++        cmd->type = RPI_PRED_INTRA;
+++        cmd->size = log2_trafo_size;
+++        cmd->c_idx = c_idx;
+++        cmd->x = x0;
+++        cmd->y = y0;
+++        cmd->na = (lc->na.cand_bottom_left<<4) + (lc->na.cand_left<<3) + (lc->na.cand_up_left<<2) + (lc->na.cand_up<<1) + lc->na.cand_up_right;
+++        cmd->mode = c_idx ? lc->tu.intra_pred_mode_c :  lc->tu.intra_pred_mode;
+++    } else {
+++        s->hpc.intra_pred[log2_trafo_size - 2](s, x0, y0, c_idx);
+++    }
+ +}
+++#endif
+ +
+-+void gpu_free(GPU_MEM_PTR_T *p) {
+-+  int mb = gpu->mb;
+-+	unsigned handle = p->vc_handle;
+-+  gpu_lock();
+-+#ifdef RPI_USE_VCSM
+-+  if (p->vcsm_handle) {
+-+      mem_unlock(mb,p->vc_handle);
+-+      vcsm_unlock_ptr(p->arm);
+-+      vcsm_free(p->vcsm_handle);
+-+  } else {
+-+	unmapmem((void*)p->arm, sizeof(struct GPU));
+-+      mem_unlock(mb, handle);
+-+      mem_free(mb, handle);
+-+  }
++ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
++                               int xBase, int yBase, int cb_xBase, int cb_yBase,
++                               int log2_cb_size, int log2_trafo_size,
++@@ -943,8 +1275,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
++     if (lc->cu.pred_mode == MODE_INTRA) {
++         int trafo_size = 1 << log2_trafo_size;
++         ff_hevc_set_neighbour_available(s, x0, y0, trafo_size, trafo_size);
++-
+++#ifdef RPI
+++        rpi_intra_pred(s, log2_trafo_size, x0, y0, 0);
+ +#else
+-+	unmapmem((void*)p->arm, sizeof(struct GPU));
+-+	mem_unlock(mb, handle);
+-+	mem_free(mb, handle);
++         s->hpc.intra_pred[log2_trafo_size - 2](s, x0, y0, 0);
+ +#endif
+-+
+-+  gpu->open_count--;
+-+  if (gpu->open_count==0) {
+-+      printf("Closing GPU\n");
+-+      gpu_term();
+-+      gpu = NULL;
+-+  }
+-+  gpu_unlock();
+-+}
+-+
+-+unsigned int vpu_get_fn(void) {
+-+  // Make sure that the gpu is initialized
+-+  if (gpu==NULL) {
+-+    printf("Preparing gpu\n");
+-+    gpu_lock();
+-+    gpu_unlock();
+-+  }
+-+  return gpu->vc + offsetof(struct GPU,vpu_code);
+-+}
+-+
+-+unsigned vpu_execute_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5)
+-+{
+-+  unsigned r;
+-+  gpu_lock();
+-+  r = execute_code(gpu->mb, code, r0, r1, r2, r3, r4, r5);
+-+  gpu_unlock();
+-+  return r;
+-+}
+-+
+-+// Run a program on a QPU with the given code and uniform stream (given in GPU addresses)
+-+// The first num QPUs will start at code, the next num2 QPUs will start at code2
+-+void qpu_run_shader12(int code, int num, int code2, int num2, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8, int unifs9, int unifs10, int unifs11, int unifs12)
+-+{
+-+  int i;
+-+#ifdef RPI_TIME_TOTAL_QPU
+-+  static int last_time=0;
+-+  static long long on_time=0;
+-+  static long long off_time=0;
+-+  int start_time;
+-+  int end_time;
+-+  static int count=0;
++     }
++ 
++     if (cbf_luma || cbf_cb[0] || cbf_cr[0] ||
++@@ -1030,7 +1365,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
++             for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
++                 if (lc->cu.pred_mode == MODE_INTRA) {
++                     ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
+++#ifdef RPI
+++                    rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (i << log2_trafo_size_c), 1);
+++#else
++                     s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (i << log2_trafo_size_c), 1);
+ +#endif
+-+
+-+  gpu_lock();
+-+#ifdef RPI_TIME_TOTAL_QPU
+-+  start_time = Microseconds();
+-+  if (last_time==0)
+-+    last_time = start_time;
+-+  off_time += start_time-last_time;
++                 }
++                 if (cbf_cb[i])
++                     ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c),
++@@ -1059,7 +1398,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
++             for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
++                 if (lc->cu.pred_mode == MODE_INTRA) {
++                     ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
+++#ifdef RPI
+++                    rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (i << log2_trafo_size_c), 2);
+++#else
++                     s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (i << log2_trafo_size_c), 2);
+ +#endif
+-+  for(i=0;i<num;i++) {
+-+    gpu->mail[i*2 + 1] = code;
+-+  }
+-+  for(;i<num+num2;i++) {
+-+    gpu->mail[i*2 + 1] = code2;
+-+  }
+-+  gpu->mail[0 ] = unifs1;
+-+  gpu->mail[2 ] = unifs2;
+-+  gpu->mail[4 ] = unifs3;
+-+  gpu->mail[6 ] = unifs4;
+-+  gpu->mail[8 ] = unifs5;
+-+  gpu->mail[10] = unifs6;
+-+	gpu->mail[12] = unifs7;
+-+	gpu->mail[14] = unifs8;
+-+	gpu->mail[16] = unifs9;
+-+	gpu->mail[18] = unifs10;
+-+	gpu->mail[20] = unifs11;
+-+	gpu->mail[22] = unifs12;
+-+	execute_qpu(
+-+		gpu->mb,
+-+		12 /* Number of QPUs */,
+-+		gpu->vc + offsetof(struct GPU, mail),
+-+		1 /* no flush */,  // Don't flush VPU L1 cache
+-+		5000 /* timeout ms */);
+-+#ifdef RPI_TIME_TOTAL_QPU
+-+  end_time = Microseconds();
+-+  last_time = end_time;
+-+  on_time += end_time - start_time;
+-+  count++;
+-+  if ((count&0x7f)==0)
+-+    printf("On=%dms, Off=%dms\n",(int)(on_time/1000),(int)(off_time/1000));
++                 }
++                 if (cbf_cr[i])
++                     ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c),
++@@ -1088,7 +1431,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
++                 if (lc->cu.pred_mode == MODE_INTRA) {
++                     ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size),
++                                                     trafo_size_h, trafo_size_v);
+++#ifdef RPI
+++                    rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (i << log2_trafo_size), 1);
+++#else
++                     s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (i << log2_trafo_size), 1);
+ +#endif
+-+  gpu_unlock();
+-+}
+-+
+-+unsigned int qpu_get_fn(int num) {
+-+    // Make sure that the gpu is initialized
+-+    unsigned int *fn;
+-+    if (gpu==NULL) {
+-+      printf("Preparing gpu\n");
+-+      gpu_lock();
+-+      gpu_unlock();
+-+    }
+-+    switch(num) {
+-+    case QPU_MC_SETUP:
+-+      fn = mc_setup;
+-+      break;
+-+    case QPU_MC_FILTER:
+-+      fn = mc_filter;
+-+      break;
+-+    case QPU_MC_EXIT:
+-+      fn = mc_exit;
+-+      break;
+-+    case QPU_MC_INTERRUPT_EXIT:
+-+      fn = mc_interrupt_exit;
+-+      break;
+-+    case QPU_MC_FILTER_B:
+-+      fn = mc_filter_b;
+-+      break;
+-+    case QPU_MC_FILTER_HONLY:
+-+      fn = mc_filter_honly;
+-+      break;
+-+    case QPU_MC_SETUP_UV:
+-+      fn = mc_setup_uv;
+-+      break;
+-+    case QPU_MC_FILTER_UV:
+-+      fn = mc_filter_uv;
+-+      break;
+-+    case QPU_MC_FILTER_UV_B:
+-+      fn = mc_filter_uv_b;
+-+      break;
+-+    case QPU_MC_END:
+-+      fn = mc_end;
+-+      break;
+-+    default:
+-+      printf("Unknown function\n");
+-+      exit(-1);
+-+    }
+-+    return gpu->vc + 4*(int)(fn-rpi_shader);
+-+    //return code[num] + gpu->vc;
++                 }
++                 if (cbf_cb[i])
++                     ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size),
++@@ -1098,7 +1445,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
++                 if (lc->cu.pred_mode == MODE_INTRA) {
++                     ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size),
++                                                 trafo_size_h, trafo_size_v);
+++#ifdef RPI
+++                    rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (i << log2_trafo_size), 2);
+++#else
++                     s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (i << log2_trafo_size), 2);
+++#endif
++                 }
++                 if (cbf_cr[i])
++                     ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size),
++@@ -1110,26 +1461,46 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
++             int trafo_size_h = 1 << (log2_trafo_size_c + s->ps.sps->hshift[1]);
++             int trafo_size_v = 1 << (log2_trafo_size_c + s->ps.sps->vshift[1]);
++             ff_hevc_set_neighbour_available(s, x0, y0, trafo_size_h, trafo_size_v);
+++#ifdef RPI
+++            rpi_intra_pred(s, log2_trafo_size_c, x0, y0, 1);
+++            rpi_intra_pred(s, log2_trafo_size_c, x0, y0, 2);
+++#else
++             s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0, 1);
++             s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0, 2);
+++#endif
++             if (s->ps.sps->chroma_format_idc == 2) {
++                 ff_hevc_set_neighbour_available(s, x0, y0 + (1 << log2_trafo_size_c),
++                                                 trafo_size_h, trafo_size_v);
+++#ifdef RPI
+++                rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (1 << log2_trafo_size_c), 1);
+++                rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (1 << log2_trafo_size_c), 2);
+++#else
++                 s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (1 << log2_trafo_size_c), 1);
++                 s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (1 << log2_trafo_size_c), 2);
+++#endif
++             }
++         } else if (blk_idx == 3) {
++             int trafo_size_h = 1 << (log2_trafo_size + 1);
++             int trafo_size_v = 1 << (log2_trafo_size + s->ps.sps->vshift[1]);
++             ff_hevc_set_neighbour_available(s, xBase, yBase,
++                                             trafo_size_h, trafo_size_v);
+++#ifdef RPI
+++            rpi_intra_pred(s, log2_trafo_size, xBase, yBase, 1);
+++            rpi_intra_pred(s, log2_trafo_size, xBase, yBase, 2);
+++#else
++             s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase, 1);
++             s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase, 2);
+++#endif
++             if (s->ps.sps->chroma_format_idc == 2) {
++                 ff_hevc_set_neighbour_available(s, xBase, yBase + (1 << (log2_trafo_size)),
++                                                 trafo_size_h, trafo_size_v);
+++#ifdef RPI
+++                rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (1 << (log2_trafo_size)), 1);
+++                rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (1 << (log2_trafo_size)), 2);
+++#else
++                 s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (1 << (log2_trafo_size)), 1);
++                 s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (1 << (log2_trafo_size)), 2);
+++#endif
++             }
++         }
++     }
++@@ -1332,6 +1703,93 @@ static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size)
++  * @param luma_offset additive offset applied to the luma prediction value
++  */
++ 
+++#ifdef RPI_INTER
+++#define RPI_REDIRECT(fn) (s->enable_rpi ? rpi_ ## fn : fn)
+++static void rpi_luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+++                        AVFrame *ref, const Mv *mv, int x_off, int y_off,
+++                        int block_w, int block_h, int luma_weight, int luma_offset)
+++{
+++    HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++;
+++    cmd->cmd = RPI_CMD_LUMA_UNI;
+++    cmd->dst = dst;
+++    cmd->dststride = dststride;
+++    cmd->src = ref->data[0];
+++    cmd->srcstride = ref->linesize[0];
+++    cmd->mv = *mv;
+++    cmd->x_off = x_off;
+++    cmd->y_off = y_off;
+++    cmd->block_w = block_w;
+++    cmd->block_h = block_h;
+++    cmd->weight = luma_weight;
+++    cmd->offset = luma_offset;
+ +}
+ +
+-+#if 0
+-+
+-+int32_t hcoeffs[] = {-4, 10, -21, 70, 90, -24, 11, -4};
+-+//int32_t hcoeffs[] = {1, 1, 1, 1, 1, 1, 1, 1};
+-+int32_t vcoeffs[] = {-2, 6, -13, 37, 115, -20, 9, -4};
+-+//int32_t vcoeffs[] = {1, 1, 1, 1, 1, 1, 1, 1};
+-+
+-+#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0-1) & 0xff) | ((c1-1) & 0xff) << 8 | ((c2-1) & 0xff) << 16 | ((c3-1) & 0xff) << 24);
+-+
+-+static uint8_t av_clip_uint8(int32_t a)
+++static void rpi_luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+++                       AVFrame *ref0, const Mv *mv0, int x_off, int y_off,
+++                       int block_w, int block_h, AVFrame *ref1, const Mv *mv1, struct MvField *current_mv)
+ +{
+-+    if (a&(~255)) return (-a)>>31;
+-+    else          return a;
+++    HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++;
+++    cmd->cmd = RPI_CMD_LUMA_BI;
+++    cmd->dst = dst;
+++    cmd->dststride = dststride;
+++    cmd->src = ref0->data[0];
+++    cmd->srcstride = ref0->linesize[0];
+++    cmd->mv = *mv0;
+++    cmd->x_off = x_off;
+++    cmd->y_off = y_off;
+++    cmd->block_w = block_w;
+++    cmd->block_h = block_h;
+++    cmd->src1 = ref1->data[0];
+++    cmd->srcstride1 = ref1->linesize[0];
+++    cmd->mv1 = *mv1;
+++    cmd->ref_idx[0] = current_mv->ref_idx[0];
+++    cmd->ref_idx[1] = current_mv->ref_idx[1];
+ +}
+ +
+-+static int32_t filter8(const uint8_t *data, int pitch)
+++static void rpi_chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
+++                          ptrdiff_t dststride, uint8_t *src0, ptrdiff_t srcstride, int reflist,
+++                          int x_off, int y_off, int block_w, int block_h, struct MvField *current_mv, int chroma_weight, int chroma_offset)
+ +{
+-+   int32_t vsum = 0;
+-+   int x, y;
+-+
+-+   for (y = 0; y < 8; y++) {
+-+      int32_t hsum = 0;
+-+
+-+      for (x = 0; x < 8; x++)
+-+         hsum += hcoeffs[x]*data[x + y * pitch];
+-+
+-+      vsum += vcoeffs[y]*av_clip_uint8( (hsum + 64) >> 7); // Added brackets to stop compiler warning
+-+   }
+++    HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++;
+++    cmd->cmd = RPI_CMD_CHROMA_UNI;
+++    cmd->dst = dst0;
+++    cmd->dststride = dststride;
+++    cmd->src = src0;
+++    cmd->srcstride = srcstride;
+++    cmd->mv = current_mv->mv[reflist];
+++    cmd->x_off = x_off;
+++    cmd->y_off = y_off;
+++    cmd->block_w = block_w;
+++    cmd->block_h = block_h;
+++    cmd->weight = chroma_weight;
+++    cmd->offset = chroma_offset;
+++}
+ +
+-+   return av_clip_uint8( (vsum + 64) >> 7);
+++static void rpi_chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVFrame *ref0, AVFrame *ref1,
+++                         int x_off, int y_off, int block_w, int block_h, struct MvField *current_mv, int cidx)
+++{
+++    HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++;
+++    cmd->cmd = RPI_CMD_CHROMA_BI+cidx;
+++    cmd->dst = dst0;
+++    cmd->dststride = dststride;
+++    cmd->src = ref0->data[cidx+1];
+++    cmd->srcstride = ref0->linesize[cidx+1];
+++    cmd->mv = current_mv->mv[0];
+++    cmd->mv1 = current_mv->mv[1];
+++    cmd->x_off = x_off;
+++    cmd->y_off = y_off;
+++    cmd->block_w = block_w;
+++    cmd->block_h = block_h;
+++    cmd->src1 = ref1->data[cidx+1];
+++    cmd->srcstride1 = ref1->linesize[cidx+1];
+++    cmd->ref_idx[0] = current_mv->ref_idx[0];
+++    cmd->ref_idx[1] = current_mv->ref_idx[1];
+ +}
+ +
+-+// Note regression changes coefficients so is not thread safe
+-+//#define REGRESSION
+-+#ifdef REGRESSION
+-+#define CMAX 100
+ +#else
+-+#define CMAX 2
+++#define RPI_REDIRECT(fn) fn
+ +#endif
+-+#define YMAX 16
+-+
+-+int rpi_test_shader(void)
+-+{
+-+   int i, c;
+ +
+-+   uint32_t *unifs;
++ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
++                         AVFrame *ref, const Mv *mv, int x_off, int y_off,
++                         int block_w, int block_h, int luma_weight, int luma_offset)
++@@ -1347,6 +1805,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
++                            (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
++     int idx              = ff_hevc_pel_weight[block_w];
++ 
+++#ifdef DISABLE_MC
+++    return;
+++#endif
+ +
+-+   uint8_t *in_buffer;
+-+   uint8_t *out_buffer[2];
++     x_off += mv->x >> 2;
++     y_off += mv->y >> 2;
++     src   += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift));
++@@ -1393,7 +1855,7 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
++  * @param mv1 motion vector1 (relative to block position) to get pixel data from
++  * @param current_mv current motion vector structure
++  */
++- static void luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+++static void luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
++                        AVFrame *ref0, const Mv *mv0, int x_off, int y_off,
++                        int block_w, int block_h, AVFrame *ref1, const Mv *mv1, struct MvField *current_mv)
++ {
++@@ -1417,6 +1879,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
++     uint8_t *src0  = ref0->data[0] + y_off0 * src0stride + (int)((unsigned)x_off0 << s->ps.sps->pixel_shift);
++     uint8_t *src1  = ref1->data[0] + y_off1 * src1stride + (int)((unsigned)x_off1 << s->ps.sps->pixel_shift);
++ 
+++#ifdef DISABLE_MC
+++    return;
+++#endif
+ +
+-+   GPU_MEM_PTR_T unifs_ptr;
+-+   GPU_MEM_PTR_T in_buffer_ptr;
+-+   GPU_MEM_PTR_T out_buffer_ptr[2];
++     if (x_off0 < QPEL_EXTRA_BEFORE || y_off0 < QPEL_EXTRA_AFTER ||
++         x_off0 >= pic_width - block_w - QPEL_EXTRA_AFTER ||
++         y_off0 >= pic_height - block_h - QPEL_EXTRA_AFTER) {
++@@ -1502,6 +1968,10 @@ static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
++     intptr_t _mx         = mx << (1 - hshift);
++     intptr_t _my         = my << (1 - vshift);
++ 
+++#ifdef DISABLE_MC
+++    return;
+++#endif
+ +
+-+   // Addresses in GPU memory of filter programs
+-+   uint32_t mc_setup = 0;
+-+   uint32_t mc_filter = 0;
+-+   uint32_t mc_exit = 0;
+-+
+-+   int pitch = 0x500;
+-+
+-+   if (gpu==NULL) {
+-+      gpu_lock();
+-+      gpu_unlock();
+-+   }
++     x_off += mv->x >> (2 + hshift);
++     y_off += mv->y >> (2 + vshift);
++     src0  += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift));
++@@ -1566,6 +2036,10 @@ static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVF
++     int hshift = s->ps.sps->hshift[1];
++     int vshift = s->ps.sps->vshift[1];
++ 
+++#ifdef DISABLE_MC
+++    return;
+++#endif
+ +
+-+   printf("This needs to change to reflect new assembler\n");
+-+   // Use table to compute locations of program start points
+-+   mc_setup = code[0] + gpu->vc;
+-+   mc_filter = code[1] + gpu->vc;
+-+   mc_exit = code[2] + gpu->vc;
++     intptr_t mx0 = av_mod_uintp2(mv0->x, 2 + hshift);
++     intptr_t my0 = av_mod_uintp2(mv0->y, 2 + vshift);
++     intptr_t mx1 = av_mod_uintp2(mv1->x, 2 + hshift);
++@@ -1693,14 +2167,14 @@ static void hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW,
++     }
++ }
++ 
++-static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
++-                                int nPbW, int nPbH,
++-                                int log2_cb_size, int partIdx, int idx)
+++static void hls_prediction_unit(HEVCContext * const s, const int x0, const int y0,
+++                                const int nPbW, const int nPbH,
+++                                const unsigned int log2_cb_size, const unsigned int partIdx, const unsigned int idx)
++ {
++ #define POS(c_idx, x, y)                                                              \
++     &s->frame->data[c_idx][((y) >> s->ps.sps->vshift[c_idx]) * s->frame->linesize[c_idx] + \
++                            (((x) >> s->ps.sps->hshift[c_idx]) << s->ps.sps->pixel_shift)]
++-    HEVCLocalContext *lc = s->HEVClc;
+++    HEVCLocalContext * const lc = s->HEVClc;
++     int merge_idx = 0;
++     struct MvField current_mv = {{{ 0 }}};
++ 
++@@ -1718,8 +2192,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
++     int y_cb             = y0 >> log2_min_cb_size;
++     int x_pu, y_pu;
++     int i, j;
++-
++-    int skip_flag = SAMPLE_CTB(s->skip_flag, x_cb, y_cb);
+++    const int skip_flag = SAMPLE_CTB(s->skip_flag, x_cb, y_cb);
++ 
++     if (!skip_flag)
++         lc->pu.merge_flag = ff_hevc_merge_flag_decode(s);
++@@ -1763,16 +2236,89 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
++         int nPbW_c = nPbW >> s->ps.sps->hshift[1];
++         int nPbH_c = nPbH >> s->ps.sps->vshift[1];
++ 
++-        luma_mc_uni(s, dst0, s->frame->linesize[0], ref0->frame,
+++#ifdef RPI_LUMA_QPU
+++        if (s->enable_rpi) {
+++            const Mv * const mv    = &current_mv.mv[0];
+++            const unsigned int mx          = mv->x & 3;
+++            const unsigned int my          = mv->y & 3;
+++            const unsigned int my_mx       = (my<<8) | mx;
+++            const uint32_t     my2_mx2_my_mx = (my_mx << 16) | my_mx;
+++            const int x1_m3 = x0 + (mv->x >> 2) - 3;
+++            const int y1_m3 = y0 + (mv->y >> 2) - 3;
+++            const uint32_t src_vc_address_y = get_vc_address_y(ref0->frame);
+++            uint32_t * y = s->curr_y_mvs;
+ +
+-+   if (!vcos_verify(gpu_malloc_uncached(4*64,&unifs_ptr))) {
+-+      return -2;
+-+   }
+-+   unifs = (uint32_t*)unifs_ptr.arm;
+++            for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
+++              const uint32_t src_yx_hi = ((y1_m3 + start_y) << 16);
+ +
+-+   if (!vcos_verify(gpu_malloc_uncached(64*23,&in_buffer_ptr))) {
+-+      return -3;
+-+   }
+-+   in_buffer = (uint8_t*)in_buffer_ptr.arm;
+++              for(int start_x=0;start_x < nPbW;start_x+=16) {
+++                  const int bw = nPbW-start_x;
+++                  const int bh = nPbH-start_y;
+++                  y++[-RPI_LUMA_COMMAND_WORDS] = src_yx_hi | ((x1_m3 + start_x) & 0xffff);
+++                  y++[-RPI_LUMA_COMMAND_WORDS] = src_vc_address_y;
+++                  y++[-RPI_LUMA_COMMAND_WORDS] = src_yx_hi | ((x1_m3 + 8 + start_x) & 0xffff);
+++                  y++[-RPI_LUMA_COMMAND_WORDS] = src_vc_address_y;
+++                  *y++ = ( (bw<16 ? bw : 16) << 16 ) + (bh<16 ? bh : 16);
+++                  *y++ = my2_mx2_my_mx;
+++                  *y++ = s->sh.luma_weight_l0[current_mv.ref_idx[0]];
+++                  *y++ = s->sh.luma_offset_l0[current_mv.ref_idx[0]] * 2 + 1;
+++                  *y++ = (get_vc_address_y(s->frame) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
+++                  y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter;
+++                }
+++            }
+++            s->curr_y_mvs = y;
+++        } else
+++#endif
+++        {
+++            RPI_REDIRECT(luma_mc_uni)(s, dst0, s->frame->linesize[0], ref0->frame,
++                     &current_mv.mv[0], x0, y0, nPbW, nPbH,
++                     s->sh.luma_weight_l0[current_mv.ref_idx[0]],
++                     s->sh.luma_offset_l0[current_mv.ref_idx[0]]);
+++        }
++ 
++         if (s->ps.sps->chroma_format_idc) {
++-            chroma_mc_uni(s, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1],
+++#ifdef RPI_INTER_QPU
+++          if (s->enable_rpi) {
+++                int hshift           = s->ps.sps->hshift[1];
+++                int vshift           = s->ps.sps->vshift[1];
+++                const Mv *mv         = &current_mv.mv[0];
+++                intptr_t mx          = av_mod_uintp2(mv->x, 2 + hshift);
+++                intptr_t my          = av_mod_uintp2(mv->y, 2 + vshift);
+++                intptr_t _mx         = mx << (1 - hshift);
+++                intptr_t _my         = my << (1 - vshift); // Fractional part of motion vector
+ +
+-+   if (!vcos_verify(gpu_malloc_uncached(16*pitch,&out_buffer_ptr[0])) || !vcos_verify(gpu_malloc_uncached(16*pitch,&out_buffer_ptr[1]))) {
+-+      return -4;
+-+   }
+-+   out_buffer[0] = (uint8_t*)out_buffer_ptr[0].arm;
+-+   out_buffer[1] = (uint8_t*)out_buffer_ptr[1].arm;
+++                int x1_c = x0_c + (mv->x >> (2 + hshift));
+++                int y1_c = y0_c + (mv->y >> (2 + hshift));
+ +
+-+   for (c = 0; c < CMAX; c++) {
+-+      int xo[] = {rand()&31, rand()&31};
+++                uint32_t *u = s->curr_u_mvs;
+++                for(int start_y=0;start_y < nPbH_c;start_y+=16) {
+++                  for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
+++                      int bw = nPbW_c-start_x;
+++                      int bh = nPbH_c-start_y;
+++                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
+++                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
+++                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
+++                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_u(ref0->frame);
+++                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_v(ref0->frame);
+++                      *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
+++                      *u++ = rpi_filter_coefs[_mx][0];
+++                      *u++ = rpi_filter_coefs[_my][0];
+++                      *u++ = PACK2(s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0] * 2 + 1,
+++                                   s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0]);
+++                      *u++ = PACK2(s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1] * 2 + 1,
+++                                   s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1]);
+++                      *u++ = (get_vc_address_u(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
+++                      *u++ = (get_vc_address_v(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+++                    }
+++                }
+++                s->curr_u_mvs = u;
+++                return;
+++            }
+++#endif
+++            RPI_REDIRECT(chroma_mc_uni)(s, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1],
++                           0, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
++                           s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]);
++-            chroma_mc_uni(s, dst2, s->frame->linesize[2], ref0->frame->data[2], ref0->frame->linesize[2],
+++            RPI_REDIRECT(chroma_mc_uni)(s, dst2, s->frame->linesize[2], ref0->frame->data[2], ref0->frame->linesize[2],
++                           0, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
++                           s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1]);
++         }
++@@ -1782,17 +2328,89 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
++         int nPbW_c = nPbW >> s->ps.sps->hshift[1];
++         int nPbH_c = nPbH >> s->ps.sps->vshift[1];
++ 
++-        luma_mc_uni(s, dst0, s->frame->linesize[0], ref1->frame,
+++#ifdef RPI_LUMA_QPU
+++        if (s->enable_rpi) {
+++            const int reflist = 1;
+++            const Mv *mv    = &current_mv.mv[reflist];
+++            int mx          = mv->x & 3;
+++            int my          = mv->y & 3;
+++            int my_mx = (my<<8) + mx;
+++            int my2_mx2_my_mx = (my_mx << 16) + my_mx;
+++            int x1 = x0 + (mv->x >> 2);
+++            int y1 = y0 + (mv->y >> 2);
+++            uint32_t *y = s->curr_y_mvs;
+++            for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
+++              for(int start_x=0;start_x < nPbW;start_x+=16) {
+++                  int bw = nPbW-start_x;
+++                  int bh = nPbH-start_y;
+++                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff);
+++                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref1->frame);
+++                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + 8 + start_x) & 0xffff);
+++                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref1->frame);
+++                  *y++ = ( (bw<16 ? bw : 16) << 16 ) + (bh<16 ? bh : 16);
+++                  *y++ = my2_mx2_my_mx;
+++                  *y++ = s->sh.luma_weight_l1[current_mv.ref_idx[reflist]];
+++                  *y++ = s->sh.luma_offset_l1[current_mv.ref_idx[reflist]] * 2 + 1;
+++                  *y++ = (get_vc_address_y(s->frame) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
+++                  y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter;
+++                }
+++            }
+++            s->curr_y_mvs = y;
+++        } else
+++#endif
+ +
+-+#ifdef REGRESSION
+-+      for (i = 0; i < 8; i++) {
+-+         hcoeffs[i] = (int8_t)rand();
+-+         vcoeffs[i] = (int8_t)rand();
+-+         if (hcoeffs[i]==-128)
+-+           hcoeffs[i]++;
+-+         if (vcoeffs[i]==-128)
+-+           vcoeffs[i]++;
+-+      }
+++        {
+++            RPI_REDIRECT(luma_mc_uni)(s, dst0, s->frame->linesize[0], ref1->frame,
++                     &current_mv.mv[1], x0, y0, nPbW, nPbH,
++                     s->sh.luma_weight_l1[current_mv.ref_idx[1]],
++                     s->sh.luma_offset_l1[current_mv.ref_idx[1]]);
+++        }
++ 
++         if (s->ps.sps->chroma_format_idc) {
++-            chroma_mc_uni(s, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1],
+++#ifdef RPI_INTER_QPU
+++            if (s->enable_rpi) {
+++                const int reflist = 1;
+++                const int hshift           = s->ps.sps->hshift[1];
+++                const int vshift           = s->ps.sps->vshift[1];
+++                const Mv * const mv        = &current_mv.mv[reflist];
+++                const intptr_t mx          = av_mod_uintp2(mv->x, 2 + hshift);
+++                const intptr_t my          = av_mod_uintp2(mv->y, 2 + vshift);
+++                const intptr_t _mx         = mx << (1 - hshift);
+++                const intptr_t _my         = my << (1 - vshift); // Fractional part of motion vector
+++
+++                const int x1_c = x0_c + (mv->x >> (2 + hshift));
+++                const int y1_c = y0_c + (mv->y >> (2 + hshift));
+++
+++                uint32_t * u = s->curr_u_mvs;
+++                for(int start_y=0;start_y < nPbH_c;start_y+=16) {
+++                  for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
+++                      const int bw = nPbW_c-start_x;
+++                      const int bh = nPbH_c-start_y;
+++                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
+++                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
+++                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
+++                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_u(ref1->frame);
+++                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_v(ref1->frame);
+++                      *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
+++                      *u++ = rpi_filter_coefs[_mx][0];
+++                      *u++ = rpi_filter_coefs[_my][0];
+++                      *u++ = PACK2(s->sh.chroma_offset_l1[current_mv.ref_idx[reflist]][0] * 2 + 1,
+++                                   s->sh.chroma_weight_l1[current_mv.ref_idx[reflist]][0]);
+++                      *u++ = PACK2(s->sh.chroma_offset_l1[current_mv.ref_idx[reflist]][1] * 2 + 1,
+++                                   s->sh.chroma_weight_l1[current_mv.ref_idx[reflist]][1]);
+++                      *u++ = (get_vc_address_u(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
+++                      *u++ = (get_vc_address_v(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+++                    }
+++                }
+++                s->curr_u_mvs = u;
+++                return;
+++            }
+ +#endif
+++            RPI_REDIRECT(chroma_mc_uni)(s, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1],
++                           1, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
++                           s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0], s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0]);
++ 
++-            chroma_mc_uni(s, dst2, s->frame->linesize[2], ref1->frame->data[2], ref1->frame->linesize[2],
+++            RPI_REDIRECT(chroma_mc_uni)(s, dst2, s->frame->linesize[2], ref1->frame->data[2], ref1->frame->linesize[2],
++                           1, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
++                           s->sh.chroma_weight_l1[current_mv.ref_idx[1]][1], s->sh.chroma_offset_l1[current_mv.ref_idx[1]][1]);
++         }
++@@ -1802,15 +2420,118 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
++         int nPbW_c = nPbW >> s->ps.sps->hshift[1];
++         int nPbH_c = nPbH >> s->ps.sps->vshift[1];
++ 
++-        luma_mc_bi(s, dst0, s->frame->linesize[0], ref0->frame,
+++#ifdef RPI_LUMA_QPU
+++        if (s->enable_rpi && 0) {
+++            const Mv *mv    = &current_mv.mv[0];
+++            int mx          = mv->x & 3;
+++            int my          = mv->y & 3;
+++            int my_mx = (my<<8) + mx;
+++            const Mv *mv2    = &current_mv.mv[1];
+++            int mx2          = mv2->x & 3;
+++            int my2          = mv2->y & 3;
+++            int my2_mx2 = (my2<<8) + mx2;
+++            int my2_mx2_my_mx = (my2_mx2 << 16) + my_mx;
+++            int x1 = x0 + (mv->x >> 2);
+++            int y1 = y0 + (mv->y >> 2);
+++            int x2 = x0 + (mv2->x >> 2);
+++            int y2 = y0 + (mv2->y >> 2);
+++            uint32_t *y = s->curr_y_mvs;
+++            for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
+++              for(int start_x=0;start_x < nPbW;start_x+=8) { // B blocks work 8 at a time
+++                  int bw = nPbW-start_x;
+++                  int bh = nPbH-start_y;
+++                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff);
+++                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref0->frame);
+++                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y2 - 3 + start_y) << 16) + ( (x2 - 3 + start_x) & 0xffff); // Second fetch is for ref1
+++                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref1->frame);
+++                  *y++ = PACK2(bw<8 ? bw : 8, bh<16 ? bh : 16);
+++                  *y++ = my2_mx2_my_mx;
+ +
+-+      for (i = 0; i < 64*23; i++) {
+-+         //printf("%d %d %p\n",i,gpu->mb,&in_buffer[i]);
+-+         in_buffer[i] = rand();
+-+      }
+++                  *y++ = PACK2(s->sh.luma_weight_l1[current_mv.ref_idx[1]],
+++                               s->sh.luma_weight_l0[current_mv.ref_idx[0]]);
+++                  *y++ = s->sh.luma_offset_l0[current_mv.ref_idx[0]] +
+++                         s->sh.luma_offset_l1[current_mv.ref_idx[1]] + 1;
+ +
+-+      // Clear output array
+-+      {
+-+        int b;
+-+        for(b=0;b<2;b++) {
+-+          for(i=0;i<16*16;i++) {
+-+            out_buffer[b][i] = 3;
+-+          }
+++                  *y++ = (get_vc_address_y(s->frame) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
+++                  y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter_b;
+++                }
+++            }
+++            s->curr_y_mvs = y;
+++        } else
+++#endif
+++        {
+++            RPI_REDIRECT(luma_mc_bi)(s, dst0, s->frame->linesize[0], ref0->frame,
++                    &current_mv.mv[0], x0, y0, nPbW, nPbH,
++                    ref1->frame, &current_mv.mv[1], &current_mv);
+ +        }
+-+      }
++ 
++         if (s->ps.sps->chroma_format_idc) {
++-            chroma_mc_bi(s, dst1, s->frame->linesize[1], ref0->frame, ref1->frame,
+++#ifdef RPI_INTER_QPU
+++          if (s->enable_rpi) {
+++                int hshift           = s->ps.sps->hshift[1];
+++                int vshift           = s->ps.sps->vshift[1];
+++                const Mv *mv         = &current_mv.mv[0];
+++                intptr_t mx          = av_mod_uintp2(mv->x, 2 + hshift);
+++                intptr_t my          = av_mod_uintp2(mv->y, 2 + vshift);
+++                intptr_t _mx         = mx << (1 - hshift);
+++                intptr_t _my         = my << (1 - vshift); // Fractional part of motion vector
+++                int x1_c = x0_c + (mv->x >> (2 + hshift));
+++                int y1_c = y0_c + (mv->y >> (2 + hshift));
+ +
+-+      unifs[0] = mc_filter;
+-+      unifs[1] = in_buffer_ptr.vc+xo[0]+16;
+-+      unifs[2] = 64; // src pitch
+-+      unifs[3] = pitch; // dst pitch
+-+      unifs[4] = 0; // Padding
+-+      unifs[5] = 0;
+-+      unifs[6] = 0;
+-+      unifs[7 ] = mc_filter;
+-+      unifs[8 ] = in_buffer_ptr.vc+xo[1]+16;
+-+      unifs[9 ] = ENCODE_COEFFS(hcoeffs[0], hcoeffs[1], hcoeffs[2], hcoeffs[3]);
+-+      unifs[10] = ENCODE_COEFFS(hcoeffs[4], hcoeffs[5], hcoeffs[6], hcoeffs[7]);
+-+      unifs[11] = ENCODE_COEFFS(vcoeffs[0], vcoeffs[1], vcoeffs[2], vcoeffs[3]);
+-+      unifs[12] = ENCODE_COEFFS(vcoeffs[4], vcoeffs[5], vcoeffs[6], vcoeffs[7]);
+-+      unifs[13] = out_buffer_ptr[0].vc;
+-+      unifs[14] = mc_exit;
+-+      unifs[15] = in_buffer_ptr.vc+xo[1]+16;        // dummy
+-+      unifs[16] = ENCODE_COEFFS(hcoeffs[0], hcoeffs[1], hcoeffs[2], hcoeffs[3]);
+-+      unifs[17] = ENCODE_COEFFS(hcoeffs[4], hcoeffs[5], hcoeffs[6], hcoeffs[7]);
+-+      unifs[18] = ENCODE_COEFFS(vcoeffs[0], vcoeffs[1], vcoeffs[2], vcoeffs[3]);
+-+      unifs[19] = ENCODE_COEFFS(vcoeffs[4], vcoeffs[5], vcoeffs[6], vcoeffs[7]);
+-+      unifs[20] = out_buffer_ptr[1].vc;
+-+
+-+      printf("Gpu->vc=%x Code=%x dst=%x\n",gpu->vc, mc_filter,out_buffer_ptr[1].vc);
+-+
+-+      // flush_dcache(); TODO is this needed on ARM side? - tried to use the direct alias to avoid this problem
+++                const Mv *mv2         = &current_mv.mv[1];
+++                intptr_t mx2          = av_mod_uintp2(mv2->x, 2 + hshift);
+++                intptr_t my2          = av_mod_uintp2(mv2->y, 2 + vshift);
+++                intptr_t _mx2         = mx2 << (1 - hshift);
+++                intptr_t _my2         = my2 << (1 - vshift); // Fractional part of motion vector
+ +
+-+      //qpu_run_shader(mc_setup, unifs_ptr.vc);
+-+      //qpu_run_shader(gpu, gpu->vc, unifs_ptr.vc);
+-+      rpi_do_block(in_buffer_ptr.vc+xo[0]+16, 64, out_buffer_ptr[0].vc, pitch,out_buffer[0]);
+-+      rpi_do_block(in_buffer_ptr.vc+xo[1]+16, 64, out_buffer_ptr[1].vc, pitch,out_buffer[1]);
+++                int x2_c = x0_c + (mv2->x >> (2 + hshift));
+++                int y2_c = y0_c + (mv2->y >> (2 + hshift));
+ +
+-+      if (1)
+-+      {
+-+         int x, y, b;
+-+         int bad = 0;
+ +
+-+         for (b=0; b<2; ++b)
+-+            for (y=0; y<YMAX; ++y)
+-+               for (x=0; x<16; ++x) {
+-+                  int32_t ref = filter8(in_buffer+x+y*64+xo[b], 64);
+++                uint32_t *u = s->curr_u_mvs;
+++                for(int start_y=0;start_y < nPbH_c;start_y+=16) {
+++                  for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
+++                      int bw = nPbW_c-start_x;
+++                      int bh = nPbH_c-start_y;
+++                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b0;
+++                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
+++                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
+++                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_u(ref0->frame);
+++                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_v(ref0->frame);
+++                      *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
+++                      *u++ = rpi_filter_coefs[_mx][0];
+++                      *u++ = rpi_filter_coefs[_my][0];
+++                      *u++ = s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0]; // Weight L0 U
+++                      *u++ = s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1]; // Weight L0 V
+++                      *u++ = 0;  // Intermediate results are not written back in first pass of B filtering
+++                      *u++ = 0;
+ +
+-+                  if (out_buffer[b][x+y*pitch] != ref) {
+-+                      bad = 1;
+-+//                     printf("%d, %d, %d, %d\n", c, b, x, y);
+-+                  }
+-+#ifndef REGRESSION
+-+                  //printf("%08x %08x\n", out_buffer[b][x+y*pitch], ref);
+++                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b;
+++                      u++[-RPI_CHROMA_COMMAND_WORDS] = x2_c - 1 + start_x;
+++                      u++[-RPI_CHROMA_COMMAND_WORDS] = y2_c - 1 + start_y;
+++                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_u(ref1->frame);
+++                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_v(ref1->frame);
+++                      *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
+++                      *u++ = rpi_filter_coefs[_mx2][0];
+++                      *u++ = rpi_filter_coefs[_my2][0];
+++                      *u++ = PACK2(s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0] +
+++                                     s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0] + 1,
+++                                   s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0]);
+++                      *u++ = PACK2(s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1] +
+++                                     s->sh.chroma_offset_l1[current_mv.ref_idx[1]][1] + 1,
+++                                   s->sh.chroma_weight_l1[current_mv.ref_idx[1]][1]);
+++                      *u++ = (get_vc_address_u(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
+++                      *u++ = (get_vc_address_v(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+++                    }
+++                }
+++                s->curr_u_mvs = u;
+++                return;
+++            }
+ +#endif
+-+               }
+-+          if (bad)
+-+            printf("Failed dst=%x test=%d\n",out_buffer_ptr[1].vc,c);
+-+          else
+-+            printf("Passed dst=%x test=%d\n",out_buffer_ptr[1].vc,c);
+-+      }
+-+      //printf("%d\n", simpenrose_get_qpu_tick_count());
+-+   }
+-+
+-+   gpu_free(&out_buffer_ptr[0]);
+-+   gpu_free(&out_buffer_ptr[1]);
+-+   gpu_free(&in_buffer_ptr);
+-+   gpu_free(&unifs_ptr);
+-+
+-+   return 0;
+-+}
+-+
+-+void rpi_do_block_arm(const uint8_t *in_buffer, int src_pitch, uint8_t *dst, int dst_pitch)
+++            RPI_REDIRECT(chroma_mc_bi)(s, dst1, s->frame->linesize[1], ref0->frame, ref1->frame,
++                          x0_c, y0_c, nPbW_c, nPbH_c, &current_mv, 0);
++ 
++-            chroma_mc_bi(s, dst2, s->frame->linesize[2], ref0->frame, ref1->frame,
+++            RPI_REDIRECT(chroma_mc_bi)(s, dst2, s->frame->linesize[2], ref0->frame, ref1->frame,
++                          x0_c, y0_c, nPbW_c, nPbH_c, &current_mv, 1);
++         }
++     }
++@@ -2304,6 +3025,734 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
++     lc->ctb_up_left_flag = ((x_ctb > 0) && (y_ctb > 0)  && (ctb_addr_in_slice-1 >= s->ps.sps->ctb_width) && (s->ps.pps->tile_id[ctb_addr_ts] == s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1 - s->ps.sps->ctb_width]]));
++ }
++ 
+++#ifdef RPI
+++static void rpi_execute_dblk_cmds(HEVCContext *s)
+ +{
+-+  int x,y;
+-+  for (y=0; y<16; ++y) {
+-+    for (x=0; x<16; ++x) {
+-+       dst[x+y*dst_pitch] = filter8(in_buffer+x+y*src_pitch, src_pitch);
+++    int n;
+++    int job = s->pass1_job;
+++    int ctb_size    = 1 << s->ps.sps->log2_ctb_size;
+++    int (*p)[2] = s->dblk_cmds[job];
+++    for(n = s->num_dblk_cmds[job]; n>0 ;n--,p++) {
+++        ff_hevc_hls_filters(s, (*p)[0], (*p)[1], ctb_size);
+ +    }
+-+  }
+++    s->num_dblk_cmds[job] = 0;
+ +}
+ +
+-+void rpi_do_block(const uint8_t *in_buffer_vc, int src_pitch, uint8_t *dst_vc, int dst_pitch, uint8_t *dst)
+++static void rpi_execute_transform(HEVCContext *s)
+ +{
+-+   uint32_t *unifs;
+-+
+-+   GPU_MEM_PTR_T unifs_ptr;
+-+   //uint8_t *out_buffer;
+-+   //GPU_MEM_PTR_T out_buffer_ptr;
+-+
+-+   // Addresses in GPU memory of filter programs
+-+   uint32_t mc_setup = 0;
+-+   uint32_t mc_filter = 0;
+-+   uint32_t mc_exit = 0;
+-+   //int x,y;
+-+
+-+   if (gpu==NULL) {
+-+      gpu_lock();
+-+      gpu_unlock();
+-+   }
+-+
+-+   // Use table to compute locations of program start points
+-+   mc_setup = code[0] + gpu->vc;
+-+   mc_filter = code[1] + gpu->vc;
+-+   mc_exit = code[2] + gpu->vc;
+-+
+-+   if (!vcos_verify(gpu_malloc_uncached(4*64,&unifs_ptr))) {
+-+      return;
+-+   }
+-+   //gpu_malloc_uncached(16*dst_pitch,&out_buffer_ptr);
+-+   //out_buffer = (uint8_t*)out_buffer_ptr.arm;
+-+
+-+   /*for (y=0; y<16; ++y) {
+-+      for (x=0; x<16; ++x) {
+-+         out_buffer[x+y*dst_pitch] = 7;
+-+      }
+++    int i=2;
+++    int job = s->pass1_job;
+++    /*int j;
+++    int16_t *coeffs = s->coeffs_buf_arm[job][i];
+++    for(j=s->num_coeffs[job][i]; j > 0; j-= 16*16, coeffs+=16*16) {
+++        s->hevcdsp.idct[4-2](coeffs, 16);
+++    }
+++    i=3;
+++    coeffs = s->coeffs_buf_arm[job][i] - s->num_coeffs[job][i];
+++    for(j=s->num_coeffs[job][i]; j > 0; j-= 32*32, coeffs+=32*32) {
+++        s->hevcdsp.idct[5-2](coeffs, 32);
+ +    }*/
+ +
+-+   unifs = (uint32_t*)unifs_ptr.arm;
+-+
+-+    unifs[0] = mc_filter;
+-+    unifs[1] = (int)in_buffer_vc;
+-+    unifs[2] = src_pitch; // src pitch
+-+    unifs[3] = dst_pitch; // dst pitch
+-+    unifs[4] = 0; // Padding
+-+    unifs[5] = 0;
+-+    unifs[6] = 0;
+-+    unifs[7 ] = mc_exit;
+-+    unifs[8 ] = (int)in_buffer_vc;
+-+    unifs[9 ] = ENCODE_COEFFS(hcoeffs[0], hcoeffs[1], hcoeffs[2], hcoeffs[3]);
+-+    unifs[10] = ENCODE_COEFFS(hcoeffs[4], hcoeffs[5], hcoeffs[6], hcoeffs[7]);
+-+    unifs[11] = ENCODE_COEFFS(vcoeffs[0], vcoeffs[1], vcoeffs[2], vcoeffs[3]);
+-+    unifs[12] = ENCODE_COEFFS(vcoeffs[4], vcoeffs[5], vcoeffs[6], vcoeffs[7]);
+-+    unifs[13] = (int)dst_vc;
+-+    //unifs[13] = (int)out_buffer_ptr.vc;
+-+
+-+    //printf("Gpu->vc=%x Code=%x dst=%x\n",gpu->vc, mc_filter,out_buffer_ptr[1].vc);
+-+
+-+    qpu_run_shader(mc_setup, unifs_ptr.vc);
+-+
+-+    /*for (y=0; y<16; ++y) {
+-+      for (x=0; x<16; ++x) {
+-+         dst[x+y*dst_pitch] = out_buffer[x+y*dst_pitch];
+-+      }
+-+    }*/
+++    gpu_cache_flush(&s->coeffs_buf_accelerated[job]);
+++    s->vpu_id = vpu_post_code2( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2],
+++                               s->num_coeffs[job][2] >> 8, s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3],
+++                               s->num_coeffs[job][3] >> 10, 0, &s->coeffs_buf_accelerated[job]);
+++    //vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0);
+++    //gpu_cache_flush(&s->coeffs_buf_accelerated);
+++    //vpu_wait(s->vpu_id);
+ +
+-+    gpu_free(&unifs_ptr);
+-+    //gpu_free(&out_buffer_ptr);
+++    for(i=0;i<4;i++)
+++        s->num_coeffs[job][i] = 0;
+ +}
+ +
+-+
+++static void rpi_execute_pred_cmds(HEVCContext *s)
+++{
+++  int i;
+++  int job = s->pass1_job;
+++  HEVCPredCmd *cmd = s->univ_pred_cmds[job];
+++#ifdef RPI_WORKER
+++  HEVCLocalContextIntra *lc = &s->HEVClcIntra;
+++#else
+++  HEVCLocalContext *lc = s->HEVClc;
+ +#endif
+ +
+-+#endif // RPI
+-diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
+-new file mode 100644
+-index 0000000..4e3c35c
+---- /dev/null
+-+++ b/libavcodec/rpi_qpu.h
+-@@ -0,0 +1,45 @@
+-+#ifndef RPI_QPU_H
+-+#define RPI_QPU_H
+-+
+-+typedef struct gpu_mem_ptr_s {
+-+  unsigned char *arm; // Pointer to memory mapped on ARM side
+-+  int vc_handle;   // Videocore handle of relocatable memory
+-+  int vcsm_handle; // Handle for use by VCSM
+-+  int vc;       // Address for use in GPU code
+-+  int numbytes; // Size of memory block
+-+} GPU_MEM_PTR_T;
+-+
+-+// General GPU functions
+-+extern int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p);
+-+extern int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p);
+-+extern void gpu_free(GPU_MEM_PTR_T *p);
+-+extern void gpu_cache_flush(GPU_MEM_PTR_T *p);
+-+
+-+// QPU specific functions
+-+extern void qpu_run_shader12(int code, int num, int code2, int num2, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8, int unifs9, int unifs10, int unifs11, int unifs12);
+++  for(i = s->num_pred_cmds[job]; i > 0; i--, cmd++) {
+++      //printf("i=%d cmd=%p job1=%d job0=%d\n",i,cmd,s->pass1_job,s->pass0_job);
+++      if (cmd->type == RPI_PRED_INTRA) {
+++          lc->tu.intra_pred_mode_c = lc->tu.intra_pred_mode = cmd->mode;
+++          lc->na.cand_bottom_left  = (cmd->na >> 4) & 1;
+++          lc->na.cand_left         = (cmd->na >> 3) & 1;
+++          lc->na.cand_up_left      = (cmd->na >> 2) & 1;
+++          lc->na.cand_up           = (cmd->na >> 1) & 1;
+++          lc->na.cand_up_right     = (cmd->na >> 0) & 1;
+++          s->hpc.intra_pred[cmd->size - 2](s, cmd->x, cmd->y, cmd->c_idx);
+++      } else {
+++#ifdef RPI_PRECLEAR
+++          int trafo_size = 1 << cmd->size;
+++#endif
+++          s->hevcdsp.transform_add[cmd->size-2](cmd->dst, cmd->buf, cmd->stride);
+++#ifdef RPI_PRECLEAR
+++          memset(cmd->buf, 0, trafo_size * trafo_size * sizeof(int16_t)); // Clear coefficients here while they are in the cache
+++#endif
+++      }
+++  }
+++  s->num_pred_cmds[job] = 0;
+++}
+ +
+-+enum {
+-+  QPU_MC_SETUP,
+-+  QPU_MC_FILTER,
+-+  QPU_MC_EXIT,
+-+  QPU_MC_INTERRUPT_EXIT,
+-+  QPU_MC_FILTER_B,
+-+  QPU_MC_FILTER_HONLY,
+-+  QPU_MC_SETUP_UV,
+-+  QPU_MC_FILTER_UV,
+-+  QPU_MC_FILTER_UV_B,
+-+  QPU_MC_END
+-+  };
+-+extern unsigned int qpu_get_fn(int num);
+-+
+-+// VPU specific functions
+-+extern unsigned int vpu_get_fn(void);
+-+extern unsigned vpu_execute_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5);
+-+
+-+// Simple test of shader code
+-+extern int rpi_test_shader(void);
+++static void rpi_execute_inter_cmds(HEVCContext *s)
+++{
+++    int job = s->pass1_job;
+++    HEVCMvCmd *cmd = s->unif_mv_cmds[job];
+++    int n,cidx;
+++    AVFrame myref;
+++    AVFrame myref1;
+++    struct MvField mymv;
+++    if (s->num_mv_cmds[job] > RPI_MAX_MV_CMDS) {
+++        printf("Overflow inter_cmds\n");
+++        exit(-1);
+++    }
+++    for(n = s->num_mv_cmds[job]; n>0 ; n--, cmd++) {
+++        switch(cmd->cmd) {
+++        case RPI_CMD_LUMA_UNI:
+++            myref.data[0] = cmd->src;
+++            myref.linesize[0] = cmd->srcstride;
+++            luma_mc_uni(s, cmd->dst, cmd->dststride, &myref, &cmd->mv, cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, cmd->weight, cmd->offset);
+++            break;
+++        case RPI_CMD_LUMA_BI:
+++            myref.data[0] = cmd->src;
+++            myref.linesize[0] = cmd->srcstride;
+++            myref1.data[0] = cmd->src1;
+++            myref1.linesize[0] = cmd->srcstride1;
+++            mymv.ref_idx[0] = cmd->ref_idx[0];
+++            mymv.ref_idx[1] = cmd->ref_idx[1];
+++            luma_mc_bi(s, cmd->dst, cmd->dststride,
+++                       &myref, &cmd->mv, cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h,
+++                       &myref1, &cmd->mv1, &mymv);
+++            break;
+++        case RPI_CMD_CHROMA_UNI:
+++            mymv.mv[0] = cmd->mv;
+++            chroma_mc_uni(s, cmd->dst,
+++                          cmd->dststride, cmd->src, cmd->srcstride, 0,
+++                          cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, &mymv, cmd->weight, cmd->offset);
+++            break;
+++        case RPI_CMD_CHROMA_BI:
+++        case RPI_CMD_CHROMA_BI+1:
+++            cidx = cmd->cmd - RPI_CMD_CHROMA_BI;
+++            myref.data[cidx+1] = cmd->src;
+++            myref.linesize[cidx+1] = cmd->srcstride;
+++            myref1.data[cidx+1] = cmd->src1;
+++            myref1.linesize[cidx+1] = cmd->srcstride1;
+++            mymv.ref_idx[0] = cmd->ref_idx[0];
+++            mymv.ref_idx[1] = cmd->ref_idx[1];
+++            mymv.mv[0] = cmd->mv;
+++            mymv.mv[1] = cmd->mv1;
+++            chroma_mc_bi(s, cmd->dst, cmd->dststride, &myref, &myref1,
+++                         cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, &mymv, cidx);
+++            break;
+++        }
+++    }
+++    s->num_mv_cmds[job] = 0;
+++}
+ +
+-+extern void rpi_do_block(const unsigned char *in_buffer_vc, int src_pitch, unsigned char *dst_vc, int dst_pitch, unsigned char *dst);
+-+extern void rpi_do_block_arm(const unsigned char *in_buffer, int src_pitch, unsigned char *dst, int dst_pitch);
+++static void rpi_do_all_passes(HEVCContext *s)
+++{
+++    // Kick off QPUs and VPUs
+++    rpi_launch_vpu_qpu(s);
+++    // Perform luma inter prediction
+++    rpi_execute_inter_cmds(s);
+++    // Wait for transform completion
+++    vpu_wait(s->vpu_id);
+++    // Perform intra prediction and residual reconstruction
+++    rpi_execute_pred_cmds(s);
+++    // Perform deblocking for CTBs in this row
+++    rpi_execute_dblk_cmds(s);
+++    // Prepare next batch
+++    rpi_begin(s);
+++}
+ +
+ +#endif
+-diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
+-new file mode 100644
+-index 0000000..41cc2e1
+---- /dev/null
+-+++ b/libavcodec/rpi_shader.c
+-@@ -0,0 +1,818 @@
+-+#include "rpi_shader.h"
+ +
+-+#ifdef _MSC_VER
+-+   #include <stdint.h>
+-+   /* cast through uintptr_t to avoid warnings */
+-+   #define POINTER_TO_UINT(X) ((unsigned int)(uintptr_t)(X))
+-+#else
+-+   #define POINTER_TO_UINT(X) ((unsigned int)(X))
+-+#endif
+++#ifdef RPI
+++static void rpi_begin(HEVCContext *s)
+++{
+++    int job = s->pass0_job;
+++    int i;
+++#ifdef RPI_INTER_QPU
+++    int pic_width        = s->ps.sps->width >> s->ps.sps->hshift[1];
+++    int pic_height       = s->ps.sps->height >> s->ps.sps->vshift[1];
+ +
+-+#ifdef __cplusplus
+-+extern "C" { /* the types are probably wrong... */
+-+#endif
+-+#ifdef __cplusplus
+-+}
+++    for(i=0;i<8;i++) {
+++        s->u_mvs[job][i] = s->mvs_base[job][i];
+++        *s->u_mvs[job][i]++ = 0;
+++        *s->u_mvs[job][i]++ = 0;
+++        *s->u_mvs[job][i]++ = 0;
+++        *s->u_mvs[job][i]++ = 0;
+++        *s->u_mvs[job][i]++ = 0;
+++        *s->u_mvs[job][i]++ = pic_width;
+++        *s->u_mvs[job][i]++ = pic_height;
+++        *s->u_mvs[job][i]++ = s->frame->linesize[1];
+++        *s->u_mvs[job][i]++ = s->frame->linesize[2];
+++        *s->u_mvs[job][i]++ = s->sh.chroma_log2_weight_denom + 6;
+++        *s->u_mvs[job][i]++ = 0;
+++        *s->u_mvs[job][i]++ = i;  // Select section of VPM (avoid collisions with 3d unit)
+++    }
+++    s->curr_u_mvs = s->u_mvs[job][0];
+ +#endif
+ +
+-+#ifdef _MSC_VER
+-+__declspec(align(8))
+-+#elif defined(__GNUC__)
+-+__attribute__((aligned(8)))
+-+#endif
+-+unsigned int rpi_shader[] = {
+-+// ::mc_setup
+-+/* [0x00000000] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000008] */ 0x0c9a0f80, 0x10020427, // add ra_x_base, unif, elem_num
+-+/* [0x00000010] */ 0x15827d80, 0x10020767, // mov ra_y, unif
+-+/* [0x00000018] */ 0x15827d80, 0x10020627, // mov ra_x2_base, unif
+-+/* [0x00000020] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
+-+/* [0x00000028] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
+-+/* [0x00000030] */ 0x15827d80, 0x10021427, // mov rb16, unif
+-+/* [0x00000038] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000040] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+-+/* [0x00000048] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
+-+/* [0x00000050] */ 0x00000001, 0xe0020527, // mov ra20, 1
+-+/* [0x00000058] */ 0x00000040, 0xe0020567, // mov ra21, 64
+-+/* [0x00000060] */ 0x00000100, 0xe00205a7, // mov ra22, 256
+-+/* [0x00000068] */ 0x00000008, 0xe00205e7, // mov ra23, 8
+-+/* [0x00000070] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
+-+/* [0x00000078] */ 0x00000040, 0xe0021567, // mov rb21, 64
+-+/* [0x00000080] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
+-+/* [0x00000088] */ 0x00000018, 0xe00215e7, // mov rb23, 24
+-+/* [0x00000090] */ 0x00000000, 0xe0020227, // mov ra8, 0
+-+/* [0x00000098] */ 0x00000000, 0xe0020267, // mov ra9, 0
+-+/* [0x000000a0] */ 0x00000000, 0xe00202a7, // mov ra10, 0
+-+/* [0x000000a8] */ 0x00000000, 0xe00202e7, // mov ra11, 0
+-+/* [0x000000b0] */ 0x00000000, 0xe0020327, // mov ra12, 0
+-+/* [0x000000b8] */ 0x00000000, 0xe0020367, // mov ra13, 0
+-+/* [0x000000c0] */ 0x00000000, 0xe00203a7, // mov ra14, 0
+-+/* [0x000000c8] */ 0x00000000, 0xe00203e7, // mov ra15, 0
+-+/* [0x000000d0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+-+/* [0x000000d8] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+-+/* [0x000000e0] */ 0x159e7480, 0x10020867, // mov r1, r2
+-+/* [0x000000e8] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+-+/* [0x000000f0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+-+/* [0x000000f8] */ 0x159e7480, 0x10020827, // mov r0, r2
+-+/* [0x00000100] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+-+/* [0x00000108] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000110] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
+-+/* [0x00000118] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+-+/* [0x00000120] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
+-+/* [0x00000128] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+-+/* [0x00000130] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+-+/* [0x00000138] */ 0x159e7480, 0x10020867, // mov r1, r2
+-+/* [0x00000140] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+-+/* [0x00000148] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+-+/* [0x00000150] */ 0x159e7480, 0x10020827, // mov r0, r2
+-+/* [0x00000158] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+-+/* [0x00000160] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000168] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+-+/* [0x00000170] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
+-+/* [0x00000178] */ 0x15427d80, 0x10020827, // mov r0, ra_x_base
+-+/* [0x00000180] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
+-+/* [0x00000188] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
+-+/* [0x00000190] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base
+-+/* [0x00000198] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x000001a0] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
+-+/* [0x000001a8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
+-+/* [0x000001b0] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
+-+/* [0x000001b8] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
+-+/* [0x000001c0] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+-+/* [0x000001c8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x000001d0] */ 0x4c9d00cf, 0x10024821, // add r0, r0, r3; mul24 r1, r1, rb_pitch
+-+/* [0x000001d8] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
+-+/* [0x000001e0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+-+/* [0x000001e8] */ 0x949dc5c0, 0xd0025890, // and r2, r2, ~3; mov ra_x_base, r0
+-+/* [0x000001f0] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
+-+/* [0x000001f8] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
+-+/* [0x00000200] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000208] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000210] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
+-+/* [0x00000218] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x00000220] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
+-+/* [0x00000228] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000230] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+-+/* [0x00000238] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
+-+/* [0x00000240] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
+-+// ::mc_filter_uv
+-+/* [0x00000248] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000250] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00000258] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000260] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000268] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x00000270] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000278] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x00000280] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x00000288] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+-+/* [0x00000290] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x00000298] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+-+/* [0x000002a0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x000002a8] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x000002b0] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000002b8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x000002c0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x000002c8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x000002d0] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+-+/* [0x000002d8] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+-+/* [0x000002e0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x000002e8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x000002f0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x000002f8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00000300] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
+-+/* [0x00000308] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000310] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000318] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000320] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000328] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x00000330] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000338] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000340] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000348] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+-+/* [0x00000350] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000358] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000360] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000368] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+-+/* [0x00000370] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000378] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000380] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000388] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+-+/* [0x00000390] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x00000398] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x000003a0] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+// :uvloop
+-+/* [0x000003a8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x000003b0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x000003b8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x000003c0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x000003c8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x000003d0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x000003d8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x000003e0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x000003e8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x000003f0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x000003f8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000400] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
+-+/* [0x00000408] */ 0x40038031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
+-+/* [0x00000410] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00000418] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x00000420] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000428] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00000430] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00000438] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00000440] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+/* [0x00000448] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+/* [0x00000450] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+/* [0x00000458] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+/* [0x00000460] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+/* [0x00000468] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+/* [0x00000470] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+/* [0x00000478] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+/* [0x00000480] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
+-+/* [0x00000488] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+-+/* [0x00000490] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+-+/* [0x00000498] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+-+/* [0x000004a0] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+-+/* [0x000004a8] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+-+/* [0x000004b0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+-+/* [0x000004b8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x000004c0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+-+/* [0x000004c8] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop
+-+/* [0x000004d0] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
+-+/* [0x000004d8] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
+-+/* [0x000004e0] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
+-+/* [0x000004e8] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
+-+/* [0x000004f0] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
+-+/* [0x000004f8] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
+-+/* [0x00000500] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
+-+/* [0x00000508] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
+-+/* [0x00000510] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
+-+/* [0x00000518] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
+-+/* [0x00000520] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
+-+/* [0x00000528] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
+-+/* [0x00000530] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x00000538] */ 0xfffffe50, 0xf06809e7, // brr.anyn -, r:uvloop
+-+/* [0x00000540] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
+-+/* [0x00000548] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x00000550] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x00000558] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00000560] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000568] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000570] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x00000578] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x00000580] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000588] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x00000590] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000598] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+// ::mc_filter
+-+/* [0x000005a0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x000005a8] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x000005b0] */ 0x156e7d80, 0x10020667, // mov ra_x2shift, ra_x2shift_next
+-+/* [0x000005b8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x000005c0] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
+-+/* [0x000005c8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x000005d0] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3,unif
+-+/* [0x000005d8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x000005e0] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
+-+/* [0x000005e8] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
+-+/* [0x000005f0] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
+-+/* [0x000005f8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x00000600] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
+-+/* [0x00000608] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+-+/* [0x00000610] */ 0x149dc5c0, 0xd00206a7, // and ra_x2_base_next, r2, ~3
+-+/* [0x00000618] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x00000620] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x00000628] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x00000630] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000638] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x00000640] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x00000648] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x00000650] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+-+/* [0x00000658] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+-+/* [0x00000660] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x00000668] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000670] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000678] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00000680] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
+-+/* [0x00000688] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000690] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000698] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000006a0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000006a8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x000006b0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000006b8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000006c0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000006c8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+-+/* [0x000006d0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000006d8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000006e0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000006e8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+-+/* [0x000006f0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000006f8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000700] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000708] */ 0x000001d0, 0xf07809e7, // brr.anynn -, r:fast_path
+-+/* [0x00000710] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+-+/* [0x00000718] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x00000720] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000728] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+// :loop
+-+/* [0x00000730] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x00000738] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x00000740] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x00000748] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000750] */ 0xee654987, 0x10024860, // shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20
+-+/* [0x00000758] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000760] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000768] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x00000770] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x00000778] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x00000780] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000788] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
+-+/* [0x00000790] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00000798] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x000007a0] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x000007a8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x000007b0] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x000007b8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x000007c0] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+/* [0x000007c8] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+/* [0x000007d0] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+/* [0x000007d8] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+/* [0x000007e0] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+/* [0x000007e8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+/* [0x000007f0] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+/* [0x000007f8] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+/* [0x00000800] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
+-+/* [0x00000808] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+-+/* [0x00000810] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+-+/* [0x00000818] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+-+/* [0x00000820] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+-+/* [0x00000828] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+-+/* [0x00000830] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+-+/* [0x00000838] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00000840] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+-+/* [0x00000848] */ 0xfffffec8, 0xf06809e7, // brr.anyn -, r:loop
+-+/* [0x00000850] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
+-+/* [0x00000858] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
+-+/* [0x00000860] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
+-+/* [0x00000868] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
+-+/* [0x00000870] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
+-+/* [0x00000878] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
+-+/* [0x00000880] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
+-+/* [0x00000888] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
+-+/* [0x00000890] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
+-+/* [0x00000898] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
+-+/* [0x000008a0] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
+-+/* [0x000008a8] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
+-+/* [0x000008b0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x000008b8] */ 0xfffffe58, 0xf06809e7, // brr.anyn -, r:loop
+-+/* [0x000008c0] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
+-+/* [0x000008c8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x000008d0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x000008d8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x000008e0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x000008e8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x000008f0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+// :fast_path
+-+/* [0x000008f8] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+// :fast_loop
+-+/* [0x00000900] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x00000908] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x00000910] */ 0x95727d9b, 0x1004475f, // mov.ifz ra_y, ra_y_next   ; mov rb31, r3
+-+/* [0x00000918] */ 0x95690dbf, 0x10044623, // mov.ifz ra_x2_base, ra_x2_base_next   ; mov r3, rb_pitch
+-+/* [0x00000920] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000928] */ 0x929de5e4, 0x100248a1, // min r2, r2, rb_frame_height_minus_1 ; mov r1, r4
+-+/* [0x00000930] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x00000938] */ 0xec414c87, 0x10024e20, // add t0s, ra_x_base, r2    ; v8subs r0, r0, rb20
+-+/* [0x00000940] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x00000948] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
+-+/* [0x00000950] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra1 << 1, r0 << 1
+-+/* [0x00000958] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra2 << 2, r0 << 2
+-+/* [0x00000960] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra3 << 3, r0 << 3
+-+/* [0x00000968] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra4 << 4, r0 << 4
+-+/* [0x00000970] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra5 << 5, r0 << 5
+-+/* [0x00000978] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra6 << 6, r0 << 6
+-+/* [0x00000980] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra7 << 7, r0 << 7
+-+/* [0x00000988] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
+-+/* [0x00000990] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+-+/* [0x00000998] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+-+/* [0x000009a0] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+-+/* [0x000009a8] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+-+/* [0x000009b0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+-+/* [0x000009b8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x000009c0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8       ; mov r1, ra22
+-+/* [0x000009c8] */ 0xffffff18, 0xf06809e7, // brr.anyn -, r:fast_loop
+-+/* [0x000009d0] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
+-+/* [0x000009d8] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
+-+/* [0x000009e0] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
+-+/* [0x000009e8] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
+-+/* [0x000009f0] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
+-+/* [0x000009f8] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
+-+/* [0x00000a00] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
+-+/* [0x00000a08] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
+-+/* [0x00000a10] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
+-+/* [0x00000a18] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
+-+/* [0x00000a20] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
+-+/* [0x00000a28] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
+-+/* [0x00000a30] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x00000a38] */ 0xfffffea8, 0xf06809e7, // brr.anyn -, r:fast_loop
+-+/* [0x00000a40] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
+-+/* [0x00000a48] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x00000a50] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x00000a58] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000a60] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00000a68] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000a70] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+// ::mc_filter_b
+-+/* [0x00000a78] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000a80] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00000a88] */ 0x156e7d80, 0x10020667, // mov ra_x2shift, ra_x2shift_next
+-+/* [0x00000a90] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000a98] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
+-+/* [0x00000aa0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000aa8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3,unif
+-+/* [0x00000ab0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000ab8] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
+-+/* [0x00000ac0] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
+-+/* [0x00000ac8] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
+-+/* [0x00000ad0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x00000ad8] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
+-+/* [0x00000ae0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+-+/* [0x00000ae8] */ 0x149dc5c0, 0xd00206a7, // and ra_x2_base_next, r2, ~3
+-+/* [0x00000af0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x00000af8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x00000b00] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x00000b08] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000b10] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x00000b18] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x00000b20] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x00000b28] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+-+/* [0x00000b30] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+-+/* [0x00000b38] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x00000b40] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+-+/* [0x00000b48] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+-+/* [0x00000b50] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+-+/* [0x00000b58] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000b60] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000b68] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00000b70] */ 0x0c9dc7c0, 0x10020c67, // add vr_setup, r3, rb28
+-+/* [0x00000b78] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000b80] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000b88] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000b90] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000b98] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x00000ba0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000ba8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000bb0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000bb8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+-+/* [0x00000bc0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000bc8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000bd0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000bd8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+-+/* [0x00000be0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000be8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000bf0] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000bf8] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+-+/* [0x00000c00] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x00000c08] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000c10] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+// :bloop
+-+/* [0x00000c18] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x00000c20] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x00000c28] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x00000c30] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000c38] */ 0xee654987, 0x10024860, // shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20
+-+/* [0x00000c40] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000c48] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000c50] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x00000c58] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x00000c60] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x00000c68] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000c70] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
+-+/* [0x00000c78] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00000c80] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x00000c88] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000c90] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00000c98] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00000ca0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00000ca8] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+/* [0x00000cb0] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+/* [0x00000cb8] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+/* [0x00000cc0] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+/* [0x00000cc8] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+/* [0x00000cd0] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+/* [0x00000cd8] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+/* [0x00000ce0] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+/* [0x00000ce8] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
+-+/* [0x00000cf0] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+-+/* [0x00000cf8] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+-+/* [0x00000d00] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+-+/* [0x00000d08] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+-+/* [0x00000d10] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+-+/* [0x00000d18] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+-+/* [0x00000d20] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00000d28] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+-+/* [0x00000d30] */ 0xfffffec8, 0xf06809e7, // brr.anyn -, r:bloop
+-+/* [0x00000d38] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
+-+/* [0x00000d40] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
+-+/* [0x00000d48] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
+-+/* [0x00000d50] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
+-+/* [0x00000d58] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
+-+/* [0x00000d60] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
+-+/* [0x00000d68] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
+-+/* [0x00000d70] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
+-+/* [0x00000d78] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
+-+/* [0x00000d80] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
+-+/* [0x00000d88] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
+-+/* [0x00000d90] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
+-+/* [0x00000d98] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x00000da0] */ 0x8fc8f3f6, 0xd0020867, // asr r1, r1, 15          ; mov -, vr_wait
+-+/* [0x00000da8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x00000db0] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
+-+/* [0x00000db8] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:bloop
+-+/* [0x00000dc0] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+-+/* [0x00000dc8] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
+-+/* [0x00000dd0] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
+-+/* [0x00000dd8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000de0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00000de8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000df0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+// ::mc_filter_honly
+-+/* [0x00000df8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000e00] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00000e08] */ 0x156e7d80, 0x10020667, // mov ra_x2shift, ra_x2shift_next
+-+/* [0x00000e10] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000e18] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
+-+/* [0x00000e20] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000e28] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3,unif
+-+/* [0x00000e30] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000e38] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
+-+/* [0x00000e40] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
+-+/* [0x00000e48] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
+-+/* [0x00000e50] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x00000e58] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
+-+/* [0x00000e60] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+-+/* [0x00000e68] */ 0x149dc5c0, 0xd00206a7, // and ra_x2_base_next, r2, ~3
+-+/* [0x00000e70] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x00000e78] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x00000e80] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x00000e88] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000e90] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x00000e98] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x00000ea0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x00000ea8] */ 0x0c9de1c0, 0xd0021467, // add rb17, r0, -2
+-+/* [0x00000eb0] */ 0x919c71c0, 0xd0024812, // shl r0, r0, 7 ; mov rb18,r0
+-+/* [0x00000eb8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000ec0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000ec8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00000ed0] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000ed8] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000ee0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000ee8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000ef0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x00000ef8] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000f00] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000f08] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000f10] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+-+/* [0x00000f18] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000f20] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x00000f28] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000f30] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+// :loop_honly
+-+/* [0x00000f38] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x00000f40] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x00000f48] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x00000f50] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000f58] */ 0xee654987, 0x10024860, // shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20
+-+/* [0x00000f60] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000f68] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000f70] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x00000f78] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x00000f80] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x00000f88] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000f90] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
+-+/* [0x00000f98] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00000fa0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x00000fa8] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000fb0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00000fb8] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00000fc0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00000fc8] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+/* [0x00000fd0] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+/* [0x00000fd8] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+/* [0x00000fe0] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+/* [0x00000fe8] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+/* [0x00000ff0] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+/* [0x00000ff8] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+/* [0x00001000] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+/* [0x00001008] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
+-+/* [0x00001010] */ 0x8d5927f6, 0x100269e1, // sub.setf -, r3, rb18 ; mov r1, ra22
+-+/* [0x00001018] */ 0x559f2fc1, 0x100049e0, // mov -, vw_wait   ; mul24 r0, r0, r1
+-+/* [0x00001020] */ 0xfffffef8, 0xf06809e7, // brr.anyn -, r:loop_honly
+-+/* [0x00001028] */ 0x0f9cf1c0, 0xd0020827, // asr r0, r0, 15
+-+/* [0x00001030] */ 0x129d61c0, 0x10020827, // min r0, r0, rb22
+-+/* [0x00001038] */ 0x139c01c0, 0xd0020c27, // max vpm, r0, 0
+-+/* [0x00001040] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00001048] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00001050] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00001058] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+// ::mc_exit
+-+/* [0x00001060] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00001068] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+-+/* [0x00001070] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00001078] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00001080] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00001088] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00001090] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00001098] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x000010a0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+// ::mc_exit1
+-+/* [0x000010a8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x000010b0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x000010b8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x000010c0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x000010c8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x000010d0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x000010d8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x000010e0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+// ::mc_interrupt_exit
+-+/* [0x000010e8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x000010f0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x000010f8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00001100] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00001108] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00001110] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001118] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001120] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001128] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001130] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001138] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001140] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001148] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001150] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001158] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001160] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001168] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00001170] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x00001178] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+// ::mc_interrupt_exit4
+-+/* [0x00001180] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00001188] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00001190] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00001198] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x000011a0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x000011a8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000011b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000011b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000011c0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x000011c8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x000011d0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+// ::mc_interrupt_exit8
+-+/* [0x000011d8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x000011e0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x000011e8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x000011f0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x000011f8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00001200] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001208] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001210] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001218] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001220] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001228] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001230] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001238] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00001240] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x00001248] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+// ::mc_setup_uv
+-+/* [0x00001250] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00001258] */ 0x0c9a0f80, 0x10020427, // add ra_x_base, unif, elem_num
+-+/* [0x00001260] */ 0x15827d80, 0x10020767, // mov ra_y, unif
+-+/* [0x00001268] */ 0x15827d80, 0x10020627, // mov ra_x2_base, unif
+-+/* [0x00001270] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x00001278] */ 0x0d620f80, 0x10020667, // sub ra_u2v_ref_offset, unif, ra_x2_base
+-+/* [0x00001280] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
+-+/* [0x00001288] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
+-+/* [0x00001290] */ 0x15827d80, 0x10021427, // mov rb16, unif
+-+/* [0x00001298] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000012a0] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+-+/* [0x000012a8] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
+-+/* [0x000012b0] */ 0x00000001, 0xe0020527, // mov ra20, 1
+-+/* [0x000012b8] */ 0x00000040, 0xe0020567, // mov ra21, 64
+-+/* [0x000012c0] */ 0x00000100, 0xe00205a7, // mov ra22, 256
+-+/* [0x000012c8] */ 0x00000008, 0xe00205e7, // mov ra23, 8
+-+/* [0x000012d0] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
+-+/* [0x000012d8] */ 0x00000040, 0xe0021567, // mov rb21, 64
+-+/* [0x000012e0] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
+-+/* [0x000012e8] */ 0x00000018, 0xe00215e7, // mov rb23, 24
+-+/* [0x000012f0] */ 0x00000000, 0xe0020227, // mov ra8, 0
+-+/* [0x000012f8] */ 0x00000000, 0xe0020267, // mov ra9, 0
+-+/* [0x00001300] */ 0x00000000, 0xe00202a7, // mov ra10, 0
+-+/* [0x00001308] */ 0x00000000, 0xe00202e7, // mov ra11, 0
+-+/* [0x00001310] */ 0x00000000, 0xe0020327, // mov ra12, 0
+-+/* [0x00001318] */ 0x00000000, 0xe0020367, // mov ra13, 0
+-+/* [0x00001320] */ 0x00000000, 0xe00203a7, // mov ra14, 0
+-+/* [0x00001328] */ 0x00000000, 0xe00203e7, // mov ra15, 0
+-+/* [0x00001330] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+-+/* [0x00001338] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+-+/* [0x00001340] */ 0x159e7480, 0x10020867, // mov r1, r2
+-+/* [0x00001348] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+-+/* [0x00001350] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+-+/* [0x00001358] */ 0x159e7480, 0x10020827, // mov r0, r2
+-+/* [0x00001360] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+-+/* [0x00001368] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00001370] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
+-+/* [0x00001378] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+-+/* [0x00001380] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
+-+/* [0x00001388] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+-+/* [0x00001390] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+-+/* [0x00001398] */ 0x159e7480, 0x10020867, // mov r1, r2
+-+/* [0x000013a0] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+-+/* [0x000013a8] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+-+/* [0x000013b0] */ 0x159e7480, 0x10020827, // mov r0, r2
+-+/* [0x000013b8] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+-+/* [0x000013c0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x000013c8] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+-+/* [0x000013d0] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
+-+/* [0x000013d8] */ 0x15427d80, 0x10020827, // mov r0, ra_x_base
+-+/* [0x000013e0] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
+-+/* [0x000013e8] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base
+-+/* [0x000013f0] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
+-+/* [0x000013f8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
+-+/* [0x00001400] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x00001408] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+-+/* [0x00001410] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x_base, r0
+-+/* [0x00001418] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x00001420] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
+-+/* [0x00001428] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
+-+/* [0x00001430] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
+-+/* [0x00001438] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00001440] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00001448] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00001450] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
+-+/* [0x00001458] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x00001460] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
+-+/* [0x00001468] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00001470] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+-+/* [0x00001478] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
+-+/* [0x00001480] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
+-+// ::mc_filter_uv_b
+-+/* [0x00001488] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00001490] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00001498] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x000014a0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x000014a8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x000014b0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x000014b8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x000014c0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x000014c8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+-+/* [0x000014d0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x000014d8] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+-+/* [0x000014e0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x000014e8] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x000014f0] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000014f8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x00001500] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x00001508] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x00001510] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+-+/* [0x00001518] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+-+/* [0x00001520] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x00001528] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+-+/* [0x00001530] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+-+/* [0x00001538] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+-+/* [0x00001540] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00001548] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00001550] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00001558] */ 0x0c9dc7c0, 0x10020c67, // add vr_setup, r3, rb28
+-+/* [0x00001560] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
+-+/* [0x00001568] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00001570] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00001578] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00001580] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00001588] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x00001590] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00001598] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000015a0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000015a8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+-+/* [0x000015b0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000015b8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000015c0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000015c8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+-+/* [0x000015d0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000015d8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000015e0] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000015e8] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+-+/* [0x000015f0] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x000015f8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00001600] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+// :uvloop_b
+-+/* [0x00001608] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x00001610] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x00001618] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x00001620] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00001628] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x00001630] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00001638] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00001640] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x00001648] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x00001650] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x00001658] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00001660] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
+-+/* [0x00001668] */ 0x40038031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
+-+/* [0x00001670] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00001678] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x00001680] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00001688] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00001690] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00001698] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x000016a0] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+/* [0x000016a8] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+/* [0x000016b0] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+/* [0x000016b8] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+/* [0x000016c0] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+/* [0x000016c8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+/* [0x000016d0] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+/* [0x000016d8] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+/* [0x000016e0] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
+-+/* [0x000016e8] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+-+/* [0x000016f0] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+-+/* [0x000016f8] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+-+/* [0x00001700] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+-+/* [0x00001708] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+-+/* [0x00001710] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+-+/* [0x00001718] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00001720] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+-+/* [0x00001728] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x00001730] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
+-+/* [0x00001738] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
+-+/* [0x00001740] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
+-+/* [0x00001748] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
+-+/* [0x00001750] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
+-+/* [0x00001758] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
+-+/* [0x00001760] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
+-+/* [0x00001768] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
+-+/* [0x00001770] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
+-+/* [0x00001778] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
+-+/* [0x00001780] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
+-+/* [0x00001788] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
+-+/* [0x00001790] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x00001798] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
+-+/* [0x000017a0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x000017a8] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
+-+/* [0x000017b0] */ 0xfffffe38, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x000017b8] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+-+/* [0x000017c0] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
+-+/* [0x000017c8] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
+-+/* [0x000017d0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x000017d8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x000017e0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x000017e8] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x000017f0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x000017f8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00001800] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x00001808] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00001810] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+// ::mc_end
+-+};
+-+#ifdef __HIGHC__
+-+#pragma Align_to(8, rpi_shader)
+++#ifdef RPI_LUMA_QPU
+++    for(i=0;i<12;i++) {
+++        // This needs to have a generally similar structure to the
+++        // actual filter code as various pipelined bits need to land correctly
+++        // when inserted by the filter requests
+++        s->y_mvs[job][i] = s->y_mvs_base[job][i];
+++        *s->y_mvs[job][i]++ = 0; // y_x
+++        *s->y_mvs[job][i]++ = 0; // ref_y_base
+++        *s->y_mvs[job][i]++ = 0; // y2_x2
+++        *s->y_mvs[job][i]++ = 0; // ref_y2_base
+++        *s->y_mvs[job][i]++ = (s->ps.sps->width << 16) + s->ps.sps->height;
+++        *s->y_mvs[job][i]++ = s->frame->linesize[0]; // pitch
+++        *s->y_mvs[job][i]++ = s->frame->linesize[0]; // dst_pitch
+++        *s->y_mvs[job][i]++ = s->sh.luma_log2_weight_denom + 6;  // weight demon + 6
+++        *s->y_mvs[job][i]++ = 0; // Unused - alignment with per-block
+++        *s->y_mvs[job][i]++ = 0; // Next kernel
+++    }
+++    s->curr_y_mvs = s->y_mvs[job][0];
+ +#endif
+-diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
+-new file mode 100644
+-index 0000000..db971f4
+---- /dev/null
+-+++ b/libavcodec/rpi_shader.h
+-@@ -0,0 +1,20 @@
+-+#ifndef rpi_shader_H
+-+#define rpi_shader_H
+-+
+-+extern unsigned int rpi_shader[];
+-+
+-+#define mc_setup (rpi_shader + 0)
+-+#define mc_filter_uv (rpi_shader + 146)
+-+#define mc_filter (rpi_shader + 360)
+-+#define mc_filter_b (rpi_shader + 670)
+-+#define mc_filter_honly (rpi_shader + 894)
+-+#define mc_exit (rpi_shader + 1048)
+-+#define mc_exit1 (rpi_shader + 1066)
+-+#define mc_interrupt_exit (rpi_shader + 1082)
+-+#define mc_interrupt_exit4 (rpi_shader + 1120)
+-+#define mc_interrupt_exit8 (rpi_shader + 1142)
+-+#define mc_setup_uv (rpi_shader + 1172)
+-+#define mc_filter_uv_b (rpi_shader + 1314)
+-+#define mc_end (rpi_shader + 1542)
+-+
+++    s->ctu_count = 0;
+++}
+ +#endif
+-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
+-new file mode 100644
+-index 0000000..6851e83
+---- /dev/null
+-+++ b/libavcodec/rpi_shader.qasm
+-@@ -0,0 +1,1413 @@
+-+# register allocation
+-+#
+-+# ra0...ra7                                     eight horizontal filter coefficients
+-+#
+-+# rb1...rb7                                     seven shifted copies of the current unfiltered row
+-+#
+-+# ra8...ra15                                    eight filtered rows of context (rb15 == most recent)
+-+#
+-+#                                               (ra15 isn't clamped to zero - this happens during the
+-+#                                                copy to ra14, and during its use in the vertical filter)
+-+#
+-+# rb8...rb15                                    eight vertical filter coefficients
+-+#
+-+# ra16                                          clipped(row start address+elem_num)&~3
+-+# ra17                                          per-channel shifts
+-+# ra19                                          next ra17
+-+#
+-+# rb16                                          pitch
+-+# rb17                                          height + 5
+-+# rb18                                          height + 7
+-+# rb19                                          next ra16
+-+#
+-+# ra20                                          1
+-+# ra21                                          64
+-+# ra22                                          256
+-+# ra23                                          8
+-+#
+-+# rb20                                          0xffffff00
+-+# rb21                                          64
+-+# rb22                                          255
+-+# rb23                                          24
+-+#
+-+# rb24                                          vdw_setup_1(dst_pitch)
+-+# rb25                                          frame width-1
+-+# rb26                                          height<<23 + width<<16 + vdw_setup_0
+-+# rb27                                          vdw_setup_0 (depends on QPU number)
+-+# rb28                                          vpm_setup (depends on QPU number)
+-+# rb29                                          vdw_setup_1(dst_pitch-width)
+-+# rb30                                          frame height-1
+-+# rb31                                          used as temp to count loop iterations
+-+#
+-+# ra24...ra30                                   15, 14, 13, 12, 11, 10, 9
+-+# ra24                                          clipped(row start address+8+elem_num)&~3
+-+# ra25                                          per-channel shifts 2
+-+# ra26                                          next ra24
+-+# ra27                                          next ra25
+-+# ra28                                          next y
+-+# ra29                                          y for next texture access
+-+#
+-+# ra31                                          next kernel address
+ +
+-+.set rb_frame_width_minus_1,       rb25
+-+.set rb_frame_height_minus_1,      rb30
+-+.set rb_pitch,                     rb16
+-+.set ra_x_base,                    ra16
+-+.set rb_x_base_next,               rb19
+-+.set ra_x2_base,                   ra24
+-+.set ra_x2_base_next,              ra26
+-+.set ra_xshift,                    ra17
+++#ifdef RPI_SIMULATE_QPUS
+ +
+-+.set ra_x2shift,                   ra25
+-+.set ra_u2v_ref_offset,            ra25
+++static int32_t clipx(int x,int FRAME_WIDTH)
+++{
+++	if (x<=0) return 0;
+++	if (x>=FRAME_WIDTH) return FRAME_WIDTH-1;
+++	return x;
+++}
+ +
+-+.set ra_xshift_next,               ra19
+++static int32_t clipy(int y,int FRAME_HEIGHT)
+++{
+++	if (y<=0) return 0;
+++	if (y>=FRAME_HEIGHT) return FRAME_HEIGHT-1;
+++	return y;
+++}
+ +
+-+.set ra_x2shift_next,              ra27
+-+.set ra_u2v_dst_offset,            ra27
+++/*static int32_t filter8(uint8_t *data, int x0, int y0, int pitch, int mx, int my,int round,int denom,int weight,int offset)
+++{
+++   int32_t vsum = 0;
+++   int x, y;
+ +
+-+.set ra_y_next,                    ra28
+-+.set ra_y,                         ra29
+++   for (y = 0; y < 8; y++) {
+++      int32_t hsum = 0;
+ +
+-+.set rb_const_64,                  rb21
+++      for (x = 0; x < 8; x++)
+++         hsum += lumaFilter[mx][x]*data[clipx(x + x0) + clipy(y + y0) * pitch];
+ +
+-+# mc_setup(next_kernel, x, y, ref_base, frame_width, frame_height, pitch, dst_pitch, pad0, pad1)
+-+::mc_setup
+++      vsum += lumaFilter[my][y]*hsum;
+++   }
+++   vsum >>= 6;
+++   vsum = (((vsum*weight)+round)>>denom)+offset;
+ +
+-+# Read starting kernel
+-+mov ra31, unif
+++   return av_clip_uint8( vsum );
+++}*/
+ +
+-+# Load first request location
+-+add ra_x_base, unif, elem_num # Store x
+-+mov ra_y, unif # Store y
+-+mov ra_x2_base, unif # Store frame base
+++static int32_t filter8_chroma(uint8_t *data, int x0, int y0, int pitch, int hcoeffs, int vcoeffs,int offset_weight,int offset_before,int denom,int pic_width, int pic_height)
+++{
+++  int32_t vsum = 0;
+++  int x, y;
+++  int chromaFilterH[4];
+++  int chromaFilterV[4];
+++  int i;
+++  int offset_after = offset_weight>>16;
+++  int weight = (offset_weight<<16)>>16;
+++  for(i=0;i<4;i++) {
+++    chromaFilterH[i] = ((hcoeffs>>(8*i))<<24)>>24;
+++    chromaFilterV[i] = ((vcoeffs>>(8*i))<<24)>>24;
+++  }
+ +
+-+# Read image dimensions
+-+sub rb25,unif,1
+-+sub rb30,unif,1
+++   for (y = 0; y < 4; y++) {
+++      int32_t hsum = 0;
+ +
+-+# get source pitch
+-+mov rb16, unif
+++      for (x = 0; x < 4; x++)
+++         hsum += chromaFilterH[x]*data[clipx(x + x0,pic_width) + clipy(y + y0,pic_height) * pitch];
+ +
+-+# get destination pitch
+-+mov r0, unif
+-+mov r1, vdw_setup_1(0)
+-+add rb24, r1, r0
+++      vsum += chromaFilterV[y]*hsum;
+++   }
+++   vsum >>= 6;
+++   vsum = (((vsum*weight)+offset_before)>>denom)+offset_after;
+ +
+-+# load constants
+++   return vsum;
+++}
+ +
+-+mov ra20, 1
+-+mov ra21, 64
+-+mov ra22, 256
+-+mov ra23, 8
+++int lumaFilter[4][8]={ {0,0,0,64,0,0,0,0},{-1,4,-10,58,17,-5,1,0},{-1,4,-11,40,40,-11,4,-1},{0,1,-5,17,58,-10,4,-1} };
+ +
+-+mov rb20, 0xffffff00
+-+mov rb21, 64
+-+mov rb22, 255
+-+mov rb23, 24
+++static int32_t filter8_luma(uint8_t *data, int x0, int y0, int pitch, int my_mx,int offset_weight,int offset_before,int denom,int pic_width, int pic_height)
+++{
+++  int32_t vsum = 0;
+++  int x, y;
+++  int i;
+++  int offset_after = offset_weight>>16;
+++  int weight = (offset_weight<<16)>>16;
+ +
+-+# touch vertical context to keep simulator happy
+++   for (y = 0; y < 8; y++) {
+++      int32_t hsum = 0;
+ +
+-+mov ra8, 0
+-+mov ra9, 0
+-+mov ra10, 0
+-+mov ra11, 0
+-+mov ra12, 0
+-+mov ra13, 0
+-+mov ra14, 0
+-+mov ra15, 0
+++      for (x = 0; x < 8; x++)
+++         hsum += lumaFilter[my_mx&3][x]*data[clipx(x + x0,pic_width) + clipy(y + y0,pic_height) * pitch];
+ +
+-+# Compute part of VPM to use for DMA output
+-+mov r2, qpu_num
+-+and r2, r2, 15
+-+mov r1, r2
+-+asr r1, r1, 2
+-+shl r1, r1, 6
+-+mov r0, r2
+-+and r0, r0, 3
+-+add r0, r0, r1
+-+mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
+-+shl r0, r0, 5
+-+add rb27, r0, r1
+++      vsum += lumaFilter[(my_mx>>8)&3][y]*hsum;
+++   }
+++   vsum >>= 6;
+++   vsum = (((vsum*weight)+offset_before)>>denom)+offset_after;
+ +
+-+# Compute part of VPM to save data into
+-+mov r2, qpu_num
+-+and r2, r2, 15
+-+mov r1, r2
+-+asr r1, r1, 2
+-+shl r1, r1, 6
+-+mov r0, r2
+-+and r0, r0, 3
+-+add r0, r0, r1
+-+mov r1, vpm_setup(0, 4, h8p(0, 0))
+-+add rb28, r0, r1
+++   return vsum;
+++}
+ +
+-+# Compute base address for first and second access
+-+#add r0, unif, elem_num     # x
+-+mov r0, ra_x_base           # Load x
+-+add r2, r0, 8               # x+8
+-+max r0, r0, 0; mov r1, ra_y # Load y
+-+min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base  # Load the frame base
+-+shl ra_xshift_next, r0, 3
+-+max r2, r2, 0
+-+add ra_y, r1, 1
+-+min r2, r2, rb_frame_width_minus_1
+-+shl ra_x2shift_next, r2, 3
+-+max r1, r1, 0  # y
+-+min r1, r1, rb_frame_height_minus_1
+-+add r0, r0, r3; mul24 r1, r1, rb_pitch
+-+add r2, r2, r3
+-+and r0, r0, ~3
+-+and r2, r2, ~3; mov ra_x_base, r0
+-+# submit texture requests for first line
+-+add t0s, r0, r1 ; mov ra_x2_base, r2
+-+add t0s, r2, r1
+++static uint8_t *test_frame(HEVCContext *s,uint32_t p, AVFrame *frame, const int cIdx)
+++{
+++  //int pic_width        = s->ps.sps->width >> s->ps.sps->hshift[cIdx];
+++  int pic_height       = s->ps.sps->height >> s->ps.sps->vshift[cIdx];
+++  int pitch = frame->linesize[cIdx];
+++  uint32_t base = cIdx == 0 ? get_vc_address_y(frame) :
+++    cIdx == 1 ? get_vc_address_u(frame) : get_vc_address_v(frame);
+++  if (p>=base && p<base+pitch*pic_height) {
+++    return frame->data[cIdx] + (p-base);
+++  }
+++  return NULL;
+++}
+ +
+-+# Dump padding words
+-+mov r0, unif
+-+mov r0, unif
+++static uint8_t *compute_arm_addr(HEVCContext *s,uint32_t p, int cIdx)
+++{
+++  SliceHeader *sh   = &s->sh;
+++  uint8_t *arm = test_frame(s,p,s->frame,cIdx);
+++  int i;
+++  if (arm) return arm;
+++  if (sh->slice_type == P_SLICE || sh->slice_type == B_SLICE)
+++  {
+++    for(i=0;i<sh->nb_refs[L0];i++) {
+++      arm = test_frame(s,p,s->ref->refPicList[0].ref[i]->frame,cIdx);
+++      if (arm) return arm;
+++    }
+++  }
+++  if (sh->slice_type == B_SLICE) {
+++    for(i=0;i<sh->nb_refs[L1];i++) {
+++      arm = test_frame(s,p,s->ref->refPicList[1].ref[i]->frame,cIdx);
+++      if (arm) return arm;
+++    }
+++  }
+++  printf("Frame 0x%x not found! Exit=%x\n",p,qpu_get_fn(QPU_MC_EXIT));
+++  exit(-1);
+++  return NULL;
+++}
+ +
+-+# submit texture requests for second line
+-+max r1, ra_y, 0
+-+min r1, r1, rb_frame_height_minus_1
+-+add ra_y, ra_y, 1
+-+bra -, ra31
+-+nop ; mul24 r1, r1, rb_pitch
+-+add t0s, r1, ra_x_base
+-+add t0s, r1, ra_x2_base
+++static void rpi_simulate_inter_chroma(HEVCContext *s,uint32_t *p)
+++{
+++  uint32_t next_kernel;
+++  uint32_t x0;
+++  uint32_t y0;
+++  uint8_t *ref_u_base;
+++  uint8_t *ref_v_base;
+++  uint32_t frame_width = p[5];
+++  uint32_t frame_height = p[6];
+++  uint32_t pitch = p[7];
+++  uint32_t dst_pitch = p[8];
+++  int32_t offset_before = p[9];
+++  int32_t denom = p[10];
+++  uint32_t vpm_id = p[11];
+++  uint32_t tmp_u_dst[256];
+++  uint32_t tmp_v_dst[256];
+++  while(1) {
+++    p += 12;
+++    next_kernel = p[0-12];
+++    x0 = p[1-12];
+++    y0 = p[2-12];
+++    if (next_kernel==s->mc_filter_uv || next_kernel==s->mc_filter_uv_b0 || next_kernel==s->mc_filter_uv_b) {
+++      int x,y;
+++      uint32_t width_height = p[5];
+++      uint32_t hcoeffs = p[6];
+++      uint32_t vcoeffs = p[7];
+++      uint32_t offset_weight_u = p[8];
+++      uint32_t offset_weight_v = p[9];
+++      uint8_t *this_u_dst;
+++      uint8_t *this_v_dst;
+++      uint32_t width = width_height >> 16;
+++      uint32_t height = (width_height << 16) >> 16;
+++      ref_u_base = compute_arm_addr(s,p[3-12],1);
+++      ref_v_base = compute_arm_addr(s,p[4-12],2);
+++      if (next_kernel!=s->mc_filter_uv_b0)
+++      {
+++        this_u_dst = compute_arm_addr(s,p[10],1);
+++        this_v_dst = compute_arm_addr(s,p[11],2);
+++      }
+++      for (y=0; y<height; ++y) {
+++        for (x=0; x<width; ++x) {
+++          if (next_kernel==s->mc_filter_uv) {
+++            int32_t refa = filter8_chroma(ref_u_base,x+x0, y+y0, pitch, hcoeffs, vcoeffs, offset_weight_u,offset_before,denom,frame_width,frame_height);
+++            int32_t refb = filter8_chroma(ref_v_base,x+x0, y+y0, pitch, hcoeffs, vcoeffs, offset_weight_v,offset_before,denom,frame_width,frame_height);
+++            this_u_dst[x+y*dst_pitch] = av_clip_uint8(refa);
+++            this_v_dst[x+y*dst_pitch] = av_clip_uint8(refb);
+++          } else if (next_kernel==s->mc_filter_uv_b0) {
+++            int32_t refa = filter8_chroma(ref_u_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1,0,0,frame_width,frame_height);
+++            int32_t refb = filter8_chroma(ref_v_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1,0,0,frame_width,frame_height);
+++            tmp_u_dst[x+y*16] = refa;
+++            tmp_v_dst[x+y*16] = refb;
+++          } else {
+++            int32_t refa = filter8_chroma(ref_u_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1, 64 + tmp_u_dst[x+y*16], 7, frame_width, frame_height);
+++            int32_t refb = filter8_chroma(ref_v_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1, 64 + tmp_v_dst[x+y*16], 7, frame_width, frame_height);
+++            this_u_dst[x+y*dst_pitch] = av_clip_uint8(refa);
+++            this_v_dst[x+y*dst_pitch] = av_clip_uint8(refb);
+++          }
+++        }
+++      }
+++    } else {
+++      av_assert0(next_kernel==qpu_get_fn(QPU_MC_INTERRUPT_EXIT8) || next_kernel==qpu_get_fn(QPU_MC_EXIT) );
+++      break;
+++    }
+++  }
+++}
+ +
+-+################################################################################
+++// mc_setup(y_x, ref_y_base, y2_x2, ref_y2_base, frame_width_height, pitch, dst_pitch, offset_shift, next_kernel)
+++static void rpi_simulate_inter_luma(HEVCContext *s,uint32_t *p,int chan)
+++{
+++  uint32_t next_kernel;
+++  int y_x,y2_x2;
+++  int x0;
+++  int y0;
+++  int x2;
+++  int y2;
+++  uint32_t *p0 = p;
+++  uint8_t *ref_y_base;
+++  uint8_t *ref_y2_base;
+++  uint32_t frame_width_height = p[4];
+++  uint32_t frame_width = frame_width_height>>16;
+++  uint32_t frame_height = (frame_width_height<<16)>>16;
+++  uint32_t pitch = p[5];
+++  uint32_t dst_pitch = p[6];
+++  int offset_shift = p[7];
+++  int32_t offset_before = offset_shift>>16;
+++  int32_t denom = (offset_shift<<16)>>16;
+++  while(1) {
+++    p += 9;
+++    next_kernel = p[8-9];
+++    y_x = p[0-9];
+++    x0 = (y_x<<16)>>16;
+++    y0 = y_x>>16;
+++    y2_x2 = p[2-9];
+++    x2 = (y2_x2<<16)>>16;
+++    y2 = y2_x2>>16;
+ +
+-+# mc_filter_uv(next_kernel, x, y, frame_u_base, frame_v_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_u_dst, this_v_dst)
+++    if (next_kernel==s->mc_filter || next_kernel==s->mc_filter_b) {
+++      // y_x, frame_base, y2_x2, frame_base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
+++      int x,y;
+++      uint32_t width_height = p[4];
+++      uint32_t my2_mx2_my_mx = p[5];
+++      uint32_t offset_weight = p[6];
+++      uint8_t *this_dst = compute_arm_addr(s,p[7],0);
+++      uint32_t width = width_height >> 16;
+++      uint32_t height = (width_height << 16) >> 16;
+++      uint8_t *dst_base = s->frame->data[0];
+++      ref_y_base = compute_arm_addr(s,p[1-9],0);
+++      ref_y2_base = compute_arm_addr(s,p[3-9],0);
+++      for (y=0; y<height; ++y) {
+++        for (x=0; x<width; ++x) {
+++          if (next_kernel==s->mc_filter) {
+++            int32_t refa = filter8_luma(ref_y_base,x+x0, y+y0, pitch, my2_mx2_my_mx, offset_weight,offset_before,denom,frame_width,frame_height);
+++            refa = av_clip_uint8(refa);
+++            this_dst[x+y*dst_pitch] = refa;
+++          }
+++          else {
+++            int32_t refa = filter8_luma(ref_y_base, x+x0, y+y0, pitch, my2_mx2_my_mx, 1, 0, 0, frame_width, frame_height);
+++            int32_t refb = filter8_luma(ref_y2_base, x+x2, y+y2, pitch, my2_mx2_my_mx>>16, 1, 64 + refa, 7, frame_width, frame_height);
+++            this_dst[x+y*dst_pitch] = av_clip_uint8(refb);
+++          }
+++        }
+++      }
+++    } else {
+++      av_assert0(next_kernel==qpu_get_fn(QPU_MC_INTERRUPT_EXIT12) || next_kernel==qpu_get_fn(QPU_MC_EXIT) );
+++      break;
+++    }
+++  }
+++}
+ +
+-+# At this point we have already issued two pairs of texture requests for the current block
+-+# ra_x_base, ra_x16_base point to the current coordinates for this block
+-+::mc_filter_uv
+-+mov ra31, unif
+-+
+-+# per-channel shifts were calculated on the *previous* invocation
+-+
+-+mov ra_xshift, ra_xshift_next
+-+
+-+# get base addresses and per-channel shifts for *next* invocation
+-+add r0, unif, elem_num    # x
+-+max r0, r0, 0; mov r1, unif # y
+-+min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base
+-+shl ra_xshift_next, r0, 3
+-+sub r2, unif, r3 # compute offset from frame base u to frame base v
+-+add r0, r0, r3
+-+and rb_x_base_next, r0, ~3
+-+mov ra_y_next, r1
+-+add ra_x2_base_next, rb_x_base_next, r2
+-+
+-+# set up VPM write
+-+mov vw_setup, rb28
+-+
+-+# get width,height of block
+-+mov r2, 16
+-+mov r0, unif
+-+shr r1, r0, r2 # Extract width
+-+sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
+-+and r0, r0, rb22 # Extract height
+-+add rb17, r0, 5
+-+add rb18, r0, 7
+-+shl r0, r0, 7
+-+add r0, r0, r1 # Combine width and height of destination area
+-+shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
+-+add rb26, r0, rb27
+-+
+-+sub.setf -,8,r1 # 8-r1, so if <0 (negative) we need to use the full code
+-+
+-+# get filter coefficients
+++static void rpi_simulate_inter_qpu(HEVCContext *s)
+++{
+++  // First run the transform as normal
+++  int i;
+++  rpi_execute_transform(s);
+++  for(i=0;i<8;i++)
+++  {
+++    rpi_simulate_inter_chroma(s,s->mvs_base[i]);
+++  }
+++  for(i=0;i<12;i++)
+++  {
+++    rpi_simulate_inter_luma(s,s->y_mvs_base[i],i);
+++  }
+++}
+ +
+-+mov r0, unif
+-+asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra0, r0, rb23;      mov r0, unif
+-+asr ra7, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra6, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra5, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra4, r0, rb23;      mov r0, unif
+-+asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+asr rb8, r0, rb23;      mov r0, unif
+-+asr rb15, r0, rb23;     mul24 r0, r0, ra22
+-+asr rb14, r0, rb23;     mul24 r0, r0, ra22
+-+asr rb13, r0, rb23;     mul24 r0, r0, ra22
+-+asr rb12, r0, rb23
+++#endif
+ +
+-+# r2 is elem_num
+-+# r3 is loop counter
+++#ifdef RPI_INTER_QPU
+ +
+-+mov r5rep, -8
+-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++static void rpi_launch_vpu_qpu(HEVCContext *s)
+++{
+++    int k;
+++    int job = s->pass1_job;
+++    int i;
+++    uint32_t *unif_vc = (uint32_t *)s->unif_mvs_ptr[job].vc;
+++#ifdef RPI_LUMA_QPU
+++    uint32_t *y_unif_vc = (uint32_t *)s->y_unif_mvs_ptr[job].vc;
+++#endif
+++    if (s->sh.slice_type == I_SLICE) {
+++#ifdef RPI_MULTI_MAILBOX
+++      rpi_execute_transform(s);
+++      return;
+++#endif
+++    }
+++    for(k=0;k<8;k++) {
+++        s->u_mvs[job][k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
+++        s->u_mvs[job][k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
+++        s->u_mvs[job][k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP_UV); // Also need a dummy for V
+++        av_assert0(s->u_mvs[job][k] - s->mvs_base[job][k] < UV_COMMANDS_PER_QPU);
+++    }
+ +
+-+# retrieve texture results and pick out bytes
+-+# then submit two more texture requests
+++    s->u_mvs[job][8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
+ +
+-+mov r3, 0
+++#ifdef RPI_LUMA_QPU
+++    for(k=0;k<12;k++) {
+++        s->y_mvs[job][k][-RPI_LUMA_COMMAND_WORDS+1] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
+++        s->y_mvs[job][k][-RPI_LUMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // Also need a dummy for second request
+++        s->y_mvs[job][k][-1] = qpu_get_fn(QPU_MC_EXIT); // Add exit command (Final uniform)
+++        av_assert0(s->y_mvs[job][k] - s->y_mvs_base[job][k] < Y_COMMANDS_PER_QPU);
+++    }
+++    s->y_mvs[job][12-1][-1] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT12); // This QPU will signal interrupt when all others are done and have acquired a semaphore
+++#endif
+ +
+-+:uvloop
+-+# retrieve texture results and pick out bytes
+-+# then submit two more texture requests
+++#ifdef RPI_SIMULATE_QPUS
+++    rpi_simulate_inter_qpu(s);
+++    return;
+++#endif
+ +
+-+sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
+-+shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
+++#ifdef RPI_MULTI_MAILBOX
+++#ifdef RPI_CACHE_UNIF_MVS
+++    flush_frame3(s, s->frame,&s->coeffs_buf_accelerated[job],&s->y_unif_mvs_ptr[job], &s->unif_mvs_ptr[job], job);
+++#else
+++    flush_frame3(s, s->frame,&s->coeffs_buf_accelerated[job],NULL,NULL, job);
+++#endif
+ +
+-+max r2, ra_y, 0  # y
+-+min r2, r2, rb_frame_height_minus_1
+-+add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+add t0s, ra_x2_base, r2
+++#if 1
+++    {
+++        unsigned int i;
+++        uint32_t * p;
+++        uint32_t code = qpu_get_fn(QPU_MC_SETUP_UV);
+++        uint32_t mail_uv[QPU_N_UV * QPU_MAIL_EL_VALS];
+++        uint32_t mail_y[QPU_N_Y * QPU_MAIL_EL_VALS];
+++
+++        for (p = mail_uv, i = 0; i != QPU_N_UV; ++i) {
+++            *p++ = (uint32_t)(unif_vc + (s->mvs_base[job][i] - (uint32_t*)s->unif_mvs_ptr[job].arm));
+++            *p++ = code;
+++        }
+ +
+-+# generate seven shifted versions
+-+# interleave with scroll of vertical context
+++        code = qpu_get_fn(QPU_MC_SETUP);
+++        for (p = mail_y, i = 0; i != QPU_N_Y; ++i) {
+++            *p++ = (uint32_t)(y_unif_vc + (s->y_mvs_base[job][i] - (uint32_t*)s->y_unif_mvs_ptr[job].arm));
+++            *p++ = code;
+++        }
+ +
+-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++        s->vpu_id = vpu_qpu_post_code2(vpu_get_fn(),
+++            vpu_get_constants(),
+++            s->coeffs_buf_vc[job][2],
+++            s->num_coeffs[job][2] >> 8,
+++            s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3],
+++            s->num_coeffs[job][3] >> 10,
+++            0,
+++            // QPU job 1
+++            QPU_N_UV,
+++            mail_uv,
+++            // QPU job 2
+++            QPU_N_Y,
+++            mail_y
+++            );
+++    }
+ +
+-+mov r2, rb21         ; mul24 r3, r0, ra0
+-+nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
+-+sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+-+nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+sub r0, r2, r3
+-+
+-+mov r3, rb31
+-+
+-+mov ra8, ra9
+-+mov ra9, ra10
+-+mov ra10, ra11
+-+mov ra11, ra12
+-+mov ra12, ra13
+-+mov ra13, ra14
+-+
+-+sub.setf -, r3, 8 ; mov r1, ra22
+++#else
+++    s->vpu_id = vpu_qpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2], s->num_coeffs[job][2] >> 8,
+++                                                                      s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3], s->num_coeffs[job][3] >> 10, 0,
+++                                   qpu_get_fn(QPU_MC_SETUP_UV),
+++                                   (uint32_t)(unif_vc+(s->mvs_base[job][0 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+++                                   (uint32_t)(unif_vc+(s->mvs_base[job][1 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+++                                   (uint32_t)(unif_vc+(s->mvs_base[job][2 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+++                                   (uint32_t)(unif_vc+(s->mvs_base[job][3 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+++                                   (uint32_t)(unif_vc+(s->mvs_base[job][4 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+++                                   (uint32_t)(unif_vc+(s->mvs_base[job][5 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+++                                   (uint32_t)(unif_vc+(s->mvs_base[job][6 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+++                                   (uint32_t)(unif_vc+(s->mvs_base[job][7 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+++#ifdef RPI_LUMA_QPU
+++                                   qpu_get_fn(QPU_MC_SETUP),
+++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][0 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][1 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][2 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][3 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][4 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][5 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][6 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][7 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][8 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][9 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][10 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][11 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm))
+++#else
+++                                   0,
+++                                   0,0,0,0,
+++                                   0,0,0,0,
+++                                   0,0,0,0
+++#endif
+++                                 );
+++#endif
+++    for(i=0;i<4;i++)
+++        s->num_coeffs[job][i] = 0;
+++#else
+++#error Code rotted here
+++    qpu_run_shader8(qpu_get_fn(QPU_MC_SETUP_UV),
+++      (uint32_t)(unif_vc+(s->mvs_base[job][0 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+++      (uint32_t)(unif_vc+(s->mvs_base[job][1 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+++      (uint32_t)(unif_vc+(s->mvs_base[job][2 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+++      (uint32_t)(unif_vc+(s->mvs_base[job][3 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+++      (uint32_t)(unif_vc+(s->mvs_base[job][4 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+++      (uint32_t)(unif_vc+(s->mvs_base[job][5 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+++      (uint32_t)(unif_vc+(s->mvs_base[job][6 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+++      (uint32_t)(unif_vc+(s->mvs_base[job][7 ] - (uint32_t*)s->unif_mvs_ptr[job].arm))
+++      );
+++#endif
+ +
+-+# apply horizontal filter
+-+brr.anyn -, r:uvloop
+-+max ra14, ra15, 0       ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
+-+asr r0, r0, 15          ; mov r1, ra21
+-+min.setf ra15, r0, rb22
+ +
+-+# apply vertical filter and write to VPM
+++}
+++#else
+ +
+-+nop                     ; mul24 r0, ra14, rb14
+-+sub r1, r1, r0          ; mul24 r0, ra13, rb13
+-+sub r1, r1, r0          ; mul24 r0, ra12, rb12
+-+sub r1, r1, r0          ; mul24 r0, ra11, rb11
+-+sub r1, r1, r0          ; mul24 r0, ra10, rb10
+-+sub r1, r1, r0          ; mul24 r0, ra9, rb9
+-+sub r1, r1, r0          ; mul24 r0, ra8, rb8
+-+sub r1, r1, r0          ; mul24 r0, ra15, rb15
+-+sub.ifnn r1, r1, r0     ; mov -, vw_wait
+-+sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+brr.anyn -, r:uvloop
+-+asr r1, r1, 15
+-+min r1, r1, rb22
+-+max vpm, r1, 0
+++#ifdef RPI
+++static void rpi_launch_vpu_qpu(HEVCContext *s)
+++{
+++  rpi_execute_transform(s);
+++}
+++#endif
+ +
+-+# DMA out for U
+++#endif
+ +
+-+mov vw_setup, rb26 # VDW setup 0
+-+mov vw_setup, rb29 # Stride
+-+mov vw_addr, unif # start the VDW
+++#ifdef RPI
+ +
+-+# DMA out for V
+-+# We need to wait for the U to complete first, but have nothing useful to compute while we wait.
+-+# Could potentially push this write into the start of the next pipeline stage.
+-+mov r0, 16
+-+mov -, vw_wait
+++#ifndef RPI_FAST_CACHEFLUSH
+++#error RPI_FAST_CACHEFLUSH is broken
+++static void flush_buffer(AVBufferRef *bref) {
+++    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
+++    gpu_cache_flush(p);
+++}
+++#endif
+ +
+-+bra -, ra31
+-+add vw_setup, rb26, r0 # VDW setup 0
+-+mov vw_setup, rb29 # Stride
+-+mov vw_addr, unif # start the VDW
+++static void flush_frame(HEVCContext *s,AVFrame *frame)
+++{
+++#ifdef RPI_FAST_CACHEFLUSH
+++    struct vcsm_user_clean_invalid_s iocache = {};
+++    GPU_MEM_PTR_T p = get_gpu_mem_ptr_u(s->frame);
+++    int n = s->ps.sps->height;
+++    int curr_y = 0;
+++    int curr_uv = 0;
+++    int n_uv = n >> s->ps.sps->vshift[1];
+++    int sz,base;
+++    sz = s->frame->linesize[1] * (n_uv-curr_uv);
+++    base = s->frame->linesize[1] * curr_uv;
+++    iocache.s[0].handle = p.vcsm_handle;
+++    iocache.s[0].cmd = 3; // clean+invalidate
+++    iocache.s[0].addr = (int)(p.arm) + base;
+++    iocache.s[0].size  = sz;
+++    p = get_gpu_mem_ptr_v(s->frame);
+++    iocache.s[1].handle = p.vcsm_handle;
+++    iocache.s[1].cmd = 3; // clean+invalidate
+++    iocache.s[1].addr = (int)(p.arm) + base;
+++    iocache.s[1].size  = sz;
+++    p = get_gpu_mem_ptr_y(s->frame);
+++    sz = s->frame->linesize[0] * (n-curr_y);
+++    base = s->frame->linesize[0] * curr_y;
+++    iocache.s[2].handle = p.vcsm_handle;
+++    iocache.s[2].cmd = 3; // clean+invalidate
+++    iocache.s[2].addr = (int)(p.arm) + base;
+++    iocache.s[2].size  = sz;
+++    vcsm_clean_invalid( &iocache );
+++#else
+++    flush_buffer(frame->buf[0]);
+++    flush_buffer(frame->buf[1]);
+++    flush_buffer(frame->buf[2]);
+++#endif
+++}
+ +
+-+################################################################################
+++static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2, int job)
+++{
+++#ifdef RPI_FAST_CACHEFLUSH
+++    struct vcsm_user_clean_invalid_s iocache = {};
+++    int n;
+++    int curr_y;
+++    int curr_uv;
+++    int n_uv;
+++    GPU_MEM_PTR_T p = get_gpu_mem_ptr_u(s->frame);
+++    int sz,base;
+++    int (*d)[2] = s->dblk_cmds[job];
+++    int low=(*d)[1];
+++    int high=(*d)[1];
+++    for(n = s->num_dblk_cmds[job]; n>0 ;n--,d++) {
+++        int y = (*d)[1];
+++        low=FFMIN(low,y);
+++        high=FFMAX(high,y);
+++    }
+++    curr_y = low;
+++    n = high+(1 << s->ps.sps->log2_ctb_size);
+++    curr_uv = curr_y >> s->ps.sps->vshift[1];
+++    n_uv = n >> s->ps.sps->vshift[1];
+ +
+++    sz = s->frame->linesize[1] * (n_uv-curr_uv);
+++    base = s->frame->linesize[1] * curr_uv;
+++    iocache.s[0].handle = p.vcsm_handle;
+++    iocache.s[0].cmd = 3; // clean+invalidate
+++    iocache.s[0].addr = (int)(p.arm) + base;
+++    iocache.s[0].size  = sz;
+++    p = get_gpu_mem_ptr_v(s->frame);
+++    iocache.s[1].handle = p.vcsm_handle;
+++    iocache.s[1].cmd = 3; // clean+invalidate
+++    iocache.s[1].addr = (int)(p.arm) + base;
+++    iocache.s[1].size  = sz;
+++    p = get_gpu_mem_ptr_y(s->frame);
+++    sz = s->frame->linesize[0] * (n-curr_y);
+++    base = s->frame->linesize[0] * curr_y;
+++    iocache.s[2].handle = p.vcsm_handle;
+++    iocache.s[2].cmd = 3; // clean+invalidate
+++    iocache.s[2].addr = (int)(p.arm) + base;
+++    iocache.s[2].size  = sz;
+ +
+-+# mc_filter(next_kernel, x, y, frame_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_dst)
+-+
+-+# At this point we have already issued two pairs of texture requests for the current block
+-+# ra_x_base, ra_x16_base point to the current coordinates for this block
+-+::mc_filter
+-+mov ra31, unif
+++    iocache.s[3].handle = p0->vcsm_handle;
+++    iocache.s[3].cmd = 3; // clean+invalidate
+++    iocache.s[3].addr = (int) p0->arm;
+++    iocache.s[3].size  = p0->numbytes;
+++    if (p1) {
+++      iocache.s[4].handle = p1->vcsm_handle;
+++      iocache.s[4].cmd = 3; // clean+invalidate
+++      iocache.s[4].addr = (int) p1->arm;
+++      iocache.s[4].size  = p1->numbytes;
+++    }
+++    if (p2) {
+++      iocache.s[5].handle = p2->vcsm_handle;
+++      iocache.s[5].cmd = 3; // clean+invalidate
+++      iocache.s[5].addr = (int) p2->arm;
+++      iocache.s[5].size  = p2->numbytes;
+++    }
+++    vcsm_clean_invalid( &iocache );
+++#else
+++    flush_buffer(frame->buf[0]);
+++    flush_buffer(frame->buf[1]);
+++    flush_buffer(frame->buf[2]);
+++    gpu_cache_flush3(p0, p1, p2);
+++#endif
+++}
+ +
+-+# per-channel shifts were calculated on the *previous* invocation
+++#endif
+ +
+-+mov ra_xshift, ra_xshift_next
+-+mov ra_x2shift, ra_x2shift_next
++ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
++ {
++     HEVCContext *s  = avctxt->priv_data;
++@@ -2313,6 +3762,17 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
++     int y_ctb       = 0;
++     int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs];
++ 
+++#ifdef RPI
+++    s->enable_rpi = s->ps.sps->bit_depth == 8
+++                    && !s->ps.pps->cross_component_prediction_enabled_flag;
+ +
+-+# get base addresses and per-channel shifts for *next* invocation
+-+add r0, unif, elem_num    # x
+-+add r2, r0, 8 # x+8
+-+max r0, r0, 0; mov r1, unif # y
+-+min r0, r0, rb_frame_width_minus_1 ; mov r3,unif # frame_base
+-+shl ra_xshift_next, r0, 3
+-+max r2, r2, 0
+-+min r2, r2, rb_frame_width_minus_1
+-+shl ra_x2shift_next, r2, 3
+-+add r0, r0, r3
+-+add r2, r2, r3
+-+and rb_x_base_next, r0, ~3
+-+and ra_x2_base_next, r2, ~3
+-+mov ra_y_next, r1
+++    if (!s->enable_rpi) {
+++      if (s->ps.pps->cross_component_prediction_enabled_flag)
+++        printf("Cross component\n");
+++    }
+++#endif
+++    //printf("L0=%d L1=%d\n",s->sh.nb_refs[L1],s->sh.nb_refs[L1]);
+ +
+-+# set up VPM write
+-+mov vw_setup, rb28
++     if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) {
++         av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n");
++         return AVERROR_INVALIDDATA;
++@@ -2326,6 +3786,14 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
++         }
++     }
++ 
+++#ifdef RPI_WORKER
+++    s->pass0_job = 0;
+++    s->pass1_job = 0;
+++#endif
+++#ifdef RPI
+++    rpi_begin(s);
+++#endif
+ +
+-+# get width,height of block
+-+mov r2, 16
+-+mov r0, unif
+-+shr r1, r0, r2 # Extract width
+-+sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
+-+and r0, r0, rb22 # Extract height
+-+add rb17, r0, 5
+-+add rb18, r0, 7
+-+shl r0, r0, 7
+-+add r0, r0, r1 # Combine width and height of destination area
+-+shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
+-+add rb26, r0, rb27
++     while (more_data && ctb_addr_ts < s->ps.sps->ctb_size) {
++         int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
++ 
++@@ -2341,7 +3809,57 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
++         s->deblock[ctb_addr_rs].tc_offset   = s->sh.tc_offset;
++         s->filter_slice_edges[ctb_addr_rs]  = s->sh.slice_loop_filter_across_slices_enabled_flag;
++ 
+++#ifdef RPI_INTER_QPU
+++        s->curr_u_mvs = s->u_mvs[s->pass0_job][s->ctu_count % 8];
+++#endif
+++#ifdef RPI_LUMA_QPU
+++        s->curr_y_mvs = s->y_mvs[s->pass0_job][s->ctu_count % 12];
+++#endif
+ +
+-+sub.setf -,8,r1 # 8-r1, so if <0 (negative) we need to use the full code
++         more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
+ +
+-+# get filter coefficients
+++#ifdef RPI_INTER_QPU
+++        s->u_mvs[s->pass0_job][s->ctu_count % 8]= s->curr_u_mvs;
+++#endif
+++#ifdef RPI_LUMA_QPU
+++        s->y_mvs[s->pass0_job][s->ctu_count % 12] = s->curr_y_mvs;
+++#endif
+ +
+-+mov r0, unif
+-+asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra0, r0, rb23;      mov r0, unif
+-+asr ra7, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra6, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra5, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra4, r0, rb23;      mov r0, unif
+-+asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+asr rb8, r0, rb23;      mov r0, unif
+-+asr rb15, r0, rb23;     mul24 r0, r0, ra22
+-+asr rb14, r0, rb23;     mul24 r0, r0, ra22
+-+asr rb13, r0, rb23;     mul24 r0, r0, ra22
+-+brr.anynn -, r:fast_path
+-+asr rb12, r0, rb23  # delay slot 1
+++#ifdef RPI
+++        if (s->enable_rpi) {
+++          //av_assert0(s->num_dblk_cmds[s->pass0_job]>=0);
+++          //av_assert0(s->num_dblk_cmds[s->pass0_job]<RPI_MAX_DEBLOCK_CMDS);
+++          //av_assert0(s->pass0_job<RPI_MAX_JOBS);
+++          //av_assert0(s->pass0_job>=0);
+++          s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]][0] = x_ctb;
+++          s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]++][1] = y_ctb;
+++          s->ctu_count++;
+++          //printf("%d %d/%d job=%d\n",s->ctu_count,s->num_dblk_cmds[s->pass0_job],RPI_MAX_DEBLOCK_CMDS,s->pass0_job);
+ +
+-+# r2 is elem_num
+-+# r3 is loop counter
+++          if ( s->ctu_count >= s->max_ctu_count ) {
+++#ifdef RPI_WORKER
+++            if (s->used_for_ref) {
+++              // Split work load onto separate threads so we make as rapid progress as possible with this frame
+++              // Pass on this job to worker thread
+++              worker_submit_job(s);
+++              // Make sure we have space to prepare the next job
+++              worker_pass0_ready(s);
+ +
+-+mov r5rep, -8 # delay slot 2
+++              // Prepare the next batch of commands
+++              rpi_begin(s);
+++            } else {
+++              // Non-ref frame so do it all on this thread
+++              rpi_do_all_passes(s);
+++            }
+++#else
+++            rpi_do_all_passes(s);
+++#endif
+++          }
+ +
+-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]  # delay slot 3
+++        }
+++#endif
+ +
+-+# retrieve texture results and pick out bytes
+-+# then submit two more texture requests
+ +
+-+## nop                                                                 ; ldtmu0     # loop counter increment
+-+## shr r0, r4, ra17                                                    ; ldtmu0
+-+## shr r1, r4, ra17     ; v8subs r0, r0, rb20
+-+## add t0s, ra16, r5    ; v8subs r1, r1, rb20
+-+## add ra16, ra16, rb16 ; mov t0s, ra16
+-+##
+-+## # generate seven shifted versions
+-+## # interleave with scroll of vertical context
+-+##
+-+## mov r2, rb21         ; mul24 r3, r0, ra0
+-+## sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+-+## nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+## sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+## nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+## sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+## nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+## sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+## nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+## sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+## nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+## sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+## nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+## sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+## nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+## sub r2, r2, r3                                                    ; ldtmu0
+-+##
+-+## mov r0, ra22
+-+## shr r0, r4, ra17     ; mul24 r2, r2, r0                           ; ldtmu0
+-+## shr r1, r4, ra17     ; v8subs r0, r0, rb20
+-+## add t0s, ra16, r5    ; v8subs r1, r1, rb20
+-+## add ra16, ra16, rb16 ; mov t0s, ra16
+-+##
+-+## # apply horizontal filter
+-+##
+-+## asr r2, r2, 15    ; mul24 r3, r0, ra0
+-+## min r2, r2, rb22
+-+## max ra13, r2, 0
+-+##
+-+## # generate seven shifted versions
+-+## # interleave with scroll of vertical context
+-+##
+-+## mov r2, rb21
+-+## sub r2, r2, r3 ; mul24      r3, ra1 << 1, r0 << 1
+-+## nop            ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+## sub r2, r2, r3 ; mul24    r3, ra2 << 2, r0 << 2
+-+## nop            ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+## sub r2, r2, r3 ; mul24    r3, ra3 << 3, r0 << 3
+-+## nop            ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+## sub r2, r2, r3 ; mul24    r3, ra4 << 4, r0 << 4
+-+## nop            ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+## sub r2, r2, r3 ; mul24    r3, ra5 << 5, r0 << 5
+-+## nop            ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+## sub r2, r2, r3 ; mul24    r3, ra6 << 6, r0 << 6
+-+## nop            ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+## sub r2, r2, r3 ; mul24    r3, ra7 << 7, r0 << 7
+-+## nop            ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+## sub r0, r2, r3
+-+##
+-+## # apply horizontal filter
+-+##
+-+## nop          ; mul24 r0, r0, ra22         # last bit of context scroll, including clamp to zero
+-+## asr r0, r0, 15
+-+## min r0, r0, rb22
+-+## max ra14, r0, 0
+-+##
+-+##
+-+##
+-+##
+-+## nop                                                                 ; ldtmu0     # loop counter increment
+-+## shr r0, r4, ra17                                                    ; ldtmu0
+-+## shr r1, r4, ra17     ; v8subs r0, r0, rb20
+-+## add t0s, ra16, r5    ; v8subs r1, r1, rb20
+-+## add ra16, ra16, rb16 ; mov t0s, ra16
+-+##
+-+## # generate seven shifted versions
+-+## # interleave with scroll of vertical context
+-+##
+-+## mov r2, rb21         ; mul24 r3, r0, ra0
+-+## sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+-+## nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+## sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+## nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+## sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+## nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+## sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+## nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+## sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+## nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+## sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+## nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+## sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+## nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+## sub r0, r2, r3
+-+##
+-+## # apply horizontal filter
+-+##
+-+## nop          ; mul24 r0, r0, ra22         # last bit of context scroll, including clamp to zero
+-+## asr r0, r0, 15
+-+## min r0, r0, rb22
+-+## max ra15, r0, 0
+-+
+-+
+-+
+-+
+-+mov r3, 0
+-+
+-+:loop
+-+# retrieve texture results and pick out bytes
+-+# then submit two more texture requests
++         if (more_data < 0) {
++             s->tab_slice_address[ctb_addr_rs] = -1;
++             return more_data;
++@@ -2350,9 +3868,29 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
++ 
++         ctb_addr_ts++;
++         ff_hevc_save_states(s, ctb_addr_ts);
+++#ifdef RPI
+++        if (s->enable_rpi)
+++            continue;
+++#endif
++         ff_hevc_hls_filters(s, x_ctb, y_ctb, ctb_size);
++     }
++ 
+++#ifdef RPI
+ +
+-+sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
+-+shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
+++#ifdef RPI_WORKER
+++    // Wait for the worker to finish all its jobs
+++    if (s->enable_rpi) {
+++        worker_wait(s);
+++    }
+++#endif
+ +
+-+max r2, ra_y, 0  # y
+-+min r2, r2, rb_frame_height_minus_1
+-+add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+add t0s, ra_x2_base, r2
+++    // Finish off any half-completed rows
+++    if (s->enable_rpi && s->ctu_count) {
+++        rpi_do_all_passes(s);
+++    }
+ +
+-+# generate seven shifted versions
+-+# interleave with scroll of vertical context
+++#endif
+ +
+-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++     if (x_ctb + ctb_size >= s->ps.sps->width &&
++         y_ctb + ctb_size >= s->ps.sps->height)
++         ff_hevc_hls_filter(s, x_ctb, y_ctb, ctb_size);
++@@ -2387,6 +3925,11 @@ static int hls_decode_entry_wpp(AVCodecContext *avctxt, void *input_ctb_row, int
++     s = s1->sList[self_id];
++     lc = s->HEVClc;
++ 
+++#ifdef RPI
+++    s->enable_rpi = 0;
+++    //printf("Wavefront\n");
+++#endif
+ +
+-+mov r2, rb21         ; mul24 r3, r0, ra0
+-+sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+-+nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+sub r0, r2, r3
+-+
+-+mov r3, rb31
+-+
+-+mov ra8, ra9
+-+mov ra9, ra10
+-+mov ra10, ra11
+-+mov ra11, ra12
+-+mov ra12, ra13
+-+mov ra13, ra14
+-+
+-+sub.setf -, r3, 8 ; mov r1, ra22
++     if(ctb_row) {
++         ret = init_get_bits8(&lc->gb, s->data + s->sh.offset[ctb_row - 1], s->sh.size[ctb_row - 1]);
++ 
++@@ -2767,6 +4310,16 @@ static int decode_nal_unit(HEVCContext *s, const H2645NAL *nal)
++         if (ret < 0)
++             return ret;
++ 
+++        s->used_for_ref = !(s->nal_unit_type == NAL_TRAIL_N ||
+++                        s->nal_unit_type == NAL_TSA_N   ||
+++                        s->nal_unit_type == NAL_STSA_N  ||
+++                        s->nal_unit_type == NAL_RADL_N  ||
+++                        s->nal_unit_type == NAL_RASL_N);
+ +
+-+# apply horizontal filter
+-+brr.anyn -, r:loop
+-+max ra14, ra15, 0       ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
+-+asr r0, r0, 15          ; mov r1, ra21
+-+min.setf ra15, r0, rb22
+++        if (!s->used_for_ref && s->avctx->skip_frame >= AVDISCARD_NONREF) {
+++            s->is_decoded = 0;
+++            break;
+++        }
++         if (s->max_ra == INT_MAX) {
++             if (s->nal_unit_type == NAL_CRA_NUT || IS_BLA(s)) {
++                 s->max_ra = s->poc;
++@@ -2891,9 +4444,17 @@ static int decode_nal_units(HEVCContext *s, const uint8_t *buf, int length)
++     }
++ 
++ fail:
++-    if (s->ref && s->threads_type == FF_THREAD_FRAME)
+++    if (s->ref && s->threads_type == FF_THREAD_FRAME) {
+++#ifdef RPI_INTER_QPU
+++        ff_hevc_flush_buffer(s, &s->ref->tf, s->ps.sps->height);
+++#endif
++         ff_thread_report_progress(&s->ref->tf, INT_MAX, 0);
++-
+++    } else if (s->ref) {
+++#ifdef RPI_INTER_QPU
+++      // When running single threaded we need to flush the whole frame
+++      flush_frame(s,s->frame);
+++#endif
+++    }
++     return ret;
++ }
++ 
++@@ -3064,6 +4625,41 @@ fail:
++     return AVERROR(ENOMEM);
++ }
++ 
+++#ifdef RPI_WORKER
+++static av_cold void hevc_init_worker(HEVCContext *s)
+++{
+++    int err;
+++    pthread_cond_init(&s->worker_cond_head, NULL);
+++    pthread_cond_init(&s->worker_cond_tail, NULL);
+++    pthread_mutex_init(&s->worker_mutex, NULL);
+ +
+-+# apply vertical filter and write to VPM
+++    s->worker_tail=0;
+++    s->worker_head=0;
+++    s->kill_worker=0;
+++    err = pthread_create(&s->worker_thread, NULL, worker_start, s);
+++    if (err) {
+++        printf("Failed to create worker thread\n");
+++        exit(-1);
+++    }
+++}
+ +
+-+nop                     ; mul24 r0, ra14, rb14
+-+sub r1, r1, r0          ; mul24 r0, ra13, rb13
+-+sub r1, r1, r0          ; mul24 r0, ra12, rb12
+-+sub r1, r1, r0          ; mul24 r0, ra11, rb11
+-+sub r1, r1, r0          ; mul24 r0, ra10, rb10
+-+sub r1, r1, r0          ; mul24 r0, ra9, rb9
+-+sub r1, r1, r0          ; mul24 r0, ra8, rb8
+-+sub r1, r1, r0          ; mul24 r0, ra15, rb15
+-+sub.ifnn r1, r1, r0     ; mov -, vw_wait
+-+sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+brr.anyn -, r:loop
+-+asr r1, r1, 15
+-+min r1, r1, rb22
+-+max vpm, r1, 0
+++static av_cold void hevc_exit_worker(HEVCContext *s)
+++{
+++    void *res;
+++    s->kill_worker=1;
+++    pthread_cond_broadcast(&s->worker_cond_tail);
+++    pthread_join(s->worker_thread, &res);
+ +
+-+# DMA out
+++    pthread_cond_destroy(&s->worker_cond_head);
+++    pthread_cond_destroy(&s->worker_cond_tail);
+++    pthread_mutex_destroy(&s->worker_mutex);
+ +
+-+bra -, ra31
+-+mov vw_setup, rb26 # VDW: height rows, 16 8-bit units long
+-+mov vw_setup, rb29
+-+mov vw_addr, unif # start the VDW
+++    s->worker_tail=0;
+++    s->worker_head=0;
+++    s->kill_worker=0;
+++}
+++#endif
+ +
+-+####################################################
+-+
+-+:fast_path
+-+## nop                                                                 ; ldtmu0     # loop counter increment
+-+## shr r0, r4, ra17                                                    ; ldtmu0
+-+## shr r1, r4, ra17     ; v8subs r0, r0, rb20
+-+## add t0s, ra16, r5    ; v8subs r1, r1, rb20
+-+## add ra16, ra16, rb16 ; mov t0s, ra16
+-+##
+-+## # generate seven shifted versions
+-+## # interleave with scroll of vertical context
+-+##
+-+## mov r2, rb21         ; mul24 r3, r0, ra0
+-+## sub r2, r2, r3       ; mul24 r3, ra1 << 1, r0 << 1
+-+## sub r2, r2, r3       ; mul24 r3, ra2 << 2, r0 << 2
+-+## sub r2, r2, r3       ; mul24 r3, ra3 << 3, r0 << 3
+-+## sub r2, r2, r3       ; mul24 r3, ra4 << 4, r0 << 4
+-+## sub r2, r2, r3       ; mul24 r3, ra5 << 5, r0 << 5
+-+## sub r2, r2, r3       ; mul24 r3, ra6 << 6, r0 << 6
+-+## sub r2, r2, r3       ; mul24 r3, ra7 << 7, r0 << 7
+-+## sub r2, r2, r3                                                    ; ldtmu0
+-+##
+-+## mov r0, ra22
+-+## shr r0, r4, ra17     ; mul24 r2, r2, r0                           ; ldtmu0
+-+## shr r1, r4, ra17     ; v8subs r0, r0, rb20
+-+## add t0s, ra16, r5    ; v8subs r1, r1, rb20
+-+## add ra16, ra16, rb16 ; mov t0s, ra16
+-+##
+-+## # apply horizontal filter
+-+##
+-+## asr r2, r2, 15    ; mul24 r3, r0, ra0
+-+## min r2, r2, rb22
+-+## max ra13, r2, 0
+-+##
+-+## # generate seven shifted versions
+-+## # interleave with scroll of vertical context
+-+##
+-+## mov r2, rb21
+-+## sub r2, r2, r3 ; mul24    r3, ra1 << 1, r0 << 1
+-+## sub r2, r2, r3 ; mul24    r3, ra2 << 2, r0 << 2
+-+## sub r2, r2, r3 ; mul24    r3, ra3 << 3, r0 << 3
+-+## sub r2, r2, r3 ; mul24    r3, ra4 << 4, r0 << 4
+-+## sub r2, r2, r3 ; mul24    r3, ra5 << 5, r0 << 5
+-+## sub r2, r2, r3 ; mul24    r3, ra6 << 6, r0 << 6
+-+## sub r2, r2, r3 ; mul24    r3, ra7 << 7, r0 << 7
+-+## sub r0, r2, r3
+-+##
+-+## # apply horizontal filter
+-+##
+-+## nop          ; mul24 r0, r0, ra22         # last bit of context scroll, including clamp to zero
+-+## asr r0, r0, 15
+-+## min r0, r0, rb22
+-+## max ra14, r0, 0
+-+##
+-+##
+-+##
+-+##
+-+## nop                                                                 ; ldtmu0     # loop counter increment
+-+## shr r0, r4, ra17                                                    ; ldtmu0
+-+## shr r1, r4, ra17     ; v8subs r0, r0, rb20
+-+## add t0s, ra16, r5    ; v8subs r1, r1, rb20
+-+## add ra16, ra16, rb16 ; mov t0s, ra16
+-+##
+-+## # generate seven shifted versions
+-+## # interleave with scroll of vertical context
+-+##
+-+## mov r2, rb21   ; mul24    r3, r0, ra0
+-+## sub r2, r2, r3 ; mul24    r3, ra1 << 1, r0 << 1
+-+## sub r2, r2, r3 ; mul24    r3, ra2 << 2, r0 << 2
+-+## sub r2, r2, r3 ; mul24    r3, ra3 << 3, r0 << 3
+-+## sub r2, r2, r3 ; mul24    r3, ra4 << 4, r0 << 4
+-+## sub r2, r2, r3 ; mul24    r3, ra5 << 5, r0 << 5
+-+## sub r2, r2, r3 ; mul24    r3, ra6 << 6, r0 << 6
+-+## sub r2, r2, r3 ; mul24    r3, ra7 << 7, r0 << 7
+-+## sub r0, r2, r3
+-+##
+-+## # apply horizontal filter
+-+##
+-+## nop          ; mul24 r0, r0, ra22         # last bit of context scroll, including clamp to zero
+-+## asr r0, r0, 15
+-+## min r0, r0, rb22
+-+## max ra15, r0, 0
+-+
+-+
+-+mov r3, 0  # This signifies the amount of unrolling
+-+
+-+:fast_loop
+-+# retrieve texture results and pick out bytes
+-+# then submit two more texture requests
++ static av_cold int hevc_decode_free(AVCodecContext *avctx)
++ {
++     HEVCContext       *s = avctx->priv_data;
++@@ -3075,6 +4671,32 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
++ 
++     av_freep(&s->cabac_state);
++ 
+++#ifdef RPI
+ +
+-+# Due to pipelining we can only skip second pipeline instructions related to the fetched pixels
+-+sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
+-+shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+mov.ifz ra_y, ra_y_next   ; mov rb31, r3
+-+mov.ifz ra_x2_base, ra_x2_base_next   ; mov r3, rb_pitch
+++#ifdef RPI_WORKER
+++    hevc_exit_worker(s);
+++#endif
+ +
+-+max r2, ra_y, 0
+-+min r2, r2, rb_frame_height_minus_1 ; mov r1, r4  # discard texture read
+-+add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+add t0s, ra_x_base, r2    ; v8subs r0, r0, rb20
+-+add t0s, ra_x2_base, r2
+++    for(i=0;i<RPI_MAX_JOBS;i++) {
+++      av_freep(&s->unif_mv_cmds[i]);
+++      av_freep(&s->univ_pred_cmds[i]);
+ +
+-+# generate seven shifted versions
+-+# interleave with scroll of vertical context
+-+
+-+mov r2, rb21         ; mul24 r3, r0, ra0
+-+sub r2, r2, r3       ; mul24 r3, ra1 << 1, r0 << 1
+-+sub r2, r2, r3       ; mul24 r3, ra2 << 2, r0 << 2
+-+sub r2, r2, r3       ; mul24 r3, ra3 << 3, r0 << 3
+-+sub r2, r2, r3       ; mul24 r3, ra4 << 4, r0 << 4
+-+sub r2, r2, r3       ; mul24 r3, ra5 << 5, r0 << 5
+-+sub r2, r2, r3       ; mul24 r3, ra6 << 6, r0 << 6
+-+sub r2, r2, r3       ; mul24 r3, ra7 << 7, r0 << 7
+-+sub r0, r2, r3       ; mov r3, rb31
+-+
+-+mov ra8, ra9
+-+mov ra9, ra10
+-+mov ra10, ra11
+-+mov ra11, ra12
+-+mov ra12, ra13
+-+mov ra13, ra14
+-+
+-+sub.setf -, r3, 8       ; mov r1, ra22
+++#ifdef RPI_INTER_QPU
+++      if (s->unif_mvs[i]) {
+++        gpu_free( &s->unif_mvs_ptr[i] );
+++        s->unif_mvs[i] = 0;
+++      }
+++#endif
+++#ifdef RPI_LUMA_QPU
+++      if (s->y_unif_mvs[i]) {
+++        gpu_free( &s->y_unif_mvs_ptr[i] );
+++        s->y_unif_mvs[i] = 0;
+++      }
+++#endif
+++    }
+ +
+-+# apply horizontal filter
+++#endif
+ +
+-+brr.anyn -, r:fast_loop
+-+max ra14, ra15, 0       ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
+-+asr r0, r0, 15          ; mov r1, ra21
+-+min.setf ra15, r0, rb22
++     for (i = 0; i < 3; i++) {
++         av_freep(&s->sao_pixel_buffer_h[i]);
++         av_freep(&s->sao_pixel_buffer_v[i]);
++@@ -3116,10 +4738,23 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
++     return 0;
++ }
++ 
+++#ifdef RPI
+++#ifdef RPI_PRECLEAR
+++static av_cold void memclear16(int16_t *p, int n)
+++{
+++  vpu_execute_code( vpu_get_fn(), p, n, 0, 0, 0, 1);
+++  //int i;
+++  //for(i=0;i<n;i++)
+++  //  p[i] = 0;
+++}
+++#endif
+++#endif
+ +
+-+# apply vertical filter and write to VPM
++ static av_cold int hevc_init_context(AVCodecContext *avctx)
++ {
++     HEVCContext *s = avctx->priv_data;
++     int i;
+++    int job;
++ 
++     s->avctx = avctx;
++ 
++@@ -3129,6 +4764,78 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
++     s->HEVClcList[0] = s->HEVClc;
++     s->sList[0] = s;
++ 
+++#ifdef RPI
+++    for(job=0;job<RPI_MAX_JOBS;job++) {
+++        s->unif_mv_cmds[job] = av_mallocz(sizeof(HEVCMvCmd)*RPI_MAX_MV_CMDS);
+++        if (!s->unif_mv_cmds[job])
+++            goto fail;
+++        s->univ_pred_cmds[job] = av_mallocz(sizeof(HEVCPredCmd)*RPI_MAX_PRED_CMDS);
+++        if (!s->univ_pred_cmds[job])
+++            goto fail;
+++    }
+ +
+-+nop                     ; mul24 r0, ra14, rb14
+-+sub r1, r1, r0          ; mul24 r0, ra13, rb13
+-+sub r1, r1, r0          ; mul24 r0, ra12, rb12
+-+sub r1, r1, r0          ; mul24 r0, ra11, rb11
+-+sub r1, r1, r0          ; mul24 r0, ra10, rb10
+-+sub r1, r1, r0          ; mul24 r0, ra9, rb9
+-+sub r1, r1, r0          ; mul24 r0, ra8, rb8
+-+sub r1, r1, r0          ; mul24 r0, ra15, rb15
+-+sub.ifnn r1, r1, r0     ; mov -, vw_wait
+-+sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+brr.anyn -, r:fast_loop
+-+asr r1, r1, 15
+-+min r1, r1, rb22
+-+max vpm, r1, 0
+++#ifdef RPI_INTER_QPU
+++    // We divide the image into blocks 256 wide and 64 high
+++    // We support up to 2048 widths
+++    // We compute the number of chroma motion vector commands for 4:4:4 format and 4x4 chroma blocks - assuming all blocks are B predicted
+++    // Also add space for the startup command for each stream.
+ +
+-+# DMA out
+++    {
+++        int uv_commands_per_qpu = UV_COMMANDS_PER_QPU;
+++        uint32_t *p;
+++		for(job=0;job<RPI_MAX_JOBS;job++) {
+++#ifdef RPI_CACHE_UNIF_MVS
+++          gpu_malloc_cached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr[job] );
+++#else
+++          gpu_malloc_uncached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr[job] );
+++#endif
+++          s->unif_mvs[job] = (uint32_t *) s->unif_mvs_ptr[job].arm;
+ +
+-+bra -, ra31
+-+mov vw_setup, rb26 # VDW: height rows, 16 8-bit units long
+-+mov vw_setup, rb29
+-+mov vw_addr, unif # start the VDW
+++          // Set up initial locations for uniform streams
+++          p = s->unif_mvs[job];
+++          for(i = 0; i < 8; i++) {
+++            s->mvs_base[job][i] = p;
+++            p += uv_commands_per_qpu;
+++          }
+++        }
+++        s->mc_filter_uv = qpu_get_fn(QPU_MC_FILTER_UV);
+++        s->mc_filter_uv_b0 = qpu_get_fn(QPU_MC_FILTER_UV_B0);
+++        s->mc_filter_uv_b = qpu_get_fn(QPU_MC_FILTER_UV_B);
+++    }
+ +
+-+################################################################################
+++#endif
+++#ifdef RPI_LUMA_QPU
+++    for(job=0;job<RPI_MAX_JOBS;job++)
+++    {
+++        int y_commands_per_qpu = Y_COMMANDS_PER_QPU;
+++        uint32_t *p;
+++#ifdef RPI_CACHE_UNIF_MVS
+++        gpu_malloc_cached( 12 * y_commands_per_qpu * sizeof(uint32_t), &s->y_unif_mvs_ptr[job] );
+++#else
+++        gpu_malloc_uncached( 12 * y_commands_per_qpu * sizeof(uint32_t), &s->y_unif_mvs_ptr[job] );
+++#endif
+++        s->y_unif_mvs[job] = (uint32_t *) s->y_unif_mvs_ptr[job].arm;
+ +
+-+# mc_filter_b(next_kernel, x, y, frame_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_dst)
+++        // Set up initial locations for uniform streams
+++        p = s->y_unif_mvs[job];
+++        for(i = 0; i < 12; i++) {
+++            s->y_mvs_base[job][i] = p;
+++            p += y_commands_per_qpu;
+++        }
+++    }
+++    s->mc_filter = qpu_get_fn(QPU_MC_FILTER);
+++    s->mc_filter_b = qpu_get_fn(QPU_MC_FILTER_B);
+++#endif
+++    //gpu_malloc_uncached(2048*64,&s->dummy);
+ +
+-+# At this point we have already issued two pairs of texture requests for the current block
+-+# ra_x_base, ra_x16_base point to the current coordinates for this block
+-+::mc_filter_b
+-+mov ra31, unif
+++    s->enable_rpi = 0;
+ +
+-+# per-channel shifts were calculated on the *previous* invocation
+++#ifdef RPI_WORKER
+++    hevc_init_worker(s);
+++#endif
+ +
+-+mov ra_xshift, ra_xshift_next
+-+mov ra_x2shift, ra_x2shift_next
+++#endif
+ +
+-+# get base addresses and per-channel shifts for *next* invocation
+-+add r0, unif, elem_num    # x
+-+add r2, r0, 8 # x+8
+-+max r0, r0, 0; mov r1, unif # y
+-+min r0, r0, rb_frame_width_minus_1 ; mov r3,unif # frame_base
+-+shl ra_xshift_next, r0, 3
+-+max r2, r2, 0
+-+min r2, r2, rb_frame_width_minus_1
+-+shl ra_x2shift_next, r2, 3
+-+add r0, r0, r3
+-+add r2, r2, r3
+-+and rb_x_base_next, r0, ~3
+-+and ra_x2_base_next, r2, ~3
+-+mov ra_y_next, r1
++     s->cabac_state = av_malloc(HEVC_CONTEXTS);
++     if (!s->cabac_state)
++         goto fail;
++diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
++index be91010..6b03ea8 100644
++--- a/libavcodec/hevc.h
+++++ b/libavcodec/hevc.h
++@@ -23,6 +23,9 @@
++ #ifndef AVCODEC_HEVC_H
++ #define AVCODEC_HEVC_H
++ 
+++// define RPI to split the CABAC/prediction/transform into separate stages
+++#include "config.h"
+ +
+-+# set up VPM write
+-+mov vw_setup, rb28
++ #include "libavutil/buffer.h"
++ #include "libavutil/md5.h"
++ 
++@@ -37,6 +40,29 @@
++ #include "thread.h"
++ #include "videodsp.h"
++ 
+++// define RPI to split the CABAC/prediction/transform into separate stages
+++#ifdef RPI
+ +
+-+# get width,height of block
+-+mov r2, 16
+-+mov r0, unif
+-+shr r1, r0, r2 # Extract width
+-+sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
+-+and r0, r0, rb22 # Extract height
+-+add rb17, r0, 5
+-+add rb18, r0, 7
+-+shl r0, r0, 7
+-+# r0 is currently height<<7
+-+# For vr_setup we want height<<20 (so 20-7=13 additional bits)
+-+shl r3, r0, 13
+-+shl r3, r3, 8 # Mask off top 8 bits
+-+shr r3, r3, 8
+-+add r0, r0, r1 # Combine width and height of destination area
+-+shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
+-+add rb26, r0, rb27
+-+# In a B frame, so also set up VPM read
+-+add vr_setup, r3, rb28
+++  #include "rpi_qpu.h"
+++  // Define RPI_INTER_QPU to use QPU for chroma inter prediction
+++  #define RPI_INTER_QPU
+ +
+-+# get filter coefficients
+++  #ifdef RPI_INTER_QPU
+++    // Define RPI_LUMA_QPU to also use QPU for luma inter prediction
+++    #define RPI_LUMA_QPU
+++  #endif
+ +
+-+mov r0, unif
+-+asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra0, r0, rb23;      mov r0, unif
+-+asr ra7, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra6, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra5, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra4, r0, rb23;      mov r0, unif
+-+asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+asr rb8, r0, rb23;      mov r0, unif
+-+asr rb15, r0, rb23;     mul24 r0, r0, ra22
+-+asr rb14, r0, rb23;     mul24 r0, r0, ra22
+-+asr rb13, r0, rb23;     mul24 r0, r0, ra22
+-+asr rb12, r0, rb23
+++  // By passing jobs to a worker thread we hope to be able to catch up during slow frames
+++  #define RPI_MAX_JOBS 2
+++  // Define RPI_WORKER to launch a worker thread for pixel processing tasks
+++  #define RPI_WORKER
+++  // Define RPI_DEBLOCK_VPU to perform deblocking on the VPUs
+++//  #define RPI_DEBLOCK_VPU
+ +
+-+# r2 is elem_num
+-+# r3 is loop counter
+++#endif
+ +
+-+mov r5rep, -8
+-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+mov r3, 0
+++#define RPI_VPU_DEBLOCK_CACHED 1
+ +
+-+:bloop
+-+# retrieve texture results and pick out bytes
+-+# then submit two more texture requests
++ #define MAX_DPB_SIZE 16 // A.4.1
++ #define MAX_REFS 16
++ 
++@@ -660,17 +686,6 @@ typedef struct CodingUnit {
++     uint8_t cu_transquant_bypass_flag;
++ } CodingUnit;
++ 
++-typedef struct Mv {
++-    int16_t x;  ///< horizontal component of motion vector
++-    int16_t y;  ///< vertical component of motion vector
++-} Mv;
++-
++-typedef struct MvField {
++-    DECLARE_ALIGNED(4, Mv, mv)[2];
++-    int8_t ref_idx[2];
++-    int8_t pred_flag;
++-} MvField;
++-
++ typedef struct NeighbourAvailable {
++     int cand_bottom_left;
++     int cand_left;
++@@ -747,7 +762,17 @@ typedef struct HEVCFrame {
++     uint8_t flags;
++ } HEVCFrame;
++ 
+++#ifdef RPI_WORKER
+++typedef struct HEVCLocalContextIntra {
+++    TransformUnit tu;
+++    NeighbourAvailable na;
+++} HEVCLocalContextIntra;
+++#endif
+ +
+-+sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
+-+shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
++ typedef struct HEVCLocalContext {
+++    TransformUnit tu;
+++    NeighbourAvailable na;  // WARNING tu and na must be the first two fields to match HEVCLocalContextIntra
+ +
+-+max r2, ra_y, 0  # y
+-+min r2, r2, rb_frame_height_minus_1
+-+add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+add t0s, ra_x2_base, r2
++     uint8_t cabac_state[HEVC_CONTEXTS];
++ 
++     uint8_t stat_coeff[4];
++@@ -762,7 +787,6 @@ typedef struct HEVCLocalContext {
++ 
++     int qPy_pred;
++ 
++-    TransformUnit tu;
++ 
++     uint8_t ctb_left_flag;
++     uint8_t ctb_up_flag;
++@@ -779,7 +803,6 @@ typedef struct HEVCLocalContext {
++     int ct_depth;
++     CodingUnit cu;
++     PredictionUnit pu;
++-    NeighbourAvailable na;
++ 
++ #define BOUNDARY_LEFT_SLICE     (1 << 0)
++ #define BOUNDARY_LEFT_TILE      (1 << 1)
++@@ -790,6 +813,80 @@ typedef struct HEVCLocalContext {
++     int boundary_flags;
++ } HEVCLocalContext;
++ 
+ +
+-+# generate seven shifted versions
+-+# interleave with scroll of vertical context
+++#ifdef RPI
+ +
+-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++// The processing is done in chunks
+++// Each chunk corresponds to 24 64x64 luma blocks (24 so it is divisible by 8 for chroma and 12 for luma)
+++// This is a distance of 1536 pixels across the screen
+++// Increasing RPI_NUM_CHUNKS will reduce time spent activating QPUs and cache flushing,
+++// but allocate more memory and increase the latency before data in the next frame can be processed
+++#define RPI_NUM_CHUNKS 1
+ +
+-+mov r2, rb21         ; mul24 r3, r0, ra0
+-+sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+-+nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+sub r0, r2, r3
+-+
+-+mov r3, rb31
+-+
+-+mov ra8, ra9
+-+mov ra9, ra10
+-+mov ra10, ra11
+-+mov ra11, ra12
+-+mov ra12, ra13
+-+mov ra13, ra14
+-+
+-+sub.setf -, r3, 8 ; mov r1, ra22
+++// RPI_MAX_WIDTH is maximum width in pixels supported by the accelerated code
+++#define RPI_MAX_WIDTH (RPI_NUM_CHUNKS*64*24)
+ +
+-+# apply horizontal filter
+-+brr.anyn -, r:bloop
+-+max ra14, ra15, 0       ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
+-+asr r0, r0, 15          ; mov r1, ra21
+-+min.setf ra15, r0, rb22
+++// Worst case is for 4:4:4 4x4 blocks with 64 high coding tree blocks, so 16 MV cmds per 4 pixels across for each colour plane, * 2 for bi
+++#define RPI_MAX_MV_CMDS   (2*16*3*(RPI_MAX_WIDTH/4))
+++// Each block can have an intra prediction and a transform_add command
+++#define RPI_MAX_PRED_CMDS (2*16*3*(RPI_MAX_WIDTH/4))
+++// Worst case is 16x16 CTUs
+++#define RPI_MAX_DEBLOCK_CMDS (RPI_MAX_WIDTH*4/16)
+ +
+-+# apply vertical filter and write to VPM
+++#define RPI_CMD_LUMA_UNI 0
+++#define RPI_CMD_CHROMA_UNI 1
+++#define RPI_CMD_LUMA_BI 2
+++#define RPI_CMD_CHROMA_BI 3
+++#define RPI_CMD_V_BI 4
+ +
+-+nop                     ; mul24 r0, ra14, rb14
+-+sub r1, r1, r0          ; mul24 r0, ra13, rb13
+-+sub r1, r1, r0          ; mul24 r0, ra12, rb12
+-+sub r1, r1, r0          ; mul24 r0, ra11, rb11
+-+sub r1, r1, r0          ; mul24 r0, ra10, rb10
+-+sub r1, r1, r0          ; mul24 r0, ra9, rb9
+-+sub r1, r1, r0          ; mul24 r0, ra8, rb8
+-+sub r1, r1, r0          ; mul24 r0, ra15, rb15
+-+sub.ifnn r1, r1, r0     ; mov -, vw_wait
+-+sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+asr r1, r1, 15          ; mov -, vr_wait
+-+min r1, r1, rb22
+-+add r0, vpm, 1          # Blend in previous VPM contents at this location
+-+brr.anyn -, r:bloop
+-+max r1, r1, 0
+-+add r1, r1, r0
+-+shr vpm, r1, 1
+++// RPI_PRECLEAR is not working yet - perhaps clearing on VPUs is flawed?
+++// #define RPI_PRECLEAR
+ +
+-+# DMA out
+++// Command for inter prediction
+++typedef struct HEVCMvCmd {
+++    int cmd;
+++    uint8_t *dst;
+++    ptrdiff_t dststride;
+++    uint8_t *src;
+++    ptrdiff_t srcstride;
+++    Mv mv;
+++    int x_off;
+++    int y_off;
+++    int block_w;
+++    int block_h;
+++    int weight;
+++    int offset;
+++    uint8_t *src1;
+++    ptrdiff_t srcstride1;
+++    Mv mv1;
+++    int8_t ref_idx[2];
+++} HEVCMvCmd;
+ +
+-+bra -, ra31
+-+mov vw_setup, rb26 # VDW: height rows, 16 8-bit units long
+-+mov vw_setup, rb29
+-+mov vw_addr, unif # start the VDW
+ +
+-+################################################################################
+++// Command for intra prediction and transform_add of predictions to coefficients
+++#define RPI_PRED_TRANSFORM_ADD 0
+++#define RPI_PRED_INTRA 1
+++typedef struct HEVCPredCmd {
+++    uint8_t size;
+++    uint8_t type;
+++    uint8_t na;
+++    uint8_t c_idx;
+++    union {
+++        uint8_t *dst; // RPI_PRED_TRANSFORM_ADD
+++        uint32_t x;   // RPI_PRED_INTRA
+++    };
+++    union {
+++        int16_t *buf; // RPI_PRED_TRANSFORM_ADD
+++        uint32_t y;   // RPI_PRED_INTRA
+++    };
+++    union {
+++        enum IntraPredMode mode; // RPI_PRED_TRANSFORM_ADD
+++        uint32_t stride;         // RPI_PRED_INTRA
+++    };
+++} HEVCPredCmd;
+ +
+-+# mc_filter_honly(next_kernel, x, y, frame_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_dst)
+-+# This filter only does horizontal filtering.
+-+# It is assumed that the region to fetch does not include extra rows above.
+++#endif
+ +
+-+# At this point we have already issued two pairs of texture requests for the current block
+-+# ra_x_base, ra_x16_base point to the current coordinates for this block
+-+::mc_filter_honly
+-+mov ra31, unif
++ typedef struct HEVCContext {
++     const AVClass *c;  // needed by private avoptions
++     AVCodecContext *avctx;
++@@ -798,13 +895,107 @@ typedef struct HEVCContext {
++ 
++     HEVCLocalContext    *HEVClcList[MAX_NB_THREADS];
++     HEVCLocalContext    *HEVClc;
++-
+++#ifdef RPI_WORKER
+++    HEVCLocalContextIntra HEVClcIntra;
+++#endif
++     uint8_t             threads_type;
++     uint8_t             threads_number;
++ 
++     int                 width;
++     int                 height;
++ 
+++    int used_for_ref;
+ +
+-+# per-channel shifts were calculated on the *previous* invocation
+++#ifdef RPI
+++    int enable_rpi;
+++    HEVCMvCmd *unif_mv_cmds[RPI_MAX_JOBS];
+++    HEVCPredCmd *univ_pred_cmds[RPI_MAX_JOBS];
+++    int buf_width;
+++    GPU_MEM_PTR_T coeffs_buf_default[RPI_MAX_JOBS];
+++    GPU_MEM_PTR_T coeffs_buf_accelerated[RPI_MAX_JOBS];
+++    int16_t *coeffs_buf_arm[RPI_MAX_JOBS][4];
+++    unsigned int coeffs_buf_vc[RPI_MAX_JOBS][4];
+++    int num_coeffs[RPI_MAX_JOBS][4];
+++    int num_xfm_cmds[RPI_MAX_JOBS];
+++    int num_mv_cmds[RPI_MAX_JOBS];
+++    int num_pred_cmds[RPI_MAX_JOBS];
+++    int num_dblk_cmds[RPI_MAX_JOBS];
+++    int vpu_id;
+++    int pass0_job; // Pass0 does coefficient decode
+++    int pass1_job; // Pass1 does pixel processing
+++    int ctu_count; // Number of CTUs done in pass0 so far
+++    int max_ctu_count; // Number of CTUs when we trigger a round of processing
+++    int ctu_per_y_chan; // Number of CTUs per luma QPU
+++    int ctu_per_uv_chan; // Number of CTUs per chroma QPU
+++#ifdef RPI_INTER_QPU
+++    GPU_MEM_PTR_T unif_mvs_ptr[RPI_MAX_JOBS];
+++    uint32_t *unif_mvs[RPI_MAX_JOBS]; // Base of memory for motion vector commands
+ +
+-+mov ra_xshift, ra_xshift_next
+-+mov ra_x2shift, ra_x2shift_next
+++    // _base pointers are to the start of the row
+++    uint32_t *mvs_base[RPI_MAX_JOBS][8];
+++    // these pointers are to the next free space
+++    uint32_t *u_mvs[RPI_MAX_JOBS][8];
+++    uint32_t *curr_u_mvs; // Current uniform stream to use for chroma
+++    // Function pointers
+++    uint32_t mc_filter_uv;
+++    uint32_t mc_filter_uv_b0;
+++    uint32_t mc_filter_uv_b;
+++#endif
+++#ifdef RPI_LUMA_QPU
+++    GPU_MEM_PTR_T y_unif_mvs_ptr[RPI_MAX_JOBS];
+++    uint32_t *y_unif_mvs[RPI_MAX_JOBS]; // Base of memory for motion vector commands
+++    uint32_t *y_mvs_base[RPI_MAX_JOBS][12];
+++    uint32_t *y_mvs[RPI_MAX_JOBS][12];
+++    uint32_t *curr_y_mvs; // Current uniform stream for luma
+++    // Function pointers
+++    uint32_t mc_filter;
+++    uint32_t mc_filter_b;
+++#endif
+ +
+-+# get base addresses and per-channel shifts for *next* invocation
+-+add r0, unif, elem_num    # x
+-+add r2, r0, 8 # x+8
+-+max r0, r0, 0; mov r1, unif # y
+-+min r0, r0, rb_frame_width_minus_1 ; mov r3,unif # frame_base
+-+shl ra_xshift_next, r0, 3
+-+max r2, r2, 0
+-+min r2, r2, rb_frame_width_minus_1
+-+shl ra_x2shift_next, r2, 3
+-+add r0, r0, r3
+-+add r2, r2, r3
+-+and rb_x_base_next, r0, ~3
+-+and ra_x2_base_next, r2, ~3
+-+mov ra_y_next, r1
+++#ifdef RPI_WORKER
+++    pthread_t worker_thread;
+++    pthread_cond_t worker_cond_head;
+++    pthread_cond_t worker_cond_tail;
+++    pthread_mutex_t worker_mutex;
+ +
+-+# set up VPM write
+-+mov vw_setup, rb28
+++    int worker_tail; // Contains the number of posted jobs
+++    int worker_head; // Contains the number of completed jobs
+++    int kill_worker; // set to 1 to terminate the worker
+++#endif
+ +
+-+# get width,height of block
+-+mov r2, 16
+-+mov r0, unif
+-+shr r1, r0, r2 # Extract width
+-+sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
+-+and r0, r0, rb22 # Extract height
+-+add rb17, r0, -2 # Pipelining means we move data across 2 iterations early
+-+shl r0, r0, 7 ; mov rb18,r0
+-+add r0, r0, r1 # Combine width and height of destination area
+-+shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
+-+add rb26, r0, rb27
+++#define RPI_DEBLOCK_VPU_Q_COUNT 2
+ +
+-+# get filter coefficients
+++#ifdef RPI_DEBLOCK_VPU
+++    int enable_rpi_deblock;
+ +
+-+mov r0, unif
+-+asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra0, r0, rb23;      mov r0, unif
+-+asr ra7, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra6, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra5, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra4, r0, rb23;      mov r0, unif
+-+mov r0, unif
+++    int uv_setup_width;
+++    int uv_setup_height;
+++    int setup_width; // Number of 16x16 blocks across the image
+++    int setup_height; // Number of 16x16 blocks down the image
+ +
+-+# r2 is elem_num
+-+# r3 is loop counter
+-+mov r5rep, -8
+-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]  # delay slot 3
+-+mov r3, 0
+++    struct dblk_vpu_q_s
+++    {
+++        GPU_MEM_PTR_T deblock_vpu_gmem;
+ +
+-+:loop_honly
+-+# retrieve texture results and pick out bytes
+-+# then submit two more texture requests
+++        uint8_t (*y_setup_arm)[2][2][2][4];
+++        uint8_t (*y_setup_vc)[2][2][2][4];
+ +
+-+sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
+-+shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
+++        uint8_t (*uv_setup_arm)[2][2][2][4];  // Half of this is unused [][][1][], but easier for the VPU as it allows us to store with zeros and addresses are aligned
+++        uint8_t (*uv_setup_vc)[2][2][2][4];
+ +
+-+max r2, ra_y, 0  # y
+-+min r2, r2, rb_frame_height_minus_1
+-+add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+add t0s, ra_x2_base, r2
+++        int (*vpu_cmds_arm)[6]; // r0-r5 for each command
+++        int vpu_cmds_vc;
+ +
+-+# generate seven shifted versions
+-+# interleave with scroll of vertical context
+++        int cmd_id;
+++    } dvq_ents[RPI_DEBLOCK_VPU_Q_COUNT];
+ +
+-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++    struct dblk_vpu_q_s * dvq;
+++    unsigned int dvq_n;
+ +
+-+mov r2, rb21         ; mul24 r3, r0, ra0
+-+sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+-+nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+sub r0, r2, r3       ; mov r3, rb31
+++#endif
+ +
+-+sub.setf -, r3, rb18 ; mov r1, ra22
+++#endif
+ +
+-+mov -, vw_wait   ; mul24 r0, r0, r1
+-+brr.anyn -, r:loop_honly
+-+asr r0, r0, 15          # delay 1
+-+min r0, r0, rb22        # delay 2
+-+max vpm, r0, 0          # delay 3
++     uint8_t *cabac_state;
++ 
++     /** 1 if the independent slice segment header was successfully parsed */
++@@ -922,6 +1113,9 @@ typedef struct HEVCContext {
++     uint32_t max_mastering_luminance;
++     uint32_t min_mastering_luminance;
++ 
+++#ifdef RPI
+++    int dblk_cmds[RPI_MAX_JOBS][RPI_MAX_DEBLOCK_CMDS][2];
+++#endif
++ } HEVCContext;
++ 
++ int ff_hevc_decode_short_term_rps(GetBitContext *gb, AVCodecContext *avctx,
++@@ -1048,6 +1242,10 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++                                  int log2_trafo_size, enum ScanType scan_idx,
++                                  int c_idx);
++ 
+++#ifdef RPI_INTER_QPU
+++extern void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n);
+++#endif
+ +
+-+# DMA out
+-+bra -, ra31
+-+mov vw_setup, rb26 # VDW: height rows, 16 8-bit units long
+-+mov vw_setup, rb29
+-+mov vw_addr, unif # start the VDW
++ void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size);
++ 
++ 
++diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
++index 05b2821..e2f1f4e 100644
++--- a/libavcodec/hevc_cabac.c
+++++ b/libavcodec/hevc_cabac.c
++@@ -21,14 +21,72 @@
++  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++  */
++ 
+++#define UNCHECKED_BITSTREAM_READER 1
+ +
++ #include "libavutil/attributes.h"
++ #include "libavutil/common.h"
++ 
++-#include "cabac_functions.h"
++ #include "hevc.h"
+++#include "cabac_functions.h"
+++
+++// BY22 is probably faster than simple bypass if the processor has
+++// either a fast 32-bit divide or a fast 32x32->64[63:32] instruction
+++// x86 has fast int divide
+++// Arm doesn't have divide or general fast 64 bit, but does have the multiply
+++// * Beware: ARCH_xxx isn't set if configure --disable-asm is used
+++#define USE_BY22 (HAVE_FAST_64BIT || ARCH_ARM || ARCH_X86)
+++// Use native divide if we have a fast one - otherwise use mpy 1/x
+++// x86 has a fast integer divide - arm doesn't - unsure about other
+++// architectures
+++#define USE_BY22_DIV  ARCH_X86
+++
+++// Special case blocks with a single significant ceoff
+++// Decreases the complexity of the code for a common case but increases the
+++// code size.
+++#define USE_N_END_1 1
+++
+++#if ARCH_ARM
+++#include "arm/hevc_cabac.h"
+++#endif
++ 
++ #define CABAC_MAX_BIN 31
++ 
+++
+++#if USE_BY22 && !USE_BY22_DIV
+++#define I(x) (uint32_t)((0x10000000000ULL / (uint64_t)(x)) + 1ULL)
+++
+++static const uint32_t cabac_by22_inv_range[256] = {
+++                                                    0,      I(257), I(258), I(259),
+++    I(260), I(261), I(262), I(263), I(264), I(265), I(266), I(267), I(268), I(269),
+++    I(270), I(271), I(272), I(273), I(274), I(275), I(276), I(277), I(278), I(279),
+++    I(280), I(281), I(282), I(283), I(284), I(285), I(286), I(287), I(288), I(289),
+++    I(290), I(291), I(292), I(293), I(294), I(295), I(296), I(297), I(298), I(299),
+++    I(300), I(301), I(302), I(303), I(304), I(305), I(306), I(307), I(308), I(309),
+++    I(310), I(311), I(312), I(313), I(314), I(315), I(316), I(317), I(318), I(319),
+++    I(320), I(321), I(322), I(323), I(324), I(325), I(326), I(327), I(328), I(329),
+++    I(330), I(331), I(332), I(333), I(334), I(335), I(336), I(337), I(338), I(339),
+++    I(340), I(341), I(342), I(343), I(344), I(345), I(346), I(347), I(348), I(349),
+++    I(350), I(351), I(352), I(353), I(354), I(355), I(356), I(357), I(358), I(359),
+++    I(360), I(361), I(362), I(363), I(364), I(365), I(366), I(367), I(368), I(369),
+++    I(370), I(371), I(372), I(373), I(374), I(375), I(376), I(377), I(378), I(379),
+++    I(380), I(381), I(382), I(383), I(384), I(385), I(386), I(387), I(388), I(389),
+++    I(390), I(391), I(392), I(393), I(394), I(395), I(396), I(397), I(398), I(399),
+++    I(400), I(401), I(402), I(403), I(404), I(405), I(406), I(407), I(408), I(409),
+++    I(410), I(411), I(412), I(413), I(414), I(415), I(416), I(417), I(418), I(419),
+++    I(420), I(421), I(422), I(423), I(424), I(425), I(426), I(427), I(428), I(429),
+++    I(430), I(431), I(432), I(433), I(434), I(435), I(436), I(437), I(438), I(439),
+++    I(440), I(441), I(442), I(443), I(444), I(445), I(446), I(447), I(448), I(449),
+++    I(450), I(451), I(452), I(453), I(454), I(455), I(456), I(457), I(458), I(459),
+++    I(460), I(461), I(462), I(463), I(464), I(465), I(466), I(467), I(468), I(469),
+++    I(470), I(471), I(472), I(473), I(474), I(475), I(476), I(477), I(478), I(479),
+++    I(480), I(481), I(482), I(483), I(484), I(485), I(486), I(487), I(488), I(489),
+++    I(490), I(491), I(492), I(493), I(494), I(495), I(496), I(497), I(498), I(499),
+++    I(500), I(501), I(502), I(503), I(504), I(505), I(506), I(507), I(508), I(509),
+++    I(510), I(511)
+++};
+++#undef I
+++#endif  // USE_BY22
+ +
+-+################################################################################
++ /**
++  * number of bin by SyntaxElement.
++  */
++@@ -445,6 +503,211 @@ static const uint8_t diag_scan8x8_inv[8][8] = {
++     { 28, 36, 43, 49, 54, 58, 61, 63, },
++ };
++ 
+ +
+-+# mc_exit()
+++typedef struct
+++{
+++    uint16_t coeff;
+++    uint16_t scale;
+++} xy_off_t;
+++
+++#define XYT_C(x,y,t) ((x) + ((y) << (t)))
+++#define SCALE_TRAFO(t) ((t) > 3 ? 3 : (t))
+++#define SCALE_SHR(t) ((t) - SCALE_TRAFO(t))
+++#define XYT_S(x,y,t) (((x) >> SCALE_SHR(t)) + (((y) >> SCALE_SHR(t)) << SCALE_TRAFO(t)))
+++
+++#define XYT(x,y,t) {XYT_C(x,y,t), XYT_S(x,y,t)}
+++
+++#define OFF_DIAG(t) {\
+++    XYT(0,0,t), XYT(0,1,t), XYT(1,0,t), XYT(0,2,t),\
+++    XYT(1,1,t), XYT(2,0,t), XYT(0,3,t), XYT(1,2,t),\
+++    XYT(2,1,t), XYT(3,0,t), XYT(1,3,t), XYT(2,2,t),\
+++    XYT(3,1,t), XYT(2,3,t), XYT(3,2,t), XYT(3,3,t)\
+++}
+ +
+-+::mc_exit
+-+mov  -, vw_wait # wait on the VDW
+++#define OFF_HORIZ(t) {\
+++    XYT(0,0,t), XYT(1,0,t), XYT(2,0,t), XYT(3,0,t),\
+++    XYT(0,1,t), XYT(1,1,t), XYT(2,1,t), XYT(3,1,t),\
+++    XYT(0,2,t), XYT(1,2,t), XYT(2,2,t), XYT(3,2,t),\
+++    XYT(0,3,t), XYT(1,3,t), XYT(2,3,t), XYT(3,3,t)\
+++}
+ +
+-+mov -,srel(0)
+++#define OFF_VERT(t) {\
+++    XYT(0,0,t), XYT(0,1,t), XYT(0,2,t), XYT(0,3,t),\
+++    XYT(1,0,t), XYT(1,1,t), XYT(1,2,t), XYT(1,3,t),\
+++    XYT(2,0,t), XYT(2,1,t), XYT(2,2,t), XYT(2,3,t),\
+++    XYT(3,0,t), XYT(3,1,t), XYT(3,2,t), XYT(3,3,t)\
+++}
+ +
+-+ldtmu0
+-+ldtmu0
+-+ldtmu0
+-+ldtmu0
+++static const xy_off_t off_xys[3][4][16] =
+++{
+++    {OFF_DIAG(2), OFF_DIAG(3), OFF_DIAG(4), OFF_DIAG(5)},
+++    {OFF_HORIZ(2), OFF_HORIZ(3), OFF_HORIZ(4), OFF_HORIZ(5)},
+++    {OFF_VERT(2), OFF_VERT(3), OFF_VERT(4), OFF_VERT(5)}
+++};
+ +
+-+nop        ; nop ; thrend
+-+nop        ; nop # delay slot 1
+-+nop        ; nop # delay slot 2
+ +
+-+::mc_exit1
+-+mov  -, vw_wait # wait on the VDW
+++// Helper fns
+++#ifndef hevc_mem_bits32
+++static av_always_inline uint32_t hevc_mem_bits32(const void * buf, const unsigned int offset)
+++{
+++    return AV_RB32((const uint8_t *)buf + (offset >> 3)) << (offset & 7);
+++}
+++#endif
+ +
+-+#mov -,srel(1)
+++#if AV_GCC_VERSION_AT_LEAST(3,4) && !defined(hevc_clz32)
+++#define hevc_clz32 hevc_clz32_builtin
+++static av_always_inline unsigned int hevc_clz32_builtin(const uint32_t x)
+++{
+++    // __builtin_clz says it works on ints - so adjust if int is >32 bits long
+++    return __builtin_clz(x) - (sizeof(int) * 8 - 32);
+++}
+++#endif
+ +
+-+ldtmu0
+-+ldtmu0
+-+ldtmu0
+-+ldtmu0
+++// It is unlikely that we will ever need this but include for completeness
+++#ifndef hevc_clz32
+++static inline unsigned int hevc_clz32(unsigned int x)
+++{
+++    unsigned int n = 1;
+++    if ((x & 0xffff0000) == 0) {
+++        n += 16;
+++        x <<= 16;
+++    }
+++    if ((x & 0xff000000) == 0) {
+++        n += 8;
+++        x <<= 8;
+++    }
+++    if ((x & 0xf0000000) == 0) {
+++        n += 4;
+++        x <<= 4;
+++    }
+++    if ((x & 0xc0000000) == 0) {
+++        n += 2;
+++        x <<= 2;
+++    }
+++    return n - ((x >> 31) & 1);
+++}
+++#endif
+ +
+-+nop        ; nop ; thrend
+-+mov interrupt, 1; nop # delay slot 1
+-+nop        ; nop # delay slot 2
+ +
+-+# mc_interrupt_exit()
+-+::mc_interrupt_exit
+-+mov  -, vw_wait # wait on the VDW
+++#if !USE_BY22
+++// If no by22 then _by22 functions will revert to normal and so _peek/_flush
+++// will no longer be called but the setup calls will still exist and we want
+++// to null them out
+++#define bypass_start(s)
+++#define bypass_finish(s)
+++#else
+++// Use BY22 for residual bypass block
+++
+++#define bypass_start(s) get_cabac_by22_start(&s->HEVClc->cc)
+++#define bypass_finish(s) get_cabac_by22_finish(&s->HEVClc->cc)
+++
+++// BY22 notes that bypass is simply a divide into the bitstream and so we
+++// can peek out large quantities of bits at once and treat the result as if
+++// it was VLC.  In many cases this will lead to O(1) processing rather than
+++// O(n) though the setup and teardown is sufficiently expensive that it is
+++// only worth using if we expect to be dealing with more than a few bits
+++// The definition of "a few bits" will vary from platform to platform but
+++// tests on ARM show that it probably isn't worth it for a single coded
+++// residual, but is for >1 - it also seems likely that if there are
+++// more residuals then they are likely to be bigger and this will make the
+++// O(1) nature of the code more worthwhile.
+++
+++
+++#if !USE_BY22_DIV
+++// * 1/x @ 32 bits gets us 22 bits of accuracy
+++#define CABAC_BY22_PEEK_BITS  22
+++#else
+++// A real 32-bit divide gets us another bit
+++// If we have a 64 bit int & a unit time divider then we should get a lot
+++// of bits (55)  but that is untested and it is unclear if it would give
+++// us a large advantage
+++#define CABAC_BY22_PEEK_BITS  23
+++#endif
+ +
+-+ldtmu0
+-+ldtmu0
+-+ldtmu0
+-+ldtmu0
+++// Bypass block start
+++// Must be called before _by22_peek is used as it sets the CABAC environment
+++// into the correct state.  _by22_finish must be called to return to 'normal'
+++// (i.e. non-bypass) cabac decoding
+++static inline void get_cabac_by22_start(CABACContext * const c)
+++{
+++    const unsigned int bits = __builtin_ctz(c->low);
+++    const uint32_t m = hevc_mem_bits32(c->bytestream, 0);
+++    uint32_t x = (c->low << (22 - CABAC_BITS)) ^ ((m ^ 0x80000000U) >> (9 + CABAC_BITS - bits));
+++#if !USE_BY22_DIV
+++    const uint32_t inv = cabac_by22_inv_range[c->range & 0xff];
+++#endif
+ +
+-+mov -,sacq(0) # 1
+-+mov -,sacq(0) # 2
+-+mov -,sacq(0) # 3
+-+mov -,sacq(0) # 4
+-+mov -,sacq(0) # 5
+-+mov -,sacq(0) # 6
+-+mov -,sacq(0) # 7
+-+mov -,sacq(0) # 8
+-+mov -,sacq(0) # 9
+-+mov -,sacq(0) # 10
+-+mov -,sacq(0) # 11
+++    c->bytestream -= (CABAC_BITS / 8);
+++    c->by22.bits = bits;
+++#if !USE_BY22_DIV
+++    c->by22.range = c->range;
+++    c->range = inv;
+++#endif
+++    c->low = x;
+++}
+ +
+-+nop        ; nop ; thrend
+-+mov interrupt, 1; nop # delay slot 1
+-+nop        ; nop # delay slot 2
+++// Bypass block finish
+++// Must be called at the end of the bypass block to return to normal operation
+++static inline void get_cabac_by22_finish(CABACContext * const c)
+++{
+++    unsigned int used = c->by22.bits;
+++    unsigned int bytes_used = (used / CABAC_BITS) * (CABAC_BITS / 8);
+++    unsigned int bits_used = used & (CABAC_BITS == 16 ? 15 : 7);
+++
+++    c->bytestream += bytes_used + (CABAC_BITS / 8);
+++    c->low = (((uint32_t)c->low >> (22 - CABAC_BITS + bits_used)) | 1) << bits_used;
+++#if !USE_BY22_DIV
+++    c->range = c->by22.range;
+++#endif
+++}
+ +
+-+# mc_interrupt_exit4()
+-+::mc_interrupt_exit4
+-+mov  -, vw_wait # wait on the VDW
+-+
+-+ldtmu0
+-+ldtmu0
+-+ldtmu0
+-+ldtmu0
+-+
+-+mov -,sacq(0) # 1
+-+mov -,sacq(0) # 2
+-+mov -,sacq(0) # 3
+-+
+-+nop        ; nop ; thrend
+-+mov interrupt, 1; nop # delay slot 1
+-+nop        ; nop # delay slot 2
+-+
+-+# mc_interrupt_exit8()
+-+::mc_interrupt_exit8
+-+mov  -, vw_wait # wait on the VDW
+-+
+-+ldtmu0
+-+ldtmu0
+-+ldtmu0
+-+ldtmu0
+-+
+-+mov -,sacq(0) # 1
+-+mov -,sacq(0) # 2
+-+mov -,sacq(0) # 3
+-+mov -,sacq(0) # 4
+-+mov -,sacq(0) # 5
+-+mov -,sacq(0) # 6
+-+mov -,sacq(0) # 7
+-+
+-+nop        ; nop ; thrend
+-+mov interrupt, 1; nop # delay slot 1
+-+nop        ; nop # delay slot 2
+-+
+-+################################################################################
+-+# mc_setup_uv(next_kernel, x, y, ref_u_base, ref_v_base, frame_width, frame_height, pitch, dst_pitch, pad0, pad1, pad2)
+-+::mc_setup_uv
+-+
+-+# Read starting kernel
+-+mov ra31, unif
+-+
+-+# Load first request location
+-+add ra_x_base, unif, elem_num # Store x
+-+mov ra_y, unif # Store y
+-+mov ra_x2_base, unif # Store frame u base
+-+nop
+-+sub ra_u2v_ref_offset, unif, ra_x2_base # Store offset to add to move from u to v in reference frame
+++// Peek bypass bits
+++// _by22_start must be called before _by22_peek is called and _by22_flush
+++// must be called afterwards to flush any used bits
+++// The actual number of valid bits returned is
+++// min(<coded bypass block length>, CABAC_BY22_PEEK_BITS). CABAC_BY22_PEEK_BITS
+++// will be at least 22 which should be long enough for any prefix or suffix
+++// though probably not long enough for the worst case combination
+++#ifndef get_cabac_by22_peek
+++static inline uint32_t get_cabac_by22_peek(const CABACContext * const c)
+++{
+++#if USE_BY22_DIV
+++    return ((unsigned int)c->low / (unsigned int)c->range) << 9;
+++#else
+++    uint32_t x = c->low & ~1U;
+++    const uint32_t inv = c->range;
+ +
+-+# Read image dimensions
+-+sub rb25,unif,1
+-+sub rb30,unif,1
+++    if (inv != 0)
+++        x = (uint32_t)(((uint64_t)x * (uint64_t)inv) >> 32);
+ +
+-+# get source pitch
+-+mov rb16, unif
+++    return x << 1;
+++#endif
+++}
+++#endif
+ +
+-+# get destination pitch
+-+mov r0, unif
+-+mov r1, vdw_setup_1(0)
+-+add rb24, r1, r0
+++// Flush bypass bits peeked by _by22_peek
+++// Flush n bypass bits. n must be >= 1 to guarantee correct operation
+++// val is an unmodified copy of whatever _by22_peek returned
+++#ifndef get_cabac_by22_flush
+++static inline void get_cabac_by22_flush(CABACContext * c, const unsigned int n, const uint32_t val)
+++{
+++    // Subtract the bits used & reshift up to the top of the word
+++#if USE_BY22_DIV
+++    const uint32_t low = (((unsigned int)c->low << n) - (((val >> (32 - n)) * (unsigned int)c->range) << 23));
+++#else
+++    const uint32_t low = (((uint32_t)c->low << n) - (((val >> (32 - n)) * c->by22.range) << 23));
+++#endif
+ +
+-+# load constants
+++    // and refill lower bits
+++    // We will probably OR over some existing bits but that doesn't matter
+++    c->by22.bits += n;
+++    c->low = low | (hevc_mem_bits32(c->bytestream, c->by22.bits) >> 9);
+++}
+++#endif
+ +
+-+mov ra20, 1
+-+mov ra21, 64
+-+mov ra22, 256
+-+mov ra23, 8
+++#endif  // USE_BY22
+ +
+-+mov rb20, 0xffffff00
+-+mov rb21, 64
+-+mov rb22, 255
+-+mov rb23, 24
+ +
+-+# touch vertical context to keep simulator happy
++ void ff_hevc_save_states(HEVCContext *s, int ctb_addr_ts)
++ {
++     if (s->ps.pps->entropy_coding_sync_enabled_flag &&
++@@ -863,19 +1126,19 @@ int ff_hevc_cbf_luma_decode(HEVCContext *s, int trafo_depth)
++     return GET_CABAC(elem_offset[CBF_LUMA] + !trafo_depth);
++ }
++ 
++-static int hevc_transform_skip_flag_decode(HEVCContext *s, int c_idx)
+++static int hevc_transform_skip_flag_decode(HEVCContext *s, int c_idx_nz)
++ {
++-    return GET_CABAC(elem_offset[TRANSFORM_SKIP_FLAG] + !!c_idx);
+++    return GET_CABAC(elem_offset[TRANSFORM_SKIP_FLAG] + c_idx_nz);
++ }
++ 
++-static int explicit_rdpcm_flag_decode(HEVCContext *s, int c_idx)
+++static int explicit_rdpcm_flag_decode(HEVCContext *s, int c_idx_nz)
++ {
++-    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_FLAG] + !!c_idx);
+++    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_FLAG] + c_idx_nz);
++ }
++ 
++-static int explicit_rdpcm_dir_flag_decode(HEVCContext *s, int c_idx)
+++static int explicit_rdpcm_dir_flag_decode(HEVCContext *s, int c_idx_nz)
++ {
++-    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_DIR_FLAG] + !!c_idx);
+++    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_DIR_FLAG] + c_idx_nz);
++ }
++ 
++ int ff_hevc_log2_res_scale_abs(HEVCContext *s, int idx) {
++@@ -891,14 +1154,14 @@ int ff_hevc_res_scale_sign_flag(HEVCContext *s, int idx) {
++     return GET_CABAC(elem_offset[RES_SCALE_SIGN_FLAG] + idx);
++ }
++ 
++-static av_always_inline void last_significant_coeff_xy_prefix_decode(HEVCContext *s, int c_idx,
+++static av_always_inline void last_significant_coeff_xy_prefix_decode(HEVCContext *s, int c_idx_nz,
++                                                    int log2_size, int *last_scx_prefix, int *last_scy_prefix)
++ {
++     int i = 0;
++     int max = (log2_size << 1) - 1;
++     int ctx_offset, ctx_shift;
++ 
++-    if (!c_idx) {
+++    if (!c_idx_nz) {
++         ctx_offset = 3 * (log2_size - 2)  + ((log2_size - 1) >> 2);
++         ctx_shift = (log2_size + 1) >> 2;
++     } else {
++@@ -929,22 +1192,16 @@ static av_always_inline int last_significant_coeff_suffix_decode(HEVCContext *s,
++     return value;
++ }
++ 
++-static av_always_inline int significant_coeff_group_flag_decode(HEVCContext *s, int c_idx, int ctx_cg)
+++static av_always_inline int significant_coeff_group_flag_decode(HEVCContext *s, int c_idx_nz, int ctx_cg)
++ {
++     int inc;
++ 
++-    inc = FFMIN(ctx_cg, 1) + (c_idx>0 ? 2 : 0);
+++    inc = (ctx_cg != 0) + (c_idx_nz << 1);
++ 
++     return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_GROUP_FLAG] + inc);
++ }
++-static av_always_inline int significant_coeff_flag_decode(HEVCContext *s, int x_c, int y_c,
++-                                           int offset, const uint8_t *ctx_idx_map)
++-{
++-    int inc = ctx_idx_map[(y_c << 2) + x_c] + offset;
++-    return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_FLAG] + inc);
++-}
++ 
++-static av_always_inline int significant_coeff_flag_decode_0(HEVCContext *s, int c_idx, int offset)
+++static av_always_inline int significant_coeff_flag_decode_0(HEVCContext *s, int offset)
++ {
++     return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_FLAG] + offset);
++ }
++@@ -966,90 +1223,366 @@ static av_always_inline int coeff_abs_level_greater2_flag_decode(HEVCContext *s,
++     return GET_CABAC(elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] + inc);
++ }
++ 
++-static av_always_inline int coeff_abs_level_remaining_decode(HEVCContext *s, int rc_rice_param)
+ +
+-+mov ra8, 0
+-+mov ra9, 0
+-+mov ra10, 0
+-+mov ra11, 0
+-+mov ra12, 0
+-+mov ra13, 0
+-+mov ra14, 0
+-+mov ra15, 0
+++#if !USE_BY22
+++#define coeff_abs_level_remaining_decode_bypass(s,r) coeff_abs_level_remaining_decode(s, r)
+++#endif
+ +
+-+# Compute part of VPM to use for DMA output
+-+mov r2, qpu_num
+-+and r2, r2, 15
+-+mov r1, r2
+-+asr r1, r1, 2
+-+shl r1, r1, 6
+-+mov r0, r2
+-+and r0, r0, 3
+-+add r0, r0, r1
+-+mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
+-+shl r0, r0, 5
+-+add rb27, r0, r1
+ +
+-+# Compute part of VPM to save data into
+-+mov r2, qpu_num
+-+and r2, r2, 15
+-+mov r1, r2
+-+asr r1, r1, 2
+-+shl r1, r1, 6
+-+mov r0, r2
+-+and r0, r0, 3
+-+add r0, r0, r1
+-+mov r1, vpm_setup(0, 4, h8p(0, 0))
+-+add rb28, r0, r1
+++#ifndef coeff_abs_level_remaining_decode_bypass
+++static int coeff_abs_level_remaining_decode_bypass(HEVCContext * const s, const unsigned int rice_param)
++ {
+++    CABACContext * const c = &s->HEVClc->cc;
+++    uint32_t y;
+++    unsigned int prefix;
+++    unsigned int last_coeff_abs_level_remaining;
+++    unsigned int n;
+++
+++    y = get_cabac_by22_peek(c);
+++    prefix = hevc_clz32(~y);
+++    // y << prefix will always have top bit 0
+++
+++    if (prefix < 3) {
+++        const unsigned int suffix = (y << prefix) >> (31 - rice_param);
+++        last_coeff_abs_level_remaining = (prefix << rice_param) + suffix;
+++        n = prefix + 1 + rice_param;
+++    }
+++    else if (prefix * 2 + rice_param <= CABAC_BY22_PEEK_BITS + 2)
+++    {
+++        const uint32_t suffix = ((y << prefix) | 0x80000000) >> (34 - (prefix + rice_param));
+ +
+-+# Compute base address for first and second access
+-+mov r0, ra_x_base           # Load x
+-+max r0, r0, 0; mov r1, ra_y # Load y
+-+min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base  # Load the frame base
+-+shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
+-+add ra_y, r1, 1
+-+add r0, r0, r3
+-+and r0, r0, ~3
+-+max r1, r1, 0 ; mov ra_x_base, r0 # y
+-+min r1, r1, rb_frame_height_minus_1
+-+# submit texture requests for first line
+-+add r2, r2, r0 ; mul24 r1, r1, rb_pitch
+-+add t0s, r0, r1 ; mov ra_x2_base, r2
+-+add t0s, r2, r1
+++        last_coeff_abs_level_remaining = (2 << rice_param) + suffix;
+++        n = prefix * 2 + rice_param - 2;
+++    }
+++    else {
+++        unsigned int suffix;
+ +
+-+# Dump padding words
+-+mov r0, unif
+-+mov r0, unif
+-+mov r0, unif
+++        get_cabac_by22_flush(c, prefix, y);
+++        y = get_cabac_by22_peek(c);
+ +
+-+# submit texture requests for second line
+-+max r1, ra_y, 0
+-+min r1, r1, rb_frame_height_minus_1
+-+add ra_y, ra_y, 1
+-+bra -, ra31
+-+nop ; mul24 r1, r1, rb_pitch
+-+add t0s, r1, ra_x_base
+-+add t0s, r1, ra_x2_base
+++        suffix = (y | 0x80000000) >> (34 - (prefix + rice_param));
+++        last_coeff_abs_level_remaining = (2 << rice_param) + suffix;
+++        n = prefix + rice_param - 2;
+++    }
+ +
+++    get_cabac_by22_flush(c, n, y);
+ +
+++    return last_coeff_abs_level_remaining;
+++}
+++#endif
+ +
+-+################################################################################
+++static int coeff_abs_level_remaining_decode(HEVCContext * const s, int rc_rice_param)
+++{
+++    CABACContext * const c = &s->HEVClc->cc;
++     int prefix = 0;
++     int suffix = 0;
++     int last_coeff_abs_level_remaining;
++     int i;
++ 
++-    while (prefix < CABAC_MAX_BIN && get_cabac_bypass(&s->HEVClc->cc))
+++    while (prefix < CABAC_MAX_BIN && get_cabac_bypass(c))
++         prefix++;
++     if (prefix == CABAC_MAX_BIN) {
++         av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", prefix);
++         return 0;
++     }
+ +
+-+::mc_filter_uv_b
+-+mov ra31, unif
++     if (prefix < 3) {
++         for (i = 0; i < rc_rice_param; i++)
++-            suffix = (suffix << 1) | get_cabac_bypass(&s->HEVClc->cc);
+++            suffix = (suffix << 1) | get_cabac_bypass(c);
++         last_coeff_abs_level_remaining = (prefix << rc_rice_param) + suffix;
++     } else {
++         int prefix_minus3 = prefix - 3;
++         for (i = 0; i < prefix_minus3 + rc_rice_param; i++)
++-            suffix = (suffix << 1) | get_cabac_bypass(&s->HEVClc->cc);
+++            suffix = (suffix << 1) | get_cabac_bypass(c);
++         last_coeff_abs_level_remaining = (((1 << prefix_minus3) + 3 - 1)
++                                               << rc_rice_param) + suffix;
++     }
+ +
+-+# per-channel shifts were calculated on the *previous* invocation
++     return last_coeff_abs_level_remaining;
++ }
++ 
++-static av_always_inline int coeff_sign_flag_decode(HEVCContext *s, uint8_t nb)
+++#if !USE_BY22
+++#define coeff_sign_flag_decode_bypass coeff_sign_flag_decode
+++static inline uint32_t coeff_sign_flag_decode(HEVCContext * const s, const unsigned int nb)
++ {
++-    int i;
++-    int ret = 0;
+++    CABACContext * const c = &s->HEVClc->cc;
+++    unsigned int i;
+++    uint32_t ret = 0;
++ 
++     for (i = 0; i < nb; i++)
++-        ret = (ret << 1) | get_cabac_bypass(&s->HEVClc->cc);
++-    return ret;
+++        ret = (ret << 1) | get_cabac_bypass(c);
+ +
+-+mov ra_xshift, ra_xshift_next
+++    return ret << (32 - nb);
+++}
+++#endif
+ +
+-+# get base addresses and per-channel shifts for *next* invocation
+-+add r0, unif, elem_num    # x
+-+max r0, r0, 0; mov r1, unif # y
+-+min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base
+-+shl ra_xshift_next, r0, 3
+-+sub r2, unif, r3 # compute offset from frame base u to frame base v
+-+add r0, r0, r3
+-+and rb_x_base_next, r0, ~3
+-+mov ra_y_next, r1
+-+add ra_x2_base_next, rb_x_base_next, r2
+++#ifndef coeff_sign_flag_decode_bypass
+++static inline uint32_t coeff_sign_flag_decode_bypass(HEVCContext * const s, const unsigned int nb)
+++{
+++    CABACContext * const c = &s->HEVClc->cc;
+++    uint32_t y;
+++    y = get_cabac_by22_peek(c);
+++    get_cabac_by22_flush(c, nb, y);
+++    return y & ~(0xffffffffU >> nb);
+++}
+++#endif
+ +
+-+# set up VPM write
+-+mov vw_setup, rb28
+ +
+-+# get width,height of block
+-+mov r2, 16
+-+mov r0, unif
+-+shr r1, r0, r2 # Extract width
+-+sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
+-+and r0, r0, rb22 # Extract height
+-+add rb17, r0, 5
+-+add rb18, r0, 7
+-+shl r0, r0, 7
+++#ifndef get_cabac_greater1_bits
+++static inline unsigned int get_cabac_greater1_bits(CABACContext * const c, const unsigned int n,
+++    uint8_t * const state0)
+++{
+++    unsigned int i;
+++    unsigned int rv = 0;
+++    for (i = 0; i != n; ++i) {
+++        const unsigned int idx = rv != 0 ? 0 : i < 3 ? i + 1 : 3;
+++        const unsigned int b = get_cabac(c, state0 + idx);
+++        rv = (rv << 1) | b;
+++    }
+++    return rv;
+++}
+++#endif
+ +
+-+# r0 is currently height<<7
+-+# For vr_setup we want height<<20 (so 20-7=13 additional bits)
+-+shl r3, r0, 13
+-+shl r3, r3, 8 # Mask off top 8 bits
+-+shr r3, r3, 8
+ +
+-+add r0, r0, r1 # Combine width and height of destination area
+-+shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
+-+add rb26, r0, rb27
+++// N.B. levels returned are the values assuming coeff_abs_level_remaining
+++// is uncoded, so 1 must be added if it is coded.  sum_abs also reflects
+++// this version of events.
+++static inline uint32_t get_greaterx_bits(HEVCContext * const s, const unsigned int n_end, int * const levels,
+++    int * const pprev_subset_coded, int * const psum,
+++    const unsigned int idx0_gt1, const unsigned int idx_gt2)
+++{
+++    CABACContext * const c = &s->HEVClc->cc;
+++    uint8_t * const state0 = s->HEVClc->cabac_state + idx0_gt1;
+++    uint8_t * const state_gt2 = s->HEVClc->cabac_state + idx_gt2;
+++    unsigned int rv;
+++    unsigned int i;
+++    const unsigned int n = FFMIN(n_end, 8);
+ +
+-+# In a B frame, so also set up VPM read
+-+add vr_setup, r3, rb28
+++    // Really this is i != n but the simple unconditional loop is cheaper
+++    // and faster
+++    for (i = 0; i != 8; ++i)
+++        levels[i] = 1;
+ +
+-+sub.setf -,8,r1 # 8-r1, so if <0 (negative) we need to use the full code
+++    rv = get_cabac_greater1_bits(c, n, state0);
+ +
+-+# get filter coefficients
+++    *pprev_subset_coded = 0;
+++    *psum = n;
+ +
+-+mov r0, unif
+-+asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra0, r0, rb23;      mov r0, unif
+-+asr ra7, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra6, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra5, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra4, r0, rb23;      mov r0, unif
+-+asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+asr rb8, r0, rb23;      mov r0, unif
+-+asr rb15, r0, rb23;     mul24 r0, r0, ra22
+-+asr rb14, r0, rb23;     mul24 r0, r0, ra22
+-+asr rb13, r0, rb23;     mul24 r0, r0, ra22
+-+asr rb12, r0, rb23
+++    rv <<= (32 - n);
+++    if (rv != 0)
+++    {
+++        *pprev_subset_coded = 1;
+++        *psum = n + 1;
+++        i = hevc_clz32(rv);
+++        levels[i] = 2;
+++        if (get_cabac(c, state_gt2) == 0)
+++        {
+++            // Unset first coded bit
+++            rv &= ~(0x80000000U >> i);
+++        }
+++    }
+ +
+-+# r2 is elem_num
+-+# r3 is loop counter
+++    if (n_end > 8) {
+++        const unsigned int g8 = n_end - 8;
+++        rv |= ((1 << g8) - 1) << (24 - g8);
+++        for (i = 0; i != g8; ++i) {
+++            levels[i + 8] = 0;
+++        }
+++    }
+ +
+-+mov r5rep, -8
+-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++    return rv;
+++}
+ +
+-+# retrieve texture results and pick out bytes
+-+# then submit two more texture requests
+++// extended_precision_processing_flag must be false given we are
+++// putting the result into a 16-bit array
+++// So trans_coeff_level must fit in 16 bits too (7.4.9.1 definition of coeff_abs_level_remaining)
+++// scale_m is uint8_t
+++//
+++// scale is [40 - 72] << [0..12] based on qp- worst case is (45 << 12)
+++//   or it can be 2 (if we have transquant_bypass)
+++// shift is set to one less than we really want but would normally be
+++//   s->ps.sps->bit_depth (max 16, min 8) + log2_trafo_size (max 5, min 2?) - 5 = max 16 min 5?
+++// however the scale shift is substracted from shift to a min 0 so scale_m worst = 45 << 6
+++// This can still theoretically lead to overflow but the coding would have to be very odd (& inefficient)
+++// to achieve it
+++
+++#ifndef trans_scale_sat
+++static inline int trans_scale_sat(const int level, const unsigned int scale, const unsigned int scale_m, const unsigned int shift)
+++{
+++    return av_clip_int16((((level * (int)(scale * scale_m)) >> shift) + 1) >> 1);
++ }
+++#endif
+ +
+-+mov r3, 0
+ +
+-+:uvloop_b
+-+# retrieve texture results and pick out bytes
+-+# then submit two more texture requests
+++#ifndef update_rice
+++static inline void update_rice(uint8_t * const stat_coeff,
+++    const unsigned int last_coeff_abs_level_remaining,
+++    const unsigned int c_rice_param)
+++{
+++    const unsigned int x = (last_coeff_abs_level_remaining << 1) >> c_rice_param;
+++    if (x >= 6)
+++        (*stat_coeff)++;
+++    else if (x == 0 && *stat_coeff > 0)
+++        (*stat_coeff)--;
+++}
+++#endif
+ +
+-+sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
+-+shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
+ +
+-+max r2, ra_y, 0  # y
+-+min r2, r2, rb_frame_height_minus_1
+-+add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+add t0s, ra_x2_base, r2
+++// n must be > 0 on entry
+++#ifndef get_cabac_sig_coeff_flag_idxs
+++static inline uint8_t * get_cabac_sig_coeff_flag_idxs(CABACContext * const c, uint8_t * const state0,
+++    unsigned int n,
+++    const uint8_t const * ctx_map,
+++    uint8_t * p)
+++{
+++    do {
+++        if (get_cabac(c, state0 + ctx_map[n]))
+++            *p++ = n;
+++    } while (--n != 0);
+++    return p;
+++}
+++#endif
+ +
+-+# generate seven shifted versions
+-+# interleave with scroll of vertical context
+ +
+-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++static int get_sig_coeff_flag_idxs(CABACContext * const c, uint8_t * const state0,
+++    unsigned int n,
+++    const uint8_t const * ctx_map,
+++    uint8_t * const flag_idx)
+++{
+++    int rv;
+ +
+-+mov r2, rb21         ; mul24 r3, r0, ra0
+-+nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
+-+sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+-+nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+sub r0, r2, r3
+-+
+-+mov r3, rb31
+-+
+-+mov ra8, ra9
+-+mov ra9, ra10
+-+mov ra10, ra11
+-+mov ra11, ra12
+-+mov ra12, ra13
+-+mov ra13, ra14
+-+
+-+sub.setf -, r3, 8 ; mov r1, ra22
+++    rv = get_cabac_sig_coeff_flag_idxs(c, state0, n, ctx_map, flag_idx) - flag_idx;
+ +
+-+# apply horizontal filter
+-+brr.anyn -, r:uvloop_b
+-+max ra14, ra15, 0       ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
+-+asr r0, r0, 15          ; mov r1, ra21
+-+min.setf ra15, r0, rb22
+++    return rv;
+++}
+ +
+-+# apply vertical filter and write to VPM
+++#define H4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
+++     x0,  x1,  x2,  x3,\
+++     x4,  x5,  x6,  x7,\
+++     x8,  x9, x10, x11,\
+++    x12, x13, x14, x15}
+++
+++#define V4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
+++     x0,  x4,  x8, x12,\
+++     x1,  x5,  x9, x13,\
+++     x2,  x6, x10, x14,\
+++     x3,  x7, x11, x15}
+++
+++#define D4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
+++     x0,  x4,  x1,  x8,\
+++     x5,  x2, x12,  x9,\
+++     x6,  x3, x13, x10,\
+++     x7, x14, x11, x15}
+++
+++
+++static inline int next_subset(HEVCContext * const s, int i, const int c_idx_nz,
+++    uint8_t * const significant_coeff_group_flag,
+++    const uint8_t * const scan_x_cg, const uint8_t * const scan_y_cg,
+++    int * const pPrev_sig)
+++{
+++    while (--i >= 0) {
+++        unsigned int x_cg = scan_x_cg[i];
+++        unsigned int y_cg = scan_y_cg[i];
+++
+++        // For the flag decode we only care about Z/NZ but
+++        // we use the full Right + Down * 2 when calculating
+++        // significant coeff flags so we obtain it here
+++        //.
+++        // The group flag array is one longer than it needs to
+++        // be so we don't need to check for y_cg limits
+++        unsigned int prev_sig = ((significant_coeff_group_flag[y_cg] >> (x_cg + 1)) & 1) |
+++            (((significant_coeff_group_flag[y_cg + 1] >> x_cg) & 1) << 1);
+++
+++        if (i == 0 ||
+++            significant_coeff_group_flag_decode(s, c_idx_nz, prev_sig))
+++        {
+++            significant_coeff_group_flag[y_cg] |= (1 << x_cg);
+++            *pPrev_sig = prev_sig;
+++            break;
+++        }
+++    }
+ +
+-+nop                     ; mul24 r0, ra14, rb14
+-+sub r1, r1, r0          ; mul24 r0, ra13, rb13
+-+sub r1, r1, r0          ; mul24 r0, ra12, rb12
+-+sub r1, r1, r0          ; mul24 r0, ra11, rb11
+-+sub r1, r1, r0          ; mul24 r0, ra10, rb10
+-+sub r1, r1, r0          ; mul24 r0, ra9, rb9
+-+sub r1, r1, r0          ; mul24 r0, ra8, rb8
+-+sub r1, r1, r0          ; mul24 r0, ra15, rb15
+-+sub.ifnn r1, r1, r0     ; mov -, vw_wait
+-+sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+asr r1, r1, 15
+-+min r1, r1, rb22
+-+add r0, vpm, 1          # Blend in previous VPM contents at this location
+-+brr.anyn -, r:uvloop_b
+-+max r1, r1, 0
+-+add r1, r1, r0
+-+shr vpm, r1, 1
+++    return i;
+++}
+ +
++ 
++ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++                                 int log2_trafo_size, enum ScanType scan_idx,
++                                 int c_idx)
++ {
++-#define GET_COORD(offset, n)                                    \
++-    do {                                                        \
++-        x_c = (x_cg << 2) + scan_x_off[n];                      \
++-        y_c = (y_cg << 2) + scan_y_off[n];                      \
++-    } while (0)
++-    HEVCLocalContext *lc = s->HEVClc;
++-    int transform_skip_flag = 0;
+++    HEVCLocalContext * const lc = s->HEVClc;
+++    int trans_skip_or_bypass = lc->cu.cu_transquant_bypass_flag;
++ 
++     int last_significant_coeff_x, last_significant_coeff_y;
++-    int last_scan_pos;
++-    int n_end;
++     int num_coeff = 0;
++-    int greater1_ctx = 1;
+++    int prev_subset_coded = 0;
++ 
++     int num_last_subset;
++     int x_cg_last_sig, y_cg_last_sig;
++ 
++-    const uint8_t *scan_x_cg, *scan_y_cg, *scan_x_off, *scan_y_off;
+++    const uint8_t *scan_x_cg, *scan_y_cg;
+++    const xy_off_t * scan_xy_off;
++ 
++     ptrdiff_t stride = s->frame->linesize[c_idx];
++     int hshift = s->ps.sps->hshift[c_idx];
++     int vshift = s->ps.sps->vshift[c_idx];
++     uint8_t *dst = &s->frame->data[c_idx][(y0 >> vshift) * stride +
++                                           ((x0 >> hshift) << s->ps.sps->pixel_shift)];
+++#ifdef RPI
+++    //***** transform_skip_flag decoded later!
+++    int use_vpu = s->enable_rpi && !lc->cu.cu_transquant_bypass_flag /* && !transform_skip_flag*/ && !lc->tu.cross_pf && log2_trafo_size>=4;
+++#endif
++     int16_t *coeffs = (int16_t*)(c_idx ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
++-    uint8_t significant_coeff_group_flag[8][8] = {{0}};
+++    uint8_t significant_coeff_group_flag[9] = {0};  // Allow 1 final byte that is always zero
++     int explicit_rdpcm_flag = 0;
++     int explicit_rdpcm_dir_flag;
++ 
++     int trafo_size = 1 << log2_trafo_size;
++     int i;
++-    int qp,shift,add,scale,scale_m;
+++    int qp,shift,scale;
++     static const uint8_t level_scale[] = { 40, 45, 51, 57, 64, 72 };
++     const uint8_t *scale_matrix = NULL;
++     uint8_t dc_scale;
++     int pred_mode_intra = (c_idx == 0) ? lc->tu.intra_pred_mode :
++                                          lc->tu.intra_pred_mode_c;
++ 
+++    int prev_sig = 0;
+++    const int c_idx_nz = (c_idx != 0);
+ +
+-+# DMA out for U
+++    int may_hide_sign;
+ +
+-+mov vw_setup, rb26 # VDW setup 0
+-+mov vw_setup, rb29 # Stride
+-+mov vw_addr, unif # start the VDW
+-+
+-+# DMA out for V
+-+# We need to wait for the U to complete first, but have nothing useful to compute while we wait.
+-+# Could potentially push this write into the start of the next pipeline stage.
+-+mov r0, 16
+-+mov -, vw_wait
+++#ifdef RPI
+++    if (s->enable_rpi) {
+++        int n = trafo_size * trafo_size;
+++        if (use_vpu) {
+++            // We support size 4 and size 5.
+++            // Size 4 grows from the front  (Coeffs_buf_arm[2] points to start of buf)
+++            // Size 5 grows from the back   (Coeffs_buf_arm[3] points to end of buf)
+++            // num_coeffs is indexed by log2_trafo_size-2
+++            if (log2_trafo_size == 4)
+++                coeffs = s->coeffs_buf_arm[s->pass0_job][log2_trafo_size - 2] + s->num_coeffs[s->pass0_job][log2_trafo_size - 2];
+++            else
+++                coeffs = s->coeffs_buf_arm[s->pass0_job][log2_trafo_size - 2] - s->num_coeffs[s->pass0_job][log2_trafo_size - 2] - n;
+++            s->num_coeffs[s->pass0_job][log2_trafo_size - 2] += n;
+++        } else {
+++            coeffs = s->coeffs_buf_arm[s->pass0_job][0] + s->num_coeffs[s->pass0_job][0];
+++            s->num_coeffs[s->pass0_job][0] += n;
+++        }
+++    }
+++    // We now do the memset after transform_add while we know the data is cached.
+++    #ifdef RPI_PRECLEAR
+++    #else
+++    memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
+++    #endif
+++#else
++     memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
+++#endif
+ +
+-+bra -, ra31
+-+add vw_setup, rb26, r0 # VDW setup 0
+-+mov vw_setup, rb29 # Stride
+-+mov vw_addr, unif # start the VDW
+ +
+-+::mc_end
+-diff --git a/libavcodec/rpi_user_vcsm.h b/libavcodec/rpi_user_vcsm.h
+-new file mode 100644
+-index 0000000..fbebbbe
+---- /dev/null
+-+++ b/libavcodec/rpi_user_vcsm.h
+-@@ -0,0 +1,425 @@
+-+/*
+-+Copyright (c) 2012, Broadcom Europe Ltd
+-+All rights reserved.
++ 
++     // Derive QP for dequant
++     if (!lc->cu.cu_transquant_bypass_flag) {
++-        static const int qp_c[] = { 29, 30, 31, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37 };
+++        static const uint8_t qp_c[] = { 29, 30, 31, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37 };
++         static const uint8_t rem6[51 + 4 * 6 + 1] = {
++             0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2,
++             3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
++@@ -1065,9 +1598,19 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++         };
++         int qp_y = lc->qp_y;
++ 
+++        may_hide_sign = s->ps.pps->sign_data_hiding_flag;
+++
++         if (s->ps.pps->transform_skip_enabled_flag &&
++             log2_trafo_size <= s->ps.pps->log2_max_transform_skip_block_size) {
++-            transform_skip_flag = hevc_transform_skip_flag_decode(s, c_idx);
+++            int transform_skip_flag = hevc_transform_skip_flag_decode(s, c_idx_nz);
+++            if (transform_skip_flag) {
+++                trans_skip_or_bypass = 1;
+++                if (lc->cu.pred_mode ==  MODE_INTRA  &&
+++                    s->ps.sps->implicit_rdpcm_enabled_flag &&
+++                    (pred_mode_intra == 10 || pred_mode_intra == 26)) {
+++                    may_hide_sign = 0;
+++                }
+++            }
++         }
++ 
++         if (c_idx == 0) {
++@@ -1100,39 +1643,73 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++             qp += s->ps.sps->qp_bd_offset;
++         }
++ 
++-        shift    = s->ps.sps->bit_depth + log2_trafo_size - 5;
++-        add      = 1 << (shift-1);
++-        scale    = level_scale[rem6[qp]] << (div6[qp]);
++-        scale_m  = 16; // default when no custom scaling lists.
++-        dc_scale = 16;
+++        // Shift is set to one less than will actually occur as the scale
+++        // and saturate step adds 1 and then shifts right again
+++        shift = s->ps.sps->bit_depth + log2_trafo_size - 6;
+++        scale = level_scale[rem6[qp]];
+++        if (div6[qp] >= shift) {
+++            scale <<= (div6[qp] - shift);
+++            shift = 0;
+++        } else {
+++            shift -= div6[qp];
+++        }
++ 
++-        if (s->ps.sps->scaling_list_enable_flag && !(transform_skip_flag && log2_trafo_size > 2)) {
+++        if (s->ps.sps->scaling_list_enable_flag && !(trans_skip_or_bypass && log2_trafo_size > 2)) {
++             const ScalingList *sl = s->ps.pps->scaling_list_data_present_flag ?
++-            &s->ps.pps->scaling_list : &s->ps.sps->scaling_list;
+++                &s->ps.pps->scaling_list : &s->ps.sps->scaling_list;
++             int matrix_id = lc->cu.pred_mode != MODE_INTRA;
++ 
++             matrix_id = 3 * matrix_id + c_idx;
++ 
++             scale_matrix = sl->sl[log2_trafo_size - 2][matrix_id];
+++            dc_scale = scale_matrix[0];
++             if (log2_trafo_size >= 4)
++                 dc_scale = sl->sl_dc[log2_trafo_size - 4][matrix_id];
++         }
+++        else
+++        {
+++            static const uint8_t sixteen_scale[64] = {
+++                16, 16, 16, 16, 16, 16, 16, 16,
+++                16, 16, 16, 16, 16, 16, 16, 16,
+++                16, 16, 16, 16, 16, 16, 16, 16,
+++                16, 16, 16, 16, 16, 16, 16, 16,
+++                16, 16, 16, 16, 16, 16, 16, 16,
+++                16, 16, 16, 16, 16, 16, 16, 16,
+++                16, 16, 16, 16, 16, 16, 16, 16,
+++                16, 16, 16, 16, 16, 16, 16, 16
+++            };
+++            scale_matrix = sixteen_scale;
+++            dc_scale = 16;
+++        }
++     } else {
+++        static const uint8_t unit_scale[64] = {
+++            1, 1, 1, 1, 1, 1, 1, 1,
+++            1, 1, 1, 1, 1, 1, 1, 1,
+++            1, 1, 1, 1, 1, 1, 1, 1,
+++            1, 1, 1, 1, 1, 1, 1, 1,
+++            1, 1, 1, 1, 1, 1, 1, 1,
+++            1, 1, 1, 1, 1, 1, 1, 1,
+++            1, 1, 1, 1, 1, 1, 1, 1,
+++            1, 1, 1, 1, 1, 1, 1, 1,
+++        };
+++        scale_matrix = unit_scale;
++         shift        = 0;
++-        add          = 0;
++-        scale        = 0;
++-        dc_scale     = 0;
+++        scale        = 2;  // We will shift right to kill this
+++        dc_scale     = 1;
+++
+++        may_hide_sign = 0;
++     }
++ 
++     if (lc->cu.pred_mode == MODE_INTER && s->ps.sps->explicit_rdpcm_enabled_flag &&
++-        (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
++-        explicit_rdpcm_flag = explicit_rdpcm_flag_decode(s, c_idx);
+++        trans_skip_or_bypass) {
+++        explicit_rdpcm_flag = explicit_rdpcm_flag_decode(s, c_idx_nz);
++         if (explicit_rdpcm_flag) {
++-            explicit_rdpcm_dir_flag = explicit_rdpcm_dir_flag_decode(s, c_idx);
+++            may_hide_sign = 0;
+++            explicit_rdpcm_dir_flag = explicit_rdpcm_dir_flag_decode(s, c_idx_nz);
++         }
++     }
++ 
++-    last_significant_coeff_xy_prefix_decode(s, c_idx, log2_trafo_size,
+++    last_significant_coeff_xy_prefix_decode(s, c_idx_nz, log2_trafo_size,
++                                            &last_significant_coeff_x, &last_significant_coeff_y);
++ 
++     if (last_significant_coeff_x > 3) {
++@@ -1160,119 +1737,113 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++         int last_x_c = last_significant_coeff_x & 3;
++         int last_y_c = last_significant_coeff_y & 3;
++ 
++-        scan_x_off = ff_hevc_diag_scan4x4_x;
++-        scan_y_off = ff_hevc_diag_scan4x4_y;
++         num_coeff = diag_scan4x4_inv[last_y_c][last_x_c];
++-        if (trafo_size == 4) {
+++
+++        switch (log2_trafo_size) {
+++        case 2:
++             scan_x_cg = scan_1x1;
++             scan_y_cg = scan_1x1;
++-        } else if (trafo_size == 8) {
+++            break;
+++        case 3:
++             num_coeff += diag_scan2x2_inv[y_cg_last_sig][x_cg_last_sig] << 4;
++             scan_x_cg = diag_scan2x2_x;
++             scan_y_cg = diag_scan2x2_y;
++-        } else if (trafo_size == 16) {
+++            break;
+++        case 4:
++             num_coeff += diag_scan4x4_inv[y_cg_last_sig][x_cg_last_sig] << 4;
++             scan_x_cg = ff_hevc_diag_scan4x4_x;
++             scan_y_cg = ff_hevc_diag_scan4x4_y;
++-        } else { // trafo_size == 32
+++            break;
+++        case 5:
+++        default:
++             num_coeff += diag_scan8x8_inv[y_cg_last_sig][x_cg_last_sig] << 4;
++             scan_x_cg = ff_hevc_diag_scan8x8_x;
++             scan_y_cg = ff_hevc_diag_scan8x8_y;
+++            break;
++         }
++         break;
++     }
++     case SCAN_HORIZ:
++         scan_x_cg = horiz_scan2x2_x;
++         scan_y_cg = horiz_scan2x2_y;
++-        scan_x_off = horiz_scan4x4_x;
++-        scan_y_off = horiz_scan4x4_y;
++         num_coeff = horiz_scan8x8_inv[last_significant_coeff_y][last_significant_coeff_x];
++         break;
++     default: //SCAN_VERT
++         scan_x_cg = horiz_scan2x2_y;
++         scan_y_cg = horiz_scan2x2_x;
++-        scan_x_off = horiz_scan4x4_y;
++-        scan_y_off = horiz_scan4x4_x;
++         num_coeff = horiz_scan8x8_inv[last_significant_coeff_x][last_significant_coeff_y];
++         break;
++     }
++     num_coeff++;
++     num_last_subset = (num_coeff - 1) >> 4;
++ 
++-    for (i = num_last_subset; i >= 0; i--) {
++-        int n, m;
++-        int x_cg, y_cg, x_c, y_c, pos;
++-        int implicit_non_zero_coeff = 0;
++-        int64_t trans_coeff_level;
++-        int prev_sig = 0;
++-        int offset = i << 4;
++-        int rice_init = 0;
+++    significant_coeff_group_flag[y_cg_last_sig] = 1 << x_cg_last_sig; // 1st subset always significant
++ 
++-        uint8_t significant_coeff_flag_idx[16];
++-        uint8_t nb_significant_coeff_flag = 0;
++-
++-        x_cg = scan_x_cg[i];
++-        y_cg = scan_y_cg[i];
++-
++-        if ((i < num_last_subset) && (i > 0)) {
++-            int ctx_cg = 0;
++-            if (x_cg < (1 << (log2_trafo_size - 2)) - 1)
++-                ctx_cg += significant_coeff_group_flag[x_cg + 1][y_cg];
++-            if (y_cg < (1 << (log2_trafo_size - 2)) - 1)
++-                ctx_cg += significant_coeff_group_flag[x_cg][y_cg + 1];
+++    scan_xy_off = off_xys[scan_idx][log2_trafo_size - 2];
++ 
++-            significant_coeff_group_flag[x_cg][y_cg] =
++-                significant_coeff_group_flag_decode(s, c_idx, ctx_cg);
++-            implicit_non_zero_coeff = 1;
++-        } else {
++-            significant_coeff_group_flag[x_cg][y_cg] =
++-            ((x_cg == x_cg_last_sig && y_cg == y_cg_last_sig) ||
++-             (x_cg == 0 && y_cg == 0));
++-        }
+++    i = num_last_subset;
+++    do {
+++        int implicit_non_zero_coeff = 0;
+++        int n_end;
++ 
++-        last_scan_pos = num_coeff - offset - 1;
+++        uint8_t significant_coeff_flag_idx[16];
+++        unsigned int nb_significant_coeff_flag = 0;
++ 
++         if (i == num_last_subset) {
+++            // First time through
+++            int last_scan_pos = num_coeff - (i << 4) - 1;
++             n_end = last_scan_pos - 1;
++             significant_coeff_flag_idx[0] = last_scan_pos;
++             nb_significant_coeff_flag = 1;
++         } else {
++             n_end = 15;
+++            implicit_non_zero_coeff = (i != 0);
++         }
++ 
++-        if (x_cg < ((1 << log2_trafo_size) - 1) >> 2)
++-            prev_sig = !!significant_coeff_group_flag[x_cg + 1][y_cg];
++-        if (y_cg < ((1 << log2_trafo_size) - 1) >> 2)
++-            prev_sig += (!!significant_coeff_group_flag[x_cg][y_cg + 1] << 1);
++-
++-        if (significant_coeff_group_flag[x_cg][y_cg] && n_end >= 0) {
++-            static const uint8_t ctx_idx_map[] = {
++-                0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8, // log2_trafo_size == 2
++-                1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, // prev_sig == 0
++-                2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, // prev_sig == 1
++-                2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, // prev_sig == 2
++-                2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2  // default
+++        if (n_end >= 0) {
+++            static const uint8_t ctx_idx_maps_ts2[3][16] = {
+++                D4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2
+++                H4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2
+++                V4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8)  // log2_trafo_size == 2
+++            };
+++            static const uint8_t ctx_idx_maps[3][4][16] = {
+++                {
+++                    D4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
+++                    D4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 1
+++                    D4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 2
+++                    D4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
+++                },
+++                {
+++                    H4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
+++                    H4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 1
+++                    H4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 2
+++                    H4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
+++                },
+++                {
+++                    V4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
+++                    V4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 1
+++                    V4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 2
+++                    V4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
+++                }
++             };
++             const uint8_t *ctx_idx_map_p;
++             int scf_offset = 0;
++-            if (s->ps.sps->transform_skip_context_enabled_flag &&
++-                (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
++-                ctx_idx_map_p = (uint8_t*) &ctx_idx_map[4 * 16];
++-                if (c_idx == 0) {
++-                    scf_offset = 40;
++-                } else {
++-                    scf_offset = 14 + 27;
++-                }
+++
+++            if (s->ps.sps->transform_skip_context_enabled_flag && trans_skip_or_bypass) {
+++                ctx_idx_map_p = ctx_idx_maps[0][3];
+++                scf_offset = 40 + c_idx_nz;
++             } else {
++-                if (c_idx != 0)
+++                if (c_idx_nz != 0)
++                     scf_offset = 27;
+++
++                 if (log2_trafo_size == 2) {
++-                    ctx_idx_map_p = (uint8_t*) &ctx_idx_map[0];
+++                    ctx_idx_map_p = ctx_idx_maps_ts2[scan_idx];
++                 } else {
++-                    ctx_idx_map_p = (uint8_t*) &ctx_idx_map[(prev_sig + 1) << 4];
++-                    if (c_idx == 0) {
++-                        if ((x_cg > 0 || y_cg > 0))
+++                    ctx_idx_map_p = ctx_idx_maps[scan_idx][prev_sig];
+++                    if (!c_idx_nz) {
+++                        if (i != 0)
++                             scf_offset += 3;
+++
++                         if (log2_trafo_size == 3) {
++                             scf_offset += (scan_idx == SCAN_DIAG) ? 9 : 15;
++                         } else {
++@@ -1286,34 +1857,30 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++                     }
++                 }
++             }
++-            for (n = n_end; n > 0; n--) {
++-                x_c = scan_x_off[n];
++-                y_c = scan_y_off[n];
++-                if (significant_coeff_flag_decode(s, x_c, y_c, scf_offset, ctx_idx_map_p)) {
++-                    significant_coeff_flag_idx[nb_significant_coeff_flag] = n;
++-                    nb_significant_coeff_flag++;
+++
+++            if (n_end > 0) {
+++                int cnt = get_sig_coeff_flag_idxs(&s->HEVClc->cc,
+++                    s->HEVClc->cabac_state + elem_offset[SIGNIFICANT_COEFF_FLAG] + scf_offset,
+++                    n_end, ctx_idx_map_p,
+++                    significant_coeff_flag_idx + nb_significant_coeff_flag);
+++
+++                nb_significant_coeff_flag += cnt;
+++                if (cnt != 0) {
++                     implicit_non_zero_coeff = 0;
++                 }
++             }
+ +
+-+Redistribution and use in source and binary forms, with or without
+-+modification, are permitted provided that the following conditions are met:
+-+    * Redistributions of source code must retain the above copyright
+-+      notice, this list of conditions and the following disclaimer.
+-+    * Redistributions in binary form must reproduce the above copyright
+-+      notice, this list of conditions and the following disclaimer in the
+-+      documentation and/or other materials provided with the distribution.
+-+    * Neither the name of the copyright holder nor the
+-+      names of its contributors may be used to endorse or promote products
+-+      derived from this software without specific prior written permission.
++             if (implicit_non_zero_coeff == 0) {
++-                if (s->ps.sps->transform_skip_context_enabled_flag &&
++-                    (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
++-                    if (c_idx == 0) {
++-                        scf_offset = 42;
++-                    } else {
++-                        scf_offset = 16 + 27;
++-                    }
+++                if (s->ps.sps->transform_skip_context_enabled_flag && trans_skip_or_bypass) {
+++                    scf_offset = 42 + c_idx_nz;
++                 } else {
++                     if (i == 0) {
++-                        if (c_idx == 0)
++-                            scf_offset = 0;
++-                        else
++-                            scf_offset = 27;
+++                        scf_offset = c_idx_nz ? 27 : 0;
++                     } else {
++                         scf_offset = 2 + scf_offset;
++                     }
++                 }
++-                if (significant_coeff_flag_decode_0(s, c_idx, scf_offset) == 1) {
+++                if (significant_coeff_flag_decode_0(s, scf_offset) == 1) {
++                     significant_coeff_flag_idx[nb_significant_coeff_flag] = 0;
++                     nb_significant_coeff_flag++;
++                 }
++@@ -1323,141 +1890,185 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++             }
++         }
++ 
++-        n_end = nb_significant_coeff_flag;
++-
+++        if (nb_significant_coeff_flag != 0) {
+++            const unsigned int gt1_idx_delta = (c_idx_nz << 2) |
+++                ((i != 0 && !c_idx_nz) ? 2 : 0) |
+++                prev_subset_coded;
+++            const unsigned int idx0_gt1 = elem_offset[COEFF_ABS_LEVEL_GREATER1_FLAG] +
+++                (gt1_idx_delta << 2);
+++            const unsigned int idx_gt2 = elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] +
+++                gt1_idx_delta;
+++
+++            const unsigned int x_cg = scan_x_cg[i];
+++            const unsigned int y_cg = scan_y_cg[i];
+++            int16_t * const blk_coeffs = coeffs +
+++                ((x_cg + (y_cg << log2_trafo_size)) << 2);
+++            // This calculation is 'wrong' for log2_traffo_size == 2
+++            // but that doesn't mattor as in this case x_cg & y_cg
+++            // are always 0 so result is correct (0) anyway
+++            const uint8_t * const blk_scale = scale_matrix +
+++                (((x_cg + (y_cg << 3)) << (5 - log2_trafo_size)));
+++
+++            // * The following code block doesn't deal with these flags:
+++            //   (nor did the one it replaces)
+++            //
+++            // cabac_bypass_alignment_enabled_flag
+++            //    This should be easy but I can't find a test case
+++            // extended_precision_processing_flag
+++            //    This can extend the required precision past 16bits
+++            //    so is probably tricky - also no example found yet
+++
+++#if USE_N_END_1
+++            if (nb_significant_coeff_flag == 1) {
+++                // There is a small gain to be had from special casing the single
+++                // transform coefficient case.  The reduction in complexity
+++                // makes up for the code duplicatioon.
+++
+++                int trans_coeff_level = 1;
+++                int coeff_sign_flag;
+++                int coded_val = 0;
+++
+++                // initialize first elem of coeff_bas_level_greater1_flag
+++                prev_subset_coded = 0;
+++
+++                if (get_cabac(&s->HEVClc->cc, s->HEVClc->cabac_state + idx0_gt1 + 1)) {
+++                    trans_coeff_level = 2;
+++                    prev_subset_coded = 1;
+++                    coded_val = get_cabac(&s->HEVClc->cc, s->HEVClc->cabac_state + idx_gt2);
+++                }
++ 
++-        if (n_end) {
++-            int first_nz_pos_in_cg;
++-            int last_nz_pos_in_cg;
++-            int c_rice_param = 0;
++-            int first_greater1_coeff_idx = -1;
++-            uint8_t coeff_abs_level_greater1_flag[8];
++-            uint16_t coeff_sign_flag;
++-            int sum_abs = 0;
++-            int sign_hidden;
++-            int sb_type;
+++                // Probably not worth the overhead of starting by22 for just one value
+++                coeff_sign_flag = get_cabac_bypass(&s->HEVClc->cc);
++ 
+++                if (coded_val)
+++                {
+++                    if (!s->ps.sps->persistent_rice_adaptation_enabled_flag) {
+++                        trans_coeff_level = 3 + coeff_abs_level_remaining_decode(s, 0);
+++                    } else {
+++                        uint8_t * const stat_coeff =
+++                            lc->stat_coeff + trans_skip_or_bypass + 2 - ((c_idx_nz) << 1);
+++                        const unsigned int c_rice_param = *stat_coeff >> 2;
+++                        const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
++ 
++-            // initialize first elem of coeff_bas_level_greater1_flag
++-            int ctx_set = (i > 0 && c_idx == 0) ? 2 : 0;
+++                        trans_coeff_level = 3 + last_coeff_abs_level_remaining;
+++                        update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
+++                    }
+++                }
++ 
++-            if (s->ps.sps->persistent_rice_adaptation_enabled_flag) {
++-                if (!transform_skip_flag && !lc->cu.cu_transquant_bypass_flag)
++-                    sb_type = 2 * (c_idx == 0 ? 1 : 0);
++-                else
++-                    sb_type = 2 * (c_idx == 0 ? 1 : 0) + 1;
++-                c_rice_param = lc->stat_coeff[sb_type] / 4;
++-            }
+++                {
+++                    const xy_off_t * const xy_off = scan_xy_off + significant_coeff_flag_idx[0];
+++                    const int k = (int32_t)(coeff_sign_flag << 31) >> 31;
+++                    const unsigned int scale_m = blk_scale[xy_off->scale];
++ 
++-            if (!(i == num_last_subset) && greater1_ctx == 0)
++-                ctx_set++;
++-            greater1_ctx = 1;
++-            last_nz_pos_in_cg = significant_coeff_flag_idx[0];
++-
++-            for (m = 0; m < (n_end > 8 ? 8 : n_end); m++) {
++-                int inc = (ctx_set << 2) + greater1_ctx;
++-                coeff_abs_level_greater1_flag[m] =
++-                    coeff_abs_level_greater1_flag_decode(s, c_idx, inc);
++-                if (coeff_abs_level_greater1_flag[m]) {
++-                    greater1_ctx = 0;
++-                    if (first_greater1_coeff_idx == -1)
++-                        first_greater1_coeff_idx = m;
++-                } else if (greater1_ctx > 0 && greater1_ctx < 3) {
++-                    greater1_ctx++;
+++                    blk_coeffs[xy_off->coeff] = trans_scale_sat(
+++                        (trans_coeff_level ^ k) - k,  // Apply sign
+++                        scale,
+++                        i == 0 && xy_off->coeff == 0 ? dc_scale : scale_m,
+++                        shift);
++                 }
++             }
++-            first_nz_pos_in_cg = significant_coeff_flag_idx[n_end - 1];
++-
++-            if (lc->cu.cu_transquant_bypass_flag ||
++-                (lc->cu.pred_mode ==  MODE_INTRA  &&
++-                 s->ps.sps->implicit_rdpcm_enabled_flag  &&  transform_skip_flag  &&
++-                 (pred_mode_intra == 10 || pred_mode_intra  ==  26 )) ||
++-                 explicit_rdpcm_flag)
++-                sign_hidden = 0;
++             else
++-                sign_hidden = (last_nz_pos_in_cg - first_nz_pos_in_cg >= 4);
+++#endif
+++            {
+++                int sign_hidden = may_hide_sign;
+++                int levels[16]; // Should be able to get away with int16_t but that fails some tests
+++                uint32_t coeff_sign_flags;
+++                uint32_t coded_vals = 0;
+++                // Sum(abs(level[]))
+++                // In fact we only need the bottom bit and in some future
+++                // version that may be all we calculate
+++                unsigned int sum_abs;
+++
+++                coded_vals = get_greaterx_bits(s, nb_significant_coeff_flag, levels,
+++                    &prev_subset_coded, &sum_abs, idx0_gt1, idx_gt2);
+++
+++                if (significant_coeff_flag_idx[0] - significant_coeff_flag_idx[nb_significant_coeff_flag - 1] <= 3)
+++                    sign_hidden = 0;
+++
+++                // -- Start bypass block
+++
+++                bypass_start(s);
+++
+++                coeff_sign_flags = coeff_sign_flag_decode_bypass(s, nb_significant_coeff_flag - sign_hidden);
+++
+++                if (coded_vals != 0)
+++                {
+++                    const int rice_adaptation_enabled = s->ps.sps->persistent_rice_adaptation_enabled_flag;
+++                    uint8_t * stat_coeff = !rice_adaptation_enabled ? NULL :
+++                        lc->stat_coeff + trans_skip_or_bypass + 2 - ((c_idx_nz) << 1);
+++                    int c_rice_param = !rice_adaptation_enabled ? 0 : *stat_coeff >> 2;
+++                    int * level = levels - 1;
+++
+++                    do {
+++                        {
+++                            const unsigned int z = hevc_clz32(coded_vals) + 1;
+++                            level += z;
+++                            coded_vals <<= z;
+++                        }
++ 
++-            if (first_greater1_coeff_idx != -1) {
++-                coeff_abs_level_greater1_flag[first_greater1_coeff_idx] += coeff_abs_level_greater2_flag_decode(s, c_idx, ctx_set);
++-            }
++-            if (!s->ps.pps->sign_data_hiding_flag || !sign_hidden ) {
++-                coeff_sign_flag = coeff_sign_flag_decode(s, nb_significant_coeff_flag) << (16 - nb_significant_coeff_flag);
++-            } else {
++-                coeff_sign_flag = coeff_sign_flag_decode(s, nb_significant_coeff_flag - 1) << (16 - (nb_significant_coeff_flag - 1));
++-            }
+++                        {
+++                            const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode_bypass(s, c_rice_param);
+++                            const int trans_coeff_level = *level + last_coeff_abs_level_remaining + 1;
+++
+++                            sum_abs += last_coeff_abs_level_remaining + 1;
+++                            *level = trans_coeff_level;
+++
+++                            if (stat_coeff != NULL)
+++                                update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
+++                            stat_coeff = NULL;
++ 
++-            for (m = 0; m < n_end; m++) {
++-                n = significant_coeff_flag_idx[m];
++-                GET_COORD(offset, n);
++-                if (m < 8) {
++-                    trans_coeff_level = 1 + coeff_abs_level_greater1_flag[m];
++-                    if (trans_coeff_level == ((m == first_greater1_coeff_idx) ? 3 : 2)) {
++-                        int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
++-
++-                        trans_coeff_level += last_coeff_abs_level_remaining;
++-                        if (trans_coeff_level > (3 << c_rice_param))
++-                            c_rice_param = s->ps.sps->persistent_rice_adaptation_enabled_flag ? c_rice_param + 1 : FFMIN(c_rice_param + 1, 4);
++-                        if (s->ps.sps->persistent_rice_adaptation_enabled_flag && !rice_init) {
++-                            int c_rice_p_init = lc->stat_coeff[sb_type] / 4;
++-                            if (last_coeff_abs_level_remaining >= (3 << c_rice_p_init))
++-                                lc->stat_coeff[sb_type]++;
++-                            else if (2 * last_coeff_abs_level_remaining < (1 << c_rice_p_init))
++-                                if (lc->stat_coeff[sb_type] > 0)
++-                                    lc->stat_coeff[sb_type]--;
++-                            rice_init = 1;
+++                            if (trans_coeff_level > (3 << c_rice_param) &&
+++                                (c_rice_param < 4 || rice_adaptation_enabled))
+++                                ++c_rice_param;
++                         }
++-                    }
++-                } else {
++-                    int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
++-
++-                    trans_coeff_level = 1 + last_coeff_abs_level_remaining;
++-                    if (trans_coeff_level > (3 << c_rice_param))
++-                        c_rice_param = s->ps.sps->persistent_rice_adaptation_enabled_flag ? c_rice_param + 1 : FFMIN(c_rice_param + 1, 4);
++-                    if (s->ps.sps->persistent_rice_adaptation_enabled_flag && !rice_init) {
++-                        int c_rice_p_init = lc->stat_coeff[sb_type] / 4;
++-                        if (last_coeff_abs_level_remaining >= (3 << c_rice_p_init))
++-                            lc->stat_coeff[sb_type]++;
++-                        else if (2 * last_coeff_abs_level_remaining < (1 << c_rice_p_init))
++-                            if (lc->stat_coeff[sb_type] > 0)
++-                                lc->stat_coeff[sb_type]--;
++-                        rice_init = 1;
++-                    }
+++                    } while (coded_vals != 0);
++                 }
++-                if (s->ps.pps->sign_data_hiding_flag && sign_hidden) {
++-                    sum_abs += trans_coeff_level;
++-                    if (n == first_nz_pos_in_cg && (sum_abs&1))
++-                        trans_coeff_level = -trans_coeff_level;
+++
+++                // sign_hidden = 0 or 1 so we can combine the tests
+++                if ((sign_hidden & sum_abs) != 0) {
+++                    levels[nb_significant_coeff_flag - 1] = -levels[nb_significant_coeff_flag - 1];
++                 }
++-                if (coeff_sign_flag >> 15)
++-                    trans_coeff_level = -trans_coeff_level;
++-                coeff_sign_flag <<= 1;
++-                if(!lc->cu.cu_transquant_bypass_flag) {
++-                    if (s->ps.sps->scaling_list_enable_flag && !(transform_skip_flag && log2_trafo_size > 2)) {
++-                        if(y_c || x_c || log2_trafo_size < 4) {
++-                            switch(log2_trafo_size) {
++-                                case 3: pos = (y_c << 3) + x_c; break;
++-                                case 4: pos = ((y_c >> 1) << 3) + (x_c >> 1); break;
++-                                case 5: pos = ((y_c >> 2) << 3) + (x_c >> 2); break;
++-                                default: pos = (y_c << 2) + x_c; break;
++-                            }
++-                            scale_m = scale_matrix[pos];
++-                        } else {
++-                            scale_m = dc_scale;
++-                        }
+++
+++                bypass_finish(s);
+++
+++                // -- Finish bypass block
+++
+++                // Scale loop
+++                {
+++                    int m = nb_significant_coeff_flag - 1;
+++
+++                    // Deal with DC component (if any) first
+++                    if (i == 0 && significant_coeff_flag_idx[m] == 0)
+++                    {
+++                        const int k = (int32_t)(coeff_sign_flags << m) >> 31;
+++                        blk_coeffs[0] = trans_scale_sat(
+++                            (levels[m] ^ k) - k, scale, dc_scale, shift);
+++                        --m;
++                     }
++-                    trans_coeff_level = (trans_coeff_level * (int64_t)scale * (int64_t)scale_m + add) >> shift;
++-                    if(trans_coeff_level < 0) {
++-                        if((~trans_coeff_level) & 0xFffffffffff8000)
++-                            trans_coeff_level = -32768;
++-                    } else {
++-                        if(trans_coeff_level & 0xffffffffffff8000)
++-                            trans_coeff_level = 32767;
+++
+++#if !USE_N_END_1
+++                    // If N_END_1 set then m was at least 1 initially
+++                    if (m >= 0)
+++#endif
+++                    {
+++                        do {
+++                            const xy_off_t * const xy_off = scan_xy_off +
+++                                significant_coeff_flag_idx[m];
+++                            const int k = (int32_t)(coeff_sign_flags << m) >> 31;
+++
+++                            blk_coeffs[xy_off->coeff] = trans_scale_sat(
+++                                (levels[m] ^ k) - k,
+++                                scale,
+++                                blk_scale[xy_off->scale],
+++                                shift);
+++                        } while (--m >= 0);
++                     }
++                 }
++-                coeffs[y_c * trafo_size + x_c] = trans_coeff_level;
+ +
+-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+-+*/
++             }
++         }
++-    }
+++    } while ((i = next_subset(s, i, c_idx_nz,
+++        significant_coeff_group_flag, scan_x_cg, scan_y_cg, &prev_sig)) >= 0);
++ 
++     if (lc->cu.cu_transquant_bypass_flag) {
++         if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
++@@ -1467,7 +2078,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++             s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
++         }
++     } else {
++-        if (transform_skip_flag) {
+++        if (trans_skip_or_bypass) { // Must be trans_skip as we've already dealt with bypass
++             int rot = s->ps.sps->transform_skip_rotation_enabled_flag &&
++                       log2_trafo_size == 2 &&
++                       lc->cu.pred_mode == MODE_INTRA;
++@@ -1475,7 +2086,6 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++                 for (i = 0; i < 8; i++)
++                     FFSWAP(int16_t, coeffs[i], coeffs[16 - i - 1]);
++             }
++-
++             s->hevcdsp.transform_skip(coeffs, log2_trafo_size);
++ 
++             if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
++@@ -1486,8 +2096,26 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++                 s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
++             }
++         } else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2) {
++-            s->hevcdsp.idct_4x4_luma(coeffs);
+++           s->hevcdsp.idct_4x4_luma(coeffs);
++         } else {
+++#ifdef RPI
+++            if (!use_vpu) {
+++              int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
+++              if (max_xy == 0) {
+++                  s->hevcdsp.idct_dc[log2_trafo_size-2](coeffs);
+++              } else {
+++                  int col_limit = last_significant_coeff_x + last_significant_coeff_y + 4;
+++                  if (max_xy < 4)
+++                      col_limit = FFMIN(4, col_limit);
+++                  else if (max_xy < 8)
+++                      col_limit = FFMIN(8, col_limit);
+++                  else if (max_xy < 12)
+++                      col_limit = FFMIN(24, col_limit);
+ +
+-+#ifndef __USER_VCSM__H__INCLUDED__
+-+#define __USER_VCSM__H__INCLUDED__
+++                  s->hevcdsp.idct[log2_trafo_size-2](coeffs, col_limit);
+++              }
+++            }
+++#else
++             int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
++             if (max_xy == 0)
++                 s->hevcdsp.idct_dc[log2_trafo_size-2](coeffs);
++@@ -1501,6 +2129,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++                     col_limit = FFMIN(24, col_limit);
++                 s->hevcdsp.idct[log2_trafo_size-2](coeffs, col_limit);
++             }
+++#endif
++         }
++     }
++     if (lc->tu.cross_pf) {
++@@ -1510,6 +2139,17 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++             coeffs[i] = coeffs[i] + ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
++         }
++     }
+++#ifdef RPI
+++    if (s->enable_rpi) {
+++        HEVCPredCmd *cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++;
+++        cmd->type = RPI_PRED_TRANSFORM_ADD;
+++        cmd->size = log2_trafo_size;
+++        cmd->buf = coeffs;
+++        cmd->dst = dst;
+++        cmd->stride = stride;
+++        return;
+++    }
+++#endif
++     s->hevcdsp.transform_add[log2_trafo_size-2](dst, coeffs, stride);
++ }
++ 
++diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
++index 1f33b0c..55a0315 100644
++--- a/libavcodec/hevc_filter.c
+++++ b/libavcodec/hevc_filter.c
++@@ -22,6 +22,12 @@
++  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++  */
++ 
+++//#define DISABLE_SAO
+++//#define DISABLE_DEBLOCK
+++//#define DISABLE_STRENGTHS
+++// define DISABLE_DEBLOCK_NONREF for a 6% speed boost (by skipping deblocking on unimportant frames)
+++//#define DISABLE_DEBLOCK_NONREF
+ +
+-+/* VideoCore Shared Memory - user interface library.
+-+**
+-+** This library provides all the necessary abstraction for any application to
+-+** make use of the shared memory service which is distributed accross a kernel
+-+** driver and a videocore service.
+-+**
+-+** It is an application design decision to choose or not to use this service.
+-+**
+-+** The logical flow of operations that a user application needs to follow when
+-+** using this service is:
+-+**
+-+**       1) Initialize the service.
+-+**       2) Allocate shared memory blocks.
+-+**       3) Start using the allocated blocks.
+-+**          - In order to gain ownership on a block, lock the allocated block,
+-+**            locking a block returns a valid address that the user application
+-+**            can access.
+-+**          - When finished with using the block for the current execution cycle
+-+**            or function, and so when giving up the ownership, unlock the block.
+-+**       4) A block can be locked/unlocked as many times required - within or outside
+-+**          of - a specific execution context.
+-+**       5) To completely release an allocated block, free it.
+-+**       6) If the service is no longer required, terminate it.
+-+**
+-+**
+-+** Some generic considerations:
++ #include "libavutil/common.h"
++ #include "libavutil/internal.h"
++ 
++@@ -31,6 +37,11 @@
++ 
++ #include "bit_depth_template.c"
++ 
+++#ifdef RPI
+++#include "rpi_user_vcsm.h"
+++#include "rpi_qpu.h"
+++#endif
+ +
+-+** Allocating memory blocks.
+-+**
+-+**   Memory blocks can be allocated in different manners depending on the cache
+-+**   behavior desired.  A given block can either be:
++ #define LUMA 0
++ #define CB 1
++ #define CR 2
++@@ -273,6 +284,10 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
++     edges[2]   = x_ctb == s->ps.sps->ctb_width  - 1;
++     edges[3]   = y_ctb == s->ps.sps->ctb_height - 1;
++ 
+++#ifdef DISABLE_SAO
+++    return;
+++#endif
+ +
+-+**       - Allocated in a non cached fashion all the way through host and videocore.
+-+**       - Allocated in a cached fashion on host OR videocore.
+-+**       - Allocated in a cached fashion on host AND videocore.
+-+**
+-+**   It is an application decision to determine how to allocate a block.  Evidently
+-+**   if the application will be doing substantial read/write accesses to a given block,
+-+**   it is recommended to allocate the block at least in a 'host cached' fashion for
+-+**   better results.
+-+**
+-+**
+-+** Locking memory blocks.
+-+**
+-+**   When the memory block has been allocated in a host cached fashion, locking the
+-+**   memory block (and so taking ownership of it) will trigger a cache invalidation.
+-+**
+-+**   For the above reason and when using host cached allocation, it is important that
+-+**   an application properly implements the lock/unlock mechanism to ensure cache will
+-+**   stay coherent, otherwise there is no guarantee it will at all be.
+-+**
+-+**   It is possible to dynamically change the host cache behavior (ie cached or non
+-+**   cached) of a given allocation without needing to free and re-allocate the block.
+-+**   This feature can be useful for such application which requires access to the block
+-+**   only at certain times and not otherwise.  By changing the cache behavior dynamically
+-+**   the application can optimize performances for a given duration of use.
+-+**   Such dynamic cache behavior remapping only applies to host cache and not videocore
+-+**   cache.  If one requires to change the videocore cache behavior, then a new block
+-+**   must be created to replace the old one.
+-+**
+-+**   On successful locking, a valid pointer is returned that the application can use
+-+**   to access to data inside the block.  There is no guarantee that the pointer will
+-+**   stay valid following the unlock action corresponding to this lock.
+-+**
+-+**
+-+** Unocking memory blocks.
+-+**
+-+**   When the memory block has been allocated in a host cached fashion, unlocking the
+-+**   memory block (and so forgiving its ownership) will trigger a cache flush unless
+-+**   explicitely asked not to flush the cache for performances reasons.
+-+**
+-+**   For the above reason and when using host cached allocation, it is important that
+-+**   an application properly implements the lock/unlock mechanism to ensure cache will
+-+**   stay coherent, otherwise there is no guarantee it will at all be.
+-+**
+-+**
+-+** A complete API is defined below.
+-+*/
++     if (restore) {
++         if (!edges[0]) {
++             left_tile_edge  = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1]];
++@@ -496,6 +511,15 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
++                 s->ps.sps->pcm.loop_filter_disable_flag) ||
++                s->ps.pps->transquant_bypass_enable_flag;
++ 
+++#ifdef DISABLE_DEBLOCK_NONREF
+++    if (!s->used_for_ref)
+++      return; // Don't deblock non-reference frames
+++#endif
+++#ifdef DISABLE_DEBLOCK
+++    return;
+++#endif
+++    if (!s->used_for_ref && s->avctx->skip_loop_filter >= AVDISCARD_NONREF)
+++        return;
++     if (x0) {
++         left_tc_offset   = s->deblock[ctb - 1].tc_offset;
++         left_beta_offset = s->deblock[ctb - 1].beta_offset;
++@@ -539,6 +563,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
++                                                          s->frame->linesize[LUMA],
++                                                          beta, tc, no_p, no_q);
++                 } else
+++#ifdef RPI_DEBLOCK_VPU
+++                if (s->enable_rpi_deblock) {
+++                    uint8_t (*setup)[2][2][4];
+++                    int num16 = (y>>4)*s->setup_width + (x>>4);
+++                    int a = ((y>>3) & 1) << 1;
+++                    int b = (x>>3) & 1;
+++                    setup = s->dvq->y_setup_arm[num16];
+++                    setup[0][b][0][a] = beta;
+++                    setup[0][b][0][a + 1] = beta;
+++                    setup[0][b][1][a] = tc[0];
+++                    setup[0][b][1][a + 1] = tc[1];
+++                } else
+++#endif
++                     s->hevcdsp.hevc_v_loop_filter_luma(src,
++                                                        s->frame->linesize[LUMA],
++                                                        beta, tc, no_p, no_q);
++@@ -571,6 +608,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
++                                                          s->frame->linesize[LUMA],
++                                                          beta, tc, no_p, no_q);
++                 } else
+++#ifdef RPI_DEBLOCK_VPU
+++                if (s->enable_rpi_deblock) {
+++                    uint8_t (*setup)[2][2][4];
+++                    int num16 = (y>>4)*s->setup_width + (x>>4);
+++                    int a = ((x>>3) & 1) << 1;
+++                    int b = (y>>3) & 1;
+++                    setup = s->dvq->y_setup_arm[num16];
+++                    setup[1][b][0][a] = beta;
+++                    setup[1][b][0][a + 1] = beta;
+++                    setup[1][b][1][a] = tc[0];
+++                    setup[1][b][1][a + 1] = tc[1];
+++                } else
+++#endif
++                     s->hevcdsp.hevc_h_loop_filter_luma(src,
++                                                        s->frame->linesize[LUMA],
++                                                        beta, tc, no_p, no_q);
++@@ -605,9 +655,23 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
++                                                                    s->frame->linesize[chroma],
++                                                                    c_tc, no_p, no_q);
++                         } else
+++#ifdef RPI_DEBLOCK_VPU
+++                        if (s->enable_rpi_deblock) {
+++                            uint8_t (*setup)[2][2][4];
+++                            int xc = x>>s->ps.sps->hshift[chroma];
+++                            int yc = y>>s->ps.sps->vshift[chroma];
+++                            int num16 = (yc>>4)*s->uv_setup_width + (xc>>4);
+++                            int a = ((yc>>3) & 1) << 1;
+++                            int b = (xc>>3) & 1;
+++                            setup = s->dvq->uv_setup_arm[num16];
+++                            setup[0][b][0][a] = c_tc[0];
+++                            setup[0][b][0][a + 1] = c_tc[1];
+++                        } else
+++#endif
++                             s->hevcdsp.hevc_v_loop_filter_chroma(src,
++                                                                  s->frame->linesize[chroma],
++                                                                  c_tc, no_p, no_q);
+ +
+-+#ifdef __cplusplus
+-+extern "C"
+-+{
++                     }
++                 }
++ 
++@@ -638,6 +702,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
++                                                                    s->frame->linesize[chroma],
++                                                                    c_tc, no_p, no_q);
++                         } else
+++#ifdef RPI_DEBLOCK_VPU
+++                        if (s->enable_rpi_deblock) {
+++                            uint8_t (*setup)[2][2][4];
+++                            int xc = x>>s->ps.sps->hshift[chroma];
+++                            int yc = y>>s->ps.sps->vshift[chroma];
+++                            int num16 = (yc>>4)*s->uv_setup_width + (xc>>4);
+++                            int a = ((xc>>3) & 1) << 1;
+++                            int b = (yc>>3) & 1;
+++                            setup = s->dvq->uv_setup_arm[num16];
+++                            setup[1][b][0][a] = c_tc[0];
+++                            setup[1][b][0][a + 1] = c_tc[1];
+++                        } else
+ +#endif
++                             s->hevcdsp.hevc_h_loop_filter_chroma(src,
++                                                                  s->frame->linesize[chroma],
++                                                                  c_tc, no_p, no_q);
++@@ -648,69 +725,6 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
++     }
++ }
++ 
++-static int boundary_strength(HEVCContext *s, MvField *curr, MvField *neigh,
++-                             RefPicList *neigh_refPicList)
++-{
++-    if (curr->pred_flag == PF_BI &&  neigh->pred_flag == PF_BI) {
++-        // same L0 and L1
++-        if (s->ref->refPicList[0].list[curr->ref_idx[0]] == neigh_refPicList[0].list[neigh->ref_idx[0]]  &&
++-            s->ref->refPicList[0].list[curr->ref_idx[0]] == s->ref->refPicList[1].list[curr->ref_idx[1]] &&
++-            neigh_refPicList[0].list[neigh->ref_idx[0]] == neigh_refPicList[1].list[neigh->ref_idx[1]]) {
++-            if ((FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
++-                 FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4) &&
++-                (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
++-                 FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4))
++-                return 1;
++-            else
++-                return 0;
++-        } else if (neigh_refPicList[0].list[neigh->ref_idx[0]] == s->ref->refPicList[0].list[curr->ref_idx[0]] &&
++-                   neigh_refPicList[1].list[neigh->ref_idx[1]] == s->ref->refPicList[1].list[curr->ref_idx[1]]) {
++-            if (FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
++-                FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4)
++-                return 1;
++-            else
++-                return 0;
++-        } else if (neigh_refPicList[1].list[neigh->ref_idx[1]] == s->ref->refPicList[0].list[curr->ref_idx[0]] &&
++-                   neigh_refPicList[0].list[neigh->ref_idx[0]] == s->ref->refPicList[1].list[curr->ref_idx[1]]) {
++-            if (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
++-                FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4)
++-                return 1;
++-            else
++-                return 0;
++-        } else {
++-            return 1;
++-        }
++-    } else if ((curr->pred_flag != PF_BI) && (neigh->pred_flag != PF_BI)){ // 1 MV
++-        Mv A, B;
++-        int ref_A, ref_B;
++-
++-        if (curr->pred_flag & 1) {
++-            A     = curr->mv[0];
++-            ref_A = s->ref->refPicList[0].list[curr->ref_idx[0]];
++-        } else {
++-            A     = curr->mv[1];
++-            ref_A = s->ref->refPicList[1].list[curr->ref_idx[1]];
++-        }
++-
++-        if (neigh->pred_flag & 1) {
++-            B     = neigh->mv[0];
++-            ref_B = neigh_refPicList[0].list[neigh->ref_idx[0]];
++-        } else {
++-            B     = neigh->mv[1];
++-            ref_B = neigh_refPicList[1].list[neigh->ref_idx[1]];
++-        }
++-
++-        if (ref_A == ref_B) {
++-            if (FFABS(A.x - B.x) >= 4 || FFABS(A.y - B.y) >= 4)
++-                return 1;
++-            else
++-                return 0;
++-        } else
++-            return 1;
++-    }
++-
++-    return 1;
++-}
++ 
++ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
++                                            int log2_trafo_size)
++@@ -721,10 +735,21 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
++     int log2_min_tu_size = s->ps.sps->log2_min_tb_size;
++     int min_pu_width     = s->ps.sps->min_pu_width;
++     int min_tu_width     = s->ps.sps->min_tb_width;
++-    int is_intra = tab_mvf[(y0 >> log2_min_pu_size) * min_pu_width +
++-                           (x0 >> log2_min_pu_size)].pred_flag == PF_INTRA;
++     int boundary_upper, boundary_left;
++-    int i, j, bs;
+++    int i, j;
+++    RefPicList *rpl      = s->ref->refPicList;
+++    int min_pu_in_4pix   = (1 << log2_min_pu_size) >> 2;
+++    int trafo_in_min_pus = (1 << log2_trafo_size) >> log2_min_pu_size;
+++    int y_pu             = y0 >> log2_min_pu_size;
+++    int x_pu             = x0 >> log2_min_pu_size;
+++    MvField *curr        = &tab_mvf[y_pu * min_pu_width + x_pu];
+++    int is_intra         = curr->pred_flag == PF_INTRA;
+++    int inc              = log2_min_pu_size == 2 ? 2 : 1;
+++    uint8_t *bs;
+ +
+-+/* Different status that can be dumped.
+-+*/
+-+typedef enum
+-+{
+-+   VCSM_STATUS_VC_WALK_ALLOC = 0,   // Walks *all* the allocation on videocore.
+-+                                    // Result of the walk is seen in the videocore
+-+                                    // log.
+-+   VCSM_STATUS_HOST_WALK_MAP,       // Walks the *full* mapping allocation on host
+-+                                    // driver (ie for all processes).  Result of
+-+                                    // the walk is seen in the kernel log.
+-+   VCSM_STATUS_HOST_WALK_PID_MAP,   // Walks the per process mapping allocation on host
+-+                                    // driver (for current process).  Result of
+-+                                    // the walk is seen in the kernel log.
+-+   VCSM_STATUS_HOST_WALK_PID_ALLOC, // Walks the per process host allocation on host
+-+                                    // driver (for current process).  Result of
+-+                                    // the walk is seen in the kernel log.
+-+   VCSM_STATUS_VC_MAP_ALL,          // Equivalent to both VCSM_STATUS_VC_WALK_ALLOC and
+-+                                    // VCSM_STATUS_HOST_WALK_MAP.
+-+                                    //
+-+   VCSM_STATUS_NONE,                // Must be last - invalid.
+++#ifdef DISABLE_STRENGTHS
+++    return;
+++#endif
++ 
++     boundary_upper = y0 > 0 && !(y0 & 7);
++     if (boundary_upper &&
++@@ -736,34 +761,56 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
++           (y0 % (1 << s->ps.sps->log2_ctb_size)) == 0)))
++         boundary_upper = 0;
++ 
+++    bs = &s->horizontal_bs[(x0 + y0 * s->bs_width) >> 2];
+ +
+-+} VCSM_STATUS_T;
++     if (boundary_upper) {
++         RefPicList *rpl_top = (lc->boundary_flags & BOUNDARY_UPPER_SLICE) ?
++                               ff_hevc_get_ref_list(s, s->ref, x0, y0 - 1) :
++-                              s->ref->refPicList;
++-        int yp_pu = (y0 - 1) >> log2_min_pu_size;
++-        int yq_pu =  y0      >> log2_min_pu_size;
++-        int yp_tu = (y0 - 1) >> log2_min_tu_size;
++-        int yq_tu =  y0      >> log2_min_tu_size;
+++                              rpl;
+++        MvField *top = curr - min_pu_width;
+ +
+-+/* Different kind of cache behavior.
+-+*/
+-+typedef enum
+-+{
+-+   VCSM_CACHE_TYPE_NONE = 0,        // No caching applies.
+-+   VCSM_CACHE_TYPE_HOST,            // Allocation is cached on host (user space).
+-+   VCSM_CACHE_TYPE_VC,              // Allocation is cached on videocore.
+-+   VCSM_CACHE_TYPE_HOST_AND_VC,     // Allocation is cached on both host and videocore.
+++        if (is_intra) {
+++            for (i = 0; i < (1 << log2_trafo_size); i += 4)
+++                bs[i >> 2] = 2;
+ +
+-+} VCSM_CACHE_TYPE_T;
+++        } else {
+++            int y_tu = y0 >> log2_min_tu_size;
+++            int x_tu = x0 >> log2_min_tu_size;
+++            uint8_t *curr_cbf_luma = &s->cbf_luma[y_tu * min_tu_width + x_tu];
+++            uint8_t *top_cbf_luma = curr_cbf_luma - min_tu_width;
+ +
+-+/* Initialize the vcsm processing.
+-+**
+-+** Must be called once before attempting to do anything else.
+-+**
+-+** Returns 0 on success, -1 on error.
+-+*/
+-+int vcsm_init( void );
+++            s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
+++                    min_pu_in_4pix, sizeof (MvField), 4 >> 2,
+++                    rpl[0].list, rpl[1].list, rpl_top[0].list, rpl_top[1].list,
+++                    curr, top, bs);
++ 
++             for (i = 0; i < (1 << log2_trafo_size); i += 4) {
++-                int x_pu = (x0 + i) >> log2_min_pu_size;
++-                int x_tu = (x0 + i) >> log2_min_tu_size;
++-                MvField *top  = &tab_mvf[yp_pu * min_pu_width + x_pu];
++-                MvField *curr = &tab_mvf[yq_pu * min_pu_width + x_pu];
++-                uint8_t top_cbf_luma  = s->cbf_luma[yp_tu * min_tu_width + x_tu];
++-                uint8_t curr_cbf_luma = s->cbf_luma[yq_tu * min_tu_width + x_tu];
++-
++-                if (curr->pred_flag == PF_INTRA || top->pred_flag == PF_INTRA)
++-                    bs = 2;
++-                else if (curr_cbf_luma || top_cbf_luma)
++-                    bs = 1;
++-                else
++-                    bs = boundary_strength(s, curr, top, rpl_top);
++-                s->horizontal_bs[((x0 + i) + y0 * s->bs_width) >> 2] = bs;
+++                int i_pu = i >> log2_min_pu_size;
+++                int i_tu = i >> log2_min_tu_size;
+ +
+++                if (top[i_pu].pred_flag == PF_INTRA)
+++                    bs[i >> 2] = 2;
+++                else if (curr_cbf_luma[i_tu] || top_cbf_luma[i_tu])
+++                    bs[i >> 2] = 1;
++             }
+++        }
+++    }
+ +
+-+/* Terminates the vcsm processing.
+-+**
+-+** Must be called vcsm services are no longer needed, it will
+-+** take care of removing any allocation under the current process
+-+** control if deemed necessary.
+-+*/
+-+void vcsm_exit( void );
+++    if (!is_intra) {
+++        for (j = inc; j < trafo_in_min_pus; j += inc) {
+++            MvField *top;
+ +
+++            curr += min_pu_width * inc;
+++            top = curr - min_pu_width;
+++            bs += s->bs_width * inc << log2_min_pu_size >> 2;
+ +
+-+/* Queries the status of the the vcsm.
+-+**
+-+** Triggers dump of various kind of information, see the
+-+** different variants specified in VCSM_STATUS_T.
+-+**
+-+** Pid is optional.
+-+*/
+-+void vcsm_status( VCSM_STATUS_T status, int pid );
+++            s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
+++                    min_pu_in_4pix, sizeof (MvField), 4 >> 2,
+++                    rpl[0].list, rpl[1].list, rpl[0].list, rpl[1].list,
+++                    curr, top, bs);
+++        }
++     }
++ 
++-    // bs for vertical TU boundaries
++     boundary_left = x0 > 0 && !(x0 & 7);
++     if (boundary_left &&
++         ((!s->sh.slice_loop_filter_across_slices_enabled_flag &&
++@@ -774,64 +821,54 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
++           (x0 % (1 << s->ps.sps->log2_ctb_size)) == 0)))
++         boundary_left = 0;
++ 
+++    curr = &tab_mvf[y_pu * min_pu_width + x_pu];
+++    bs = &s->vertical_bs[(x0 + y0 * s->bs_width) >> 2];
+ +
++     if (boundary_left) {
++         RefPicList *rpl_left = (lc->boundary_flags & BOUNDARY_LEFT_SLICE) ?
++                                ff_hevc_get_ref_list(s, s->ref, x0 - 1, y0) :
++-                               s->ref->refPicList;
++-        int xp_pu = (x0 - 1) >> log2_min_pu_size;
++-        int xq_pu =  x0      >> log2_min_pu_size;
++-        int xp_tu = (x0 - 1) >> log2_min_tu_size;
++-        int xq_tu =  x0      >> log2_min_tu_size;
+++                               rpl;
+++        MvField *left = curr - 1;
++ 
++-            for (i = 0; i < (1 << log2_trafo_size); i += 4) {
++-                int y_pu      = (y0 + i) >> log2_min_pu_size;
++-                int y_tu      = (y0 + i) >> log2_min_tu_size;
++-                MvField *left = &tab_mvf[y_pu * min_pu_width + xp_pu];
++-                MvField *curr = &tab_mvf[y_pu * min_pu_width + xq_pu];
++-                uint8_t left_cbf_luma = s->cbf_luma[y_tu * min_tu_width + xp_tu];
++-                uint8_t curr_cbf_luma = s->cbf_luma[y_tu * min_tu_width + xq_tu];
++-
++-                if (curr->pred_flag == PF_INTRA || left->pred_flag == PF_INTRA)
++-                    bs = 2;
++-                else if (curr_cbf_luma || left_cbf_luma)
++-                    bs = 1;
++-                else
++-                    bs = boundary_strength(s, curr, left, rpl_left);
++-                s->vertical_bs[(x0 + (y0 + i) * s->bs_width) >> 2] = bs;
++-            }
++-    }
+++        if (is_intra) {
+++            for (j = 0; j < (1 << log2_trafo_size); j += 4)
+++                bs[j * s->bs_width >> 2] = 2;
++ 
++-    if (log2_trafo_size > log2_min_pu_size && !is_intra) {
++-        RefPicList *rpl = s->ref->refPicList;
++-
++-        // bs for TU internal horizontal PU boundaries
++-        for (j = 8; j < (1 << log2_trafo_size); j += 8) {
++-            int yp_pu = (y0 + j - 1) >> log2_min_pu_size;
++-            int yq_pu = (y0 + j)     >> log2_min_pu_size;
++-
++-            for (i = 0; i < (1 << log2_trafo_size); i += 4) {
++-                int x_pu = (x0 + i) >> log2_min_pu_size;
++-                MvField *top  = &tab_mvf[yp_pu * min_pu_width + x_pu];
++-                MvField *curr = &tab_mvf[yq_pu * min_pu_width + x_pu];
++-
++-                bs = boundary_strength(s, curr, top, rpl);
++-                s->horizontal_bs[((x0 + i) + (y0 + j) * s->bs_width) >> 2] = bs;
+++        } else {
+++            int y_tu = y0 >> log2_min_tu_size;
+++            int x_tu = x0 >> log2_min_tu_size;
+++            uint8_t *curr_cbf_luma = &s->cbf_luma[y_tu * min_tu_width + x_tu];
+++            uint8_t *left_cbf_luma = curr_cbf_luma - 1;
+ +
+-+/* Allocates a non-cached block of memory of size 'size' via the vcsm memory
+-+** allocator.
+-+**
+-+** Returns:        0 on error
+-+**                 a non-zero opaque handle on success.
+-+**
+-+** On success, the user must invoke vcsm_lock with the returned opaque
+-+** handle to gain access to the memory associated with the opaque handle.
+-+** When finished using the memory, the user calls vcsm_unlock_xx (see those
+-+** function definition for more details on the one that can be used).
+-+**
+-+** A well behaved application should make every attempt to lock/unlock
+-+** only for the duration it needs to access the memory data associated with
+-+** the opaque handle.
+-+*/
+-+unsigned int vcsm_malloc( unsigned int size, char *name );
+++            s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
+++                    min_pu_in_4pix, min_pu_width * sizeof (MvField), 4 * s->bs_width >> 2,
+++                    rpl[0].list, rpl[1].list, rpl_left[0].list, rpl_left[1].list,
+++                    curr, left, bs);
+ +
+++            for (j = 0; j < (1 << log2_trafo_size); j += 4) {
+++                int j_pu = j >> log2_min_pu_size;
+++                int j_tu = j >> log2_min_tu_size;
+ +
+-+/* Allocates a cached block of memory of size 'size' via the vcsm memory
+-+** allocator, the type of caching requested is passed as argument of the
+-+** function call.
+-+**
+-+** Returns:        0 on error
+-+**                 a non-zero opaque handle on success.
+-+**
+-+** On success, the user must invoke vcsm_lock with the returned opaque
+-+** handle to gain access to the memory associated with the opaque handle.
+-+** When finished using the memory, the user calls vcsm_unlock_xx (see those
+-+** function definition for more details on the one that can be used).
+-+**
+-+** A well behaved application should make every attempt to lock/unlock
+-+** only for the duration it needs to access the memory data associated with
+-+** the opaque handle.
+-+*/
+-+unsigned int vcsm_malloc_cache( unsigned int size, VCSM_CACHE_TYPE_T cache, char *name );
+++                if (left[j_pu * min_pu_width].pred_flag == PF_INTRA)
+++                    bs[j * s->bs_width >> 2] = 2;
+++                else if (curr_cbf_luma[j_tu * min_tu_width] || left_cbf_luma[j_tu * min_tu_width])
+++                    bs[j * s->bs_width >> 2] = 1;
++             }
++         }
+++    }
++ 
++-        // bs for TU internal vertical PU boundaries
++-        for (j = 0; j < (1 << log2_trafo_size); j += 4) {
++-            int y_pu = (y0 + j) >> log2_min_pu_size;
+++    if (!is_intra) {
+++        for (i = inc; i < trafo_in_min_pus; i += inc) {
+++            MvField *left;
++ 
++-            for (i = 8; i < (1 << log2_trafo_size); i += 8) {
++-                int xp_pu = (x0 + i - 1) >> log2_min_pu_size;
++-                int xq_pu = (x0 + i)     >> log2_min_pu_size;
++-                MvField *left = &tab_mvf[y_pu * min_pu_width + xp_pu];
++-                MvField *curr = &tab_mvf[y_pu * min_pu_width + xq_pu];
+++            curr += inc;
+++            left = curr - 1;
+++            bs += inc << log2_min_pu_size >> 2;
++ 
++-                bs = boundary_strength(s, curr, left, rpl);
++-                s->vertical_bs[((x0 + i) + (y0 + j) * s->bs_width) >> 2] = bs;
++-            }
+++            s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
+++                    min_pu_in_4pix, min_pu_width * sizeof (MvField), 4 * s->bs_width >> 2,
+++                    rpl[0].list, rpl[1].list, rpl[0].list, rpl[1].list,
+++                    curr, left, bs);
++         }
++     }
++ }
++@@ -840,11 +877,196 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
++ #undef CB
++ #undef CR
++ 
+++#if !defined(RPI_FAST_CACHEFLUSH)
+++#if defined(RPI_LUMA_QPU) || defined(RPI_DEBLOCK_VPU)
+++static void flush_buffer_y(const AVFrame * const frame) {
+++    GPU_MEM_PTR_T p = get_gpu_mem_ptr_y(frame);
+++    gpu_cache_flush(&p);
+++}
+ +
+++static void flush_buffer_u(const AVFrame * const frame) {
+++    GPU_MEM_PTR_T p = get_gpu_mem_ptr_u(frame);
+++    gpu_cache_flush(&p);
+++}
+ +
+-+/* Shares an allocated block of memory via the vcsm memory allocator.
+-+**
+-+** Returns:        0 on error
+-+**                 a non-zero opaque handle on success.
+-+**
+-+** On success, the user must invoke vcsm_lock with the returned opaque
+-+** handle to gain access to the memory associated with the opaque handle.
+-+** When finished using the memory, the user calls vcsm_unlock_xx (see those
+-+** function definition for more details on the one that can be used).
+-+**
+-+** A well behaved application should make every attempt to lock/unlock
+-+** only for the duration it needs to access the memory data associated with
+-+** the opaque handle.
+-+*/
+-+unsigned int vcsm_malloc_share( unsigned int handle );
+-+
+-+
+-+/* Resizes a block of memory allocated previously by vcsm_alloc.
+-+**
+-+** Returns:        0 on success
+-+**                 -errno on error.
+-+**
+-+** The handle must be unlocked by user prior to attempting any
+-+** resize action.
+-+**
+-+** On error, the original size allocated against the handle
+-+** remains available the same way it would be following a
+-+** successful vcsm_malloc.
+-+*/
+-+int vcsm_resize( unsigned int handle, unsigned int new_size );
+-+
+-+
+-+/* Frees a block of memory that was successfully allocated by
+-+** a prior call the vcms_alloc.
+-+**
+-+** The handle should be considered invalid upon return from this
+-+** call.
+-+**
+-+** Whether any memory is actually freed up or not as the result of
+-+** this call will depends on many factors, if all goes well it will
+-+** be freed.  If something goes wrong, the memory will likely end up
+-+** being freed up as part of the vcsm_exit process.  In the end the
+-+** memory is guaranteed to be freed one way or another.
+-+*/
+-+void vcsm_free( unsigned int handle );
+-+
+-+
+-+/* Retrieves a videocore opaque handle from a mapped user address
+-+** pointer.  The videocore handle will correspond to the actual
+-+** memory mapped in videocore.
+-+**
+-+** Returns:        0 on error
+-+**                 a non-zero opaque handle on success.
+-+**
+-+** Note: the videocore opaque handle is distinct from the user
+-+**       opaque handle (allocated via vcsm_malloc) and it is only
+-+**       significant for such application which knows what to do
+-+**       with it, for the others it is just a number with little
+-+**       use since nothing can be done with it (in particular
+-+**       for safety reason it cannot be used to map anything).
+-+*/
+-+unsigned int vcsm_vc_hdl_from_ptr( void *usr_ptr );
+++static void flush_buffer_v(const AVFrame * const frame) {
+++    GPU_MEM_PTR_T p = get_gpu_mem_ptr_v(frame);
+++    gpu_cache_flush(&p);
+++}
+++#endif
+++#endif
+ +
+ +
+-+/* Retrieves a videocore opaque handle from a opaque handle
+-+** pointer.  The videocore handle will correspond to the actual
+-+** memory mapped in videocore.
+-+**
+-+** Returns:        0 on error
+-+**                 a non-zero opaque handle on success.
+-+**
+-+** Note: the videocore opaque handle is distinct from the user
+-+**       opaque handle (allocated via vcsm_malloc) and it is only
+-+**       significant for such application which knows what to do
+-+**       with it, for the others it is just a number with little
+-+**       use since nothing can be done with it (in particular
+-+**       for safety reason it cannot be used to map anything).
+-+*/
+-+unsigned int vcsm_vc_hdl_from_hdl( unsigned int handle );
+++#ifdef RPI_DEBLOCK_VPU
+++#error Not fixed yet
+ +
+++// ff_hevc_flush_buffer_lines
+++// flushes and invalidates all pixel rows in [start,end-1]
+++static void ff_hevc_flush_buffer_lines(HEVCContext *s, int start, int end, int flush_luma, int flush_chroma)
+++{
+++#ifdef RPI_FAST_CACHEFLUSH
+++        struct vcsm_user_clean_invalid_s iocache = {};
+++        int curr_y = start;
+++        int n = end;
+++        int curr_uv = curr_y >> s->ps.sps->vshift[1];
+++        int n_uv = n >> s->ps.sps->vshift[1];
+++        int sz,base;
+++        GPU_MEM_PTR_T p;
+++        if (curr_uv < 0) curr_uv = 0;
+++        if (n_uv<=curr_uv) { return; }
+++        sz = s->frame->linesize[1] * (n_uv-curr_uv);
+++        base = s->frame->linesize[1] * curr_uv;
+++        if (flush_chroma) {
+++          p = get_gpu_mem_ptr_u(s->frame);
+++          iocache.s[0].handle = p.vcsm_handle;
+++          iocache.s[0].cmd = 3; // clean+invalidate
+++          iocache.s[0].addr = (int)p.arm + base;
+++          iocache.s[0].size  = sz;
+++          p = get_gpu_mem_ptr_v(s->frame);
+++          iocache.s[1].handle = p.vcsm_handle;
+++          iocache.s[1].cmd = 3; // clean+invalidate
+++          iocache.s[1].addr = (int)p.arm + base;
+++          iocache.s[1].size  = sz;
+++        }
+++        if (flush_luma) {
+++          p = get_gpu_mem_ptr_y(s->frame);
+++          sz = s->frame->linesize[0] * (n-curr_y);
+++          base = s->frame->linesize[0] * curr_y;
+++          iocache.s[2].handle = p.vcsm_handle;
+++          iocache.s[2].cmd = 3; // clean+invalidate
+++          iocache.s[2].addr = (int)p.arm + base;
+++          iocache.s[2].size  = sz;
+++        }
+++        vcsm_clean_invalid( &iocache );
+++#else
+++        if (flush_chroma) {
+++          flush_buffer_u(s->frame);
+++          flush_buffer_v(s->frame);
+++        }
+++        if (flush_luma) {
+++          flush_buffer_y(s->frame);
+++        }
+++#endif
+++}
+++#endif
+ +
+-+/* Retrieves a user opaque handle from a mapped user address
+-+** pointer.
+-+**
+-+** Returns:        0 on error
+-+**                 a non-zero opaque handle on success.
+-+*/
+-+unsigned int vcsm_usr_handle( void *usr_ptr );
+++#ifdef RPI_INTER_QPU
+++void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n)
+++{
+++    if (s->enable_rpi && s->used_for_ref) {
+++      // TODO make this use ff_hevc_flush_buffer_lines
+++#ifdef RPI_FAST_CACHEFLUSH
+++        struct vcsm_user_clean_invalid_s iocache = {};
+++        int curr_y = ((int *)f->progress->data)[0];
+++        int curr_uv = curr_y >> s->ps.sps->vshift[1];
+++        int n_uv = n >> s->ps.sps->vshift[1];
+++        int sz,base;
+++        GPU_MEM_PTR_T p;
+++        if (curr_uv < 0) curr_uv = 0;
+++        if (n_uv<=curr_uv) { return; }
+++        sz = s->frame->linesize[1] * (n_uv-curr_uv);
+++        base = s->frame->linesize[1] * curr_uv;
+++        p = get_gpu_mem_ptr_u(s->frame);
+++        iocache.s[0].handle = p.vcsm_handle;
+++        iocache.s[0].cmd = 3; // clean+invalidate
+++        iocache.s[0].addr = (int)p.arm + base;
+++        iocache.s[0].size  = sz;
+++        p = get_gpu_mem_ptr_v(s->frame);
+++        iocache.s[1].handle = p.vcsm_handle;
+++        iocache.s[1].cmd = 3; // clean+invalidate
+++        iocache.s[1].addr = (int)p.arm + base;
+++        iocache.s[1].size  = sz;
+ +
+++#ifdef RPI_LUMA_QPU
+++        p = get_gpu_mem_ptr_y(s->frame);
+++        sz = s->frame->linesize[0] * (n-curr_y);
+++        base = s->frame->linesize[0] * curr_y;
+++        iocache.s[2].handle = p.vcsm_handle;
+++        iocache.s[2].cmd = 3; // clean+invalidate
+++        iocache.s[2].addr = (int)p.arm + base;
+++        iocache.s[2].size  = sz;
+++#endif
+++        vcsm_clean_invalid( &iocache );
+++#else
+++        flush_buffer_u(s->frame);
+++        flush_buffer_v(s->frame);
+++#ifdef RPI_LUMA_QPU
+++        flush_buffer_y(s->frame);
+++#endif
+ +
+-+/* Retrieves a mapped user address from an opaque user
+-+** handle.
+-+**
+-+** Returns:        0 on error
+-+**                 a non-zero address on success.
+-+**
+-+** On success, the address corresponds to the pointer
+-+** which can access the data allocated via the vcsm_malloc
+-+** call.
+-+*/
+-+void *vcsm_usr_address( unsigned int handle );
+++#endif
+++        //memcpy(s->dummy.arm,s->frame->data[0],2048*64);
+++        //memcpy(s->dummy.arm,s->frame->data[1],1024*32);
+++        //memcpy(s->dummy.arm,s->frame->data[2],1024*32);
+++    }
+++}
+++#endif
+ +
+++#ifdef RPI_DEBLOCK_VPU
+++#error XXX
+++/* rpi_deblock deblocks an entire row of ctbs using the VPU */
+++static void rpi_deblock(HEVCContext *s, int y, int ctb_size)
+++{
+++  // Flush image, 4 lines above to bottom of ctb stripe
+++  ff_hevc_flush_buffer_lines(s, FFMAX(y-4,0), y+ctb_size, 1, 1);
+++  // TODO flush buffer of beta/tc setup when it becomes cached
+ +
+-+/* Locks the memory associated with this opaque handle.
+-+**
+-+** Returns:        NULL on error
+-+**                 a valid pointer on success.
+-+**
+-+** A user MUST lock the handle received from vcsm_malloc
+-+** in order to be able to use the memory associated with it.
+-+**
+-+** On success, the pointer returned is only valid within
+-+** the lock content (ie until a corresponding vcsm_unlock_xx
+-+** is invoked).
+-+*/
+-+void *vcsm_lock( unsigned int handle );
+++  // Prepare three commands at once to avoid calling overhead
+++  s->dvq->vpu_cmds_arm[0][0] = get_vc_address_y(s->frame) + s->frame->linesize[0] * y;
+++  s->dvq->vpu_cmds_arm[0][1] = s->frame->linesize[0];
+++  s->dvq->vpu_cmds_arm[0][2] = s->setup_width;
+++  s->dvq->vpu_cmds_arm[0][3] = (int) ( s->dvq->y_setup_vc + s->setup_width * (y>>4) );
+++  s->dvq->vpu_cmds_arm[0][4] = ctb_size>>4;
+++  s->dvq->vpu_cmds_arm[0][5] = 2;
+++
+++  s->dvq->vpu_cmds_arm[1][0] = get_vc_address_u(s->frame) + s->frame->linesize[1] * (y>> s->ps.sps->vshift[1]);
+++  s->dvq->vpu_cmds_arm[1][1] = s->frame->linesize[1];
+++  s->dvq->vpu_cmds_arm[1][2] = s->uv_setup_width;
+++  s->dvq->vpu_cmds_arm[1][3] = (int) ( s->dvq->uv_setup_vc + s->uv_setup_width * ((y>>4)>> s->ps.sps->vshift[1]) );
+++  s->dvq->vpu_cmds_arm[1][4] = (ctb_size>>4)>> s->ps.sps->vshift[1];
+++  s->dvq->vpu_cmds_arm[1][5] = 3;
+++
+++  s->dvq->vpu_cmds_arm[2][0] = get_vc_address_v(s->frame) + s->frame->linesize[2] * (y>> s->ps.sps->vshift[2]);
+++  s->dvq->vpu_cmds_arm[2][1] = s->frame->linesize[2];
+++  s->dvq->vpu_cmds_arm[2][2] = s->uv_setup_width;
+++  s->dvq->vpu_cmds_arm[2][3] = (int) ( s->dvq->uv_setup_vc + s->uv_setup_width * ((y>>4)>> s->ps.sps->vshift[1]) );
+++  s->dvq->vpu_cmds_arm[2][4] = (ctb_size>>4)>> s->ps.sps->vshift[1];
+++  s->dvq->vpu_cmds_arm[2][5] = 4;
+++  // Call VPU
+++  s->dvq->cmd_id = vpu_post_code2( vpu_get_fn(), s->dvq->vpu_cmds_vc, 3, 0, 0, 0, 5, 0); // 5 means to do all the commands
+ +
+++  s->dvq_n = (s->dvq_n + 1) & (RPI_DEBLOCK_VPU_Q_COUNT - 1);
+++  s->dvq = s->dvq_ents + s->dvq_n;
+ +
+-+/* Locks the memory associated with this opaque handle.  The lock
+-+** also gives a chance to update the *host* cache behavior of the
+-+** allocated buffer if so desired.  The *videocore* cache behavior
+-+** of the allocated buffer cannot be changed by this call and such
+-+** attempt will be ignored.
+-+**
+-+** The system will attempt to honour the cache_update mode request,
+-+** the cache_result mode will provide the final answer on which cache
+-+** mode is really in use.  Failing to change the cache mode will not
+-+** result in a failure to lock the buffer as it is an application
+-+** decision to choose what to do if (cache_result != cache_update)
+-+**
+-+** The value returned in cache_result can only be considered valid if
+-+** the returned pointer is non NULL.  The cache_result pointer may be
+-+** NULL if the application does not care about the actual outcome of
+-+** its action with regards to the cache behavior change.
+-+**
+-+** Returns:        NULL on error
+-+**                 a valid pointer on success.
+-+**
+-+** A user MUST lock the handle received from vcsm_malloc
+-+** in order to be able to use the memory associated with it.
+-+**
+-+** On success, the pointer returned is only valid within
+-+** the lock content (ie until a corresponding vcsm_unlock_xx
+-+** is invoked).
+-+*/
+-+void *vcsm_lock_cache( unsigned int handle,
+-+                       VCSM_CACHE_TYPE_T cache_update,
+-+                       VCSM_CACHE_TYPE_T *cache_result );
+++  if (s->dvq->cmd_id != -1) {
+++      vpu_wait(s->dvq->cmd_id);
+++      s->dvq->cmd_id = -1;
+++  }
+++}
+ +
+++#endif
+ +
+-+/* Unlocks the memory associated with this user mapped address.
+-+**
+-+** Returns:        0 on success
+-+**                 -errno on error.
+-+**
+-+** After unlocking a mapped address, the user should no longer
+-+** attempt to reference it.
+-+*/
+-+int vcsm_unlock_ptr( void *usr_ptr );
+-+
+-+
+-+/* Unlocks the memory associated with this user mapped address.
+-+** Apply special processing that would override the otherwise
+-+** default behavior.
+-+**
+-+** If 'cache_no_flush' is specified:
+-+**    Do not flush cache as the result of the unlock (if cache
+-+**    flush was otherwise applicable in this case).
+-+**
+-+** Returns:        0 on success
+-+**                 -errno on error.
+-+**
+-+** After unlocking a mapped address, the user should no longer
+-+** attempt to reference it.
+-+*/
+-+int vcsm_unlock_ptr_sp( void *usr_ptr, int cache_no_flush );
+-+
+-+
+-+/* Unlocks the memory associated with this user opaque handle.
+-+**
+-+** Returns:        0 on success
+-+**                 -errno on error.
+-+**
+-+** After unlocking an opaque handle, the user should no longer
+-+** attempt to reference the mapped addressed once associated
+-+** with it.
+-+*/
+-+int vcsm_unlock_hdl( unsigned int handle );
+-+
+-+
+-+/* Unlocks the memory associated with this user opaque handle.
+-+** Apply special processing that would override the otherwise
+-+** default behavior.
+-+**
+-+** If 'cache_no_flush' is specified:
+-+**    Do not flush cache as the result of the unlock (if cache
+-+**    flush was otherwise applicable in this case).
+-+**
+-+** Returns:        0 on success
+-+**                 -errno on error.
+-+**
+-+** After unlocking an opaque handle, the user should no longer
+-+** attempt to reference the mapped addressed once associated
+-+** with it.
+-+*/
+-+int vcsm_unlock_hdl_sp( unsigned int handle, int cache_no_flush );
+-+
+-+#ifdef __cplusplus
+-+}
++ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
++ {
++     int x_end = x >= s->ps.sps->width  - ctb_size;
+++#ifdef RPI_DEBLOCK_VPU
+++    int done_deblock = 0;
+ +#endif
+-+
+-+#endif /* __USER_VCSM__H__INCLUDED__ */
+--- 
+-2.7.4
+-
+-
+-From 6cfa5910be47865aaaf58c185587189c332765a6 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@argondesign.com>
+-Date: Sat, 2 May 2015 21:15:37 +0100
+-Subject: [PATCH 04/68] First working version with uncached memory
+-
+----
+- libavcodec/hevc.c               |  61 +++++-
+- libavcodec/hevc.h               |  12 +-
+- libavcodec/hevc_cabac.c         |  39 +++-
+- libavcodec/hevc_filter.c        |  16 ++
+- libavcodec/hevcpred_template.c  |   6 +
+- libavcodec/rpi_hevc_transform.h | 422 +++++++++++++++++++++++++++++++++++++++-
+- libavcodec/rpi_hevc_transform.s | 153 +++++++++++++--
+- libavcodec/rpi_qpu.c            |  72 +++++++
+- libavcodec/rpi_qpu.h            |   1 +
+- 9 files changed, 736 insertions(+), 46 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index ab55df1..94ff709 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -45,6 +45,8 @@
+- #include "rpi_qpu.h"
+- #endif
+- 
+-+// #define DISABLE_MC
+-+
+- const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
+- 
+- /**
+-@@ -1079,11 +1081,15 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+-                         for (i = 0; i < (size * size); i++) {
+-                             coeffs[i] = ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
+-                         }
+-+                        printf("Cross component not supported\n"); // TODO
+-+                        exit(-1);
+-                         s->hevcdsp.transform_add[log2_trafo_size_c-2](dst, coeffs, stride);
+-                     }
+-             }
+- 
+-             if (lc->tu.cross_pf) {
+-+                printf("Cross component not supported\n"); // TODO
+-+                exit(-1);
+-                 hls_cross_component_pred(s, 1);
+-             }
+-             for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
+-@@ -1112,6 +1118,8 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+-                         for (i = 0; i < (size * size); i++) {
+-                             coeffs[i] = ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
+-                         }
+-+                        printf("Cross component not supported\n"); // TODO
+-+                        exit(-1);
+-                         s->hevcdsp.transform_add[log2_trafo_size_c-2](dst, coeffs, stride);
+-                     }
+-             }
+-@@ -1409,6 +1417,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+-                            (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
+-     int idx              = ff_hevc_pel_weight[block_w];
+- 
+-+#ifdef DISABLE_MC
+-+    return;
++     if (s->avctx->skip_loop_filter < AVDISCARD_ALL)
++         deblocking_filter_CTB(s, x, y);
+++#ifdef RPI_DEBLOCK_VPU
+++    if (s->enable_rpi_deblock && x_end)
+++    {
+++      int y_at_end = y >= s->ps.sps->height - ctb_size;
+++      int height = 64;  // Deblock in units 64 high to avoid too many VPU calls
+++      int y_start = y&~63;
+++      if (y_at_end) height = s->ps.sps->height - y_start;
+++      if ((((y+ctb_size)&63)==0) || y_at_end) {
+++        done_deblock = 1;
+++        rpi_deblock(s, y_start, height);
+++      }
+++    }
+ +#endif
+-+
+-     x_off += mv->x >> 2;
+-     y_off += mv->y >> 2;
+-     src   += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift));
+-@@ -1479,6 +1491,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+-     uint8_t *src0  = ref0->data[0] + y_off0 * src0stride + (int)((unsigned)x_off0 << s->ps.sps->pixel_shift);
+-     uint8_t *src1  = ref1->data[0] + y_off1 * src1stride + (int)((unsigned)x_off1 << s->ps.sps->pixel_shift);
+- 
+-+#ifdef DISABLE_MC
+-+    return;
++     if (s->ps.sps->sao_enabled) {
++         int y_end = y >= s->ps.sps->height - ctb_size;
++         if (y && x)
++@@ -853,16 +1075,46 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
++             sao_filter_CTB(s, x - ctb_size, y);
++         if (y && x_end) {
++             sao_filter_CTB(s, x, y - ctb_size);
++-            if (s->threads_type & FF_THREAD_FRAME )
+++            if (s->threads_type & FF_THREAD_FRAME ) {
+++#ifdef RPI_INTER_QPU
+++                ff_hevc_flush_buffer(s,&s->ref->tf, y);
+ +#endif
+-+
+-     if (x_off0 < QPEL_EXTRA_BEFORE || y_off0 < QPEL_EXTRA_AFTER ||
+-         x_off0 >= pic_width - block_w - QPEL_EXTRA_AFTER ||
+-         y_off0 >= pic_height - block_h - QPEL_EXTRA_AFTER) {
+-@@ -1564,6 +1580,10 @@ static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
+-     intptr_t _mx         = mx << (1 - hshift);
+-     intptr_t _my         = my << (1 - vshift);
+- 
+-+#ifdef DISABLE_MC
+-+    return;
++                 ff_thread_report_progress(&s->ref->tf, y, 0);
+++            }
++         }
++         if (x_end && y_end) {
++             sao_filter_CTB(s, x , y);
++-            if (s->threads_type & FF_THREAD_FRAME )
+++            if (s->threads_type & FF_THREAD_FRAME ) {
+++#ifdef RPI_INTER_QPU
+++                ff_hevc_flush_buffer(s, &s->ref->tf, y + ctb_size);
+ +#endif
+-+
+-     x_off += mv->x >> (2 + hshift);
+-     y_off += mv->y >> (2 + vshift);
+-     src0  += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift));
+-@@ -1628,6 +1648,10 @@ static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVF
+-     int hshift = s->ps.sps->hshift[1];
+-     int vshift = s->ps.sps->vshift[1];
+- 
+-+#ifdef DISABLE_MC
+-+    return;
++                 ff_thread_report_progress(&s->ref->tf, y + ctb_size, 0);
+++            }
+++        }
+++    } else if (s->threads_type & FF_THREAD_FRAME && x_end) {
+++        //int newh = y + ctb_size - 4;
+++        //int currh = s->ref->tf.progress->data[0];
+++        //if (((y + ctb_size)&63)==0)
+++#ifdef RPI_DEBLOCK_VPU
+++        if (s->enable_rpi_deblock) {
+++          // we no longer need to flush the luma buffer as it is in GPU memory when using deblocking on the rpi
+++          if (done_deblock) {
+++            ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
+++          }
+++        } else {
+++#ifdef RPI_INTER_QPU
+++          ff_hevc_flush_buffer(s, &s->ref->tf, y + ctb_size - 4);
+ +#endif
+-+
+-     intptr_t mx0 = av_mod_uintp2(mv0->x, 2 + hshift);
+-     intptr_t my0 = av_mod_uintp2(mv0->y, 2 + vshift);
+-     intptr_t mx1 = av_mod_uintp2(mv1->x, 2 + hshift);
+-@@ -2367,6 +2391,22 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
+++          ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
++         }
++-    } else if (s->threads_type & FF_THREAD_FRAME && x_end)
+++#else
+++#ifdef RPI_INTER_QPU
+++        ff_hevc_flush_buffer(s, &s->ref->tf, y + ctb_size - 4);
+++        // we no longer need to flush the luma buffer as it is in GPU memory when using deblocking on the rpi
+++#endif
++         ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
+++#endif
+++    }
+  }
+  
+- #ifdef RPI
+-+static void rpi_execute_transform(HEVCContext *s)
++ void ff_hevc_hls_filters(HEVCContext *s, int x_ctb, int y_ctb, int ctb_size)
++diff --git a/libavcodec/hevc_ps.c b/libavcodec/hevc_ps.c
++index 83f2ec2..6882a8d 100644
++--- a/libavcodec/hevc_ps.c
+++++ b/libavcodec/hevc_ps.c
++@@ -989,6 +989,8 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
++     sps->amp_enabled_flag = get_bits1(gb);
++     sps->sao_enabled      = get_bits1(gb);
++ 
+++    av_log(avctx, AV_LOG_INFO, "sao_enabled=%d\n", sps->sao_enabled);
+++
++     sps->pcm_enabled_flag = get_bits1(gb);
++     if (sps->pcm_enabled_flag) {
++         sps->pcm.bit_depth   = get_bits(gb, 4) + 1;
++diff --git a/libavcodec/hevcdsp.c b/libavcodec/hevcdsp.c
++index 9d773d9..a6534a9 100644
++--- a/libavcodec/hevcdsp.c
+++++ b/libavcodec/hevcdsp.c
++@@ -123,6 +123,120 @@ DECLARE_ALIGNED(16, const int8_t, ff_hevc_qpel_filters[3][16]) = {
++ #include "hevcdsp_template.c"
++ #undef BIT_DEPTH
++ 
+++static void hevc_deblocking_boundary_strengths(int pus, int dup, int in_inc, int out_inc,
+++                                               int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
+++                                               MvField *curr, MvField *neigh, uint8_t *bs)
+ +{
+-+    int i=2;
+-+    //int j;
+-+    //int16_t *coeffs = s->coeffs_buf_arm[i];
+-+    //for(j=s->num_coeffs[i]; j > 0; j-= 16*16, coeffs+=16*16) {
+-+    //    s->hevcdsp.idct[4-2](coeffs, 16);
+-+    //}
+++    for (; pus > 0; pus--) {
+++        int strength, out;
+++        int curr_refL0 = curr_rpl0[curr->ref_idx[0]];
+++        int curr_refL1 = curr_rpl1[curr->ref_idx[1]];
+++        int neigh_refL0 = neigh_rpl0[neigh->ref_idx[0]];
+++        int neigh_refL1 = neigh_rpl1[neigh->ref_idx[1]];
+ +
+-+    //gpu_cache_flush(&s->coeffs_buf[i]);
+-+    vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf[i].vc, s->num_coeffs[i] >> 8, 0, 0, 0);
+++#if 1 // This more directly matches the original implementation
+++        if (curr->pred_flag == PF_BI &&  neigh->pred_flag == PF_BI) {
+++            // same L0 and L1
+++            if (curr_refL0 == neigh_refL0 &&
+++                curr_refL0 == curr_refL1 &&
+++                neigh_refL0 == neigh_refL1) {
+++                if ((FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
+++                     FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4) &&
+++                    (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
+++                     FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4))
+++                    strength = 1;
+++                else
+++                    strength = 0;
+++            } else if (neigh_refL0 == curr_refL0 &&
+++                       neigh_refL1 == curr_refL1) {
+++                if (FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
+++                    FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4)
+++                    strength = 1;
+++                else
+++                    strength = 0;
+++            } else if (neigh_refL1 == curr_refL0 &&
+++                       neigh_refL0 == curr_refL1) {
+++                if (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
+++                    FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4)
+++                    strength = 1;
+++                else
+++                    strength = 0;
+++            } else {
+++                strength = 1;
+++            }
+++        } else if ((curr->pred_flag != PF_BI) && (neigh->pred_flag != PF_BI)){ // 1 MV
+++            Mv curr_mv0, neigh_mv0;
+ +
+-+    for(i=0;i<4;i++)
+-+        s->num_coeffs[i] = 0;
+-+}
+++            if (curr->pred_flag & 1) {
+++                curr_mv0   = curr->mv[0];
+++            } else {
+++                curr_mv0   = curr->mv[1];
+++                curr_refL0 = curr_refL1;
+++            }
+ +
+- static void rpi_execute_pred_cmds(HEVCContext *s)
+- {
+-   int i;
+-@@ -2387,7 +2427,6 @@ static void rpi_execute_pred_cmds(HEVCContext *s)
+-       }
+-   }
+-   s->num_pred_cmds = 0;
+--  s->num_coeffs = 0;
+- }
+- #endif
+- 
+-@@ -2434,7 +2473,8 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+- 
+-         more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
+- #ifdef RPI
+--        if (x_ctb + ctb_size >= s->ps.sps->width) {
+-+        if (1 || x_ctb + ctb_size >= s->ps.sps->width) { // TODO watch out for deblocking!
+-+            rpi_execute_transform(s);
+-             rpi_execute_pred_cmds(s);
+-         }
+- #endif
+-@@ -3179,7 +3219,9 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
+-     av_freep(&s->unif_mv_cmds);
+-     av_freep(&s->unif_xfm_cmds);
+-     av_freep(&s->univ_pred_cmds);
+--    av_freep(&s->coeffs_buf);
+-+    for(i = 0; i < 4; i++) {
+-+        gpu_free(&s->coeffs_buf[i]);
+-+    }
+- #endif
+- 
+-     for (i = 0; i < 3; i++) {
+-@@ -3246,13 +3288,16 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
+-     s->univ_pred_cmds = av_mallocz(sizeof(HEVCPredCmd)*RPI_MAX_PRED_CMDS);
+-     if (!s->univ_pred_cmds)
+-         goto fail;
+--    s->coeffs_buf = av_mallocz(sizeof(int16_t)*RPI_MAX_XFM_CMDS*16);
+--    if (!s->coeffs_buf)
+--        goto fail;
+-+    for(i = 0; i < 4; i++) {
+-+        gpu_malloc_uncached(sizeof(int16_t)*RPI_MAX_XFM_CMDS*16, &s->coeffs_buf[i]); // TODO slim this down and share across sizes
+-+        s->coeffs_buf_arm[i] = (int16_t*) s->coeffs_buf[i].arm;
+-+        if (!s->coeffs_buf_arm[i])
+-+            goto fail;
+-+    }
+-     s->enable_rpi = 0;
+- 
+-     // A little test program
+--    {
+-+    /*{
+-       GPU_MEM_PTR_T p;
+-       int err = gpu_malloc_cached(16, &p);
+-       short *q = (short *)p.arm;
+-@@ -3273,7 +3318,7 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
+-       printf(")\n");
+-       gpu_free(&p);
+-       goto fail; // Early out
+--    }
+-+    }*/
+- 
+- #endif
+- 
+-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
+-index 7a1c35f..4167985 100644
+---- a/libavcodec/hevc.h
+-+++ b/libavcodec/hevc.h
+-@@ -40,6 +40,11 @@
+- #include "thread.h"
+- #include "videodsp.h"
+- 
+-+// define RPI to split the CABAC/prediction/transform into separate stages
+-+#ifdef RPI
+-+#include "rpi_qpu.h"
+++            if (neigh->pred_flag & 1) {
+++                neigh_mv0   = neigh->mv[0];
+++            } else {
+++                neigh_mv0   = neigh->mv[1];
+++                neigh_refL0 = neigh_refL1;
+++            }
+++
+++            if (curr_refL0 == neigh_refL0) {
+++                if (FFABS(curr_mv0.x - neigh_mv0.x) >= 4 || FFABS(curr_mv0.y - neigh_mv0.y) >= 4)
+++                    strength = 1;
+++                else
+++                    strength = 0;
+++            } else
+++                strength = 1;
+++        } else
+++            strength = 1;
+++#else // This has exactly the same effect, but is more suitable for vectorisation
+++        Mv curr_mv[2];
+++        Mv neigh_mv[2];
+++        memcpy(curr_mv, curr->mv, sizeof curr_mv);
+++        memcpy(neigh_mv, neigh->mv, sizeof neigh_mv);
+++
+++        if (!(curr->pred_flag & 2)) {
+++            curr_mv[1] = curr_mv[0];
+++            curr_refL1 = curr_refL0;
+++        }
+++        if (!(neigh->pred_flag & 2)) {
+++            neigh_mv[1] = neigh_mv[0];
+++            neigh_refL1 = neigh_refL0;
+++        }
+++        if (!(curr->pred_flag & 1)) {
+++            curr_mv[0] = curr_mv[1];
+++            curr_refL0 = curr_refL1;
+++        }
+++        if (!(neigh->pred_flag & 1)) {
+++            neigh_mv[0] = neigh_mv[1];
+++            neigh_refL0 = neigh_refL1;
+++        }
+++
+++        strength = 1;
+++
+++        strength &= (neigh_refL0 != curr_refL0) | (neigh_refL1 != curr_refL1) |
+++                (FFABS(neigh_mv[0].x - curr_mv[0].x) >= 4) | (FFABS(neigh_mv[0].y - curr_mv[0].y) >= 4) |
+++                (FFABS(neigh_mv[1].x - curr_mv[1].x) >= 4) | (FFABS(neigh_mv[1].y - curr_mv[1].y) >= 4);
+++
+++        strength &= (neigh_refL1 != curr_refL0) | (neigh_refL0 != curr_refL1) |
+++                (FFABS(neigh_mv[1].x - curr_mv[0].x) >= 4) | (FFABS(neigh_mv[1].y - curr_mv[0].y) >= 4) |
+++                (FFABS(neigh_mv[0].x - curr_mv[1].x) >= 4) | (FFABS(neigh_mv[0].y - curr_mv[1].y) >= 4);
+++
+++        strength |= (((curr->pred_flag + 1) ^ (neigh->pred_flag + 1)) >> 2);
+ +#endif
+ +
+- #define MAX_DPB_SIZE 16 // A.4.1
+- #define MAX_REFS 16
+- 
+-@@ -856,11 +861,12 @@ typedef struct HEVCContext {
+-     HEVCMvCmd *unif_mv_cmds;
+-     HEVCXfmCmd *unif_xfm_cmds;
+-     HEVCPredCmd *univ_pred_cmds;
+--    int16_t *coeffs_buf;
+--    int num_mv_cmds;
+-+    GPU_MEM_PTR_T coeffs_buf[4];
+-+    int16_t *coeffs_buf_arm[4];
+-+    int num_coeffs[4];
+-     int num_xfm_cmds;
+-+    int num_mv_cmds;
+-     int num_pred_cmds;
+--    int num_coeffs;
+- #endif
+- 
+-     uint8_t *cabac_state;
+-diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
+-index 4e97f06..d1cba86 100644
+---- a/libavcodec/hevc_cabac.c
+-+++ b/libavcodec/hevc_cabac.c
+-@@ -1031,6 +1031,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+-     int vshift = s->ps.sps->vshift[c_idx];
+-     uint8_t *dst = &s->frame->data[c_idx][(y0 >> vshift) * stride +
+-                                           ((x0 >> hshift) << s->ps.sps->pixel_shift)];
+-+    int use_vpu = s->enable_rpi && !lc->cu.cu_transquant_bypass_flag && !transform_skip_flag && !lc->tu.cross_pf && log2_trafo_size==4;
+-     int16_t *coeffs = (int16_t*)(c_idx ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
+-     uint8_t significant_coeff_group_flag[8][8] = {{0}};
+-     int explicit_rdpcm_flag = 0;
+-@@ -1044,6 +1045,18 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+-     uint8_t dc_scale;
+-     int pred_mode_intra = (c_idx == 0) ? lc->tu.intra_pred_mode :
+-                                          lc->tu.intra_pred_mode_c;
+-+#ifdef RPI
+-+    if (s->enable_rpi) {
+-+        int n = trafo_size * trafo_size;
+-+        if (use_vpu) {
+-+            coeffs = s->coeffs_buf_arm[log2_trafo_size - 2] + s->num_coeffs[log2_trafo_size - 2];
+-+            s->num_coeffs[log2_trafo_size - 2] += n;
+-+        } else {
+-+            coeffs = s->coeffs_buf_arm[0] + s->num_coeffs[0];
+-+            s->num_coeffs[0] += n;
+++        curr += in_inc / sizeof (MvField);
+++        neigh += in_inc / sizeof (MvField);
+++
+++        for (out = dup; out > 0; out--)
+++        {
+++            *bs = strength;
+++            bs += out_inc;
+ +        }
+ +    }
+-+#endif
+- 
+-     memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
+- 
+-@@ -1488,6 +1501,24 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+-         } else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2) {
+-             s->hevcdsp.idct_4x4_luma(coeffs);
+-         } else {
+-+#ifdef RPI
+-+            if (!use_vpu) {
+-+              int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
+-+              if (max_xy == 0)
+-+                  s->hevcdsp.idct_dc[log2_trafo_size-2](coeffs);
+-+              else {
+-+                  int col_limit = last_significant_coeff_x + last_significant_coeff_y + 4;
+-+                  if (max_xy < 4)
+-+                      col_limit = FFMIN(4, col_limit);
+-+                  else if (max_xy < 8)
+-+                      col_limit = FFMIN(8, col_limit);
+-+                  else if (max_xy < 12)
+-+                      col_limit = FFMIN(24, col_limit);
+++}
+ +
+-+                  s->hevcdsp.idct[log2_trafo_size-2](coeffs, col_limit);
+-+              }
+-+            }
+-+#else
+-             int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
+-             if (max_xy == 0)
+-                 s->hevcdsp.idct_dc[log2_trafo_size-2](coeffs);
+-@@ -1501,6 +1532,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+-                     col_limit = FFMIN(24, col_limit);
+-                 s->hevcdsp.idct[log2_trafo_size-2](coeffs, col_limit);
+-             }
+-+#endif
+-         }
+-     }
+-     if (lc->tu.cross_pf) {
+-@@ -1512,14 +1544,11 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
++ {
++ #undef FUNC
++@@ -257,6 +371,8 @@ int i = 0;
++         break;
+      }
+- #ifdef RPI
+-     if (s->enable_rpi) {
+--        int16_t *c = s->coeffs_buf + s->num_coeffs;
+--        int n = trafo_size * trafo_size;
+-         HEVCPredCmd *cmd = s->univ_pred_cmds + s->num_pred_cmds++;
+--        memcpy(c, coeffs, n * sizeof(int16_t));  // TODO change pointer earlier and we can avoid this copy
+--        s->num_coeffs += n;
+-+        //memcpy(coeffs2, coeffs, sizeof(int16_t) * trafo_size * trafo_size); // TODO
+-         cmd->type = RPI_PRED_TRANSFORM_ADD;
+-         cmd->size = log2_trafo_size;
+--        cmd->buf = c;
+-+        cmd->buf = coeffs;
+-         cmd->dst = dst;
+-         cmd->stride = stride;
+-         return;
+-diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
+-index 1f33b0c..e4c3da7 100644
+---- a/libavcodec/hevc_filter.c
+-+++ b/libavcodec/hevc_filter.c
+-@@ -22,6 +22,10 @@
+-  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+-  */
+  
+-+//#define DISABLE_SAO
+-+//#define DISABLE_DEBLOCK
+-+//#define DISABLE_STRENGTHS
+++    hevcdsp->hevc_deblocking_boundary_strengths = hevc_deblocking_boundary_strengths;
+ +
+- #include "libavutil/common.h"
+- #include "libavutil/internal.h"
+- 
+-@@ -273,6 +277,10 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
+-     edges[2]   = x_ctb == s->ps.sps->ctb_width  - 1;
+-     edges[3]   = y_ctb == s->ps.sps->ctb_height - 1;
++     if (ARCH_X86)
++         ff_hevc_dsp_init_x86(hevcdsp, bit_depth);
++     if (ARCH_ARM)
++diff --git a/libavcodec/hevcdsp.h b/libavcodec/hevcdsp.h
++index 9f1f6dd..e221e54 100644
++--- a/libavcodec/hevcdsp.h
+++++ b/libavcodec/hevcdsp.h
++@@ -42,6 +42,17 @@ typedef struct SAOParams {
++     uint8_t type_idx[3];    ///< sao_type_idx
++ } SAOParams;
+  
+-+#ifdef DISABLE_SAO
+-+    return;
+-+#endif
+++typedef struct Mv {
+++    int16_t x;  ///< horizontal component of motion vector
+++    int16_t y;  ///< vertical component of motion vector
+++} Mv;
+ +
+-     if (restore) {
+-         if (!edges[0]) {
+-             left_tile_edge  = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1]];
+-@@ -496,6 +504,10 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+-                 s->ps.sps->pcm.loop_filter_disable_flag) ||
+-                s->ps.pps->transquant_bypass_enable_flag;
+- 
+-+#ifdef DISABLE_DEBLOCK
+-+    return;
+-+#endif
+++typedef struct MvField {
+++    DECLARE_ALIGNED(4, Mv, mv)[2];
+++    int8_t ref_idx[2];
+++    int8_t pred_flag;
+++} MvField;
+ +
+-     if (x0) {
+-         left_tc_offset   = s->deblock[ctb - 1].tc_offset;
+-         left_beta_offset = s->deblock[ctb - 1].beta_offset;
+-@@ -726,6 +738,10 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
+-     int boundary_upper, boundary_left;
+-     int i, j, bs;
++ typedef struct HEVCDSPContext {
++     void (*put_pcm)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
++                     struct GetBitContext *gb, int pcm_bit_depth);
++@@ -120,6 +131,9 @@ typedef struct HEVCDSPContext {
++     void (*hevc_v_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride,
++                                         int32_t *tc, uint8_t *no_p,
++                                         uint8_t *no_q);
+++    void (*hevc_deblocking_boundary_strengths)(int pus, int dup, int in_inc, int out_inc,
+++                                               int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
+++                                               MvField *curr, MvField *neigh, uint8_t *bs);
++ } HEVCDSPContext;
+  
+-+#ifdef DISABLE_STRENGTHS
+-+    return;
+-+#endif
+-+
+-     boundary_upper = y0 > 0 && !(y0 & 7);
+-     if (boundary_upper &&
+-         ((!s->sh.slice_loop_filter_across_slices_enabled_flag &&
++ void ff_hevc_dsp_init(HEVCDSPContext *hpc, int bit_depth);
+ diff --git a/libavcodec/hevcpred_template.c b/libavcodec/hevcpred_template.c
+-index 6ae87cc..71c6d52 100644
++index 6ae87cc..28d2653 100644
+ --- a/libavcodec/hevcpred_template.c
+ +++ b/libavcodec/hevcpred_template.c
+ @@ -20,6 +20,8 @@
+@@ -5149,7 +7113,20 @@ index 6ae87cc..71c6d52 100644
+  #include "libavutil/pixdesc.h"
+  
+  #include "bit_depth_template.c"
+-@@ -114,6 +116,10 @@ do {                                  \
++@@ -69,8 +71,11 @@ do {                                  \
++                 AV_WN4P(&ptr[i], a);                                           \
++             else                                                               \
++                 a = PIXEL_SPLAT_X4(ptr[i + 3])
++-
+++#ifdef RPI_WORKER
+++    HEVCLocalContextIntra *lc = (s->enable_rpi) ? &s->HEVClcIntra : (HEVCLocalContextIntra *)s->HEVClc ;
+++#else
++     HEVCLocalContext *lc = s->HEVClc;
+++#endif
++     int i;
++     int hshift = s->ps.sps->hshift[c_idx];
++     int vshift = s->ps.sps->vshift[c_idx];
++@@ -114,6 +119,10 @@ do {                                  \
+      int top_right_size   = (FFMIN(x0 + 2 * size_in_luma_h, s->ps.sps->width) -
+                             (x0 + size_in_luma_h)) >> hshift;
+  
+@@ -5160,126 +7137,93 @@ index 6ae87cc..71c6d52 100644
+      if (s->ps.pps->constrained_intra_pred_flag == 1) {
+          int size_in_luma_pu_v = PU(size_in_luma_v);
+          int size_in_luma_pu_h = PU(size_in_luma_h);
++diff --git a/libavcodec/mmaldec.c b/libavcodec/mmaldec.c
++index 099a8c5..bdff2d2 100644
++--- a/libavcodec/mmaldec.c
+++++ b/libavcodec/mmaldec.c
++@@ -24,6 +24,9 @@
++  * MMAL Video Decoder
++  */
++ 
+++#pragma GCC diagnostic push
+++// Many many redundant decls in the header files
+++#pragma GCC diagnostic ignored "-Wredundant-decls"
++ #include <bcm_host.h>
++ #include <interface/mmal/mmal.h>
++ #include <interface/mmal/mmal_parameters_video.h>
++@@ -31,6 +34,7 @@
++ #include <interface/mmal/util/mmal_util_params.h>
++ #include <interface/mmal/util/mmal_default_components.h>
++ #include <interface/mmal/vc/mmal_vc_api.h>
+++#pragma GCC diagnostic pop
++ 
++ #include "avcodec.h"
++ #include "internal.h"
++diff --git a/libavcodec/mpeg4videodec.c b/libavcodec/mpeg4videodec.c
++index 3adf28d..2f9195f 100644
++--- a/libavcodec/mpeg4videodec.c
+++++ b/libavcodec/mpeg4videodec.c
++@@ -2205,6 +2205,9 @@ int ff_mpeg4_workaround_bugs(AVCodecContext *avctx)
++ 
++         if (ctx->divx_version >= 0)
++             s->workaround_bugs |= FF_BUG_HPEL_CHROMA;
+++
+++        if (ctx->num_sprite_warping_points > 1)
+++            s->workaround_bugs |= FF_BUG_GMC_UNSUPPORTED;
++     }
++ 
++     if (s->workaround_bugs & FF_BUG_STD_QPEL) {
++@@ -2229,6 +2232,7 @@ int ff_mpeg4_workaround_bugs(AVCodecContext *avctx)
++                s->workaround_bugs, ctx->lavc_build, ctx->xvid_build,
++                ctx->divx_version, ctx->divx_build, s->divx_packed ? "p" : "");
++ 
+++    avctx->workaround_bugs = s->workaround_bugs;
++     if (CONFIG_MPEG4_DECODER && ctx->xvid_build >= 0 &&
++         s->codec_id == AV_CODEC_ID_MPEG4 &&
++         avctx->idct_algo == FF_IDCT_AUTO) {
+ diff --git a/libavcodec/rpi_hevc_transform.h b/libavcodec/rpi_hevc_transform.h
+-index 85a9102..c0c279f 100644
+---- a/libavcodec/rpi_hevc_transform.h
++new file mode 100644
++index 0000000..4309f1c
++--- /dev/null
+ +++ b/libavcodec/rpi_hevc_transform.h
+-@@ -3,11 +3,11 @@ unsigned char rpi_hevc_transform [] = {
+- 3,
+- 3,
+- 232,
+--128,
+-+32,
+- 0,
+- 0,
+- 0,
+--20,
+-+12,
+- 248,
+- 0,
+- 136,
+-@@ -56,9 +56,9 @@ unsigned char rpi_hevc_transform [] = {
+- 5,
+- 232,
+- 0,
+--0,
+- 8,
+- 0,
+-+0,
+- 128,
+- 69,
+- 113,
+-@@ -108,8 +108,8 @@ unsigned char rpi_hevc_transform [] = {
+- 128,
+- 2,
+- 0,
+--248,
+--62,
+-+8,
+-+2,
+- 0,
+- 128,
+- 144,
+-@@ -123,13 +123,13 @@ unsigned char rpi_hevc_transform [] = {
+- 3,
+- 32,
+- 8,
+--16,
+-+20,
+- 0,
+- 76,
+- 254,
+- 48,
+- 192,
+--9,
+-+4,
+- 4,
+- 32,
+- 8,
+-@@ -155,14 +155,46 @@ unsigned char rpi_hevc_transform [] = {
+- 192,
+- 41,
+- 3,
+--68,
+-+70,
+-+192,
+-+80,
+-+7,
+-+164,
+-+255,
+-+36,
+-+204,
+-+96,
+-+2,
++@@ -0,0 +1,3070 @@
+++unsigned char rpi_hevc_transform [] = {
+++21,
+++106,
+ +0,
+-+248,
+-+62,
+++144,
+++47,
+++1,
+++37,
+++106,
+ +0,
+-+3,
+-+255,
+-+55,
+-+208,
+-+120,
+-+3,
+-+224,
+-+3,
+-+190,
+-+11,
+-+16,
+-+139,
+-+246,
+-+91,
+++144,
+++66,
+++1,
+++53,
+++106,
+ +0,
+-+103,
+-+90,
+++144,
+++192,
+++4,
+++69,
+++106,
+ +0,
+-+70,
+- 192,
+- 80,
+- 7,
+- 164,
+- 255,
+- 36,
+--220,
+-+204,
+- 96,
+- 2,
+- 0,
+-@@ -182,7 +214,7 @@ unsigned char rpi_hevc_transform [] = {
+- 16,
+- 139,
+- 246,
+--83,
+-+91,
+- 0,
+- 103,
+- 90,
+-@@ -209,4 +241,374 @@ unsigned char rpi_hevc_transform [] = {
+- 96,
+- 90,
+- 0,
+++144,
+++192,
+++4,
+++85,
+++106,
+++0,
+++144,
+++220,
+++5,
+ +169,
+ +3,
+++62,
+++64,
+++79,
+++64,
+ +3,
+ +232,
+ +32,
+@@ -5312,9 +7256,11 @@ index 85a9102..c0c279f 100644
+ +248,
+ +0,
+ +0,
+++0,
+++96,
+ +3,
+ +232,
+-+128,
+++32,
+ +0,
+ +0,
+ +0,
+@@ -5324,6 +7270,22 @@ index 85a9102..c0c279f 100644
+ +2,
+ +0,
+ +0,
+++8,
+++232,
+++0,
+++4,
+++0,
+++0,
+++12,
+++248,
+++0,
+++128,
+++0,
+++0,
+++192,
+++8,
+++4,
+++0,
+ +4,
+ +232,
+ +64,
+@@ -5336,6 +7298,184 @@ index 85a9102..c0c279f 100644
+ +8,
+ +0,
+ +0,
+++128,
+++69,
+++113,
+++66,
+++12,
+++248,
+++0,
+++128,
+++0,
+++0,
+++192,
+++8,
+++4,
+++0,
+++128,
+++69,
+++113,
+++70,
+++128,
+++144,
+++40,
+++0,
+++4,
+++255,
+++48,
+++192,
+++128,
+++3,
+++32,
+++8,
+++16,
+++0,
+++76,
+++254,
+++48,
+++192,
+++9,
+++4,
+++32,
+++8,
+++0,
+++0,
+++4,
+++254,
+++0,
+++144,
+++128,
+++2,
+++0,
+++8,
+++2,
+++0,
+++128,
+++144,
+++23,
+++0,
+++4,
+++255,
+++48,
+++192,
+++128,
+++3,
+++32,
+++8,
+++20,
+++0,
+++76,
+++254,
+++48,
+++192,
+++4,
+++4,
+++32,
+++8,
+++0,
+++0,
+++140,
+++248,
+++44,
+++0,
+++0,
+++0,
+++32,
+++48,
+++4,
+++0,
+++128,
+++69,
+++113,
+++66,
+++242,
+++140,
+++211,
+++192,
+++34,
+++31,
+++41,
+++3,
+++70,
+++192,
+++80,
+++7,
+++164,
+++255,
+++36,
+++204,
+++96,
+++2,
+++0,
+++248,
+++62,
+++0,
+++3,
+++255,
+++55,
+++208,
+++120,
+++3,
+++224,
+++3,
+++190,
+++11,
+++16,
+++139,
+++246,
+++91,
+++0,
+++103,
+++90,
+++0,
+++70,
+++192,
+++80,
+++7,
+++164,
+++255,
+++36,
+++204,
+++224,
+++2,
+++0,
+++248,
+++62,
+++0,
+++3,
+++255,
+++55,
+++208,
+++120,
+++3,
+++224,
+++3,
+++190,
+++11,
+++16,
+++139,
+++246,
+++91,
+++0,
+++103,
+++90,
+++0,
+++225,
+++64,
+++242,
+++64,
+++3,
+++232,
+++128,
+++0,
+++0,
+++0,
+++7,
+++232,
+++0,
+++2,
+++0,
+++0,
+ +57,
+ +239,
+ +224,
+@@ -5354,18 +7494,26 @@ index 85a9102..c0c279f 100644
+ +64,
+ +26,
+ +64,
+++4,
+++232,
+++64,
+++0,
+++0,
+++0,
+++149,
+++96,
+ +161,
+ +64,
+ +152,
+ +64,
+ +128,
+ +144,
+-+31,
+++35,
+ +0,
+ +72,
+ +232,
+-+32,
+ +0,
+++4,
+ +0,
+ +0,
+ +65,
+@@ -5376,8 +7524,16 @@ index 85a9102..c0c279f 100644
+ +0,
+ +128,
+ +144,
+-+23,
+++27,
+++0,
+++4,
+++232,
+++0,
+++8,
+++0,
+ +0,
+++69,
+++96,
+ +145,
+ +64,
+ +168,
+@@ -5388,8 +7544,8 @@ index 85a9102..c0c279f 100644
+ +0,
+ +72,
+ +232,
+-+32,
+ +0,
+++4,
+ +0,
+ +0,
+ +65,
+@@ -5410,7 +7566,7 @@ index 85a9102..c0c279f 100644
+ +0,
+ +242,
+ +140,
+-+229,
+++221,
+ +192,
+ +57,
+ +239,
+@@ -5420,6 +7576,8 @@ index 85a9102..c0c279f 100644
+ +0,
+ +41,
+ +3,
+++239,
+++3,
+ +12,
+ +248,
+ +0,
+@@ -5427,7 +7585,7 @@ index 85a9102..c0c279f 100644
+ +0,
+ +0,
+ +192,
+-+8,
+++248,
+ +4,
+ +0,
+ +12,
+@@ -5437,14 +7595,14 @@ index 85a9102..c0c279f 100644
+ +64,
+ +0,
+ +192,
+-+8,
+++248,
+ +4,
+ +0,
+ +0,
+ +96,
+ +255,
+ +159,
+-+131,
+++154,
+ +255,
+ +0,
+ +232,
+@@ -5454,7 +7612,7 @@ index 85a9102..c0c279f 100644
+ +0,
+ +255,
+ +159,
+-+142,
+++165,
+ +255,
+ +4,
+ +255,
+@@ -5466,7 +7624,7 @@ index 85a9102..c0c279f 100644
+ +251,
+ +62,
+ +0,
+-+5,
+++4,
+ +255,
+ +51,
+ +204,
+@@ -5476,15 +7634,15 @@ index 85a9102..c0c279f 100644
+ +251,
+ +16,
+ +0,
+-+77,
+++76,
+ +254,
+ +51,
+ +204,
+-+9,
+-+4,
+++128,
+++3,
+ +224,
+ +251,
+-+0,
+++20,
+ +0,
+ +128,
+ +64,
+@@ -5504,16 +7662,6 @@ index 85a9102..c0c279f 100644
+ +99,
+ +0,
+ +0,
+-+4,
+-+254,
+-+0,
+-+144,
+-+128,
+-+2,
+-+0,
+-+8,
+-+2,
+-+0,
+ +32,
+ +247,
+ +240,
+@@ -5525,92 +7673,92 @@ index 85a9102..c0c279f 100644
+ +176,
+ +207,
+ +17,
+-+3,
+++19,
+ +32,
+ +247,
+ +112,
+ +207,
+ +18,
+-+3,
+++35,
+ +32,
+ +247,
+ +48,
+ +207,
+ +19,
+-+3,
+++51,
+ +32,
+ +247,
+ +240,
+ +206,
+ +20,
+-+3,
+++67,
+ +32,
+ +247,
+ +176,
+ +206,
+ +21,
+-+3,
+++83,
+ +32,
+ +247,
+ +112,
+ +206,
+ +22,
+-+3,
+++99,
+ +32,
+ +247,
+ +48,
+ +206,
+ +23,
+-+3,
+++115,
+ +32,
+ +247,
+ +240,
+ +205,
+ +24,
+-+3,
+++131,
+ +32,
+ +247,
+ +176,
+ +205,
+ +25,
+-+3,
+++147,
+ +32,
+ +247,
+ +112,
+ +205,
+ +26,
+-+3,
+++163,
+ +32,
+ +247,
+ +48,
+ +205,
+ +27,
+-+3,
+++179,
+ +32,
+ +247,
+ +240,
+ +204,
+ +28,
+-+3,
+++195,
+ +32,
+ +247,
+ +176,
+ +204,
+ +29,
+-+3,
+++211,
+ +32,
+ +247,
+ +112,
+ +204,
+ +30,
+-+3,
+++227,
+ +32,
+ +247,
+ +48,
+ +204,
+ +31,
+-+3,
+-+5,
+++243,
+++4,
+ +255,
+ +51,
+ +204,
+@@ -5620,20 +7768,20 @@ index 85a9102..c0c279f 100644
+ +251,
+ +16,
+ +0,
+-+77,
+++76,
+ +254,
+ +51,
+ +204,
+-+9,
+-+4,
+++128,
+++3,
+ +224,
+ +251,
+-+0,
+++20,
+ +0,
+ +0,
+ +237,
+++32,
+ +0,
+-+4,
+ +0,
+ +0,
+ +140,
+@@ -5646,29347 +7794,1846 @@ index 85a9102..c0c279f 100644
+ +99,
+ +0,
+ +0,
+++111,
+++3,
+++4,
+++254,
+++0,
+++128,
+++0,
+++4,
+++0,
+++248,
+++0,
+++0,
+++2,
+++232,
+++32,
+++0,
+++0,
+++0,
+++140,
+++248,
+++32,
+++0,
+++0,
+++0,
+++224,
+++35,
+++0,
+++0,
+++64,
+++232,
+++0,
+++2,
+++0,
+++0,
+++193,
+++232,
+++0,
+++1,
+++0,
+++0,
+++1,
+++106,
+++116,
+++30,
+ +90,
+ +0,
+- };
+-diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s
+-index 5e2728d..1e389c7 100644
+---- a/libavcodec/rpi_hevc_transform.s
+-+++ b/libavcodec/rpi_hevc_transform.s
+-@@ -58,13 +58,6 @@
+- #
+- #
+- 
+--test_add:
+--  vldh HX(0,0),(r0)
+--  vadd HX(0,0),HX(0,0),10
+--  vsth HX(0,0),(r0)
+--  mov r0,7 # return value
+--  b lr
+--
+- # Columns are transformed first
+- #
+- # Store top left half of transMatrix2 in
+-@@ -79,7 +72,7 @@ test_add:
+- #
+- 
+- 
+--# hevc_trans_16x16(short *transMatrix2, short *coeffs, int num)
+-+# hevc_trans_16x16(short *transMatrix2, short *coeffs, int num) # TODO add size so we can branch to correct implementation (or perhaps have coeffs32 and num32 as secondary inputs!)
+- # transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory)
+- # coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
+- # num: number of 16x16 transforms to be done
+-@@ -87,17 +80,17 @@ test_add:
+- hevc_trans_16x16:
+-   push r6-r15, lr # TODO cut down number of used registers
+- 
+--  mov r3, 2*32*2 # Twice Stride of transMatrix2 in bytes
+--  vld HX(32++,0),(r0 += r3) REP 16 # This is the 16x16 matrix, a transform is equivalent to multiplying input row vector * matrix
+-+  mov r3, 16*2 # Stride of transMatrix2 in bytes
+-+  vldh HX(32++,0),(r0 += r3) REP 16 # This is the 16x16 matrix, a transform is equivalent to multiplying input row vector * matrix
+-   # Now use r0 to describe which matrix we are working on.
+-   # Allows us to prefetch the next block of coefficients for efficiency.
+-   mov r0,0 # This describes the location where we read our coefficients from
+--  mov r3,16*2 # Stride of coefficients in bytes
+-+  mov r3,16*2 # Stride of coefficients in bytes (TODO remove)
+-   mov r7,16*16*2 # Total block size
+-   mov r8,64*16 # Value used to swap from current to next VRF location
+-   vldh HX(0++,0)+r0,(r1 += r3) REP 16
+-   mov r4,64 # Constant used for rounding first pass
+--  mov r5,1<<19 # Constant used for rounding second pass
+-+  mov r5,1<<11 # Constant used for rounding second pass
+- 
+-   # At start of block r0,r1 point to the current block (that has already been loaded)
+- block_loop:
+-@@ -113,12 +106,12 @@ block_loop:
+-   vadd HY(0++,0)+r0,HY(0++,0)+r0,r4 REP 16   # Now add on rounding, shift down by 7, and saturate
+-   #vsasls HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # 9+7=16 so this ends up with the output saturated and in the top half of the word.
+-   vasl HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16    # This should be saturating, but the instruction above does not assemble?
+--  vmov VX(0,0++), HX(0++,32) REP 16          # For simplicity transpose this back to the original position
+-+  vmov VX(0,0++)+r0, HX(0++,32)+r0 REP 16    # For simplicity transpose this back to the original position
+- 
+-   bl col_trans_16
+--  vadd HY(0++,0)+r0,HY(0++,0)+r0,r4 REP 16   # Now add on rounding, shift down by 7, and saturate
+--  #vsasls HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # 9+7=16 so this ends up with the output saturated and in the top half of the word.
+--  vasl HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16    # This should be saturating, but the instruction above does not assemble?
+-+  vadd HY(0++,0)+r0,HY(0++,0)+r0,r5 REP 16   # Now add on rounding, shift down by 7, and saturate
+-+  #vsasls HY(0++,0)+r0,HY(0++,0)+r0,4 REP 16 # 4+12=16 so this ends up with the output saturated and in the top half of the word.
+-+  vasl HY(0++,0)+r0,HY(0++,0)+r0,4 REP 16    # This should be saturating, but the instruction above does not assemble?  (Probably because it ends with ls which is interpreted as a condition flag)
+- 
+-   # Save results - note there has been a transposition during the processing so we save columns
+-   vsth VX(0,32++)+r0, (r1 += r3) REP 16
+-@@ -132,16 +125,136 @@ block_loop:
+- 
+- # r1,r2,r3 r7,r8 should be preserved
+- # HX(0++,0)+r0 is the block to be transformed
+--# HX(32++,0) is the 16x16 matrix of transform coefficients
+-+# HX(32++,0)+r6 is the 16x16 matrix of transform coefficients
+- # Use HY(48,0) for intermediate results
+- # r0 can be used, but should be returned to its original value at the end
+- col_trans_16:
+--  add r4,r0,16 # Final value for this loop
+-+  add r6,r0,16 # Final value for this loop
+- col_trans_16_loop:
+-   # First compute partial products for a single column
+--  vmul32s VY(48,0++), VX(0,0)+r0, VX(32,0++) REP 16
+-+  vmul32s HY(48++,0), VX(0,0)+r0, VX(32,0++) REP 16
+-   # Then sum up the results and place back
+-   vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
+--  addcmpblt r0,1,r4,col_trans_16_loop
+-+  addcmpblt r0,1,r6,col_trans_16_loop
+-   sub r0,16  # but r0 back to its original value
+-   b lr
+-+
+-+col_trans_odd_16:
+-+  add r6,r0,16 # Final value for this loop
+-+col_trans_odd_16_loop:
+-+  # First compute partial products for a single column
+-+  vmul32s HY(48++,0), VX(0,0)+r0, VX(32,0++) REP 16
+-+  # Then sum up the results and place back
+-+  vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
+-+  addcmpblt r0,1,r6,col_trans_odd_16_loop
+-+  sub r0,16  # but r0 back to its original value
+-+  b lr
+-+
+-+
+-+test_add:
+-+  vldh HX(0,0),(r0)
+-+  vadd HX(0,0),HX(0,0),10
+-+  vsth HX(0,0),(r0)
+-+  mov r0,7 # return value
+-+  b lr
+-+
+-+# hevc_trans_32x32(short *transMatrix2, short *coeffs, int num)
+-+# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory) Even followed by odd
+-+# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
+-+# num: number of 16x16 transforms to be done
+-+#
+-+hevc_trans_32x32:
+-+  push r6-r15, lr # TODO cut down number of used registers
+-+
+-+  # Fetch transform matrices
+-+  mov r3, 16*2 # Stride of transMatrix2 in bytes (and of coefficients)
+-+  vldh HX(32++,0),(r0 += r3) REP 16 # This is the even 16x16 matrix
+-+  add r0, 16*16*2
+-+  vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
+-+
+-+  mov r3, 32*2*2 # Stride used to fetch alternate rows of our input coefficient buffer
+-+  mov r7, 16*16*2 # Total block size
+-+  mov r4, 64 # Constant used for rounding first pass
+-+  mov r5, 1<<11 # Constant used for rounding second pass
+-+  sub sp,sp,32*32*2+32 # Allocate some space on the stack for us to store 32*32 shorts as temporary results (needs to be aligned)
+-+  # set r8 to 32byte aligned stack pointer
+-+  add r8,sp,31
+-+  lsr r8,5
+-+  lsl r8,5
+-+  mov r9,r8  # Backup of the temporary storage
+-+  mov r10,r1 # Backup of the coefficient buffer
+-+block_loop32:
+-+
+-+  # COLUMN TRANSFORM
+-+  # Transform the first 16 columns
+-+  mov r1,r10  # Input Coefficient buffer
+-+  mov r8,r9   # Output temporary storage
+-+  bl trans32
+-+  # Transform the second 16 columns
+-+  add r8,32
+-+  add r1,32
+-+  bl trans32
+-+
+-+  # ROW TRANSFORM
+-+  mov r1,r9  # Input temporary storage
+-+  mov r8,r10   # Output Coefficient buffer
+-+  bl trans32
+-+  # Transform the second 16 columns
+-+  add r8,32
+-+  add r1,32
+-+  bl trans32
+-+
+-+  add r10, 32*32*2 # move onto next block of coefficients
+-+  addcmpbgt r2,-1,0,block_loop32
+-+
+-+  add sp,sp,32*32*2+32 # Restore stack
+-+
+-+  pop r6-r15, pc
+-+
+-+trans32:
+-+  # We can no longer afford the VRF space to do prefetching when doing 32x32
+-+  # Fetch the even rows
+-+  vldh HX(0++,0)+r0,(r1 += r3) REP 16
+-+  # Fetch the odd rows
+-+  vldh HX(16++,0)+r0,64(r1 += r3) REP 16 # First odd row is 32 shorts ahead of r1
+-+
+-+  # Transform the even rows using even matrix
+-+  mov r0, 0 # Even rows
+-+  bl col_trans_16
+-+
+-+  # Now transform the odd rows using odd matrix
+-+  mov r0, 64*16 # Odd rows
+-+  bl col_trans_odd_16
+-+
+-+  # Now apply butterfly to compute the first 16 results
+-+  vadd HY(48++,0),HY(0++,0),HY(16++,0) REP 16
+-+  vadd HY(48++,0),HY(48++,0),r4 REP 32   # add on rounding,
+-+  vasl HY(48++,0),HY(48++,0),9 REP 32    # shift down by 7, and saturate
+-+  # 16bit results now in HX(48,32)
+-+  mov r0,r8
+-+  mov r6,32*2
+-+  vsth VX(48,32++),(r0+=r6) REP 16
+-+  vmov VX(0,0++)+r0, HX(0++,32)+r0 REP 16    # Store transposed
+-+
+-+  # Now apply butterfly to compute the second 16 results (in reverse order)
+-+  vsub HY(63,0),HY(0,0),HY(16,0)
+-+  vsub HY(62,0),HY(0,0),HY(17,0)
+-+  vsub HY(61,0),HY(0,0),HY(18,0)
+-+  vsub HY(60,0),HY(0,0),HY(19,0)
+-+  vsub HY(59,0),HY(0,0),HY(20,0)
+-+  vsub HY(58,0),HY(0,0),HY(21,0)
+-+  vsub HY(57,0),HY(0,0),HY(22,0)
+-+  vsub HY(56,0),HY(0,0),HY(23,0)
+-+  vsub HY(55,0),HY(0,0),HY(24,0)
+-+  vsub HY(54,0),HY(0,0),HY(25,0)
+-+  vsub HY(53,0),HY(0,0),HY(26,0)
+-+  vsub HY(52,0),HY(0,0),HY(27,0)
+-+  vsub HY(51,0),HY(0,0),HY(28,0)
+-+  vsub HY(50,0),HY(0,0),HY(29,0)
+-+  vsub HY(49,0),HY(0,0),HY(30,0)
+-+  vsub HY(48,0),HY(0,0),HY(31,0)
+-+  vadd HY(48++,0),HY(48++,0),r4 REP 32   # add on rounding,
+-+  vasl HY(48++,0),HY(48++,0),9 REP 32    # shift down by 7, and saturate
+-+  add r0,r8,16*32*2 # Move to 16th row
+-+  vsth VX(48,32++),(r0+=r6) REP 16
+-+  b lr
+-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
+-index b1f50ee..d720546 100644
+---- a/libavcodec/rpi_qpu.c
+-+++ b/libavcodec/rpi_qpu.c
+-@@ -3,6 +3,7 @@
+- // This works better than the mmap in that the memory can be cached, but requires a kernel modification to enable the device.
+- #define RPI_USE_VCSM
+- #define RPI_TIME_TOTAL_QPU
+-+#define RPI_TIME_TOTAL_VPU
+- 
+- #include <stdio.h>
+- #include <stdlib.h>
+-@@ -48,10 +49,47 @@ typedef int int32_t;
+- #define QPU_CODE_SIZE 2048
+- #define VPU_CODE_SIZE 2048
+- 
+-+const short rpi_transMatrix2even[32][16] = { // Even rows first
+-+{64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64},
+-+{90,  87,  80,  70,  57,  43,  25,   9,  -9, -25, -43, -57, -70, -80, -87, -90},
+-+{89,  75,  50,  18, -18, -50, -75, -89, -89, -75, -50, -18,  18,  50,  75,  89},
+-+{87,  57,   9, -43, -80, -90, -70, -25,  25,  70,  90,  80,  43,  -9, -57, -87},
+-+{83,  36, -36, -83, -83, -36,  36,  83,  83,  36, -36, -83, -83, -36,  36,  83},
+-+{80,   9, -70, -87, -25,  57,  90,  43, -43, -90, -57,  25,  87,  70,  -9, -80},
+-+{75, -18, -89, -50,  50,  89,  18, -75, -75,  18,  89,  50, -50, -89, -18,  75},
+-+{70, -43, -87,   9,  90,  25, -80, -57,  57,  80, -25, -90,  -9,  87,  43, -70},
+-+{64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64},
+-+{57, -80, -25,  90,  -9, -87,  43,  70, -70, -43,  87,   9, -90,  25,  80, -57},
+-+{50, -89,  18,  75, -75, -18,  89, -50, -50,  89, -18, -75,  75,  18, -89,  50},
+-+{43, -90,  57,  25, -87,  70,   9, -80,  80,  -9, -70,  87, -25, -57,  90, -43},
+-+{36, -83,  83, -36, -36,  83, -83,  36,  36, -83,  83, -36, -36,  83, -83,  36},
+-+{25, -70,  90, -80,  43,   9, -57,  87, -87,  57,  -9, -43,  80, -90,  70, -25},
+-+{18, -50,  75, -89,  89, -75,  50, -18, -18,  50, -75,  89, -89,  75, -50,  18},
+-+{ 9, -25,  43, -57,  70, -80,  87, -90,  90, -87,  80, -70,  57, -43,  25,  -9},
+-+// Odd rows
+-+{90,  90,  88,  85,  82,  78,  73,  67,  61,  54,  46,  38,  31,  22,  13,   4},
+-+{90,  82,  67,  46,  22,  -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13},
+-+{88,  67,  31, -13, -54, -82, -90, -78, -46,  -4,  38,  73,  90,  85,  61,  22},
+-+{85,  46, -13, -67, -90, -73, -22,  38,  82,  88,  54,  -4, -61, -90, -78, -31},
+-+{82,  22, -54, -90, -61,  13,  78,  85,  31, -46, -90, -67,   4,  73,  88,  38},
+-+{78,  -4, -82, -73,  13,  85,  67, -22, -88, -61,  31,  90,  54, -38, -90, -46},
+-+{73, -31, -90, -22,  78,  67, -38, -90, -13,  82,  61, -46, -88,  -4,  85,  54},
+-+{67, -54, -78,  38,  85, -22, -90,   4,  90,  13, -88, -31,  82,  46, -73, -61},
+-+{61, -73, -46,  82,  31, -88, -13,  90,  -4, -90,  22,  85, -38, -78,  54,  67},
+-+{54, -85,  -4,  88, -46, -61,  82,  13, -90,  38,  67, -78, -22,  90, -31, -73},
+-+{46, -90,  38,  54, -90,  31,  61, -88,  22,  67, -85,  13,  73, -82,   4,  78},
+-+{38, -88,  73,  -4, -67,  90, -46, -31,  85, -78,  13,  61, -90,  54,  22, -82},
+-+{31, -78,  90, -61,   4,  54, -88,  82, -38, -22,  73, -90,  67, -13, -46,  85},
+-+{22, -61,  85, -90,  73, -38,  -4,  46, -78,  90, -82,  54, -13, -31,  67, -88},
+-+{13, -38,  61, -78,  88, -90,  85, -73,  54, -31,   4,  22, -46,  67, -82,  90},
+-+{ 4, -13,  22, -31,  38, -46,  54, -61,  67, -73,  78, -82,  85, -88,  90, -90}
+-+};
+-+
+- struct GPU
+- {
+-   unsigned int qpu_code[QPU_CODE_SIZE];
+-   unsigned int vpu_code[VPU_CODE_SIZE];
+-+  short transMatrix2even[16*16];
+-   int open_count; // Number of allocated video buffers
+-   unsigned int vc_handle; // Handle of this memory
+-   int      mb; // Mailbox handle
+-@@ -123,6 +161,8 @@ static int gpu_init(volatile struct GPU **gpu) {
+-     assert(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
+-     memcpy((void*)ptr->vpu_code, rpi_hevc_transform, num_bytes);
+-   }
+-+  // And the transform coefficients
+-+  memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, 16*16*sizeof(short));
+- 
+-   return 0;
+- }
+-@@ -274,11 +314,43 @@ unsigned int vpu_get_fn(void) {
+-   return gpu->vc + offsetof(struct GPU,vpu_code);
+- }
+- 
+-+unsigned int vpu_get_constants(void) {
+-+  if (gpu==NULL) {
+-+    gpu_lock();
+-+    gpu_unlock();
+-+  }
+-+  return gpu->vc + offsetof(struct GPU,transMatrix2even);
+-+}
+-+
+- unsigned vpu_execute_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5)
+- {
+-   unsigned r;
+-+#ifdef RPI_TIME_TOTAL_VPU
+-+  static int last_time=0;
+-+  static long long on_time=0;
+-+  static long long off_time=0;
+-+  int start_time;
+-+  int end_time;
+-+  static int count=0;
+-+  static long long countr2=0;
+-+#endif
+-   gpu_lock();
+-+#ifdef RPI_TIME_TOTAL_VPU
+-+  start_time = Microseconds();
+-+  if (last_time==0)
+-+    last_time = start_time;
+-+  off_time += start_time-last_time;
+-+#endif
+-   r = execute_code(gpu->mb, code, r0, r1, r2, r3, r4, r5);
+-+#ifdef RPI_TIME_TOTAL_VPU
+-+  end_time = Microseconds();
+-+  last_time = end_time;
+-+  on_time += end_time - start_time;
+-+  count++;
+-+  countr2 += r2;
+-+  if ((count&0x7f)==0)
+-+    printf("VPU %d %lld On=%dms, Off=%dms\n",count,countr2,(int)(on_time/1000),(int)(off_time/1000));
+-+#endif
+-   gpu_unlock();
+-   return r;
+- }
+-diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
+-index 4e3c35c..814fc3c 100644
+---- a/libavcodec/rpi_qpu.h
+-+++ b/libavcodec/rpi_qpu.h
+-@@ -34,6 +34,7 @@ extern unsigned int qpu_get_fn(int num);
+- 
+- // VPU specific functions
+- extern unsigned int vpu_get_fn(void);
+-+extern unsigned int vpu_get_constants(void);
+- extern unsigned vpu_execute_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5);
+- 
+- // Simple test of shader code
+--- 
+-2.7.4
+-
+-
+-From 4bb0a7ba6723650e74d63cec2123f76da4c3eb0e Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Tue, 5 May 2015 09:41:23 +0100
+-Subject: [PATCH 05/68] Fixed deblocking
+-
+----
+- libavcodec/hevc.c | 20 +++++++++++++++++---
+- 1 file changed, 17 insertions(+), 3 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 94ff709..391c57a 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -2400,8 +2400,9 @@ static void rpi_execute_transform(HEVCContext *s)
+-     //    s->hevcdsp.idct[4-2](coeffs, 16);
+-     //}
+- 
+--    //gpu_cache_flush(&s->coeffs_buf[i]);
+-+    gpu_cache_flush(&s->coeffs_buf[i]);
+-     vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf[i].vc, s->num_coeffs[i] >> 8, 0, 0, 0);
+-+    gpu_cache_flush(&s->coeffs_buf[i]);
+- 
+-     for(i=0;i<4;i++)
+-         s->num_coeffs[i] = 0;
+-@@ -2440,6 +2441,7 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-     int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs];
+- 
+- #ifdef RPI
+-+    int start_ctb_x = (s->sh.slice_ctb_addr_rs % ((s->ps.sps->width + ctb_size - 1) >> s->ps.sps->log2_ctb_size)) << s->ps.sps->log2_ctb_size;
+-     s->enable_rpi = 1; // TODO this should depend on cross component and frame width etc.
+- #endif
+- 
+-@@ -2473,9 +2475,17 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+- 
+-         more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
+- #ifdef RPI
+--        if (1 || x_ctb + ctb_size >= s->ps.sps->width) { // TODO watch out for deblocking!
+-+        if (s->enable_rpi && x_ctb + ctb_size >= s->ps.sps->width) {
+-+            int x;
+-+            // Transform all blocks
+-             rpi_execute_transform(s);
+-+            // Perform intra prediction and residual reconstruction
+-             rpi_execute_pred_cmds(s);
+-+            // Perform deblocking for CTBs in this row
+-+            for(x = start_ctb_x; x <= x_ctb; x += ctb_size) {  // TODO this will fail for tiles
+-+                ff_hevc_hls_filters(s, x, y_ctb, ctb_size);
+-+            }
+-+            start_ctb_x = 0;
+-         }
+- #endif
+-         if (more_data < 0) {
+-@@ -2486,6 +2496,10 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+- 
+-         ctb_addr_ts++;
+-         ff_hevc_save_states(s, ctb_addr_ts);
+-+#ifdef RPI
+-+        if (s->enable_rpi)
+-+            continue;
+-+#endif
+-         ff_hevc_hls_filters(s, x_ctb, y_ctb, ctb_size);
+-     }
+- 
+-@@ -3289,7 +3303,7 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
+-     if (!s->univ_pred_cmds)
+-         goto fail;
+-     for(i = 0; i < 4; i++) {
+--        gpu_malloc_uncached(sizeof(int16_t)*RPI_MAX_XFM_CMDS*16, &s->coeffs_buf[i]); // TODO slim this down and share across sizes
+-+        gpu_malloc_cached(sizeof(int16_t)*RPI_MAX_XFM_CMDS*16, &s->coeffs_buf[i]); // TODO slim this down and share across sizes
+-         s->coeffs_buf_arm[i] = (int16_t*) s->coeffs_buf[i].arm;
+-         if (!s->coeffs_buf_arm[i])
+-             goto fail;
+--- 
+-2.7.4
+-
+-
+-From 9079ef888e3d81a69f3c802ddc3c5134679e74a6 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Tue, 5 May 2015 11:32:30 +0100
+-Subject: [PATCH 06/68] Added 32x32 transform
+-
+----
+- libavcodec/hevc.c               |   8 +-
+- libavcodec/hevc_cabac.c         |   4 +-
+- libavcodec/rpi_hevc_transform.h | 200 +++++++++++++++++-----------------------
+- libavcodec/rpi_hevc_transform.s | 102 ++++++++++----------
+- libavcodec/rpi_qpu.c            |   4 +-
+- 5 files changed, 148 insertions(+), 170 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 391c57a..0dde6f2 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -2400,9 +2400,11 @@ static void rpi_execute_transform(HEVCContext *s)
+-     //    s->hevcdsp.idct[4-2](coeffs, 16);
+-     //}
+- 
+--    gpu_cache_flush(&s->coeffs_buf[i]);
+--    vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf[i].vc, s->num_coeffs[i] >> 8, 0, 0, 0);
+--    gpu_cache_flush(&s->coeffs_buf[i]);
+-+    gpu_cache_flush(&s->coeffs_buf[2]);
+-+    gpu_cache_flush(&s->coeffs_buf[3]);
+-+    vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf[2].vc, s->num_coeffs[2] >> 8, s->coeffs_buf[3].vc, s->num_coeffs[3] >> 10, 0);
+-+    gpu_cache_flush(&s->coeffs_buf[2]);
+-+    gpu_cache_flush(&s->coeffs_buf[3]);
+- 
+-     for(i=0;i<4;i++)
+-         s->num_coeffs[i] = 0;
+-diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
+-index d1cba86..88aa959 100644
+---- a/libavcodec/hevc_cabac.c
+-+++ b/libavcodec/hevc_cabac.c
+-@@ -1031,7 +1031,9 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+-     int vshift = s->ps.sps->vshift[c_idx];
+-     uint8_t *dst = &s->frame->data[c_idx][(y0 >> vshift) * stride +
+-                                           ((x0 >> hshift) << s->ps.sps->pixel_shift)];
+--    int use_vpu = s->enable_rpi && !lc->cu.cu_transquant_bypass_flag && !transform_skip_flag && !lc->tu.cross_pf && log2_trafo_size==4;
+-+#ifdef RPI
+-+    int use_vpu = s->enable_rpi && !lc->cu.cu_transquant_bypass_flag && !transform_skip_flag && !lc->tu.cross_pf && log2_trafo_size>=4;
+-+#endif
+-     int16_t *coeffs = (int16_t*)(c_idx ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
+-     uint8_t significant_coeff_group_flag[8][8] = {{0}};
+-     int explicit_rdpcm_flag = 0;
+-diff --git a/libavcodec/rpi_hevc_transform.h b/libavcodec/rpi_hevc_transform.h
+-index c0c279f..6d772d7 100644
+---- a/libavcodec/rpi_hevc_transform.h
+-+++ b/libavcodec/rpi_hevc_transform.h
+-@@ -1,6 +1,10 @@
+- unsigned char rpi_hevc_transform [] = {
+- 169,
+- 3,
+-+62,
+++169,
+++3,
+++73,
+ +64,
+-+79,
+++52,
+++64,
+++45,
+++64,
+++2,
+++64,
+++10,
+ +64,
+- 3,
+- 232,
+- 32,
+-@@ -17,6 +21,22 @@ unsigned char rpi_hevc_transform [] = {
+- 248,
+- 0,
+- 0,
+ +64,
+++198,
+++1,
+++7,
+++8,
+ +232,
+++63,
+ +0,
+-+2,
+ +0,
+ +0,
+-+12,
+-+248,
+++6,
+++232,
+++253,
+++255,
+++255,
+++255,
+ +0,
+-+168,
+++246,
+ +0,
+ +0,
+-+192,
+++0,
+++4,
+++215,
+++64,
+++3,
+++96,
+++2,
+ +248,
+ +0,
+++35,
+++0,
+ +0,
+- 0,
+- 96,
+- 3,
+-@@ -79,7 +99,7 @@ unsigned char rpi_hevc_transform [] = {
+- 70,
+- 128,
+- 144,
+--39,
+-+40,
+- 0,
+- 4,
+- 255,
+-@@ -113,7 +133,7 @@ unsigned char rpi_hevc_transform [] = {
+- 0,
+- 128,
+- 144,
+--22,
+-+23,
+- 0,
+- 4,
+- 255,
+-@@ -153,6 +173,8 @@ unsigned char rpi_hevc_transform [] = {
+- 140,
+- 211,
+- 192,
+-+34,
+-+31,
+- 41,
+- 3,
+- 70,
+-@@ -195,7 +217,7 @@ unsigned char rpi_hevc_transform [] = {
+- 255,
+- 36,
+- 204,
+--96,
+-+224,
+- 2,
+- 0,
+- 248,
+-@@ -219,62 +241,10 @@ unsigned char rpi_hevc_transform [] = {
+- 103,
+- 90,
+- 0,
+--8,
+--240,
+--0,
+--128,
+--128,
+--3,
+--0,
+--247,
+--32,
+--128,
+--10,
+--4,
+--136,
+--240,
+--32,
+--0,
+--128,
+--3,
+--112,
+--96,
+--90,
+--0,
+--169,
+--3,
+--3,
+--232,
+--32,
+--0,
+--0,
+--0,
+--12,
+--248,
+--0,
+--136,
+--0,
+--0,
+--192,
+--248,
+--0,
+--0,
+-+225,
+ +64,
+-+242,
+- 64,
+--232,
+--0,
+--2,
+--0,
+--0,
+--12,
+--248,
+--0,
+--168,
+--0,
+--0,
+--192,
+--248,
+--0,
+--0,
+- 3,
+- 232,
+- 128,
+-@@ -287,18 +257,6 @@ unsigned char rpi_hevc_transform [] = {
+- 2,
+- 0,
+- 0,
+--4,
+--232,
+--64,
+--0,
+--0,
+--0,
+--5,
+--232,
+--0,
+--8,
+--0,
+--0,
+- 57,
+- 239,
+- 224,
+-@@ -317,18 +275,26 @@ unsigned char rpi_hevc_transform [] = {
+- 64,
+- 26,
+- 64,
+++56,
+++0,
+++0,
+ +4,
+-+232,
+++248,
+++0,
+++36,
+++0,
+++0,
+++64,
+++56,
+++8,
+++0,
+++0,
+++240,
+ +64,
+ +0,
+++132,
+++3,
+++128,
+++240,
+ +0,
+ +0,
+-+149,
+-+96,
+- 161,
+- 64,
+- 152,
+- 64,
+- 128,
+- 144,
+--31,
+++132,
+++3,
+++128,
+++144,
+++137,
+++0,
+++131,
+++98,
+++0,
+++255,
+++64,
+++0,
+++0,
+++20,
+++200,
+++243,
+++0,
+++0,
+++128,
+++144,
+++129,
+++0,
+++131,
+++102,
+++0,
+++158,
+++67,
+++0,
+++2,
+++248,
+++0,
+ +35,
+- 0,
+- 72,
+- 232,
+--32,
+- 0,
+-+4,
+- 0,
+- 0,
+- 65,
+-@@ -339,8 +305,16 @@ unsigned char rpi_hevc_transform [] = {
+- 0,
+- 128,
+- 144,
+--23,
+-+27,
+++0,
+++0,
+++64,
+++56,
+++0,
+ +0,
+ +4,
+-+232,
+++248,
+++0,
+++36,
+++0,
+ +0,
+++64,
+++56,
+ +8,
+- 0,
+ +0,
+-+69,
+-+96,
+- 145,
+- 64,
+- 168,
+-@@ -351,8 +325,8 @@ unsigned char rpi_hevc_transform [] = {
+- 0,
+- 72,
+- 232,
+--32,
+- 0,
+-+4,
+- 0,
+- 0,
+- 65,
+-@@ -373,7 +347,7 @@ unsigned char rpi_hevc_transform [] = {
+- 0,
+- 242,
+- 140,
+--229,
+-+221,
+- 192,
+- 57,
+- 239,
+-@@ -383,6 +357,8 @@ unsigned char rpi_hevc_transform [] = {
+- 0,
+- 41,
+- 3,
+-+239,
+++0,
+++240,
+++64,
+++0,
+++132,
+ +3,
+- 12,
+- 248,
+- 0,
+-@@ -390,7 +366,7 @@ unsigned char rpi_hevc_transform [] = {
+- 0,
+- 0,
+- 192,
+--8,
+-+248,
+- 4,
+- 0,
+- 12,
+-@@ -400,14 +376,14 @@ unsigned char rpi_hevc_transform [] = {
+- 64,
+- 0,
+- 192,
+--8,
+-+248,
+- 4,
+- 0,
+- 0,
+- 96,
+- 255,
+- 159,
+--131,
+-+154,
+- 255,
+- 0,
+- 232,
+-@@ -417,7 +393,7 @@ unsigned char rpi_hevc_transform [] = {
+- 0,
+- 255,
+- 159,
+--142,
+-+165,
+- 255,
+- 4,
+- 255,
+-@@ -429,7 +405,7 @@ unsigned char rpi_hevc_transform [] = {
+- 251,
+- 62,
+- 0,
+--5,
+-+4,
+- 255,
+- 51,
+- 204,
+-@@ -439,15 +415,15 @@ unsigned char rpi_hevc_transform [] = {
+- 251,
+- 16,
+- 0,
+--77,
+-+76,
+- 254,
+- 51,
+- 204,
+--9,
+--4,
+ +128,
+++240,
+++0,
+++0,
+++132,
+ +3,
+- 224,
+- 251,
+--0,
+++128,
+++144,
+++108,
+++0,
+++131,
+++98,
+++0,
+++255,
+++64,
+++0,
+++0,
+ +20,
+- 0,
+- 128,
+- 64,
+-@@ -467,16 +443,6 @@ unsigned char rpi_hevc_transform [] = {
+- 99,
+- 0,
+- 0,
+--4,
+--254,
+--0,
+--144,
+--128,
+--2,
+--0,
+--8,
+--2,
+--0,
+- 32,
+- 247,
+- 240,
+-@@ -488,92 +454,92 @@ unsigned char rpi_hevc_transform [] = {
+- 176,
+- 207,
+- 17,
+--3,
+-+19,
+- 32,
+- 247,
+- 112,
+- 207,
+- 18,
+--3,
+-+35,
+- 32,
+- 247,
+- 48,
+- 207,
+- 19,
+--3,
+-+51,
+- 32,
+- 247,
+- 240,
+- 206,
+- 20,
+--3,
+-+67,
+- 32,
+- 247,
+- 176,
+- 206,
+- 21,
+--3,
+-+83,
+- 32,
+- 247,
+- 112,
+- 206,
+- 22,
+--3,
+-+99,
+- 32,
+- 247,
+- 48,
+- 206,
+- 23,
+--3,
+-+115,
+- 32,
+- 247,
+- 240,
+- 205,
+- 24,
+--3,
+++200,
+++243,
+++0,
+++0,
+++128,
+++144,
+++100,
+++0,
+ +131,
+- 32,
+- 247,
+- 176,
+- 205,
+- 25,
+--3,
+-+147,
+- 32,
+- 247,
+- 112,
+- 205,
+- 26,
+--3,
+-+163,
+- 32,
+- 247,
+- 48,
+- 205,
+- 27,
+--3,
+-+179,
+- 32,
+- 247,
+- 240,
+- 204,
+- 28,
+--3,
+-+195,
+- 32,
+- 247,
+- 176,
+- 204,
+- 29,
+--3,
+++102,
+++0,
+++248,
+++64,
+++0,
+++112,
+++0,
+++192,
+++243,
+ +211,
+- 32,
+- 247,
+- 112,
+- 204,
+- 30,
+--3,
+-+227,
+- 32,
+- 247,
+- 48,
+- 204,
+- 31,
+--3,
+--5,
+++31,
+++128,
+++248,
+++0,
+++0,
+++112,
+++0,
+++192,
+ +243,
+-+4,
+- 255,
+- 51,
+- 204,
+-@@ -583,20 +549,20 @@ unsigned char rpi_hevc_transform [] = {
+- 251,
+- 16,
+- 0,
+--77,
+-+76,
+- 254,
+- 51,
+- 204,
+--9,
+--4,
+++211,
+++31,
+ +128,
+-+3,
+- 224,
+- 251,
+--0,
+-+20,
+- 0,
+- 0,
+- 237,
+-+32,
+- 0,
+--4,
+- 0,
+- 0,
+- 140,
+-@@ -609,6 +575,6 @@ unsigned char rpi_hevc_transform [] = {
+- 99,
+- 0,
+- 0,
+--90,
+--0,
+-+111,
+-+3,
+- };
+-diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s
+-index 1e389c7..afdb32a 100644
+---- a/libavcodec/rpi_hevc_transform.s
+-+++ b/libavcodec/rpi_hevc_transform.s
+-@@ -76,12 +76,19 @@
+- # transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory)
+- # coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
+- # num: number of 16x16 transforms to be done
+-+# coeffs32
+-+# num32: number of 32x32 transforms
+- #
+- hevc_trans_16x16:
+-   push r6-r15, lr # TODO cut down number of used registers
+--
+-+  mov r14,r3 # coeffs32
+-+  mov r15,r4 # num32
+-   mov r3, 16*2 # Stride of transMatrix2 in bytes
+-   vldh HX(32++,0),(r0 += r3) REP 16 # This is the 16x16 matrix, a transform is equivalent to multiplying input row vector * matrix
+-+
+-+  add r0, 16*16*2 # For 32x32 transforms we also need this matrix
+-+  vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
+-+
+-   # Now use r0 to describe which matrix we are working on.
+-   # Allows us to prefetch the next block of coefficients for efficiency.
+-   mov r0,0 # This describes the location where we read our coefficients from
+-@@ -121,6 +128,10 @@ block_loop:
+-   add r1,r7
+- 
+-   addcmpbgt r2,-1,0,block_loop
+-+
+-+  # Now go and do any 32x32 transforms
+-+  b hevc_trans_32x32
+-+
+-   pop r6-r15, pc
+- 
+- # r1,r2,r3 r7,r8 should be preserved
+-@@ -136,26 +147,18 @@ col_trans_16_loop:
+-   # Then sum up the results and place back
+-   vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
+-   addcmpblt r0,1,r6,col_trans_16_loop
+--  sub r0,16  # but r0 back to its original value
+-+  sub r0,16  # put r0 back to its original value
+-   b lr
+- 
+- col_trans_odd_16:
+-   add r6,r0,16 # Final value for this loop
+- col_trans_odd_16_loop:
+-   # First compute partial products for a single column
+--  vmul32s HY(48++,0), VX(0,0)+r0, VX(32,0++) REP 16
+-+  vmul32s HY(48++,0), VX(0,0)+r0, VX(32,32++) REP 16
+-   # Then sum up the results and place back
+-   vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
+-   addcmpblt r0,1,r6,col_trans_odd_16_loop
+--  sub r0,16  # but r0 back to its original value
+--  b lr
+--
+--
+--test_add:
+--  vldh HX(0,0),(r0)
+--  vadd HX(0,0),HX(0,0),10
+--  vsth HX(0,0),(r0)
+--  mov r0,7 # return value
+-+  sub r0,16  # put r0 back to its original value
+-   b lr
+- 
+- # hevc_trans_32x32(short *transMatrix2, short *coeffs, int num)
+-@@ -164,18 +167,17 @@ test_add:
+- # num: number of 16x16 transforms to be done
+- #
+- hevc_trans_32x32:
+--  push r6-r15, lr # TODO cut down number of used registers
+-+  mov r1,r14 # coeffs
+-+  mov r2,r15 # num
+- 
+--  # Fetch transform matrices
+--  mov r3, 16*2 # Stride of transMatrix2 in bytes (and of coefficients)
+--  vldh HX(32++,0),(r0 += r3) REP 16 # This is the even 16x16 matrix
+--  add r0, 16*16*2
+--  vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
+-+  # Fetch odd transform matrix
+-+  #mov r3, 16*2 # Stride of transMatrix2 in bytes (and of coefficients)
+-+  #vldh HX(32++,0),(r0 += r3) REP 16 # This is the even 16x16 matrix
+-+  #add r0, 16*16*2
+-+  #vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
+- 
+-   mov r3, 32*2*2 # Stride used to fetch alternate rows of our input coefficient buffer
+-   mov r7, 16*16*2 # Total block size
+--  mov r4, 64 # Constant used for rounding first pass
+--  mov r5, 1<<11 # Constant used for rounding second pass
+-   sub sp,sp,32*32*2+32 # Allocate some space on the stack for us to store 32*32 shorts as temporary results (needs to be aligned)
+-   # set r8 to 32byte aligned stack pointer
+-   add r8,sp,31
+-@@ -186,21 +188,27 @@ hevc_trans_32x32:
+- block_loop32:
+- 
+-   # COLUMN TRANSFORM
+-+  mov r4, 64 # Constant used for rounding first pass
+-+  mov r5, 9 # left shift used for rounding first pass
+-+
+-   # Transform the first 16 columns
+-   mov r1,r10  # Input Coefficient buffer
+-   mov r8,r9   # Output temporary storage
+-   bl trans32
+-   # Transform the second 16 columns
+--  add r8,32
+-+  add r8,32*16*2
+-   add r1,32
+-   bl trans32
+- 
+-   # ROW TRANSFORM
+-+  mov r4, 1<<11 # Constant used for rounding second pass
+-+  mov r5, 4 # left shift used for rounding second pass
+-+
+-   mov r1,r9  # Input temporary storage
+-   mov r8,r10   # Output Coefficient buffer
+-   bl trans32
+-   # Transform the second 16 columns
+--  add r8,32
+-+  add r8,32*16*2
+-   add r1,32
+-   bl trans32
+- 
+-@@ -212,11 +220,12 @@ block_loop32:
+-   pop r6-r15, pc
+- 
+- trans32:
+-+  push lr
+-   # We can no longer afford the VRF space to do prefetching when doing 32x32
+-   # Fetch the even rows
+--  vldh HX(0++,0)+r0,(r1 += r3) REP 16
+-+  vldh HX(0++,0),(r1 += r3) REP 16
+-   # Fetch the odd rows
+--  vldh HX(16++,0)+r0,64(r1 += r3) REP 16 # First odd row is 32 shorts ahead of r1
+-+  vldh HX(16++,0),64(r1 += r3) REP 16 # First odd row is 32 shorts ahead of r1
+- 
+-   # Transform the even rows using even matrix
+-   mov r0, 0 # Even rows
+-@@ -228,33 +237,32 @@ trans32:
+- 
+-   # Now apply butterfly to compute the first 16 results
+-   vadd HY(48++,0),HY(0++,0),HY(16++,0) REP 16
+--  vadd HY(48++,0),HY(48++,0),r4 REP 32   # add on rounding,
+--  vasl HY(48++,0),HY(48++,0),9 REP 32    # shift down by 7, and saturate
+-+  vadd HY(48++,0),HY(48++,0),r4 REP 16   # add on rounding,
+-+  vasl HY(48++,0),HY(48++,0),r5 REP 16    # shift down by 7, and saturate
+-   # 16bit results now in HX(48,32)
+-   mov r0,r8
+-   mov r6,32*2
+-   vsth VX(48,32++),(r0+=r6) REP 16
+--  vmov VX(0,0++)+r0, HX(0++,32)+r0 REP 16    # Store transposed
+- 
+-   # Now apply butterfly to compute the second 16 results (in reverse order)
+--  vsub HY(63,0),HY(0,0),HY(16,0)
+--  vsub HY(62,0),HY(0,0),HY(17,0)
+--  vsub HY(61,0),HY(0,0),HY(18,0)
+--  vsub HY(60,0),HY(0,0),HY(19,0)
+--  vsub HY(59,0),HY(0,0),HY(20,0)
+--  vsub HY(58,0),HY(0,0),HY(21,0)
+--  vsub HY(57,0),HY(0,0),HY(22,0)
+--  vsub HY(56,0),HY(0,0),HY(23,0)
+--  vsub HY(55,0),HY(0,0),HY(24,0)
+--  vsub HY(54,0),HY(0,0),HY(25,0)
+--  vsub HY(53,0),HY(0,0),HY(26,0)
+--  vsub HY(52,0),HY(0,0),HY(27,0)
+--  vsub HY(51,0),HY(0,0),HY(28,0)
+--  vsub HY(50,0),HY(0,0),HY(29,0)
+--  vsub HY(49,0),HY(0,0),HY(30,0)
+--  vsub HY(48,0),HY(0,0),HY(31,0)
+--  vadd HY(48++,0),HY(48++,0),r4 REP 32   # add on rounding,
+--  vasl HY(48++,0),HY(48++,0),9 REP 32    # shift down by 7, and saturate
+--  add r0,r8,16*32*2 # Move to 16th row
+-+  vsub HY(63,0),HY(0 ,0),HY(16,0)
+-+  vsub HY(62,0),HY(1 ,0),HY(17,0)
+-+  vsub HY(61,0),HY(2 ,0),HY(18,0)
+-+  vsub HY(60,0),HY(3 ,0),HY(19,0)
+-+  vsub HY(59,0),HY(4 ,0),HY(20,0)
+-+  vsub HY(58,0),HY(5 ,0),HY(21,0)
+-+  vsub HY(57,0),HY(6 ,0),HY(22,0)
+-+  vsub HY(56,0),HY(7 ,0),HY(23,0)
+-+  vsub HY(55,0),HY(8 ,0),HY(24,0)
+-+  vsub HY(54,0),HY(9 ,0),HY(25,0)
+-+  vsub HY(53,0),HY(10,0),HY(26,0)
+-+  vsub HY(52,0),HY(11,0),HY(27,0)
+-+  vsub HY(51,0),HY(12,0),HY(28,0)
+-+  vsub HY(50,0),HY(13,0),HY(29,0)
+-+  vsub HY(49,0),HY(14,0),HY(30,0)
+-+  vsub HY(48,0),HY(15,0),HY(31,0)
+-+  vadd HY(48++,0),HY(48++,0),r4 REP 16   # add on rounding,
+-+  vasl HY(48++,0),HY(48++,0),r5 REP 16    # shift down by 7, and saturate
+-+  add r0,r8,32
+-   vsth VX(48,32++),(r0+=r6) REP 16
+--  b lr
+-+  pop pc
+-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
+-index d720546..12ad5fb 100644
+---- a/libavcodec/rpi_qpu.c
+-+++ b/libavcodec/rpi_qpu.c
+-@@ -89,7 +89,7 @@ struct GPU
+- {
+-   unsigned int qpu_code[QPU_CODE_SIZE];
+-   unsigned int vpu_code[VPU_CODE_SIZE];
+--  short transMatrix2even[16*16];
+-+  short transMatrix2even[16*16*2];
+-   int open_count; // Number of allocated video buffers
+-   unsigned int vc_handle; // Handle of this memory
+-   int      mb; // Mailbox handle
+-@@ -162,7 +162,7 @@ static int gpu_init(volatile struct GPU **gpu) {
+-     memcpy((void*)ptr->vpu_code, rpi_hevc_transform, num_bytes);
+-   }
+-   // And the transform coefficients
+--  memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, 16*16*sizeof(short));
+-+  memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, sizeof(rpi_transMatrix2even));
+- 
+-   return 0;
+- }
+--- 
+-2.7.4
+-
+-
+-From 6c2ed6109c4dd5c8ab16bf16e0ae3be6ae166e50 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Tue, 5 May 2015 16:57:03 +0100
+-Subject: [PATCH 07/68] Clear coefficients in advance
+-
+----
+- libavcodec/hevc.c               | 129 ++++++++++++++++++++++++++++------------
+- libavcodec/hevc.h               |   6 +-
+- libavcodec/hevc_cabac.c         |   7 ++-
+- libavcodec/rpi_hevc_transform.h |  50 ++++++++++++++++
+- libavcodec/rpi_hevc_transform.s |  16 +++++
+- 5 files changed, 168 insertions(+), 40 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 0dde6f2..1424007 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -43,6 +43,8 @@
+- 
+- #ifdef RPI
+- #include "rpi_qpu.h"
+-+// For some unknown reason, the code seems to crash if I do a late malloc
+-+#define EARLY_MALLOC
+- #endif
+- 
+- // #define DISABLE_MC
+-@@ -61,6 +63,20 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
+- /* free everything allocated  by pic_arrays_init() */
+- static void pic_arrays_free(HEVCContext *s)
+- {
+-+#ifdef RPI
+-+#ifdef EARLY_MALLOC
+-+#else
+-+    printf("pic_arrays_free\n");
+-+    if (s->coeffs_buf_arm[0]) {
+-+      gpu_free(&s->coeffs_buf_default);
+-+      s->coeffs_buf_arm[0] = 0;
+-+    }
+-+    if (s->coeffs_buf_arm[2]) {
+-+      gpu_free(&s->coeffs_buf_accelerated);
+-+      s->coeffs_buf_arm[2] = 0;
+-+    }
+-+#endif
+-+#endif
+-     av_freep(&s->sao);
+-     av_freep(&s->deblock);
+- 
+-@@ -97,6 +113,28 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
+-     int ctb_count        = sps->ctb_width * sps->ctb_height;
+-     int min_pu_size      = sps->min_pu_width * sps->min_pu_height;
+- 
+-+#ifdef RPI
+-+#ifdef EARLY_MALLOC
+-+#else
+-+    int coeffs_in_ctb = (1 << s->ps.sps->log2_ctb_size) * (1 << s->ps.sps->log2_ctb_size);
+-+    int coefs_per_row = sps->ctb_width * coeffs_in_ctb * 3;  // Allow space for chroma
+-+    printf("pic_arrays_init\n");
+-+    printf("Allocated %d\n",coefs_per_row);
+-+    gpu_malloc_cached(sizeof(int16_t) * coefs_per_row, &s->coeffs_buf_default);
+-+    s->coeffs_buf_arm[0] = (int16_t*) s->coeffs_buf_default.arm;
+-+    if (!s->coeffs_buf_arm[0])
+-+        goto fail;
+-+    gpu_malloc_cached(sizeof(int16_t) * coefs_per_row * 2, &s->coeffs_buf_accelerated);
+-+    s->coeffs_buf_arm[2] = (int16_t*) s->coeffs_buf_accelerated.arm;
+-+    s->coeffs_buf_vc[2] = s->coeffs_buf_accelerated.vc;
+-+    if (!s->coeffs_buf_arm[2])
+-+        goto fail;
+-+    s->coeffs_buf_arm[3] = coefs_per_row + s->coeffs_buf_arm[2];
+-+    s->coeffs_buf_vc[3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[2];
+-+    printf("Done\n");
+-+#endif
+-+#endif
+-+
+-     s->bs_width  = (width  >> 2) + 1;
+-     s->bs_height = (height >> 2) + 1;
+- 
+-@@ -2400,11 +2438,10 @@ static void rpi_execute_transform(HEVCContext *s)
+-     //    s->hevcdsp.idct[4-2](coeffs, 16);
+-     //}
+- 
+--    gpu_cache_flush(&s->coeffs_buf[2]);
+--    gpu_cache_flush(&s->coeffs_buf[3]);
+--    vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf[2].vc, s->num_coeffs[2] >> 8, s->coeffs_buf[3].vc, s->num_coeffs[3] >> 10, 0);
+--    gpu_cache_flush(&s->coeffs_buf[2]);
+--    gpu_cache_flush(&s->coeffs_buf[3]);
+-+
+-+    gpu_cache_flush(&s->coeffs_buf_accelerated);
+-+    vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0);
+-+    //gpu_cache_flush(&s->coeffs_buf_accelerated);
+- 
+-     for(i=0;i<4;i++)
+-         s->num_coeffs[i] = 0;
+-@@ -2426,7 +2463,9 @@ static void rpi_execute_pred_cmds(HEVCContext *s)
+-           lc->na.cand_up_right     = (cmd->na >> 0) & 1;
+-           s->hpc.intra_pred[cmd->size - 2](s, cmd->x, cmd->y, cmd->c_idx);
+-       } else {
+-+          int trafo_size = 1 << cmd->size;
+-           s->hevcdsp.transform_add[cmd->size-2](cmd->dst, cmd->buf, cmd->stride);
+-+          memset(cmd->buf, 0, trafo_size * trafo_size * sizeof(int16_t)); // Clear coefficients here while they are in the cache
+-       }
+-   }
+-   s->num_pred_cmds = 0;
+-@@ -3235,10 +3274,18 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
+-     av_freep(&s->unif_mv_cmds);
+-     av_freep(&s->unif_xfm_cmds);
+-     av_freep(&s->univ_pred_cmds);
+--    for(i = 0; i < 4; i++) {
+--        gpu_free(&s->coeffs_buf[i]);
+-+
+-+#ifdef EARLY_MALLOC
+-+    if (s->coeffs_buf_arm[0]) {
+-+      gpu_free(&s->coeffs_buf_default);
+-+      s->coeffs_buf_arm[0] = 0;
+-+    }
+-+    if (s->coeffs_buf_arm[2]) {
+-+      gpu_free(&s->coeffs_buf_accelerated);
+-+      s->coeffs_buf_arm[2] = 0;
+-     }
+- #endif
+-+#endif
+- 
+-     for (i = 0; i < 3; i++) {
+-         av_freep(&s->sao_pixel_buffer_h[i]);
+-@@ -3281,6 +3328,16 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
+-     return 0;
+- }
+- 
+-+#ifdef RPI
+-+static av_cold void memclear16(int16_t *p, int n)
+-+{
+-+  vpu_execute_code( vpu_get_fn(), p, n, 0, 0, 0, 1);
+-+  //int i;
+-+  //for(i=0;i<n;i++)
+-+  //  p[i] = 0;
+-+}
+-+#endif
+-+
+- static av_cold int hevc_init_context(AVCodecContext *avctx)
+- {
+-     HEVCContext *s = avctx->priv_data;
+-@@ -3304,37 +3361,35 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
+-     s->univ_pred_cmds = av_mallocz(sizeof(HEVCPredCmd)*RPI_MAX_PRED_CMDS);
+-     if (!s->univ_pred_cmds)
+-         goto fail;
+--    for(i = 0; i < 4; i++) {
+--        gpu_malloc_cached(sizeof(int16_t)*RPI_MAX_XFM_CMDS*16, &s->coeffs_buf[i]); // TODO slim this down and share across sizes
+--        s->coeffs_buf_arm[i] = (int16_t*) s->coeffs_buf[i].arm;
+--        if (!s->coeffs_buf_arm[i])
+--            goto fail;
+--    }
+--    s->enable_rpi = 0;
+- 
+--    // A little test program
+--    /*{
+--      GPU_MEM_PTR_T p;
+--      int err = gpu_malloc_cached(16, &p);
+--      short *q = (short *)p.arm;
+--      int i;
+--      int r;
+--      printf("Allocated memory %d ARM 0x%x, VC 0x%x, Code 0x%x\n",err,(int)p.arm,p.vc,(int)vpu_get_fn());
+--      printf("Allocated memory %d ARM 0x%x, VC 0x%x\n",err,(int)p.arm,p.vc);
+--      printf("Preparing data %p\n",q);
+--      for(i=0;i<16;i++)
+--        q[i] = i;
+--      printf("Flush cache\n");
+--      gpu_cache_flush(&p);
+--      printf("Executing code\n");
+--      r = vpu_execute_code( vpu_get_fn(), p.vc, 0, 0, 0, 0, 0);
+--      printf("Return value %d (",r);
+--      for(i=0;i<16;i++)
+--        printf("%d ",q[i]);
+--      printf(")\n");
+--      gpu_free(&p);
+--      goto fail; // Early out
+--    }*/
+-+    s->coeffs_buf_arm[0] = 0;
+-+    s->coeffs_buf_arm[2] = 0;
+-+
+-+#ifdef EARLY_MALLOC
+-+    int coeffs_in_ctb = 64*64;
+-+    int coefs_per_row = (2048/64) * coeffs_in_ctb * 3;  // Allow space for chroma
+-+    printf("Allocated %d\n",coefs_per_row);
+-+    gpu_malloc_cached(sizeof(int16_t) * coefs_per_row, &s->coeffs_buf_default);
+-+    s->coeffs_buf_arm[0] = (int16_t*) s->coeffs_buf_default.arm;
+-+    if (!s->coeffs_buf_arm[0])
+-+        goto fail;
+-+    gpu_malloc_cached(sizeof(int16_t) * coefs_per_row * 2, &s->coeffs_buf_accelerated);
+-+    s->coeffs_buf_arm[2] = (int16_t*) s->coeffs_buf_accelerated.arm;
+-+    s->coeffs_buf_vc[2] = s->coeffs_buf_accelerated.vc;
+-+    if (!s->coeffs_buf_arm[2])
+-+        goto fail;
+-+    s->coeffs_buf_arm[3] = coefs_per_row + s->coeffs_buf_arm[2];
+-+    s->coeffs_buf_vc[3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[2];
+-+    printf("Done\n");
+-+    //memset(s->coeffs_buf_arm[0],0, sizeof(int16_t) * coefs_per_row);
+-+    memclear16(s->coeffs_buf_arm[0], coefs_per_row);
+-+    //memset(s->coeffs_buf_arm[2],0, sizeof(int16_t) * coefs_per_row);
+-+    memclear16(s->coeffs_buf_arm[2], coefs_per_row);
+-+    //memset(s->coeffs_buf_arm[3],0, sizeof(int16_t) * coefs_per_row);
+-+    memclear16(s->coeffs_buf_arm[3], coefs_per_row);
+-+#endif
+-+
+-+    s->enable_rpi = 0;
+- 
+- #endif
+- 
+-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
+-index 4167985..9a228f6 100644
+---- a/libavcodec/hevc.h
+-+++ b/libavcodec/hevc.h
+-@@ -861,8 +861,12 @@ typedef struct HEVCContext {
+-     HEVCMvCmd *unif_mv_cmds;
+-     HEVCXfmCmd *unif_xfm_cmds;
+-     HEVCPredCmd *univ_pred_cmds;
+--    GPU_MEM_PTR_T coeffs_buf[4];
+-+    int buf_width;
+-+    GPU_MEM_PTR_T coeffs_buf_default;
+-+    GPU_MEM_PTR_T coeffs_buf_accelerated;
+-     int16_t *coeffs_buf_arm[4];
+-+    unsigned int coeffs_buf_vc[4];
+-+
+-     int num_coeffs[4];
+-     int num_xfm_cmds;
+-     int num_mv_cmds;
+-diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
+-index 88aa959..dbfee85 100644
+---- a/libavcodec/hevc_cabac.c
+-+++ b/libavcodec/hevc_cabac.c
+-@@ -1058,9 +1058,13 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+-             s->num_coeffs[0] += n;
+-         }
+-     }
+-+    // We now do the memset after transform_add while we know the data is cached.
+-+    //memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
+-+#else
+-+    memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
+- #endif
+- 
+--    memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
+-+
+- 
+-     // Derive QP for dequant
+-     if (!lc->cu.cu_transquant_bypass_flag) {
+-@@ -1547,7 +1551,6 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+- #ifdef RPI
+-     if (s->enable_rpi) {
+-         HEVCPredCmd *cmd = s->univ_pred_cmds + s->num_pred_cmds++;
+--        //memcpy(coeffs2, coeffs, sizeof(int16_t) * trafo_size * trafo_size); // TODO
+-         cmd->type = RPI_PRED_TRANSFORM_ADD;
+-         cmd->size = log2_trafo_size;
+-         cmd->buf = coeffs;
+-diff --git a/libavcodec/rpi_hevc_transform.h b/libavcodec/rpi_hevc_transform.h
+-index 6d772d7..4f13622 100644
+---- a/libavcodec/rpi_hevc_transform.h
+-+++ b/libavcodec/rpi_hevc_transform.h
+-@@ -1,4 +1,10 @@
+- unsigned char rpi_hevc_transform [] = {
+-+21,
+-+106,
+-+0,
+ +144,
+-+35,
+-+1,
+- 169,
+- 3,
+- 62,
+-@@ -577,4 +583,48 @@ unsigned char rpi_hevc_transform [] = {
+- 0,
+- 111,
+- 3,
+-+4,
+-+254,
+++161,
+ +0,
+-+128,
+++188,
+++64,
+++67,
+++232,
+ +0,
+-+4,
+++2,
+ +0,
+-+248,
+ +0,
+ +0,
+-+2,
+++255,
+++64,
+++0,
+++0,
+++20,
+++200,
+++243,
+++0,
+++0,
+++128,
+++144,
+++150,
+++0,
+++195,
+ +232,
+-+32,
+ +0,
+++2,
+ +0,
+ +0,
+-+140,
+++12,
+++128,
+++7,
+++192,
+++130,
+ +248,
+-+32,
+-+0,
+ +0,
+ +0,
+++112,
+++192,
+ +224,
+-+35,
+++16,
+++195,
+++31,
+++132,
+++248,
+++1,
+ +0,
+++112,
+ +0,
+-+64,
+++224,
+++16,
+++203,
+++31,
+++3,
+++99,
+++131,
+++71,
+++68,
+ +232,
+++32,
+ +0,
+-+2,
+ +0,
+ +0,
+-+193,
+-+232,
+ +0,
+-+1,
+++99,
+++2,
+++99,
+++23,
+++102,
+++7,
+++106,
+++127,
+++156,
+++182,
+++255,
+ +0,
+++248,
+++64,
+ +0,
+-+1,
+-+106,
+-+116,
+-+30,
+-+90,
+++112,
+ +0,
+- };
+-diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s
+-index afdb32a..fd159bc 100644
+---- a/libavcodec/rpi_hevc_transform.s
+-+++ b/libavcodec/rpi_hevc_transform.s
+-@@ -78,8 +78,11 @@
+- # num: number of 16x16 transforms to be done
+- # coeffs32
+- # num32: number of 32x32 transforms
+-+# command 0 for transform, 1 for memclear16(int16_t *dst,num16)
+- #
+- hevc_trans_16x16:
+-+  cmp r5,1
+-+  beq memclear16
+-   push r6-r15, lr # TODO cut down number of used registers
+-   mov r14,r3 # coeffs32
+-   mov r15,r4 # num32
+-@@ -266,3 +269,16 @@ trans32:
+-   add r0,r8,32
+-   vsth VX(48,32++),(r0+=r6) REP 16
+-   pop pc
+-+
+-+memclear16:
+-+  # r0 is address
+-+  # r1 is number of 16bits values to set to 0 (may overrun past end and clear more than specified)
+-+  vmov HX(0++,0),0 REP 16
+-+  mov r2,32
+-+loop:
+-+  vsth HX(0++,0),(r0+=r2) REP 16
+-+  add r0,16*16*2
+-+  sub r1,16*16
+-+  cmp r1,0
+-+  bgt loop
+-+  b lr
+--- 
+-2.7.4
+-
+-
+-From 48282c2fb55c0d9a72222f384c03c432f78a3016 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Wed, 6 May 2015 09:56:43 +0100
+-Subject: [PATCH 08/68] Prepared inter offload
+-
+----
+- libavcodec/hevc.c       | 116 +++++++++++++++++++++++++++++++++++++++++++-----
+- libavcodec/hevc.h       |  29 +++++++++++-
+- libavcodec/hevc_cabac.c |   5 ++-
+- 3 files changed, 137 insertions(+), 13 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 1424007..8215201 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -45,6 +45,8 @@
+- #include "rpi_qpu.h"
+- // For some unknown reason, the code seems to crash if I do a late malloc
+- #define EARLY_MALLOC
+-+// Move Inter prediction into separate pass
+-+//#define RPI_INTER
+- #endif
+- 
+- // #define DISABLE_MC
+-@@ -1440,6 +1442,95 @@ static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size)
+-  * @param luma_offset additive offset applied to the luma prediction value
+-  */
+- 
+-+#ifdef RPI_INTER
+-+#define RPI_REDIRECT(fn) rpi_ ## fn
+-+static void rpi_luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+-+                        AVFrame *ref, const Mv *mv, int x_off, int y_off,
+-+                        int block_w, int block_h, int luma_weight, int luma_offset)
+-+{
+-+    HEVCMvCmd *cmd = unif_mv_cmds + s->num_mv_cmds++;
+-+    cmd->cmd = RPI_CMD_LUMA_UNI;
+-+    cmd->dst = dst;
+-+    cmd->dststride = dststride;
+-+    cmd->src = ref->data[0];
+-+    cmd->srcstride = ref->linesize[0];
+-+    cmd->mv = *mv;
+-+    cmd->x_off = x_off;
+-+    cmd->y_off = y_off;
+-+    cmd->block_w = block_w;
+-+    cmd->block_h = block_h;
+-+    cmd->weight = luma_weight;
+-+    cmd->offset = luma_offset;
+-+}
+-+
+-+static void rpi_luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+-+                       AVFrame *ref0, const Mv *mv0, int x_off, int y_off,
+-+                       int block_w, int block_h, AVFrame *ref1, const Mv *mv1, struct MvField *current_mv)
+-+{
+-+    HEVCMvCmd *cmd = unif_mv_cmds + s->num_mv_cmds++;
+-+    cmd->cmd = RPI_CMD_LUMA_BI;
+-+    cmd->dst = dst;
+-+    cmd->dststride = dststride;
+-+    cmd->src = ref->data[0];
+-+    cmd->srcstride = ref->linesize[0];
+-+    cmd->mv = *mv;
+-+    cmd->x_off = x_off;
+-+    cmd->y_off = y_off;
+-+    cmd->block_w = block_w;
+-+    cmd->block_h = block_h;
+-+    cmd->weight = luma_weight;
+-+    cmd->offset = luma_offset;
+-+    cmd->src1 = ref1->data[];
+-+    cmd->srcstride1 = ref1->linesize[0];
+-+    cmd->mv1 = *mv1;
+-+    cmd->ref_idx[0] = current_mv->ref_idx[0];
+-+    cmd->ref_idx[1] = current_mv->ref_idx[1];
+-+}
+-+
+-+static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
+-+                          ptrdiff_t dststride, uint8_t *src0, ptrdiff_t srcstride, int reflist,
+-+                          int x_off, int y_off, int block_w, int block_h, struct MvField *current_mv, int chroma_weight, int chroma_offset)
+-+{
+-+    HEVCMvCmd *cmd = unif_mv_cmds + s->num_mv_cmds++;
+-+    cmd->cmd = RPI_CMD_CHROMA_UNI;
+-+    cmd->dst = dst0;
+-+    cmd->dststride = dststride;
+-+    cmd->src = src0;
+-+    cmd->srcstride = srcstride;
+-+    cmd->mv = current_mv->mv[reflist];
+-+    cmd->x_off = x_off;
+-+    cmd->y_off = y_off;
+-+    cmd->block_w = block_w;
+-+    cmd->block_h = block_h;
+-+    cmd->weight = chroma_weight;
+-+    cmd->offset = chroma_offset;
+-+}
+-+
+-+static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVFrame *ref0, AVFrame *ref1,
+-+                         int x_off, int y_off, int block_w, int block_h, struct MvField *current_mv, int cidx)
+-+{
+-+    HEVCMvCmd *cmd = unif_mv_cmds + s->num_mv_cmds++;
+-+    cmd->cmd = RPI_CMD_CHROMA_BI+cidx;
+-+    cmd->dst = dst0;
+-+    cmd->dststride = dststride;
+-+    cmd->src = ref0->data[cidx+1];
+-+    cmd->srcstride = ref0->linesize[cidx+1];
+-+    cmd->mv = current_mv->mv[reflist];
+-+    cmd->x_off = x_off;
+-+    cmd->y_off = y_off;
+-+    cmd->block_w = block_w;
+-+    cmd->block_h = block_h;
+-+    cmd->weight = chroma_weight;
+-+    cmd->offset = chroma_offset;
+-+    cmd->src = ref1->data[cidx+1];
+-+    cmd->srcstride1 = ref1->linesize[cidx+1];
+-+    cmd->ref_idx[0] = current_mv->ref_idx[0];
+-+    cmd->ref_idx[1] = current_mv->ref_idx[1];
+-+}
+-+#else
+-+#define RPI_REDIRECT(fn) fn
+-+#endif
+-+
+- static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+-                         AVFrame *ref, const Mv *mv, int x_off, int y_off,
+-                         int block_w, int block_h, int luma_weight, int luma_offset)
+-@@ -1505,7 +1596,7 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+-  * @param mv1 motion vector1 (relative to block position) to get pixel data from
+-  * @param current_mv current motion vector structure
+-  */
+-- static void luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+-+static void luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+-                        AVFrame *ref0, const Mv *mv0, int x_off, int y_off,
+-                        int block_w, int block_h, AVFrame *ref1, const Mv *mv1, struct MvField *current_mv)
+- {
+-@@ -1887,16 +1978,16 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-         int nPbW_c = nPbW >> s->ps.sps->hshift[1];
+-         int nPbH_c = nPbH >> s->ps.sps->vshift[1];
+- 
+--        luma_mc_uni(s, dst0, s->frame->linesize[0], ref0->frame,
+-+        RPI_REDIRECT(luma_mc_uni)(s, dst0, s->frame->linesize[0], ref0->frame,
+-                     &current_mv.mv[0], x0, y0, nPbW, nPbH,
+-                     s->sh.luma_weight_l0[current_mv.ref_idx[0]],
+-                     s->sh.luma_offset_l0[current_mv.ref_idx[0]]);
+- 
+-         if (s->ps.sps->chroma_format_idc) {
+--            chroma_mc_uni(s, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1],
+-+            RPI_REDIRECT(chroma_mc_uni)(s, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1],
+-                           0, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
+-                           s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]);
+--            chroma_mc_uni(s, dst2, s->frame->linesize[2], ref0->frame->data[2], ref0->frame->linesize[2],
+-+            RPI_REDIRECT(chroma_mc_uni)(s, dst2, s->frame->linesize[2], ref0->frame->data[2], ref0->frame->linesize[2],
+-                           0, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
+-                           s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1]);
+-         }
+-@@ -1906,17 +1997,17 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-         int nPbW_c = nPbW >> s->ps.sps->hshift[1];
+-         int nPbH_c = nPbH >> s->ps.sps->vshift[1];
+- 
+--        luma_mc_uni(s, dst0, s->frame->linesize[0], ref1->frame,
+-+        RPI_REDIRECT(luma_mc_uni)(s, dst0, s->frame->linesize[0], ref1->frame,
+-                     &current_mv.mv[1], x0, y0, nPbW, nPbH,
+-                     s->sh.luma_weight_l1[current_mv.ref_idx[1]],
+-                     s->sh.luma_offset_l1[current_mv.ref_idx[1]]);
+- 
+-         if (s->ps.sps->chroma_format_idc) {
+--            chroma_mc_uni(s, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1],
+-+            RPI_REDIRECT(chroma_mc_uni)(s, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1],
+-                           1, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
+-                           s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0], s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0]);
+- 
+--            chroma_mc_uni(s, dst2, s->frame->linesize[2], ref1->frame->data[2], ref1->frame->linesize[2],
+-+            RPI_REDIRECT(chroma_mc_uni)(s, dst2, s->frame->linesize[2], ref1->frame->data[2], ref1->frame->linesize[2],
+-                           1, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
+-                           s->sh.chroma_weight_l1[current_mv.ref_idx[1]][1], s->sh.chroma_offset_l1[current_mv.ref_idx[1]][1]);
+-         }
+-@@ -1926,15 +2017,15 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-         int nPbW_c = nPbW >> s->ps.sps->hshift[1];
+-         int nPbH_c = nPbH >> s->ps.sps->vshift[1];
+- 
+--        luma_mc_bi(s, dst0, s->frame->linesize[0], ref0->frame,
+-+        RPI_REDIRECT(luma_mc_bi)(s, dst0, s->frame->linesize[0], ref0->frame,
+-                    &current_mv.mv[0], x0, y0, nPbW, nPbH,
+-                    ref1->frame, &current_mv.mv[1], &current_mv);
+- 
+-         if (s->ps.sps->chroma_format_idc) {
+--            chroma_mc_bi(s, dst1, s->frame->linesize[1], ref0->frame, ref1->frame,
+-+            RPI_REDIRECT(chroma_mc_bi)(s, dst1, s->frame->linesize[1], ref0->frame, ref1->frame,
+-                          x0_c, y0_c, nPbW_c, nPbH_c, &current_mv, 0);
+- 
+--            chroma_mc_bi(s, dst2, s->frame->linesize[2], ref0->frame, ref1->frame,
+-+            RPI_REDIRECT(chroma_mc_bi)(s, dst2, s->frame->linesize[2], ref0->frame, ref1->frame,
+-                          x0_c, y0_c, nPbW_c, nPbH_c, &current_mv, 1);
+-         }
+-     }
+-@@ -2465,7 +2556,9 @@ static void rpi_execute_pred_cmds(HEVCContext *s)
+-       } else {
+-           int trafo_size = 1 << cmd->size;
+-           s->hevcdsp.transform_add[cmd->size-2](cmd->dst, cmd->buf, cmd->stride);
+-+#ifdef RPI_PRECLEAR
+-           memset(cmd->buf, 0, trafo_size * trafo_size * sizeof(int16_t)); // Clear coefficients here while they are in the cache
+-+#endif
+-       }
+-   }
+-   s->num_pred_cmds = 0;
+-@@ -3381,6 +3474,7 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
+-     s->coeffs_buf_arm[3] = coefs_per_row + s->coeffs_buf_arm[2];
+-     s->coeffs_buf_vc[3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[2];
+-     printf("Done\n");
+-+#ifdef RPI_PRECLEAR
+-     //memset(s->coeffs_buf_arm[0],0, sizeof(int16_t) * coefs_per_row);
+-     memclear16(s->coeffs_buf_arm[0], coefs_per_row);
+-     //memset(s->coeffs_buf_arm[2],0, sizeof(int16_t) * coefs_per_row);
+-@@ -3389,6 +3483,8 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
+-     memclear16(s->coeffs_buf_arm[3], coefs_per_row);
+- #endif
+- 
+-+#endif
+-+
+-     s->enable_rpi = 0;
+- 
+- #endif
+-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
+-index 9a228f6..1ac119a 100644
+---- a/libavcodec/hevc.h
+-+++ b/libavcodec/hevc.h
+-@@ -803,14 +803,39 @@ typedef struct HEVCLocalContext {
+- // RPI_MAX_WIDTH is maximum width in pixels supported by the accelerated code
+- #define RPI_MAX_WIDTH 2048
+- 
+--// Worst case is for 4:4:4 4x4 blocks with 64 high coding tree blocks, so 16 MV cmds per 4 pixels across for each colour plane
+--#define RPI_MAX_MV_CMDS   (16*3*(RPI_MAX_WIDTH/4))
+-+// Worst case is for 4:4:4 4x4 blocks with 64 high coding tree blocks, so 16 MV cmds per 4 pixels across for each colour plane, * 2 for bi
+-+#define RPI_MAX_MV_CMDS   (2*16*3*(RPI_MAX_WIDTH/4))
+- #define RPI_MAX_XFM_CMDS  (16*3*(RPI_MAX_WIDTH/4))
+- // Each block can have an intra prediction and a transform_add command
+- #define RPI_MAX_PRED_CMDS (2*16*3*(RPI_MAX_WIDTH/4))
+- 
+-+#define RPI_CMD_LUMA_UNI 0
+-+#define RPI_CMD_CHROMA_UNI 1
+-+#define RPI_CMD_LUMA_BI 2
+-+#define RPI_CMD_U_BI 3
+-+#define RPI_CMD_V_BI 4
+-+
+-+// RPI_PRECLEAR is not working yet - perhaps clearing on VPUs is flawed?
+-+// #define RPI_PRECLEAR
+-+
+- // Command for inter prediction
+- typedef struct HEVCMvCmd {
+-+    int cmd;
+-+    uint8_t *dst;
+-+    ptrdiff_t dststride;
+-+    uint8_t *src;
+-+    ptrdiff_t srcstride;
+-+    Mv mv;
+-+    int x_off;
+-+    int y_off;
+-+    int block_w;
+-+    int block_h;
+-+    int weight;
+-+    int offset;
+-+    uint8_t *src1;
+-+    ptrdiff_t srcstride1;
+-+    Mv mv1;
+-+    int8_t ref_idx[2];
+- } HEVCMvCmd;
+- 
+- // Command for transform to process a block of coefficients
+-diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
+-index dbfee85..4f072be 100644
+---- a/libavcodec/hevc_cabac.c
+-+++ b/libavcodec/hevc_cabac.c
+-@@ -1059,7 +1059,10 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+-         }
+-     }
+-     // We now do the memset after transform_add while we know the data is cached.
+--    //memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
+-+    #ifdef RPI_PRECLEAR
+-+    #else
+-+    memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
+-+    #endif
+- #else
+-     memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
+- #endif
+--- 
+-2.7.4
+-
+-
+-From 25d3b4e876febe08302a01abd85d5009160ead3e Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Wed, 6 May 2015 11:08:50 +0100
+-Subject: [PATCH 09/68] Inter prediction in separate pass
+-
+----
+- libavcodec/hevc.c | 93 +++++++++++++++++++++++++++++++++++++++++++++----------
+- libavcodec/hevc.h |  2 +-
+- 2 files changed, 77 insertions(+), 18 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 8215201..b7bc6ad 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -46,7 +46,7 @@
+- // For some unknown reason, the code seems to crash if I do a late malloc
+- #define EARLY_MALLOC
+- // Move Inter prediction into separate pass
+--//#define RPI_INTER
+-+#define RPI_INTER
+- #endif
+- 
+- // #define DISABLE_MC
+-@@ -1448,7 +1448,7 @@ static void rpi_luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+-                         AVFrame *ref, const Mv *mv, int x_off, int y_off,
+-                         int block_w, int block_h, int luma_weight, int luma_offset)
+- {
+--    HEVCMvCmd *cmd = unif_mv_cmds + s->num_mv_cmds++;
+-+    HEVCMvCmd *cmd = s->unif_mv_cmds + s->num_mv_cmds++;
+-     cmd->cmd = RPI_CMD_LUMA_UNI;
+-     cmd->dst = dst;
+-     cmd->dststride = dststride;
+-@@ -1467,31 +1467,29 @@ static void rpi_luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+-                        AVFrame *ref0, const Mv *mv0, int x_off, int y_off,
+-                        int block_w, int block_h, AVFrame *ref1, const Mv *mv1, struct MvField *current_mv)
+- {
+--    HEVCMvCmd *cmd = unif_mv_cmds + s->num_mv_cmds++;
+-+    HEVCMvCmd *cmd = s->unif_mv_cmds + s->num_mv_cmds++;
+-     cmd->cmd = RPI_CMD_LUMA_BI;
+-     cmd->dst = dst;
+-     cmd->dststride = dststride;
+--    cmd->src = ref->data[0];
+--    cmd->srcstride = ref->linesize[0];
+--    cmd->mv = *mv;
+-+    cmd->src = ref0->data[0];
+-+    cmd->srcstride = ref0->linesize[0];
+-+    cmd->mv = *mv0;
+-     cmd->x_off = x_off;
+-     cmd->y_off = y_off;
+-     cmd->block_w = block_w;
+-     cmd->block_h = block_h;
+--    cmd->weight = luma_weight;
+--    cmd->offset = luma_offset;
+--    cmd->src1 = ref1->data[];
+-+    cmd->src1 = ref1->data[0];
+-     cmd->srcstride1 = ref1->linesize[0];
+-     cmd->mv1 = *mv1;
+-     cmd->ref_idx[0] = current_mv->ref_idx[0];
+-     cmd->ref_idx[1] = current_mv->ref_idx[1];
+- }
+- 
+--static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
+-+static void rpi_chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
+-                           ptrdiff_t dststride, uint8_t *src0, ptrdiff_t srcstride, int reflist,
+-                           int x_off, int y_off, int block_w, int block_h, struct MvField *current_mv, int chroma_weight, int chroma_offset)
+- {
+--    HEVCMvCmd *cmd = unif_mv_cmds + s->num_mv_cmds++;
+-+    HEVCMvCmd *cmd = s->unif_mv_cmds + s->num_mv_cmds++;
+-     cmd->cmd = RPI_CMD_CHROMA_UNI;
+-     cmd->dst = dst0;
+-     cmd->dststride = dststride;
+-@@ -1506,27 +1504,27 @@ static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
+-     cmd->offset = chroma_offset;
+- }
+- 
+--static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVFrame *ref0, AVFrame *ref1,
+-+static void rpi_chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVFrame *ref0, AVFrame *ref1,
+-                          int x_off, int y_off, int block_w, int block_h, struct MvField *current_mv, int cidx)
+- {
+--    HEVCMvCmd *cmd = unif_mv_cmds + s->num_mv_cmds++;
+-+    HEVCMvCmd *cmd = s->unif_mv_cmds + s->num_mv_cmds++;
+-     cmd->cmd = RPI_CMD_CHROMA_BI+cidx;
+-     cmd->dst = dst0;
+-     cmd->dststride = dststride;
+-     cmd->src = ref0->data[cidx+1];
+-     cmd->srcstride = ref0->linesize[cidx+1];
+--    cmd->mv = current_mv->mv[reflist];
+-+    cmd->mv = current_mv->mv[0];
+-+    cmd->mv1 = current_mv->mv[1];
+-     cmd->x_off = x_off;
+-     cmd->y_off = y_off;
+-     cmd->block_w = block_w;
+-     cmd->block_h = block_h;
+--    cmd->weight = chroma_weight;
+--    cmd->offset = chroma_offset;
+--    cmd->src = ref1->data[cidx+1];
+-+    cmd->src1 = ref1->data[cidx+1];
+-     cmd->srcstride1 = ref1->linesize[cidx+1];
+-     cmd->ref_idx[0] = current_mv->ref_idx[0];
+-     cmd->ref_idx[1] = current_mv->ref_idx[1];
+- }
+-+
+- #else
+- #define RPI_REDIRECT(fn) fn
+- #endif
+-@@ -2554,7 +2552,9 @@ static void rpi_execute_pred_cmds(HEVCContext *s)
+-           lc->na.cand_up_right     = (cmd->na >> 0) & 1;
+-           s->hpc.intra_pred[cmd->size - 2](s, cmd->x, cmd->y, cmd->c_idx);
+-       } else {
+-+#ifdef RPI_PRECLEAR
+-           int trafo_size = 1 << cmd->size;
+-+#endif
+-           s->hevcdsp.transform_add[cmd->size-2](cmd->dst, cmd->buf, cmd->stride);
+- #ifdef RPI_PRECLEAR
+-           memset(cmd->buf, 0, trafo_size * trafo_size * sizeof(int16_t)); // Clear coefficients here while they are in the cache
+-@@ -2563,6 +2563,61 @@ static void rpi_execute_pred_cmds(HEVCContext *s)
+-   }
+-   s->num_pred_cmds = 0;
+- }
+-+
+-+static void rpi_execute_inter_cmds(HEVCContext *s)
+-+{
+-+    HEVCMvCmd *cmd = s->unif_mv_cmds;
+-+    int n,cidx;
+-+    AVFrame myref;
+-+    AVFrame myref1;
+-+    struct MvField mymv;
+-+    if (s->num_mv_cmds > RPI_MAX_MV_CMDS) {
+-+        printf("Overflow inter_cmds\n");
+-+        exit(-1);
+-+    }
+-+    for(n = s->num_mv_cmds; n>0 ; n--, cmd++) {
+-+        switch(cmd->cmd) {
+-+        case RPI_CMD_LUMA_UNI:
+-+            myref.data[0] = cmd->src;
+-+            myref.linesize[0] = cmd->srcstride;
+-+            luma_mc_uni(s, cmd->dst, cmd->dststride, &myref, &cmd->mv, cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, cmd->weight, cmd->offset);
+-+            break;
+-+        case RPI_CMD_LUMA_BI:
+-+            myref.data[0] = cmd->src;
+-+            myref.linesize[0] = cmd->srcstride;
+-+            myref1.data[0] = cmd->src1;
+-+            myref1.linesize[0] = cmd->srcstride1;
+-+            mymv.ref_idx[0] = cmd->ref_idx[0];
+-+            mymv.ref_idx[1] = cmd->ref_idx[1];
+-+            luma_mc_bi(s, cmd->dst, cmd->dststride,
+-+                       &myref, &cmd->mv, cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h,
+-+                       &myref1, &cmd->mv1, &mymv);
+-+            break;
+-+        case RPI_CMD_CHROMA_UNI:
+-+            mymv.mv[0] = cmd->mv;
+-+            chroma_mc_uni(s, cmd->dst,
+-+                          cmd->dststride, cmd->src, cmd->srcstride, 0,
+-+                          cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, &mymv, cmd->weight, cmd->offset);
+-+            break;
+-+        case RPI_CMD_CHROMA_BI:
+-+        case RPI_CMD_CHROMA_BI+1:
+-+            cidx = cmd->cmd - RPI_CMD_CHROMA_BI;
+-+            myref.data[cidx+1] = cmd->src;
+-+            myref.linesize[cidx+1] = cmd->srcstride;
+-+            myref1.data[cidx+1] = cmd->src1;
+-+            myref1.linesize[cidx+1] = cmd->srcstride1;
+-+            mymv.ref_idx[0] = cmd->ref_idx[0];
+-+            mymv.ref_idx[1] = cmd->ref_idx[1];
+-+            mymv.mv[0] = cmd->mv;
+-+            mymv.mv[1] = cmd->mv1;
+-+            chroma_mc_bi(s, cmd->dst, cmd->dststride, &myref, &myref1,
+-+                         cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, &mymv, cidx);
+-+            break;
+-+        }
+-+    }
+-+    s->num_mv_cmds = 0;
+-+}
+-+
+- #endif
+- 
+- static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-@@ -2611,6 +2666,8 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+- #ifdef RPI
+-         if (s->enable_rpi && x_ctb + ctb_size >= s->ps.sps->width) {
+-             int x;
+-+            // Perform inter prediction
+-+            rpi_execute_inter_cmds(s);
+-             // Transform all blocks
+-             rpi_execute_transform(s);
+-             // Perform intra prediction and residual reconstruction
+-@@ -3422,6 +3479,7 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
+- }
+- 
+- #ifdef RPI
+-+#ifdef RPI_PRECLEAR
+- static av_cold void memclear16(int16_t *p, int n)
+- {
+-   vpu_execute_code( vpu_get_fn(), p, n, 0, 0, 0, 1);
+-@@ -3430,6 +3488,7 @@ static av_cold void memclear16(int16_t *p, int n)
+-   //  p[i] = 0;
+- }
+- #endif
+-+#endif
+- 
+- static av_cold int hevc_init_context(AVCodecContext *avctx)
+- {
+-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
+-index 1ac119a..a0eb71b 100644
+---- a/libavcodec/hevc.h
+-+++ b/libavcodec/hevc.h
+-@@ -812,7 +812,7 @@ typedef struct HEVCLocalContext {
+- #define RPI_CMD_LUMA_UNI 0
+- #define RPI_CMD_CHROMA_UNI 1
+- #define RPI_CMD_LUMA_BI 2
+--#define RPI_CMD_U_BI 3
+-+#define RPI_CMD_CHROMA_BI 3
+- #define RPI_CMD_V_BI 4
+- 
+- // RPI_PRECLEAR is not working yet - perhaps clearing on VPUs is flawed?
+--- 
+-2.7.4
+-
+-
+-From 8af0a0a036e4bb3883f144d0567bc527772dd65b Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Wed, 6 May 2015 13:03:50 +0100
+-Subject: [PATCH 10/68] Added VPU thread
+-
+----
+- libavcodec/hevc.c    |  11 +++--
+- libavcodec/hevc.h    |   1 +
+- libavcodec/rpi_qpu.c | 125 +++++++++++++++++++++++++++++++++++++++++++++++++--
+- libavcodec/rpi_qpu.h |   2 +
+- 4 files changed, 133 insertions(+), 6 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index b7bc6ad..98dbd69 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -2529,8 +2529,10 @@ static void rpi_execute_transform(HEVCContext *s)
+- 
+- 
+-     gpu_cache_flush(&s->coeffs_buf_accelerated);
+--    vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0);
+-+    s->vpu_id = vpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0, &s->coeffs_buf_accelerated);
+-+    //vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0);
+-     //gpu_cache_flush(&s->coeffs_buf_accelerated);
+-+    //vpu_wait(s->vpu_id);
+- 
+-     for(i=0;i<4;i++)
+-         s->num_coeffs[i] = 0;
+-@@ -2666,10 +2668,12 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+- #ifdef RPI
+-         if (s->enable_rpi && x_ctb + ctb_size >= s->ps.sps->width) {
+-             int x;
+--            // Perform inter prediction
+--            rpi_execute_inter_cmds(s);
+-             // Transform all blocks
+-             rpi_execute_transform(s);
+-+            // Perform inter prediction
+-+            rpi_execute_inter_cmds(s);
+-+            // Wait for transform completion
+-+            vpu_wait(s->vpu_id);
+-             // Perform intra prediction and residual reconstruction
+-             rpi_execute_pred_cmds(s);
+-             // Perform deblocking for CTBs in this row
+-@@ -3426,6 +3430,7 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
+-     av_freep(&s->univ_pred_cmds);
+- 
+- #ifdef EARLY_MALLOC
+-+    printf("hevc_decode_free\n");
+-     if (s->coeffs_buf_arm[0]) {
+-       gpu_free(&s->coeffs_buf_default);
+-       s->coeffs_buf_arm[0] = 0;
+-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
+-index a0eb71b..0d8dfe9 100644
+---- a/libavcodec/hevc.h
+-+++ b/libavcodec/hevc.h
+-@@ -896,6 +896,7 @@ typedef struct HEVCContext {
+-     int num_xfm_cmds;
+-     int num_mv_cmds;
+-     int num_pred_cmds;
+-+    int vpu_id;
+- #endif
+- 
+-     uint8_t *cabac_state;
+-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
+-index 12ad5fb..378dd74 100644
+---- a/libavcodec/rpi_qpu.c
+-+++ b/libavcodec/rpi_qpu.c
+-@@ -1,9 +1,13 @@
+- #ifdef RPI
+--// Use the vcsm device for shared memory
+-+// define RPI_USE_VCSM to use the vcsm device for shared memory
+- // This works better than the mmap in that the memory can be cached, but requires a kernel modification to enable the device.
+- #define RPI_USE_VCSM
+--#define RPI_TIME_TOTAL_QPU
+--#define RPI_TIME_TOTAL_VPU
+-+// define RPI_TIME_TOTAL_QPU to print out how much time is spent in the QPU code
+-+//#define RPI_TIME_TOTAL_QPU
+-+// define RPI_TIME_TOTAL_VPU to print out how much time is spent in the VPI code
+-+//#define RPI_TIME_TOTAL_VPU
+-+// define RPI_ASYNC to run the VPU in a separate thread, need to make a separate call to check for completion
+-+#define RPI_ASYNC
+- 
+- #include <stdio.h>
+- #include <stdlib.h>
+-@@ -113,6 +117,19 @@ static unsigned int Microseconds(void) {
+- }
+- #endif
+- 
+-+#ifdef RPI_ASYNC
+-+pthread_t vpu_thread;
+-+static void *vpu_start(void *arg);
+-+
+-+#define MAXCMDS 128
+-+static pthread_cond_t post_cond = PTHREAD_COND_INITIALIZER;
+-+static pthread_mutex_t post_mutex = PTHREAD_MUTEX_INITIALIZER;
+-+
+-+static int vpu_cmds[MAXCMDS][8];
+-+static volatile int vpu_async_tail=0; // Contains the number of posted jobs
+-+static volatile int vpu_async_head=0;
+-+#endif
+-+
+- // Connect to QPU, returns 0 on success.
+- static int gpu_init(volatile struct GPU **gpu) {
+-   int mb = mbox_open();
+-@@ -164,12 +181,27 @@ static int gpu_init(volatile struct GPU **gpu) {
+-   // And the transform coefficients
+-   memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, sizeof(rpi_transMatrix2even));
+- 
+-+#ifdef RPI_ASYNC
+-+  {
+-+    int err;
+-+    vpu_async_tail = 0;
+-+    vpu_async_head = 0;
+-+    err = pthread_create(&vpu_thread, NULL, vpu_start, NULL);
+-+    //printf("Created thread\n");
+-+    if (err) {
+-+        printf("Failed to create vpu thread\n");
+-+        return -4;
+-+    }
+-+  }
+-+#endif
+-+
+-   return 0;
+- }
+- 
+- // Make sure we have exclusive access to the mailbox, and enable qpu if necessary.
+- static void gpu_lock(void) {
+-   pthread_mutex_lock(&gpu_mutex);
+-+
+-   if (gpu==NULL) {
+-     gpu_init(&gpu);
+-   }
+-@@ -264,6 +296,16 @@ static void gpu_term(void)
+- 	unsigned handle = gpu->vc_handle;
+-   if (gpu==NULL)
+-     return;
+-+
+-+#ifdef RPI_ASYNC
+-+  {
+-+    void *res;
+-+    vpu_post_code(0, 0, 0, 0, 0, 0, -1, NULL);
+-+    pthread_join(vpu_thread, &res);
+-+  }
+-+#endif
+-+
+-+
+- 	unmapmem((void*)gpu, sizeof(struct GPU));
+- 	mem_unlock(mb, handle);
+- 	mem_free(mb, handle);
+-@@ -322,6 +364,79 @@ unsigned int vpu_get_constants(void) {
+-   return gpu->vc + offsetof(struct GPU,transMatrix2even);
+- }
+- 
+-+#ifdef RPI_ASYNC
+-+
+-+static void *vpu_start(void *arg) {
+-+  while(1) {
+-+    pthread_mutex_lock(&post_mutex);
+-+    while( vpu_async_tail - vpu_async_head <= 0)
+-+    {
+-+      //printf("Checking number %d %d\n",vpu_async_head,vpu_async_tail);
+-+      pthread_cond_wait(&post_cond, &post_mutex);
+-+    }
+-+    int *p = vpu_cmds[vpu_async_head%MAXCMDS];
+-+    pthread_mutex_unlock(&post_mutex);
+-+
+-+    if (p[6] == -1) {
+-+      break; // Last job
+-+    }
+-+    if (p[7]) {
+-+        GPU_MEM_PTR_T *buf = (GPU_MEM_PTR_T *)p[7];
+-+        //gpu_cache_flush(buf);
+-+    }
+-+    vpu_execute_code(p[0], p[1], p[2], p[3], p[4], p[5], p[6]);
+-+
+-+    pthread_mutex_lock(&post_mutex);
+-+    vpu_async_head++;
+-+    pthread_cond_broadcast(&post_cond);
+-+    pthread_mutex_unlock(&post_mutex);
+-+  }
+-+
+-+  return NULL;
+-+}
+-+
+-+// Post a command to the queue
+-+// Returns an id which we can use to wait for completion
+-+int vpu_post_code(unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5, GPU_MEM_PTR_T *buf)
+-+{
+-+  pthread_mutex_lock(&post_mutex);
+-+  {
+-+    int id = vpu_async_tail++;
+-+    int *p = vpu_cmds[id%MAXCMDS];
+-+    int num = vpu_async_tail - vpu_async_head;
+-+    if (num>MAXCMDS) {
+-+      printf("Too many commands submitted\n");
+-+      exit(-1);
+-+    }
+-+    p[0] = code;
+-+    p[1] = r0;
+-+    p[2] = r1;
+-+    p[3] = r2;
+-+    p[4] = r3;
+-+    p[5] = r4;
+-+    p[6] = r5;
+-+    p[7] = (int) buf;
+-+    if (num<=1)
+-+      pthread_cond_broadcast(&post_cond); // Otherwise the vpu thread must already be awake
+-+    pthread_mutex_unlock(&post_mutex);
+-+    return id;
+-+  }
+-+}
+-+
+-+// Wait for completion of the given command
+-+void vpu_wait(int id)
+-+{
+-+  pthread_mutex_lock(&post_mutex);
+-+  while( id + 1 - vpu_async_head > 0)
+-+  {
+-+    pthread_cond_wait(&post_cond, &post_mutex);
+-+  }
+-+  pthread_mutex_unlock(&post_mutex);
+-+}
+-+
+-+#endif
+-+
+-+
+- unsigned vpu_execute_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5)
+- {
+-   unsigned r;
+-@@ -334,7 +449,9 @@ unsigned vpu_execute_code( unsigned code, unsigned r0, unsigned r1, unsigned r2,
+-   static int count=0;
+-   static long long countr2=0;
+- #endif
+-+#ifndef RPI_ASYNC
+-   gpu_lock();
+-+#endif
+- #ifdef RPI_TIME_TOTAL_VPU
+-   start_time = Microseconds();
+-   if (last_time==0)
+-@@ -351,7 +468,9 @@ unsigned vpu_execute_code( unsigned code, unsigned r0, unsigned r1, unsigned r2,
+-   if ((count&0x7f)==0)
+-     printf("VPU %d %lld On=%dms, Off=%dms\n",count,countr2,(int)(on_time/1000),(int)(off_time/1000));
+- #endif
+-+#ifndef RPI_ASYNC
+-   gpu_unlock();
+-+#endif
+-   return r;
+- }
+- 
+-diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
+-index 814fc3c..3526fce 100644
+---- a/libavcodec/rpi_qpu.h
+-+++ b/libavcodec/rpi_qpu.h
+-@@ -36,6 +36,8 @@ extern unsigned int qpu_get_fn(int num);
+- extern unsigned int vpu_get_fn(void);
+- extern unsigned int vpu_get_constants(void);
+- extern unsigned vpu_execute_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5);
+-+extern int vpu_post_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5, GPU_MEM_PTR_T *buf);
+-+extern void vpu_wait( int id);
+- 
+- // Simple test of shader code
+- extern int rpi_test_shader(void);
+--- 
+-2.7.4
+-
+-
+-From 016d3db644e60fbe272bfcf1d7c3670c82422317 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Wed, 6 May 2015 15:03:37 +0100
+-Subject: [PATCH 11/68] Added different signal when tail moves
+-
+----
+- libavcodec/rpi_qpu.c | 11 ++++++-----
+- 1 file changed, 6 insertions(+), 5 deletions(-)
+-
+-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
+-index 378dd74..d1c3e20 100644
+---- a/libavcodec/rpi_qpu.c
+-+++ b/libavcodec/rpi_qpu.c
+-@@ -122,7 +122,8 @@ pthread_t vpu_thread;
+- static void *vpu_start(void *arg);
+- 
+- #define MAXCMDS 128
+--static pthread_cond_t post_cond = PTHREAD_COND_INITIALIZER;
+-+static pthread_cond_t post_cond_head = PTHREAD_COND_INITIALIZER;
+-+static pthread_cond_t post_cond_tail = PTHREAD_COND_INITIALIZER;
+- static pthread_mutex_t post_mutex = PTHREAD_MUTEX_INITIALIZER;
+- 
+- static int vpu_cmds[MAXCMDS][8];
+-@@ -372,7 +373,7 @@ static void *vpu_start(void *arg) {
+-     while( vpu_async_tail - vpu_async_head <= 0)
+-     {
+-       //printf("Checking number %d %d\n",vpu_async_head,vpu_async_tail);
+--      pthread_cond_wait(&post_cond, &post_mutex);
+-+      pthread_cond_wait(&post_cond_tail, &post_mutex);
+-     }
+-     int *p = vpu_cmds[vpu_async_head%MAXCMDS];
+-     pthread_mutex_unlock(&post_mutex);
+-@@ -388,7 +389,7 @@ static void *vpu_start(void *arg) {
+- 
+-     pthread_mutex_lock(&post_mutex);
+-     vpu_async_head++;
+--    pthread_cond_broadcast(&post_cond);
+-+    pthread_cond_broadcast(&post_cond_head);
+-     pthread_mutex_unlock(&post_mutex);
+-   }
+- 
+-@@ -417,7 +418,7 @@ int vpu_post_code(unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned
+-     p[6] = r5;
+-     p[7] = (int) buf;
+-     if (num<=1)
+--      pthread_cond_broadcast(&post_cond); // Otherwise the vpu thread must already be awake
+-+      pthread_cond_broadcast(&post_cond_tail); // Otherwise the vpu thread must already be awake
+-     pthread_mutex_unlock(&post_mutex);
+-     return id;
+-   }
+-@@ -429,7 +430,7 @@ void vpu_wait(int id)
+-   pthread_mutex_lock(&post_mutex);
+-   while( id + 1 - vpu_async_head > 0)
+-   {
+--    pthread_cond_wait(&post_cond, &post_mutex);
+-+    pthread_cond_wait(&post_cond_head, &post_mutex);
+-   }
+-   pthread_mutex_unlock(&post_mutex);
+- }
+--- 
+-2.7.4
+-
+-
+-From b04a72641253dc89fd1ec688035c3e2a946aa370 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Thu, 7 May 2015 08:57:11 +0100
+-Subject: [PATCH 12/68] Add option to test for gpu_idle
+-
+----
+- libavcodec/hevc.c    |  3 ++-
+- libavcodec/rpi_qpu.c | 18 ++++++++++++++++++
+- 2 files changed, 20 insertions(+), 1 deletion(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 98dbd69..2e269b6 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -2527,7 +2527,6 @@ static void rpi_execute_transform(HEVCContext *s)
+-     //    s->hevcdsp.idct[4-2](coeffs, 16);
+-     //}
+- 
+--
+-     gpu_cache_flush(&s->coeffs_buf_accelerated);
+-     s->vpu_id = vpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0, &s->coeffs_buf_accelerated);
+-     //vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0);
+-@@ -2669,6 +2668,8 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-         if (s->enable_rpi && x_ctb + ctb_size >= s->ps.sps->width) {
+-             int x;
+-             // Transform all blocks
+-+            //printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
+-+
+-             rpi_execute_transform(s);
+-             // Perform inter prediction
+-             rpi_execute_inter_cmds(s);
+-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
+-index d1c3e20..85f49db 100644
+---- a/libavcodec/rpi_qpu.c
+-+++ b/libavcodec/rpi_qpu.c
+-@@ -199,6 +199,17 @@ static int gpu_init(volatile struct GPU **gpu) {
+-   return 0;
+- }
+- 
+-+// Returns 1 if the gpu is currently idle
+-+static int gpu_idle(void)
+-+{
+-+  int ret = pthread_mutex_trylock(&gpu_mutex);
+-+  if (ret==0) {
+-+    pthread_mutex_unlock(&gpu_mutex);
+-+    return 1;
+-+  }
+-+  return 0;
+-+}
+-+
+- // Make sure we have exclusive access to the mailbox, and enable qpu if necessary.
+- static void gpu_lock(void) {
+-   pthread_mutex_lock(&gpu_mutex);
+-@@ -400,6 +411,13 @@ static void *vpu_start(void *arg) {
+- // Returns an id which we can use to wait for completion
+- int vpu_post_code(unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5, GPU_MEM_PTR_T *buf)
+- {
+-+  // If the gpu is idle then just run the command immediately
+-+  // This works, but doesn't seem to give any benefit
+-+  // if (gpu_idle()) {
+-+  //   vpu_execute_code( code,  r0,  r1,  r2,  r3,  r4,  r5);
+-+  //   return -1; // TODO perhaps a wraparound bug here?
+-+  // }
+-+
+-   pthread_mutex_lock(&post_mutex);
+-   {
+-     int id = vpu_async_tail++;
+--- 
+-2.7.4
+-
+-
+-From e7b457e683d4ca92bf2677b69708fbfc3849847b Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Thu, 7 May 2015 11:01:35 +0100
+-Subject: [PATCH 13/68] Added deblocking pass
+-
+----
+- libavcodec/hevc.c        | 33 +++++++++++++++++++++++++++------
+- libavcodec/hevc.h        |  7 ++++++-
+- libavcodec/hevc_filter.c |  6 +++++-
+- libavcodec/rpi_qpu.c     |  2 +-
+- 4 files changed, 39 insertions(+), 9 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 2e269b6..29f8415 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -2518,6 +2518,17 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
+- }
+- 
+- #ifdef RPI
+-+static void rpi_execute_dblk_cmds(HEVCContext *s)
+-+{
+-+    int n;
+-+    int ctb_size    = 1 << s->ps.sps->log2_ctb_size;
+-+    int (*p)[2] = s->dblk_cmds;
+-+    for(n = s->num_dblk_cmds; n>0 ;n--,p++) {
+-+        ff_hevc_hls_filters(s, (*p)[0], (*p)[1], ctb_size);
+-+    }
+-+    s->num_dblk_cmds = 0;
+-+}
+-+
+- static void rpi_execute_transform(HEVCContext *s)
+- {
+-     int i=2;
+-@@ -2631,7 +2642,6 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-     int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs];
+- 
+- #ifdef RPI
+--    int start_ctb_x = (s->sh.slice_ctb_addr_rs % ((s->ps.sps->width + ctb_size - 1) >> s->ps.sps->log2_ctb_size)) << s->ps.sps->log2_ctb_size;
+-     s->enable_rpi = 1; // TODO this should depend on cross component and frame width etc.
+- #endif
+- 
+-@@ -2665,7 +2675,10 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+- 
+-         more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
+- #ifdef RPI
+--        if (s->enable_rpi && x_ctb + ctb_size >= s->ps.sps->width) {
+-+        if (s->enable_rpi) {
+-+          s->dblk_cmds[s->num_dblk_cmds][0] = x_ctb;
+-+          s->dblk_cmds[s->num_dblk_cmds++][1] = y_ctb;
+-+          if ( (((y_ctb + ctb_size)&63) == 0) && x_ctb + ctb_size >= s->ps.sps->width) {
+-             int x;
+-             // Transform all blocks
+-             //printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
+-@@ -2678,10 +2691,8 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-             // Perform intra prediction and residual reconstruction
+-             rpi_execute_pred_cmds(s);
+-             // Perform deblocking for CTBs in this row
+--            for(x = start_ctb_x; x <= x_ctb; x += ctb_size) {  // TODO this will fail for tiles
+--                ff_hevc_hls_filters(s, x, y_ctb, ctb_size);
+--            }
+--            start_ctb_x = 0;
+-+            rpi_execute_dblk_cmds(s);
+-+          }
+-         }
+- #endif
+-         if (more_data < 0) {
+-@@ -2699,6 +2710,16 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-         ff_hevc_hls_filters(s, x_ctb, y_ctb, ctb_size);
+-     }
+- 
+-+#ifdef RPI
+-+    if (s->enable_rpi && s->num_dblk_cmds) {
+-+        rpi_execute_transform(s);
+-+        rpi_execute_inter_cmds(s);
+-+        vpu_wait(s->vpu_id);
+-+        rpi_execute_pred_cmds(s);
+-+        rpi_execute_dblk_cmds(s);
+-+    }
+-+#endif
+-+
+-     if (x_ctb + ctb_size >= s->ps.sps->width &&
+-         y_ctb + ctb_size >= s->ps.sps->height)
+-         ff_hevc_hls_filter(s, x_ctb, y_ctb, ctb_size);
+-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
+-index 0d8dfe9..990bd8c 100644
+---- a/libavcodec/hevc.h
+-+++ b/libavcodec/hevc.h
+-@@ -808,6 +808,8 @@ typedef struct HEVCLocalContext {
+- #define RPI_MAX_XFM_CMDS  (16*3*(RPI_MAX_WIDTH/4))
+- // Each block can have an intra prediction and a transform_add command
+- #define RPI_MAX_PRED_CMDS (2*16*3*(RPI_MAX_WIDTH/4))
+-+// Worst case is 16x16 CTUs
+-+#define RPI_MAX_DEBLOCK_CMDS (RPI_MAX_WIDTH*4/16)
+- 
+- #define RPI_CMD_LUMA_UNI 0
+- #define RPI_CMD_CHROMA_UNI 1
+-@@ -867,6 +869,9 @@ typedef struct HEVCPredCmd {
+- #endif
+- 
+- typedef struct HEVCContext {
+-+#ifdef RPI
+-+    int dblk_cmds[RPI_MAX_DEBLOCK_CMDS][2];
+-+#endif
+-     const AVClass *c;  // needed by private avoptions
+-     AVCodecContext *avctx;
+- 
+-@@ -891,11 +896,11 @@ typedef struct HEVCContext {
+-     GPU_MEM_PTR_T coeffs_buf_accelerated;
+-     int16_t *coeffs_buf_arm[4];
+-     unsigned int coeffs_buf_vc[4];
+--
+-     int num_coeffs[4];
+-     int num_xfm_cmds;
+-     int num_mv_cmds;
+-     int num_pred_cmds;
+-+    int num_dblk_cmds;
+-     int vpu_id;
+- #endif
+- 
+-diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
+-index e4c3da7..ea0af91 100644
+---- a/libavcodec/hevc_filter.c
+-+++ b/libavcodec/hevc_filter.c
+-@@ -877,8 +877,12 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
+-             if (s->threads_type & FF_THREAD_FRAME )
+-                 ff_thread_report_progress(&s->ref->tf, y + ctb_size, 0);
+-         }
+--    } else if (s->threads_type & FF_THREAD_FRAME && x_end)
+-+    } else if (s->threads_type & FF_THREAD_FRAME && x_end) {
+-+        int newh = y + ctb_size - 4;
+-+        //int currh = s->ref->tf.progress->data[0];
+-+        //if (((y + ctb_size)&63)==0)
+-         ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
+-+    }
+- }
+- 
+- void ff_hevc_hls_filters(HEVCContext *s, int x_ctb, int y_ctb, int ctb_size)
+-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
+-index 85f49db..3b6dae7 100644
+---- a/libavcodec/rpi_qpu.c
+-+++ b/libavcodec/rpi_qpu.c
+-@@ -105,7 +105,7 @@ struct GPU
+- static pthread_mutex_t gpu_mutex = PTHREAD_MUTEX_INITIALIZER;
+- static volatile struct GPU* gpu = NULL;
+- 
+--#ifdef RPI_TIME_TOTAL_QPU
+-+#if defined(RPI_TIME_TOTAL_QPU) || defined(RPI_TIME_TOTAL_VPU)
+- static unsigned int Microseconds(void) {
+-     struct timespec ts;
+-     unsigned int x;
+--- 
+-2.7.4
+-
+-
+-From 7a443df9115f21b4428de378bd146dcdba3dd42a Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Thu, 7 May 2015 16:47:47 +0100
+-Subject: [PATCH 14/68] Added option to disable deblocking for non-ref frames
+-
+----
+- libavcodec/hevc_filter.c | 10 ++++++++++
+- 1 file changed, 10 insertions(+)
+-
+-diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
+-index ea0af91..2cdd621 100644
+---- a/libavcodec/hevc_filter.c
+-+++ b/libavcodec/hevc_filter.c
+-@@ -25,6 +25,8 @@
+- //#define DISABLE_SAO
+- //#define DISABLE_DEBLOCK
+- //#define DISABLE_STRENGTHS
+-+// define DISABLE_DEBLOCK_NONREF for a 6% speed boost (by skipping deblocking on unimportant frames)
+-+//#define DISABLE_DEBLOCK_NONREF
+- 
+- #include "libavutil/common.h"
+- #include "libavutil/internal.h"
+-@@ -504,6 +506,14 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+-                 s->ps.sps->pcm.loop_filter_disable_flag) ||
+-                s->ps.pps->transquant_bypass_enable_flag;
+- 
+-+#ifdef DISABLE_DEBLOCK_NONREF
+-+    if (    s->nal_unit_type == NAL_TRAIL_N ||
+-+            s->nal_unit_type == NAL_TSA_N   ||
+-+            s->nal_unit_type == NAL_STSA_N  ||
+-+            s->nal_unit_type == NAL_RADL_N  ||
+-+            s->nal_unit_type == NAL_RASL_N )
+-+      return; // Don't deblock non-reference frames
+-+#endif
+- #ifdef DISABLE_DEBLOCK
+-     return;
+- #endif
+--- 
+-2.7.4
+-
+-
+-From 9606e160a582db64ccf981d971cdc258d8cc02f7 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Mon, 11 May 2015 10:00:27 +0100
+-Subject: [PATCH 15/68] Moved buffers to VPU memory
+-
+----
+- libavcodec/hevc_filter.c | 17 +++++++++++++-
+- libavcodec/utils.c       | 59 ++++++++++++++++++++++++++++++++++++++++++++++++
+- libavutil/buffer.c       |  6 +++++
+- libavutil/buffer.h       |  3 +++
+- 4 files changed, 84 insertions(+), 1 deletion(-)
+-
+-diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
+-index 2cdd621..e1b32d4 100644
+---- a/libavcodec/hevc_filter.c
+-+++ b/libavcodec/hevc_filter.c
+-@@ -866,6 +866,13 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
+- #undef CB
+- #undef CR
+- 
+-+#ifdef RPI_INTER_QPU
+-+static void flush_buffer(AVBufferRef *bref) {
+-+    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
+-+    gpu_cache_flush(p);
+-+}
+-+#endif
+-+
+- void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
+- {
+-     int x_end = x >= s->ps.sps->width  - ctb_size;
+-@@ -888,9 +895,17 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
+-                 ff_thread_report_progress(&s->ref->tf, y + ctb_size, 0);
+-         }
+-     } else if (s->threads_type & FF_THREAD_FRAME && x_end) {
+--        int newh = y + ctb_size - 4;
+-+        //int newh = y + ctb_size - 4;
+-         //int currh = s->ref->tf.progress->data[0];
+-         //if (((y + ctb_size)&63)==0)
+-+        if (!(  s->nal_unit_type == NAL_TRAIL_N ||
+-+            s->nal_unit_type == NAL_TSA_N   ||
+-+            s->nal_unit_type == NAL_STSA_N  ||
+-+            s->nal_unit_type == NAL_RADL_N  ||
+-+            s->nal_unit_type == NAL_RASL_N )) {
+-+            flush_buffer(s->frame->buf[1]);
+-+            flush_buffer(s->frame->buf[2]);
+-+        }
+-         ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
+-     }
+- }
+-diff --git a/libavcodec/utils.c b/libavcodec/utils.c
+-index f7adb52..708526e 100644
+---- a/libavcodec/utils.c
+-+++ b/libavcodec/utils.c
+-@@ -26,6 +26,12 @@
+-  */
+- 
+- #include "config.h"
+-+
+-+#ifdef RPI
+-+// Move video buffers to GPU memory
+-+#define RPI_GPU_BUFFERS
+-+#endif
+-+
+- #include "libavutil/atomic.h"
+- #include "libavutil/attributes.h"
+- #include "libavutil/avassert.h"
+-@@ -64,6 +70,10 @@
+- #include "libavutil/ffversion.h"
+- const char av_codec_ffversion[] = "FFmpeg version " FFMPEG_VERSION;
+- 
+-+#ifdef RPI_GPU_BUFFERS
+-+#include "rpi_qpu.h"
+-+#endif
+-+
+- #if HAVE_PTHREADS || HAVE_W32THREADS || HAVE_OS2THREADS
+- static int default_lockmgr_cb(void **arg, enum AVLockOp op)
+- {
+-@@ -503,6 +513,47 @@ int avcodec_fill_audio_frame(AVFrame *frame, int nb_channels,
+-     return ret;
+- }
+- 
+-+#ifdef RPI_GPU_BUFFERS
+-+static void rpi_buffer_default_free(void *opaque, uint8_t *data)
+-+{
+-+    GPU_MEM_PTR_T *p = opaque;
+-+    gpu_free(p);
+-+    av_free(p);
+-+}
+-+
+-+static AVBufferRef *rpi_buffer_alloc(int size)
+-+{
+-+    AVBufferRef *ret = NULL;
+-+    uint8_t    *data = NULL;
+-+    GPU_MEM_PTR_T *p;
+-+
+-+    static int total=0;
+-+    total+=size;
+-+
+-+    p = av_malloc(sizeof *p);
+-+    if (!p)
+-+        return NULL;
+-+
+-+    if (gpu_malloc_cached(size,p)<0)  // Change this line to choose cached or uncached memory.  The caching here refers to the ARM data cache.
+-+        return NULL;
+-+
+-+    data = p->arm;
+-+    printf("Rpi alloc %d/%d ARM=%p VC=%x->%x\n",size,total,p->arm,p->vc,p->vc+size);
+-+    //memset(data, 64, size);
+-+
+-+    if (!data)
+-+        return NULL;
+-+
+-+    ret = av_buffer_create(data, size, rpi_buffer_default_free, p, 0);
+-+    if (!ret) {
+-+        gpu_free(p);
+-+        av_freep(&p);
+-+    }
+-+
+-+    return ret;
+-+}
+-+#endif
+-+
+- static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame)
+- {
+-     FramePool *pool = avctx->internal->pool;
+-@@ -550,6 +601,14 @@ static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame)
+-             av_buffer_pool_uninit(&pool->pools[i]);
+-             pool->linesize[i] = linesize[i];
+-             if (size[i]) {
+-+#ifdef RPI_GPU_BUFFERS
+-+                if (avctx->codec_id == AV_CODEC_ID_HEVC)
+-+                    pool->pools[i] = av_buffer_pool_init(size[i] + 16 + STRIDE_ALIGN - 1,
+-+                                                     CONFIG_MEMORY_POISONING ?
+-+                                                        NULL :
+-+                                                        rpi_buffer_alloc);
+-+                else
+-+#endif
+-                 pool->pools[i] = av_buffer_pool_init(size[i] + 16 + STRIDE_ALIGN - 1,
+-                                                      CONFIG_MEMORY_POISONING ?
+-                                                         NULL :
+-diff --git a/libavutil/buffer.c b/libavutil/buffer.c
+-index 694e116..203ca7b 100644
+---- a/libavutil/buffer.c
+-+++ b/libavutil/buffer.c
+-@@ -425,3 +425,9 @@ AVBufferRef *av_buffer_pool_get(AVBufferPool *pool)
+- 
+-     return ret;
+- }
+-+
+-+// Return the opaque for the underlying frame (gives us a GPU_MEM_PTR_T)
+-+void *av_buffer_pool_opaque(AVBufferRef *ref) {
+-+  BufferPoolEntry *buf = av_buffer_get_opaque(ref);
+-+  return buf->opaque;
+-+}
+-diff --git a/libavutil/buffer.h b/libavutil/buffer.h
+-index 0c0ce12..82e0bc3 100644
+---- a/libavutil/buffer.h
+-+++ b/libavutil/buffer.h
+-@@ -283,6 +283,9 @@ void av_buffer_pool_uninit(AVBufferPool **pool);
+-  */
+- AVBufferRef *av_buffer_pool_get(AVBufferPool *pool);
+- 
+-+// Return the opaque for the underlying frame
+-+void *av_buffer_pool_opaque(AVBufferRef *ref);
+-+
+- /**
+-  * @}
+-  */
+--- 
+-2.7.4
+-
+-
+-From f56515b9a720c829ba3ddf6da4232a91b13e0f03 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Mon, 11 May 2015 14:04:37 +0100
+-Subject: [PATCH 16/68] Prepared QPU execute code
+-
+----
+- libavcodec/hevc.c        | 227 ++++++++++++++++++++++++++++++++++++++++-------
+- libavcodec/hevc.h        |  22 ++++-
+- libavcodec/hevc_filter.c |   7 +-
+- libavcodec/rpi_qpu.c     |  55 +++++++++++-
+- libavcodec/rpi_qpu.h     |   2 +
+- 5 files changed, 276 insertions(+), 37 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 29f8415..66ed37a 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -42,17 +42,45 @@
+- #include "profiles.h"
+- 
+- #ifdef RPI
+--#include "rpi_qpu.h"
+--// For some unknown reason, the code seems to crash if I do a late malloc
+--#define EARLY_MALLOC
+--// Move Inter prediction into separate pass
+--#define RPI_INTER
+-+  #include "rpi_qpu.h"
+-+  // For some unknown reason, the code seems to crash if I do a late malloc
+-+  #define EARLY_MALLOC
+-+  // Move Inter prediction into separate pass
+-+  #define RPI_INTER
+- #endif
+- 
+- // #define DISABLE_MC
+- 
+- const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
+- 
+-+
+-+#ifdef RPI_INTER_QPU
+-+
+-+#define RPI_CHROMA_COMMAND_WORDS 12
+-+// The QPU code for UV blocks only works up to a block width of 8
+-+#define RPI_CHROMA_BLOCK_WIDTH 8
+-+
+-+#define ENCODE_COEFFS(c0, c1, c2, c3) (((-c0) & 0xff) | ((-c1) & 0xff) << 8 | ((-c2) & 0xff) << 16 | ((-c3) & 0xff) << 24)
+-+
+-+// TODO Chroma only needs 4 taps
+-+static uint32_t rpi_filter_coefs[8][2] = {
+-+        { ENCODE_COEFFS(  0,  0,  0, 128), ENCODE_COEFFS(   0,   0,  0,  0 ) },
+-+        { ENCODE_COEFFS(  0,  0, -2,  58), ENCODE_COEFFS(  10,  -2,  0,  0 ) },
+-+        { ENCODE_COEFFS(  0,  0, -4,  54), ENCODE_COEFFS(  16,  -2,  0,  0 ) },
+-+        { ENCODE_COEFFS(  0,  0, -6,  46), ENCODE_COEFFS(  28,  -4,  0,  0 ) },
+-+        { ENCODE_COEFFS(  0,  0, -4,  36), ENCODE_COEFFS(  36,  -4,  0,  0 ) },
+-+        { ENCODE_COEFFS(  0,  0, -4,  28), ENCODE_COEFFS(  46,  -6,  0,  0 ) },
+-+        { ENCODE_COEFFS(  0,  0, -2,  16), ENCODE_COEFFS(  54,  -4,  0,  0 ) },
+-+        { ENCODE_COEFFS(  0,  0, -2,  10), ENCODE_COEFFS(  58,  -2,  0,  0 ) }
+-+};
+-+
+-+static uint32_t get_vc_address(AVBufferRef *bref) {
+-+  GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
+-+  return p->vc;
+-+}
+-+
+-+#endif
+-+
+- /**
+-  * NOTE: Each function hls_foo correspond to the function foo in the
+-  * specification (HLS stands for High Level Syntax).
+-@@ -66,6 +94,7 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
+- static void pic_arrays_free(HEVCContext *s)
+- {
+- #ifdef RPI
+-+
+- #ifdef EARLY_MALLOC
+- #else
+-     printf("pic_arrays_free\n");
+-@@ -1982,6 +2011,43 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                     s->sh.luma_offset_l0[current_mv.ref_idx[0]]);
+- 
+-         if (s->ps.sps->chroma_format_idc) {
+-+#ifdef RPI_INTER_QPU
+-+            if (s->enable_rpi) {
+-+                int reflist = 0;
+-+                int hshift           = s->ps.sps->hshift[1];
+-+                int vshift           = s->ps.sps->vshift[1];
+-+                const Mv *mv         = &current_mv.mv[reflist];
+-+                intptr_t mx          = av_mod_uintp2(mv->x, 2 + hshift);
+-+                intptr_t my          = av_mod_uintp2(mv->y, 2 + vshift);
+-+                intptr_t _mx         = mx << (1 - hshift);
+-+                intptr_t _my         = my << (1 - vshift); // Fractional part of motion vector
+-+
+-+                int x1_c = x0_c + (mv->x >> (2 + hshift));
+-+                int y1_c = y0_c + (mv->y >> (2 + hshift));
+-+                int chan = x0>>8; // Allocate commands for the first 256 luma pixels across to the first QPU.  This is optimised for images around 1920 width
+-+
+-+                uint32_t *u = s->u_mvs[chan & 7];
+-+                for(int start_y=0;start_y < nPbH_c;start_y+=16) {
+-+                  for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 3 + start_x;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 3 + start_y;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[1]);
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[2]);
+-+                      *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
+-+                      // TODO chroma weight and offset... s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]
+-+                      *u++ = rpi_filter_coefs[_mx][0];
+-+                      *u++ = rpi_filter_coefs[_mx][1];
+-+                      *u++ = rpi_filter_coefs[_my][0];
+-+                      *u++ = rpi_filter_coefs[_my][1];
+-+                      *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
+-+                      *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+-+                    }
+-+                }
+-+                s->u_mvs[chan & 7] = u;
+-+                return;
+-+            }
+-+#endif
+-             RPI_REDIRECT(chroma_mc_uni)(s, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1],
+-                           0, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
+-                           s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]);
+-@@ -2632,6 +2698,54 @@ static void rpi_execute_inter_cmds(HEVCContext *s)
+- 
+- #endif
+- 
+-+#ifdef RPI_INTER_QPU
+-+static void rpi_inter_clear(HEVCContext *s)
+-+{
+-+    int i;
+-+    int pic_width        = s->ps.sps->width >> s->ps.sps->hshift[1];
+-+    int pic_height       = s->ps.sps->height >> s->ps.sps->vshift[1];
+-+    for(i=0;i<8;i++) {
+-+        s->u_mvs[i] = s->mvs_base[i];
+-+        *s->u_mvs[i]++ = 0;
+-+        *s->u_mvs[i]++ = 0;
+-+        *s->u_mvs[i]++ = 0;
+-+        *s->u_mvs[i]++ = 0;
+-+        *s->u_mvs[i]++ = 0;
+-+        *s->u_mvs[i]++ = pic_width;
+-+        *s->u_mvs[i]++ = pic_height;
+-+        *s->u_mvs[i]++ = s->frame->linesize[1];
+-+        *s->u_mvs[i]++ = s->frame->linesize[2];
+-+        s->u_mvs[i] += 3;  // Padding words
+-+    }
+-+}
+-+
+-+static void rpi_execute_inter_qpu(HEVCContext *s)
+-+{
+-+    int k;
+-+    uint32_t *unif_vc = (uint32_t *)s->unif_mvs_ptr.vc;
+-+
+-+    if (s->sh.slice_type == I_SLICE)
+-+        return;
+-+    for(k=0;k<8;k++) {
+-+        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
+-+        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
+-+    }
+-+
+-+    s->u_mvs[8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
+-+
+-+    qpu_run_shader8(qpu_get_fn(QPU_MC_SETUP_UV),
+-+      (uint32_t)(unif_vc+(s->mvs_base[0 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[1 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[2 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[3 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[4 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[5 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[6 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[7 ] - (uint32_t*)s->unif_mvs_ptr.arm))
+-+      );
+-+}
+-+#endif
+-+
+- static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+- {
+-     HEVCContext *s  = avctxt->priv_data;
+-@@ -2658,6 +2772,10 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-         }
+-     }
+- 
+-+#ifdef RPI_INTER_QPU
+-+    rpi_inter_clear(s);
+-+#endif
+-+
+-     while (more_data && ctb_addr_ts < s->ps.sps->ctb_size) {
+-         int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
+- 
+-@@ -2679,19 +2797,30 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-           s->dblk_cmds[s->num_dblk_cmds][0] = x_ctb;
+-           s->dblk_cmds[s->num_dblk_cmds++][1] = y_ctb;
+-           if ( (((y_ctb + ctb_size)&63) == 0) && x_ctb + ctb_size >= s->ps.sps->width) {
+--            int x;
+-+#ifdef RPI_INTER_QPU
+-+            // Kick off inter prediction on QPUs
+-+            rpi_execute_inter_qpu(s);
+-+#endif
+-             // Transform all blocks
+-             //printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
+--
+-             rpi_execute_transform(s);
+-             // Perform inter prediction
+-             rpi_execute_inter_cmds(s);
+-             // Wait for transform completion
+-             vpu_wait(s->vpu_id);
+-+
+-+            // Copy back reconstructed data
+-+            //memcpy(s->frame->data[0],s->dummy.arm,2048*64);
+-+            //memcpy(s->frame->data[1],s->dummy.arm,1024*32);
+-+            //memcpy(s->frame->data[2],s->dummy.arm,1024*32);
+-+
+-             // Perform intra prediction and residual reconstruction
+-             rpi_execute_pred_cmds(s);
+-             // Perform deblocking for CTBs in this row
+-             rpi_execute_dblk_cmds(s);
+-+#ifdef RPI_INTER_QPU
+-+            rpi_inter_clear(s);
+-+#endif
+-           }
+-         }
+- #endif
+-@@ -2712,6 +2841,9 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+- 
+- #ifdef RPI
+-     if (s->enable_rpi && s->num_dblk_cmds) {
+-+#ifdef RPI_INTER_QPU
+-+        rpi_execute_inter_qpu(s);
+-+#endif
+-         rpi_execute_transform(s);
+-         rpi_execute_inter_cmds(s);
+-         vpu_wait(s->vpu_id);
+-@@ -3451,6 +3583,14 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
+-     av_freep(&s->unif_xfm_cmds);
+-     av_freep(&s->univ_pred_cmds);
+- 
+-+#ifdef RPI_INTER_QPU
+-+    if (s->unif_mvs) {
+-+        gpu_free( &s->unif_mvs_ptr );
+-+        s->unif_mvs = 0;
+-+    }
+-+#endif
+-+    //gpu_free(&s->dummy);
+-+
+- #ifdef EARLY_MALLOC
+-     printf("hevc_decode_free\n");
+-     if (s->coeffs_buf_arm[0]) {
+-@@ -3541,34 +3681,59 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
+-     if (!s->univ_pred_cmds)
+-         goto fail;
+- 
+--    s->coeffs_buf_arm[0] = 0;
+--    s->coeffs_buf_arm[2] = 0;
+-+#ifdef RPI_INTER_QPU
+-+    // We divide the image into blocks 256 wide and 64 high
+-+    // We support up to 2048 widths
+-+    // We compute the number of chroma motion vector commands for 4:4:4 format and 4x4 chroma blocks - assuming all blocks are B predicted
+-+    // Also add space for the startup command for each stream.
+-+
+-+    {
+-+        int uv_commands_per_qpu = (1 + (256*64*2)/(4*4)) * RPI_CHROMA_COMMAND_WORDS;
+-+        uint32_t *p;
+-+        gpu_malloc_uncached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr );
+-+        s->unif_mvs = (uint32_t *) s->unif_mvs_ptr.arm; // TODO support this allocation in non EARLY_MALLOC
+-+
+-+        // Set up initial locations for uniform streams
+-+        p = s->unif_mvs;
+-+        for(i = 0; i < 8; i++) {
+-+            s->mvs_base[i] = p;
+-+            p += uv_commands_per_qpu;
+-+        }
+-+        s->mc_filter_uv = qpu_get_fn(QPU_MC_FILTER_UV);
+-+        s->mc_filter_uv_b = qpu_get_fn(QPU_MC_FILTER_UV_B);
+-+
+-+    }
+-+#endif
+-+    //gpu_malloc_uncached(2048*64,&s->dummy);
+- 
+- #ifdef EARLY_MALLOC
+--    int coeffs_in_ctb = 64*64;
+--    int coefs_per_row = (2048/64) * coeffs_in_ctb * 3;  // Allow space for chroma
+--    printf("Allocated %d\n",coefs_per_row);
+--    gpu_malloc_cached(sizeof(int16_t) * coefs_per_row, &s->coeffs_buf_default);
+--    s->coeffs_buf_arm[0] = (int16_t*) s->coeffs_buf_default.arm;
+--    if (!s->coeffs_buf_arm[0])
+--        goto fail;
+--    gpu_malloc_cached(sizeof(int16_t) * coefs_per_row * 2, &s->coeffs_buf_accelerated);
+--    s->coeffs_buf_arm[2] = (int16_t*) s->coeffs_buf_accelerated.arm;
+--    s->coeffs_buf_vc[2] = s->coeffs_buf_accelerated.vc;
+--    if (!s->coeffs_buf_arm[2])
+--        goto fail;
+--    s->coeffs_buf_arm[3] = coefs_per_row + s->coeffs_buf_arm[2];
+--    s->coeffs_buf_vc[3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[2];
+--    printf("Done\n");
+-+    {
+-+        int coeffs_in_ctb = 64*64;
+-+        int coefs_per_row = (2048/64) * coeffs_in_ctb * 3;  // Allow space for chroma
+-+        s->coeffs_buf_arm[0] = 0;
+-+        s->coeffs_buf_arm[2] = 0;
+-+        printf("Allocated %d\n",coefs_per_row);
+-+        gpu_malloc_cached(sizeof(int16_t) * coefs_per_row, &s->coeffs_buf_default);
+-+        s->coeffs_buf_arm[0] = (int16_t*) s->coeffs_buf_default.arm;
+-+        if (!s->coeffs_buf_arm[0])
+-+            goto fail;
+-+        gpu_malloc_cached(sizeof(int16_t) * coefs_per_row * 2, &s->coeffs_buf_accelerated);
+-+        s->coeffs_buf_arm[2] = (int16_t*) s->coeffs_buf_accelerated.arm;
+-+        s->coeffs_buf_vc[2] = s->coeffs_buf_accelerated.vc;
+-+        if (!s->coeffs_buf_arm[2])
+-+            goto fail;
+-+        s->coeffs_buf_arm[3] = coefs_per_row + s->coeffs_buf_arm[2];
+-+        s->coeffs_buf_vc[3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[2];
+-+        printf("Done\n");
+- #ifdef RPI_PRECLEAR
+--    //memset(s->coeffs_buf_arm[0],0, sizeof(int16_t) * coefs_per_row);
+--    memclear16(s->coeffs_buf_arm[0], coefs_per_row);
+--    //memset(s->coeffs_buf_arm[2],0, sizeof(int16_t) * coefs_per_row);
+--    memclear16(s->coeffs_buf_arm[2], coefs_per_row);
+--    //memset(s->coeffs_buf_arm[3],0, sizeof(int16_t) * coefs_per_row);
+--    memclear16(s->coeffs_buf_arm[3], coefs_per_row);
+-+        //memset(s->coeffs_buf_arm[0],0, sizeof(int16_t) * coefs_per_row);
+-+        memclear16(s->coeffs_buf_arm[0], coefs_per_row);
+-+        //memset(s->coeffs_buf_arm[2],0, sizeof(int16_t) * coefs_per_row);
+-+        memclear16(s->coeffs_buf_arm[2], coefs_per_row);
+-+        //memset(s->coeffs_buf_arm[3],0, sizeof(int16_t) * coefs_per_row);
+-+        memclear16(s->coeffs_buf_arm[3], coefs_per_row);
+- #endif
+--
+-+    }
+- #endif
+- 
+-     s->enable_rpi = 0;
+-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
+-index 990bd8c..da345f6 100644
+---- a/libavcodec/hevc.h
+-+++ b/libavcodec/hevc.h
+-@@ -42,7 +42,11 @@
+- 
+- // define RPI to split the CABAC/prediction/transform into separate stages
+- #ifdef RPI
+--#include "rpi_qpu.h"
+-+
+-+  #include "rpi_qpu.h"
+-+  // Use QPU for inter prediction
+-+  //#define RPI_INTER_QPU
+-+
+- #endif
+- 
+- #define MAX_DPB_SIZE 16 // A.4.1
+-@@ -888,7 +892,7 @@ typedef struct HEVCContext {
+- 
+- #ifdef RPI
+-     int enable_rpi;
+--    HEVCMvCmd *unif_mv_cmds;
+-+    HEVCMvCmd *unif_mv_cmds;  // TODO rename
+-     HEVCXfmCmd *unif_xfm_cmds;
+-     HEVCPredCmd *univ_pred_cmds;
+-     int buf_width;
+-@@ -902,6 +906,20 @@ typedef struct HEVCContext {
+-     int num_pred_cmds;
+-     int num_dblk_cmds;
+-     int vpu_id;
+-+    //GPU_MEM_PTR_T dummy;
+-+#ifdef RPI_INTER_QPU
+-+    GPU_MEM_PTR_T unif_mvs_ptr;
+-+    uint32_t *unif_mvs; // Base of memory for motion vector commands
+-+
+-+    // _base pointers are to the start of the row
+-+    uint32_t *mvs_base[8];
+-+    // these pointers are to the next free space
+-+    uint32_t *u_mvs[8];
+-+    // Function pointers
+-+    uint32_t mc_filter_uv;
+-+    uint32_t mc_filter_uv_b;
+-+#endif
+-+
+- #endif
+- 
+-     uint8_t *cabac_state;
+-diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
+-index e1b32d4..5b3d759 100644
+---- a/libavcodec/hevc_filter.c
+-+++ b/libavcodec/hevc_filter.c
+-@@ -903,8 +903,11 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
+-             s->nal_unit_type == NAL_STSA_N  ||
+-             s->nal_unit_type == NAL_RADL_N  ||
+-             s->nal_unit_type == NAL_RASL_N )) {
+--            flush_buffer(s->frame->buf[1]);
+--            flush_buffer(s->frame->buf[2]);
+-+            //flush_buffer(s->frame->buf[1]);
+-+            //flush_buffer(s->frame->buf[2]);
+-+            //memcpy(s->dummy.arm,s->frame->data[0],2048*64);
+-+            //memcpy(s->dummy.arm,s->frame->data[1],1024*32);
+-+            //memcpy(s->dummy.arm,s->frame->data[2],1024*32);
+-         }
+-         ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
+-     }
+-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
+-index 3b6dae7..e4dd58a 100644
+---- a/libavcodec/rpi_qpu.c
+-+++ b/libavcodec/rpi_qpu.c
+-@@ -3,7 +3,7 @@
+- // This works better than the mmap in that the memory can be cached, but requires a kernel modification to enable the device.
+- #define RPI_USE_VCSM
+- // define RPI_TIME_TOTAL_QPU to print out how much time is spent in the QPU code
+--//#define RPI_TIME_TOTAL_QPU
+-+#define RPI_TIME_TOTAL_QPU
+- // define RPI_TIME_TOTAL_VPU to print out how much time is spent in the VPI code
+- //#define RPI_TIME_TOTAL_VPU
+- // define RPI_ASYNC to run the VPU in a separate thread, need to make a separate call to check for completion
+-@@ -30,7 +30,7 @@
+- #endif
+- 
+- // On Pi2 there is no way to access the VPU L2 cache
+--// GPU_MEM_FLG should be 4 for uncached memory.
+-+// GPU_MEM_FLG should be 4 for uncached memory.  (Or C for alias to allocate in the VPU L2 cache)
+- // However, if using VCSM allocated buffers, need to use C at the moment because VCSM does not allocate uncached memory correctly
+- // The QPU crashes if we mix L2 cached and L2 uncached accesses due to a HW bug.
+- #define GPU_MEM_FLG 0xC
+-@@ -549,6 +549,54 @@ void qpu_run_shader12(int code, int num, int code2, int num2, int unifs1, int un
+-   gpu_unlock();
+- }
+- 
+-+// Run a program on 8 QPUs with the given code and uniform stream (given in GPU addresses)
+-+void qpu_run_shader8(int code, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8)
+-+{
+-+  int i;
+-+#ifdef RPI_TIME_TOTAL_QPU
+-+  static int last_time=0;
+-+  static long long on_time=0;
+-+  static long long off_time=0;
+-+  int start_time;
+-+  int end_time;
+-+  static int count=0;
+-+#endif
+-+
+-+  gpu_lock();
+-+#ifdef RPI_TIME_TOTAL_QPU
+-+  start_time = Microseconds();
+-+  if (last_time==0)
+-+    last_time = start_time;
+-+  off_time += start_time-last_time;
+-+#endif
+-+  for(i=0;i<8;i++) {
+-+    gpu->mail[i*2 + 1] = code;
+-+  }
+-+  gpu->mail[0 ] = unifs1;
+-+  gpu->mail[2 ] = unifs2;
+-+  gpu->mail[4 ] = unifs3;
+-+  gpu->mail[6 ] = unifs4;
+-+  gpu->mail[8 ] = unifs5;
+-+  gpu->mail[10] = unifs6;
+-+	gpu->mail[12] = unifs7;
+-+	gpu->mail[14] = unifs8;
+-+	execute_qpu(
+-+		gpu->mb,
+-+		8 /* Number of QPUs */,
+-+		gpu->vc + offsetof(struct GPU, mail),
+-+		1 /* no flush */,  // Don't flush VPU L1 cache
+-+		5000 /* timeout ms */);
+-+#ifdef RPI_TIME_TOTAL_QPU
+-+  end_time = Microseconds();
+-+  last_time = end_time;
+-+  on_time += end_time - start_time;
+-+  count++;
+-+  if ((count&0x7f)==0)
+-+    printf("On=%dms, Off=%dms\n",(int)(on_time/1000),(int)(off_time/1000));
+-+#endif
+-+  gpu_unlock();
+-+}
+-+
+- unsigned int qpu_get_fn(int num) {
+-     // Make sure that the gpu is initialized
+-     unsigned int *fn;
+-@@ -585,6 +633,9 @@ unsigned int qpu_get_fn(int num) {
+-     case QPU_MC_FILTER_UV_B:
+-       fn = mc_filter_uv_b;
+-       break;
+-+    case QPU_MC_INTERRUPT_EXIT8:
+-+      fn = mc_interrupt_exit8;
+-+      break;
+-     case QPU_MC_END:
+-       fn = mc_end;
+-       break;
+-diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
+-index 3526fce..2b22d98 100644
+---- a/libavcodec/rpi_qpu.h
+-+++ b/libavcodec/rpi_qpu.h
+-@@ -16,6 +16,7 @@ extern void gpu_free(GPU_MEM_PTR_T *p);
+- extern void gpu_cache_flush(GPU_MEM_PTR_T *p);
+- 
+- // QPU specific functions
+-+extern void qpu_run_shader8(int code, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8);
+- extern void qpu_run_shader12(int code, int num, int code2, int num2, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8, int unifs9, int unifs10, int unifs11, int unifs12);
+- 
+- enum {
+-@@ -28,6 +29,7 @@ enum {
+-   QPU_MC_SETUP_UV,
+-   QPU_MC_FILTER_UV,
+-   QPU_MC_FILTER_UV_B,
+-+  QPU_MC_INTERRUPT_EXIT8,
+-   QPU_MC_END
+-   };
+- extern unsigned int qpu_get_fn(int num);
+--- 
+-2.7.4
+-
+-
+-From bd651e1569ebe0cdc41a6be169e139758cce069d Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Wed, 13 May 2015 11:47:23 +0100
+-Subject: [PATCH 17/68] Drafted chroma interpolation on QPUs
+-
+----
+- libavcodec/hevc.c          |   5 ++-
+- libavcodec/hevc.h          |   2 +-
+- libavcodec/hevc_filter.c   |   6 ++-
+- libavcodec/rpi_qpu.c       | 101 +++++++++++++++++++++++++++++++++++++++++++--
+- libavcodec/rpi_qpu.h       |   1 +
+- libavcodec/rpi_shader.c    |  42 +++++++++----------
+- libavcodec/rpi_shader.qasm |  42 +++++++++----------
+- 7 files changed, 149 insertions(+), 50 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 66ed37a..d5ea45e 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -60,11 +60,11 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
+- // The QPU code for UV blocks only works up to a block width of 8
+- #define RPI_CHROMA_BLOCK_WIDTH 8
+- 
+--#define ENCODE_COEFFS(c0, c1, c2, c3) (((-c0) & 0xff) | ((-c1) & 0xff) << 8 | ((-c2) & 0xff) << 16 | ((-c3) & 0xff) << 24)
+-+#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24)
+- 
+- // TODO Chroma only needs 4 taps
+- static uint32_t rpi_filter_coefs[8][2] = {
+--        { ENCODE_COEFFS(  0,  0,  0, 128), ENCODE_COEFFS(   0,   0,  0,  0 ) },
+-+        { ENCODE_COEFFS(  0,  0,  0,  64), ENCODE_COEFFS(   0,   0,  0,  0 ) },
+-         { ENCODE_COEFFS(  0,  0, -2,  58), ENCODE_COEFFS(  10,  -2,  0,  0 ) },
+-         { ENCODE_COEFFS(  0,  0, -4,  54), ENCODE_COEFFS(  16,  -2,  0,  0 ) },
+-         { ENCODE_COEFFS(  0,  0, -6,  46), ENCODE_COEFFS(  28,  -4,  0,  0 ) },
+-@@ -2729,6 +2729,7 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
+-     for(k=0;k<8;k++) {
+-         s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
+-         s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
+-+        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP); // Also need a dummy for V
+-     }
+- 
+-     s->u_mvs[8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
+-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
+-index da345f6..2497c47 100644
+---- a/libavcodec/hevc.h
+-+++ b/libavcodec/hevc.h
+-@@ -45,7 +45,7 @@
+- 
+-   #include "rpi_qpu.h"
+-   // Use QPU for inter prediction
+--  //#define RPI_INTER_QPU
+-+  // #define RPI_INTER_QPU
+- 
+- #endif
+- 
+-diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
+-index 5b3d759..9b6e26d 100644
+---- a/libavcodec/hevc_filter.c
+-+++ b/libavcodec/hevc_filter.c
+-@@ -903,8 +903,10 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
+-             s->nal_unit_type == NAL_STSA_N  ||
+-             s->nal_unit_type == NAL_RADL_N  ||
+-             s->nal_unit_type == NAL_RASL_N )) {
+--            //flush_buffer(s->frame->buf[1]);
+--            //flush_buffer(s->frame->buf[2]);
+-+#ifdef RPI_INTER_QPU
+-+            flush_buffer(s->frame->buf[1]);
+-+            flush_buffer(s->frame->buf[2]);
+-+#endif
+-             //memcpy(s->dummy.arm,s->frame->data[0],2048*64);
+-             //memcpy(s->dummy.arm,s->frame->data[1],1024*32);
+-             //memcpy(s->dummy.arm,s->frame->data[2],1024*32);
+-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
+-index e4dd58a..4d9eda8 100644
+---- a/libavcodec/rpi_qpu.c
+-+++ b/libavcodec/rpi_qpu.c
+-@@ -33,7 +33,8 @@
+- // GPU_MEM_FLG should be 4 for uncached memory.  (Or C for alias to allocate in the VPU L2 cache)
+- // However, if using VCSM allocated buffers, need to use C at the moment because VCSM does not allocate uncached memory correctly
+- // The QPU crashes if we mix L2 cached and L2 uncached accesses due to a HW bug.
+--#define GPU_MEM_FLG 0xC
+-+#define GPU_MEM_FLG 0x4
+-+// GPU_MEM_MAP is meaningless on the Pi2 and should be left at 0  (On Pi1 it allows ARM to access VPU L2 cache)
+- #define GPU_MEM_MAP 0x0
+- 
+- #define vcos_verify(x) ((x)>=0)
+-@@ -165,6 +166,8 @@ static int gpu_init(volatile struct GPU **gpu) {
+- 	ptr->vc_handle = handle;
+- 	ptr->vc = vc;
+- 
+-+  printf("GPU allocated at 0x%x\n",vc);
+-+
+-   *gpu = ptr;
+- 
+-   // Now copy over the QPU code into GPU memory
+-@@ -304,10 +307,13 @@ int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p) {
+- 
+- static void gpu_term(void)
+- {
+--	int mb = gpu->mb;
+--	unsigned handle = gpu->vc_handle;
+-+	int mb;
+-+	unsigned handle;
+-+
+-   if (gpu==NULL)
+-     return;
+-+  mb = gpu->mb;
+-+  handle = gpu->vc_handle;
+- 
+- #ifdef RPI_ASYNC
+-   {
+-@@ -648,6 +654,95 @@ unsigned int qpu_get_fn(int num) {
+- }
+- 
+- #if 0
+-+typedef unsigned int uint32_t;
+-+
+-+typedef struct mvs_s {
+-+    GPU_MEM_PTR_T unif_mvs_ptr;
+-+    uint32_t *unif_mvs; // Base of memory for motion vector commands
+-+
+-+    // _base pointers are to the start of the row
+-+    uint32_t *mvs_base[8];
+-+    // these pointers are to the next free space
+-+    uint32_t *u_mvs[8];
+-+
+-+} HEVCContext;
+-+
+-+#define RPI_CHROMA_COMMAND_WORDS 12
+-+
+-+static void rpi_inter_clear(HEVCContext *s)
+-+{
+-+    int i;
+-+    for(i=0;i<8;i++) {
+-+        s->u_mvs[i] = s->mvs_base[i];
+-+        *s->u_mvs[i]++ = 0;
+-+        *s->u_mvs[i]++ = 0;
+-+        *s->u_mvs[i]++ = 0;
+-+        *s->u_mvs[i]++ = 0;
+-+        *s->u_mvs[i]++ = 0;
+-+        *s->u_mvs[i]++ = 128;  // w
+-+        *s->u_mvs[i]++ = 128;  // h
+-+        *s->u_mvs[i]++ = 128;  // stride u
+-+        *s->u_mvs[i]++ = 128;  // stride v
+-+        s->u_mvs[i] += 3;  // Padding words
+-+    }
+-+}
+-+
+-+static void rpi_execute_inter_qpu(HEVCContext *s)
+-+{
+-+    int k;
+-+    uint32_t *unif_vc = (uint32_t *)s->unif_mvs_ptr.vc;
+-+
+-+    for(k=0;k<8;k++) {
+-+        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
+-+        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
+-+        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP); //  dummy location for V
+-+    }
+-+
+-+    s->u_mvs[8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
+-+
+-+    qpu_run_shader8(qpu_get_fn(QPU_MC_SETUP_UV),
+-+      (uint32_t)(unif_vc+(s->mvs_base[0 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[1 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[2 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[3 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[4 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[5 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[6 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[7 ] - (uint32_t*)s->unif_mvs_ptr.arm))
+-+      );
+-+}
+-+
+-+void rpi_test_qpu(void)
+-+{
+-+    HEVCContext mvs;
+-+    HEVCContext *s = &mvs;
+-+    int i;
+-+    int uv_commands_per_qpu = (1 + (256*64*2)/(4*4)) * RPI_CHROMA_COMMAND_WORDS;
+-+    uint32_t *p;
+-+    printf("Allocate memory\n");
+-+    gpu_malloc_uncached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr );
+-+    s->unif_mvs = (uint32_t *) s->unif_mvs_ptr.arm;
+-+
+-+    // Set up initial locations for uniform streams
+-+    p = s->unif_mvs;
+-+    for(i = 0; i < 8; i++) {
+-+        s->mvs_base[i] = p;
+-+        p += uv_commands_per_qpu;
+-+    }
+-+    // Now run a simple program that should just quit immediately after a single texture fetch
+-+    rpi_inter_clear(s);
+-+    for(i=0;i<4;i++) {
+-+      printf("Launch QPUs\n");
+-+      rpi_execute_inter_qpu(s);
+-+      printf("Done\n");
+-+    }
+-+    printf("Free memory\n");
+-+    gpu_free(&s->unif_mvs_ptr);
+-+    return;
+-+}
+-+#endif
+-+
+-+#if 0
+- 
+- int32_t hcoeffs[] = {-4, 10, -21, 70, 90, -24, 11, -4};
+- //int32_t hcoeffs[] = {1, 1, 1, 1, 1, 1, 1, 1};
+-diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
+-index 2b22d98..f9ad333 100644
+---- a/libavcodec/rpi_qpu.h
+-+++ b/libavcodec/rpi_qpu.h
+-@@ -18,6 +18,7 @@ extern void gpu_cache_flush(GPU_MEM_PTR_T *p);
+- // QPU specific functions
+- extern void qpu_run_shader8(int code, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8);
+- extern void qpu_run_shader12(int code, int num, int code2, int num2, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8, int unifs9, int unifs10, int unifs11, int unifs12);
+-+extern void rpi_test_qpu(void);
+- 
+- enum {
+-   QPU_MC_SETUP,
+-diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
+-index 41cc2e1..d7ed297 100644
+---- a/libavcodec/rpi_shader.c
+-+++ b/libavcodec/rpi_shader.c
+-@@ -152,23 +152,23 @@ unsigned int rpi_shader[] = {
+- /* [0x000003e8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+- /* [0x000003f0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+- /* [0x000003f8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000400] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
+--/* [0x00000408] */ 0x40038031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
+--/* [0x00000410] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00000400] */ 0x55015fc6, 0x100248a2, // mov r2, rb21         ; mul24 r2, r0, ra0
+-+/* [0x00000408] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x00000410] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+- /* [0x00000418] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x00000420] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000420] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+- /* [0x00000428] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x00000430] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00000430] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+- /* [0x00000438] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00000440] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+/* [0x00000440] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+- /* [0x00000448] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--/* [0x00000450] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+/* [0x00000450] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+- /* [0x00000458] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--/* [0x00000460] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+/* [0x00000460] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+- /* [0x00000468] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--/* [0x00000470] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+/* [0x00000470] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+- /* [0x00000478] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--/* [0x00000480] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
+-+/* [0x00000480] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
+- /* [0x00000488] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+- /* [0x00000490] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+- /* [0x00000498] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+-@@ -179,20 +179,20 @@ unsigned int rpi_shader[] = {
+- /* [0x000004c0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+- /* [0x000004c8] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop
+- /* [0x000004d0] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
+--/* [0x000004d8] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
+-+/* [0x000004d8] */ 0x8f54e1f6, 0xd0024821, // asr r0, r0, 14          ; mov r1, ra21
+- /* [0x000004e0] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
+--/* [0x000004e8] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
+--/* [0x000004f0] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
+--/* [0x000004f8] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
+--/* [0x00000500] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
+--/* [0x00000508] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
+--/* [0x00000510] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
+--/* [0x00000518] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
+--/* [0x00000520] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
+--/* [0x00000528] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
+-+/* [0x000004e8] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
+-+/* [0x000004f0] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
+-+/* [0x000004f8] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
+-+/* [0x00000500] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
+-+/* [0x00000508] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
+-+/* [0x00000510] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
+-+/* [0x00000518] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
+-+/* [0x00000520] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
+-+/* [0x00000528] */ 0x8c9f223f, 0x100a0867, // add.ifnn r1, r1, r0     ; mov -, vw_wait
+- /* [0x00000530] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+- /* [0x00000538] */ 0xfffffe50, 0xf06809e7, // brr.anyn -, r:uvloop
+--/* [0x00000540] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
+-+/* [0x00000540] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+- /* [0x00000548] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+- /* [0x00000550] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+- /* [0x00000558] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
+-index 6851e83..02fdcb2 100644
+---- a/libavcodec/rpi_shader.qasm
+-+++ b/libavcodec/rpi_shader.qasm
+-@@ -270,23 +270,23 @@ add t0s, ra_x2_base, r2
+- 
+- mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+- 
+--mov r2, rb21         ; mul24 r3, r0, ra0
+--nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
+--sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+-+mov r2, rb21         ; mul24 r2, r0, ra0
+-+nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+nop                  ; mul24      r3, ra1 << 1, r0 << 1
+- nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+- nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+- nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+- nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+- nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+- nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+- nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--sub r0, r2, r3
+-+add r0, r2, r3
+- 
+- mov r3, rb31
+- 
+-@@ -302,23 +302,23 @@ sub.setf -, r3, 8 ; mov r1, ra22
+- # apply horizontal filter
+- brr.anyn -, r:uvloop
+- max ra14, ra15, 0       ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
+--asr r0, r0, 15          ; mov r1, ra21
+-+asr r0, r0, 14          ; mov r1, ra21
+- min.setf ra15, r0, rb22
+- 
+- # apply vertical filter and write to VPM
+- 
+--nop                     ; mul24 r0, ra14, rb14
+--sub r1, r1, r0          ; mul24 r0, ra13, rb13
+--sub r1, r1, r0          ; mul24 r0, ra12, rb12
+--sub r1, r1, r0          ; mul24 r0, ra11, rb11
+--sub r1, r1, r0          ; mul24 r0, ra10, rb10
+--sub r1, r1, r0          ; mul24 r0, ra9, rb9
+--sub r1, r1, r0          ; mul24 r0, ra8, rb8
+--sub r1, r1, r0          ; mul24 r0, ra15, rb15
+--sub.ifnn r1, r1, r0     ; mov -, vw_wait
+-+nop                     ; mul24 r1, ra14, rb14
+-+nop                     ; mul24 r0, ra13, rb13
+-+add r1, r1, r0          ; mul24 r0, ra12, rb12
+-+add r1, r1, r0          ; mul24 r0, ra11, rb11
+-+add r1, r1, r0          ; mul24 r0, ra10, rb10
+-+add r1, r1, r0          ; mul24 r0, ra9, rb9
+-+add r1, r1, r0          ; mul24 r0, ra8, rb8
+-+add r1, r1, r0          ; mul24 r0, ra15, rb15
+-+add.ifnn r1, r1, r0     ; mov -, vw_wait
+- sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+- brr.anyn -, r:uvloop
+--asr r1, r1, 15
+-+asr r1, r1, 14
+- min r1, r1, rb22
+- max vpm, r1, 0
+- 
+--- 
+-2.7.4
+-
+-
+-From 61628063461ee5d891af6dbedfd495efcf464012 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Wed, 13 May 2015 13:54:11 +0100
+-Subject: [PATCH 18/68] Fixed chroma inter prediction
+-
+----
+- libavcodec/hevc.c          |    8 +-
+- libavcodec/hevc.h          |    2 +-
+- libavcodec/rpi_shader.c    | 1170 ++++++++++++++++++++++----------------------
+- libavcodec/rpi_shader.h    |   22 +-
+- libavcodec/rpi_shader.qasm |   24 +-
+- 5 files changed, 617 insertions(+), 609 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index d5ea45e..d6d78ee 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -57,9 +57,11 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
+- #ifdef RPI_INTER_QPU
+- 
+- #define RPI_CHROMA_COMMAND_WORDS 12
+-+#define UV_COMMANDS_PER_QPU ((1 + (256*64*2)/(4*4)) * RPI_CHROMA_COMMAND_WORDS)
+- // The QPU code for UV blocks only works up to a block width of 8
+- #define RPI_CHROMA_BLOCK_WIDTH 8
+- 
+-+
+- #define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24)
+- 
+- // TODO Chroma only needs 4 taps
+-@@ -2024,7 +2026,8 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+- 
+-                 int x1_c = x0_c + (mv->x >> (2 + hshift));
+-                 int y1_c = y0_c + (mv->y >> (2 + hshift));
+--                int chan = x0>>8; // Allocate commands for the first 256 luma pixels across to the first QPU.  This is optimised for images around 1920 width
+-+                //int chan = x0>>8; // Allocate commands for the first 256 luma pixels across to the first QPU.  This is optimised for images around 1920 width
+-+                int chan = x0>>8;
+- 
+-                 uint32_t *u = s->u_mvs[chan & 7];
+-                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
+-@@ -2730,6 +2733,7 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
+-         s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
+-         s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
+-         s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP); // Also need a dummy for V
+-+        assert(s->u_mvs[k] - s->mvs_base[k] < UV_COMMANDS_PER_QPU);
+-     }
+- 
+-     s->u_mvs[8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
+-@@ -3689,7 +3693,7 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
+-     // Also add space for the startup command for each stream.
+- 
+-     {
+--        int uv_commands_per_qpu = (1 + (256*64*2)/(4*4)) * RPI_CHROMA_COMMAND_WORDS;
+-+        int uv_commands_per_qpu = UV_COMMANDS_PER_QPU;
+-         uint32_t *p;
+-         gpu_malloc_uncached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr );
+-         s->unif_mvs = (uint32_t *) s->unif_mvs_ptr.arm; // TODO support this allocation in non EARLY_MALLOC
+-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
+-index 2497c47..d513579 100644
+---- a/libavcodec/hevc.h
+-+++ b/libavcodec/hevc.h
+-@@ -45,7 +45,7 @@
+- 
+-   #include "rpi_qpu.h"
+-   // Use QPU for inter prediction
+--  // #define RPI_INTER_QPU
+-+  #define RPI_INTER_QPU
+- 
+- #endif
+- 
+-diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
+-index d7ed297..831633b 100644
+---- a/libavcodec/rpi_shader.c
+-+++ b/libavcodec/rpi_shader.c
+-@@ -33,7 +33,7 @@ unsigned int rpi_shader[] = {
+- /* [0x00000040] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+- /* [0x00000048] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
+- /* [0x00000050] */ 0x00000001, 0xe0020527, // mov ra20, 1
+--/* [0x00000058] */ 0x00000040, 0xe0020567, // mov ra21, 64
+-+/* [0x00000058] */ 0x00000020, 0xe0020567, // mov ra21, 32
+- /* [0x00000060] */ 0x00000100, 0xe00205a7, // mov ra22, 256
+- /* [0x00000068] */ 0x00000008, 0xe00205e7, // mov ra23, 8
+- /* [0x00000070] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
+-@@ -152,7 +152,7 @@ unsigned int rpi_shader[] = {
+- /* [0x000003e8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+- /* [0x000003f0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+- /* [0x000003f8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000400] */ 0x55015fc6, 0x100248a2, // mov r2, rb21         ; mul24 r2, r0, ra0
+-+/* [0x00000400] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+- /* [0x00000408] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+- /* [0x00000410] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+- /* [0x00000418] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-@@ -178,9 +178,9 @@ unsigned int rpi_shader[] = {
+- /* [0x000004b8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+- /* [0x000004c0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+- /* [0x000004c8] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop
+--/* [0x000004d0] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
+--/* [0x000004d8] */ 0x8f54e1f6, 0xd0024821, // asr r0, r0, 14          ; mov r1, ra21
+--/* [0x000004e0] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
+-+/* [0x000004d0] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+-+/* [0x000004d8] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+-+/* [0x000004e0] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+- /* [0x000004e8] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
+- /* [0x000004f0] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
+- /* [0x000004f8] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
+-@@ -189,400 +189,400 @@ unsigned int rpi_shader[] = {
+- /* [0x00000510] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
+- /* [0x00000518] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
+- /* [0x00000520] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
+--/* [0x00000528] */ 0x8c9f223f, 0x100a0867, // add.ifnn r1, r1, r0     ; mov -, vw_wait
+-+/* [0x00000528] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+- /* [0x00000530] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x00000538] */ 0xfffffe50, 0xf06809e7, // brr.anyn -, r:uvloop
+--/* [0x00000540] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+--/* [0x00000548] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x00000550] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+--/* [0x00000558] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00000560] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000568] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x00000570] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x00000578] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x00000580] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000588] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x00000590] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000598] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000538] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x00000540] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
+-+/* [0x00000548] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:uvloop
+-+/* [0x00000550] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
+-+/* [0x00000558] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x00000560] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x00000568] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00000570] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000578] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000580] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x00000588] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x00000590] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000598] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x000005a0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x000005a8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_filter
+--/* [0x000005a0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x000005a8] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x000005b0] */ 0x156e7d80, 0x10020667, // mov ra_x2shift, ra_x2shift_next
+--/* [0x000005b8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x000005c0] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
+--/* [0x000005c8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x000005d0] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3,unif
+--/* [0x000005d8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x000005e0] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
+--/* [0x000005e8] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
+--/* [0x000005f0] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
+--/* [0x000005f8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x00000600] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
+--/* [0x00000608] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+--/* [0x00000610] */ 0x149dc5c0, 0xd00206a7, // and ra_x2_base_next, r2, ~3
+--/* [0x00000618] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x00000620] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+--/* [0x00000628] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x00000630] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000638] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x00000640] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x00000648] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x00000650] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+--/* [0x00000658] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+--/* [0x00000660] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x00000668] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000670] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000678] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00000680] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
+--/* [0x00000688] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000690] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000698] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000006a0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000006a8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x000006b0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000006b8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000006c0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000006c8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+--/* [0x000006d0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000006d8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000006e0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000006e8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+--/* [0x000006f0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000006f8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000700] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000708] */ 0x000001d0, 0xf07809e7, // brr.anynn -, r:fast_path
+--/* [0x00000710] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+--/* [0x00000718] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x00000720] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000728] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x000005b0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x000005b8] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x000005c0] */ 0x156e7d80, 0x10020667, // mov ra_x2shift, ra_x2shift_next
+-+/* [0x000005c8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x000005d0] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
+-+/* [0x000005d8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x000005e0] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3,unif
+-+/* [0x000005e8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x000005f0] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
+-+/* [0x000005f8] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
+-+/* [0x00000600] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
+-+/* [0x00000608] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x00000610] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
+-+/* [0x00000618] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+-+/* [0x00000620] */ 0x149dc5c0, 0xd00206a7, // and ra_x2_base_next, r2, ~3
+-+/* [0x00000628] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x00000630] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x00000638] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x00000640] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000648] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x00000650] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x00000658] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x00000660] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+-+/* [0x00000668] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+-+/* [0x00000670] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x00000678] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000680] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000688] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00000690] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
+-+/* [0x00000698] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000006a0] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000006a8] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000006b0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000006b8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x000006c0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000006c8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000006d0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000006d8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+-+/* [0x000006e0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000006e8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000006f0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000006f8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+-+/* [0x00000700] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000708] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000710] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000718] */ 0x000001d0, 0xf07809e7, // brr.anynn -, r:fast_path
+-+/* [0x00000720] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+-+/* [0x00000728] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x00000730] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000738] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :loop
+--/* [0x00000730] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x00000738] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x00000740] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x00000748] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x00000750] */ 0xee654987, 0x10024860, // shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20
+--/* [0x00000758] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x00000760] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x00000768] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x00000770] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x00000778] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x00000780] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000788] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
+--/* [0x00000790] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x00000798] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x000007a0] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x000007a8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x000007b0] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x000007b8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x000007c0] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--/* [0x000007c8] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--/* [0x000007d0] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--/* [0x000007d8] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--/* [0x000007e0] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--/* [0x000007e8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--/* [0x000007f0] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--/* [0x000007f8] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--/* [0x00000800] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
+--/* [0x00000808] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+--/* [0x00000810] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+--/* [0x00000818] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+--/* [0x00000820] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+--/* [0x00000828] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+--/* [0x00000830] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+--/* [0x00000838] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x00000840] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+--/* [0x00000848] */ 0xfffffec8, 0xf06809e7, // brr.anyn -, r:loop
+--/* [0x00000850] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
+--/* [0x00000858] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
+--/* [0x00000860] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
+--/* [0x00000868] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
+--/* [0x00000870] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
+--/* [0x00000878] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
+--/* [0x00000880] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
+--/* [0x00000888] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
+--/* [0x00000890] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
+--/* [0x00000898] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
+--/* [0x000008a0] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
+--/* [0x000008a8] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
+--/* [0x000008b0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x000008b8] */ 0xfffffe58, 0xf06809e7, // brr.anyn -, r:loop
+--/* [0x000008c0] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
+--/* [0x000008c8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x000008d0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+--/* [0x000008d8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x000008e0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x000008e8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x000008f0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000740] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x00000748] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x00000750] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x00000758] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000760] */ 0xee654987, 0x10024860, // shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20
+-+/* [0x00000768] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000770] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000778] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x00000780] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x00000788] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x00000790] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000798] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
+-+/* [0x000007a0] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x000007a8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x000007b0] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x000007b8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x000007c0] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x000007c8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x000007d0] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+/* [0x000007d8] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+/* [0x000007e0] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+/* [0x000007e8] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+/* [0x000007f0] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+/* [0x000007f8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+/* [0x00000800] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+/* [0x00000808] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+/* [0x00000810] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
+-+/* [0x00000818] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+-+/* [0x00000820] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+-+/* [0x00000828] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+-+/* [0x00000830] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+-+/* [0x00000838] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+-+/* [0x00000840] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+-+/* [0x00000848] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00000850] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+-+/* [0x00000858] */ 0xfffffec8, 0xf06809e7, // brr.anyn -, r:loop
+-+/* [0x00000860] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
+-+/* [0x00000868] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
+-+/* [0x00000870] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
+-+/* [0x00000878] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
+-+/* [0x00000880] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
+-+/* [0x00000888] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
+-+/* [0x00000890] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
+-+/* [0x00000898] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
+-+/* [0x000008a0] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
+-+/* [0x000008a8] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
+-+/* [0x000008b0] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
+-+/* [0x000008b8] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
+-+/* [0x000008c0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x000008c8] */ 0xfffffe58, 0xf06809e7, // brr.anyn -, r:loop
+-+/* [0x000008d0] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
+-+/* [0x000008d8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x000008e0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x000008e8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x000008f0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x000008f8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000900] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // :fast_path
+--/* [0x000008f8] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x00000908] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :fast_loop
+--/* [0x00000900] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x00000908] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x00000910] */ 0x95727d9b, 0x1004475f, // mov.ifz ra_y, ra_y_next   ; mov rb31, r3
+--/* [0x00000918] */ 0x95690dbf, 0x10044623, // mov.ifz ra_x2_base, ra_x2_base_next   ; mov r3, rb_pitch
+--/* [0x00000920] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x00000928] */ 0x929de5e4, 0x100248a1, // min r2, r2, rb_frame_height_minus_1 ; mov r1, r4
+--/* [0x00000930] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x00000938] */ 0xec414c87, 0x10024e20, // add t0s, ra_x_base, r2    ; v8subs r0, r0, rb20
+--/* [0x00000940] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x00000948] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
+--/* [0x00000950] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra1 << 1, r0 << 1
+--/* [0x00000958] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra2 << 2, r0 << 2
+--/* [0x00000960] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra3 << 3, r0 << 3
+--/* [0x00000968] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra4 << 4, r0 << 4
+--/* [0x00000970] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra5 << 5, r0 << 5
+--/* [0x00000978] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra6 << 6, r0 << 6
+--/* [0x00000980] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra7 << 7, r0 << 7
+--/* [0x00000988] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
+--/* [0x00000990] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+--/* [0x00000998] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+--/* [0x000009a0] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+--/* [0x000009a8] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+--/* [0x000009b0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+--/* [0x000009b8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x000009c0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8       ; mov r1, ra22
+--/* [0x000009c8] */ 0xffffff18, 0xf06809e7, // brr.anyn -, r:fast_loop
+--/* [0x000009d0] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
+--/* [0x000009d8] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
+--/* [0x000009e0] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
+--/* [0x000009e8] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
+--/* [0x000009f0] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
+--/* [0x000009f8] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
+--/* [0x00000a00] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
+--/* [0x00000a08] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
+--/* [0x00000a10] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
+--/* [0x00000a18] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
+--/* [0x00000a20] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
+--/* [0x00000a28] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
+--/* [0x00000a30] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x00000a38] */ 0xfffffea8, 0xf06809e7, // brr.anyn -, r:fast_loop
+--/* [0x00000a40] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
+--/* [0x00000a48] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x00000a50] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+--/* [0x00000a58] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000a60] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00000a68] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000a70] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000910] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x00000918] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x00000920] */ 0x95727d9b, 0x1004475f, // mov.ifz ra_y, ra_y_next   ; mov rb31, r3
+-+/* [0x00000928] */ 0x95690dbf, 0x10044623, // mov.ifz ra_x2_base, ra_x2_base_next   ; mov r3, rb_pitch
+-+/* [0x00000930] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000938] */ 0x929de5e4, 0x100248a1, // min r2, r2, rb_frame_height_minus_1 ; mov r1, r4
+-+/* [0x00000940] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x00000948] */ 0xec414c87, 0x10024e20, // add t0s, ra_x_base, r2    ; v8subs r0, r0, rb20
+-+/* [0x00000950] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x00000958] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
+-+/* [0x00000960] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra1 << 1, r0 << 1
+-+/* [0x00000968] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra2 << 2, r0 << 2
+-+/* [0x00000970] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra3 << 3, r0 << 3
+-+/* [0x00000978] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra4 << 4, r0 << 4
+-+/* [0x00000980] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra5 << 5, r0 << 5
+-+/* [0x00000988] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra6 << 6, r0 << 6
+-+/* [0x00000990] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra7 << 7, r0 << 7
+-+/* [0x00000998] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
+-+/* [0x000009a0] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+-+/* [0x000009a8] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+-+/* [0x000009b0] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+-+/* [0x000009b8] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+-+/* [0x000009c0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+-+/* [0x000009c8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x000009d0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8       ; mov r1, ra22
+-+/* [0x000009d8] */ 0xffffff18, 0xf06809e7, // brr.anyn -, r:fast_loop
+-+/* [0x000009e0] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
+-+/* [0x000009e8] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
+-+/* [0x000009f0] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
+-+/* [0x000009f8] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
+-+/* [0x00000a00] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
+-+/* [0x00000a08] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
+-+/* [0x00000a10] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
+-+/* [0x00000a18] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
+-+/* [0x00000a20] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
+-+/* [0x00000a28] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
+-+/* [0x00000a30] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
+-+/* [0x00000a38] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
+-+/* [0x00000a40] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x00000a48] */ 0xfffffea8, 0xf06809e7, // brr.anyn -, r:fast_loop
+-+/* [0x00000a50] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
+-+/* [0x00000a58] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x00000a60] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x00000a68] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000a70] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00000a78] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000a80] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_filter_b
+--/* [0x00000a78] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00000a80] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x00000a88] */ 0x156e7d80, 0x10020667, // mov ra_x2shift, ra_x2shift_next
+--/* [0x00000a90] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00000a98] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
+--/* [0x00000aa0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x00000aa8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3,unif
+--/* [0x00000ab0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x00000ab8] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
+--/* [0x00000ac0] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
+--/* [0x00000ac8] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
+--/* [0x00000ad0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x00000ad8] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
+--/* [0x00000ae0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+--/* [0x00000ae8] */ 0x149dc5c0, 0xd00206a7, // and ra_x2_base_next, r2, ~3
+--/* [0x00000af0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x00000af8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+--/* [0x00000b00] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x00000b08] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000b10] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x00000b18] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x00000b20] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x00000b28] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+--/* [0x00000b30] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+--/* [0x00000b38] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x00000b40] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+--/* [0x00000b48] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+--/* [0x00000b50] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+--/* [0x00000b58] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000b60] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000b68] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00000b70] */ 0x0c9dc7c0, 0x10020c67, // add vr_setup, r3, rb28
+--/* [0x00000b78] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000b80] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000b88] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000b90] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000b98] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x00000ba0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000ba8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000bb0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000bb8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+--/* [0x00000bc0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000bc8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000bd0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000bd8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+--/* [0x00000be0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000be8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000bf0] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000bf8] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+--/* [0x00000c00] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x00000c08] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000c10] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x00000a88] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000a90] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00000a98] */ 0x156e7d80, 0x10020667, // mov ra_x2shift, ra_x2shift_next
+-+/* [0x00000aa0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000aa8] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
+-+/* [0x00000ab0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000ab8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3,unif
+-+/* [0x00000ac0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000ac8] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
+-+/* [0x00000ad0] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
+-+/* [0x00000ad8] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
+-+/* [0x00000ae0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x00000ae8] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
+-+/* [0x00000af0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+-+/* [0x00000af8] */ 0x149dc5c0, 0xd00206a7, // and ra_x2_base_next, r2, ~3
+-+/* [0x00000b00] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x00000b08] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x00000b10] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x00000b18] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000b20] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x00000b28] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x00000b30] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x00000b38] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+-+/* [0x00000b40] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+-+/* [0x00000b48] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x00000b50] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+-+/* [0x00000b58] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+-+/* [0x00000b60] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+-+/* [0x00000b68] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000b70] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000b78] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00000b80] */ 0x0c9dc7c0, 0x10020c67, // add vr_setup, r3, rb28
+-+/* [0x00000b88] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000b90] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000b98] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000ba0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000ba8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x00000bb0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000bb8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000bc0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000bc8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+-+/* [0x00000bd0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000bd8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000be0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000be8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+-+/* [0x00000bf0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000bf8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000c00] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000c08] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+-+/* [0x00000c10] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x00000c18] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000c20] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :bloop
+--/* [0x00000c18] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x00000c20] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x00000c28] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x00000c30] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x00000c38] */ 0xee654987, 0x10024860, // shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20
+--/* [0x00000c40] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x00000c48] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x00000c50] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x00000c58] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x00000c60] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x00000c68] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000c70] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
+--/* [0x00000c78] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x00000c80] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x00000c88] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x00000c90] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x00000c98] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x00000ca0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00000ca8] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--/* [0x00000cb0] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--/* [0x00000cb8] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--/* [0x00000cc0] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--/* [0x00000cc8] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--/* [0x00000cd0] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--/* [0x00000cd8] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--/* [0x00000ce0] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--/* [0x00000ce8] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
+--/* [0x00000cf0] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+--/* [0x00000cf8] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+--/* [0x00000d00] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+--/* [0x00000d08] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+--/* [0x00000d10] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+--/* [0x00000d18] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+--/* [0x00000d20] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x00000d28] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+--/* [0x00000d30] */ 0xfffffec8, 0xf06809e7, // brr.anyn -, r:bloop
+--/* [0x00000d38] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
+--/* [0x00000d40] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
+--/* [0x00000d48] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
+--/* [0x00000d50] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
+--/* [0x00000d58] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
+--/* [0x00000d60] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
+--/* [0x00000d68] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
+--/* [0x00000d70] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
+--/* [0x00000d78] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
+--/* [0x00000d80] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
+--/* [0x00000d88] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
+--/* [0x00000d90] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
+--/* [0x00000d98] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x00000da0] */ 0x8fc8f3f6, 0xd0020867, // asr r1, r1, 15          ; mov -, vr_wait
+--/* [0x00000da8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x00000db0] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
+--/* [0x00000db8] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:bloop
+--/* [0x00000dc0] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+--/* [0x00000dc8] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
+--/* [0x00000dd0] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
+--/* [0x00000dd8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000de0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00000de8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000df0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000c28] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x00000c30] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x00000c38] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x00000c40] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000c48] */ 0xee654987, 0x10024860, // shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20
+-+/* [0x00000c50] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000c58] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000c60] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x00000c68] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x00000c70] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x00000c78] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000c80] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
+-+/* [0x00000c88] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00000c90] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x00000c98] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000ca0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00000ca8] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00000cb0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00000cb8] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+/* [0x00000cc0] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+/* [0x00000cc8] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+/* [0x00000cd0] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+/* [0x00000cd8] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+/* [0x00000ce0] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+/* [0x00000ce8] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+/* [0x00000cf0] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+/* [0x00000cf8] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
+-+/* [0x00000d00] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+-+/* [0x00000d08] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+-+/* [0x00000d10] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+-+/* [0x00000d18] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+-+/* [0x00000d20] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+-+/* [0x00000d28] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+-+/* [0x00000d30] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00000d38] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+-+/* [0x00000d40] */ 0xfffffec8, 0xf06809e7, // brr.anyn -, r:bloop
+-+/* [0x00000d48] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
+-+/* [0x00000d50] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
+-+/* [0x00000d58] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
+-+/* [0x00000d60] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
+-+/* [0x00000d68] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
+-+/* [0x00000d70] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
+-+/* [0x00000d78] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
+-+/* [0x00000d80] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
+-+/* [0x00000d88] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
+-+/* [0x00000d90] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
+-+/* [0x00000d98] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
+-+/* [0x00000da0] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
+-+/* [0x00000da8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x00000db0] */ 0x8fc8f3f6, 0xd0020867, // asr r1, r1, 15          ; mov -, vr_wait
+-+/* [0x00000db8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x00000dc0] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
+-+/* [0x00000dc8] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:bloop
+-+/* [0x00000dd0] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+-+/* [0x00000dd8] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
+-+/* [0x00000de0] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
+-+/* [0x00000de8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000df0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00000df8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000e00] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_filter_honly
+--/* [0x00000df8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00000e00] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x00000e08] */ 0x156e7d80, 0x10020667, // mov ra_x2shift, ra_x2shift_next
+--/* [0x00000e10] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00000e18] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
+--/* [0x00000e20] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x00000e28] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3,unif
+--/* [0x00000e30] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x00000e38] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
+--/* [0x00000e40] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
+--/* [0x00000e48] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
+--/* [0x00000e50] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x00000e58] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
+--/* [0x00000e60] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+--/* [0x00000e68] */ 0x149dc5c0, 0xd00206a7, // and ra_x2_base_next, r2, ~3
+--/* [0x00000e70] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x00000e78] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+--/* [0x00000e80] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x00000e88] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000e90] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x00000e98] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x00000ea0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x00000ea8] */ 0x0c9de1c0, 0xd0021467, // add rb17, r0, -2
+--/* [0x00000eb0] */ 0x919c71c0, 0xd0024812, // shl r0, r0, 7 ; mov rb18,r0
+--/* [0x00000eb8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000ec0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000ec8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00000ed0] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000ed8] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000ee0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000ee8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000ef0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x00000ef8] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000f00] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000f08] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000f10] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+--/* [0x00000f18] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000f20] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x00000f28] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000f30] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x00000e08] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000e10] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00000e18] */ 0x156e7d80, 0x10020667, // mov ra_x2shift, ra_x2shift_next
+-+/* [0x00000e20] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000e28] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
+-+/* [0x00000e30] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000e38] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3,unif
+-+/* [0x00000e40] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000e48] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
+-+/* [0x00000e50] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
+-+/* [0x00000e58] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
+-+/* [0x00000e60] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x00000e68] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
+-+/* [0x00000e70] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+-+/* [0x00000e78] */ 0x149dc5c0, 0xd00206a7, // and ra_x2_base_next, r2, ~3
+-+/* [0x00000e80] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x00000e88] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x00000e90] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x00000e98] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000ea0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x00000ea8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x00000eb0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x00000eb8] */ 0x0c9de1c0, 0xd0021467, // add rb17, r0, -2
+-+/* [0x00000ec0] */ 0x919c71c0, 0xd0024812, // shl r0, r0, 7 ; mov rb18,r0
+-+/* [0x00000ec8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000ed0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000ed8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00000ee0] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000ee8] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000ef0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000ef8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000f00] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x00000f08] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000f10] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000f18] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000f20] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+-+/* [0x00000f28] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000f30] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x00000f38] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000f40] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :loop_honly
+--/* [0x00000f38] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x00000f40] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x00000f48] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x00000f50] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x00000f58] */ 0xee654987, 0x10024860, // shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20
+--/* [0x00000f60] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x00000f68] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x00000f70] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x00000f78] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x00000f80] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x00000f88] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000f90] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
+--/* [0x00000f98] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x00000fa0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x00000fa8] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x00000fb0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x00000fb8] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x00000fc0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00000fc8] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--/* [0x00000fd0] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--/* [0x00000fd8] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--/* [0x00000fe0] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--/* [0x00000fe8] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--/* [0x00000ff0] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--/* [0x00000ff8] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--/* [0x00001000] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--/* [0x00001008] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
+--/* [0x00001010] */ 0x8d5927f6, 0x100269e1, // sub.setf -, r3, rb18 ; mov r1, ra22
+--/* [0x00001018] */ 0x559f2fc1, 0x100049e0, // mov -, vw_wait   ; mul24 r0, r0, r1
+--/* [0x00001020] */ 0xfffffef8, 0xf06809e7, // brr.anyn -, r:loop_honly
+--/* [0x00001028] */ 0x0f9cf1c0, 0xd0020827, // asr r0, r0, 15
+--/* [0x00001030] */ 0x129d61c0, 0x10020827, // min r0, r0, rb22
+--/* [0x00001038] */ 0x139c01c0, 0xd0020c27, // max vpm, r0, 0
+--/* [0x00001040] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00001048] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00001050] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00001058] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000f48] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x00000f50] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x00000f58] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x00000f60] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000f68] */ 0xee654987, 0x10024860, // shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20
+-+/* [0x00000f70] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000f78] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000f80] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x00000f88] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x00000f90] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x00000f98] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000fa0] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
+-+/* [0x00000fa8] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00000fb0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x00000fb8] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000fc0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00000fc8] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00000fd0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00000fd8] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+/* [0x00000fe0] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+/* [0x00000fe8] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+/* [0x00000ff0] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+/* [0x00000ff8] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+/* [0x00001000] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+/* [0x00001008] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+/* [0x00001010] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+/* [0x00001018] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
+-+/* [0x00001020] */ 0x8d5927f6, 0x100269e1, // sub.setf -, r3, rb18 ; mov r1, ra22
+-+/* [0x00001028] */ 0x559f2fc1, 0x100049e0, // mov -, vw_wait   ; mul24 r0, r0, r1
+-+/* [0x00001030] */ 0xfffffef8, 0xf06809e7, // brr.anyn -, r:loop_honly
+-+/* [0x00001038] */ 0x0f9cf1c0, 0xd0020827, // asr r0, r0, 15
+-+/* [0x00001040] */ 0x129d61c0, 0x10020827, // min r0, r0, rb22
+-+/* [0x00001048] */ 0x139c01c0, 0xd0020c27, // max vpm, r0, 0
+-+/* [0x00001050] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00001058] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00001060] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00001068] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_exit
+--/* [0x00001060] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x00001068] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+--/* [0x00001070] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00001078] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00001070] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00001078] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+- /* [0x00001080] */ 0x009e7000, 0xa00009e7, // ldtmu0
+- /* [0x00001088] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00001090] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x00001098] */ 0x009e7000, 0x100009e7, // nop        ; nop
+--/* [0x000010a0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00001090] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00001098] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x000010a0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x000010a8] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x000010b0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_exit1
+--/* [0x000010a8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x000010b0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x000010b8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x000010b8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+- /* [0x000010c0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+- /* [0x000010c8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x000010d0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x000010d8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+--/* [0x000010e0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x000010d0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x000010d8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x000010e0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x000010e8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x000010f0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_interrupt_exit
+--/* [0x000010e8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x000010f0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x000010f8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x000010f8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+- /* [0x00001100] */ 0x009e7000, 0xa00009e7, // ldtmu0
+- /* [0x00001108] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00001110] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00001118] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001110] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00001118] */ 0x009e7000, 0xa00009e7, // ldtmu0
+- /* [0x00001120] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00001128] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00001130] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-@@ -592,225 +592,227 @@ unsigned int rpi_shader[] = {
+- /* [0x00001150] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00001158] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00001160] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00001168] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x00001170] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+--/* [0x00001178] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00001168] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001170] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001178] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00001180] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x00001188] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_interrupt_exit4
+--/* [0x00001180] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x00001188] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00001190] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00001190] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+- /* [0x00001198] */ 0x009e7000, 0xa00009e7, // ldtmu0
+- /* [0x000011a0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x000011a8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x000011b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000011a8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x000011b0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+- /* [0x000011b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x000011c0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x000011c8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+--/* [0x000011d0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x000011c0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000011c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000011d0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x000011d8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x000011e0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_interrupt_exit8
+--/* [0x000011d8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x000011e0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x000011e8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x000011e8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+- /* [0x000011f0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+- /* [0x000011f8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00001200] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00001208] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001200] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00001208] */ 0x009e7000, 0xa00009e7, // ldtmu0
+- /* [0x00001210] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00001218] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00001220] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00001228] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00001230] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00001238] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x00001240] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+--/* [0x00001248] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00001238] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001240] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001248] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00001250] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x00001258] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_setup_uv
+--/* [0x00001250] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00001258] */ 0x0c9a0f80, 0x10020427, // add ra_x_base, unif, elem_num
+--/* [0x00001260] */ 0x15827d80, 0x10020767, // mov ra_y, unif
+--/* [0x00001268] */ 0x15827d80, 0x10020627, // mov ra_x2_base, unif
+--/* [0x00001270] */ 0x009e7000, 0x100009e7, // nop
+--/* [0x00001278] */ 0x0d620f80, 0x10020667, // sub ra_u2v_ref_offset, unif, ra_x2_base
+--/* [0x00001280] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
+--/* [0x00001288] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
+--/* [0x00001290] */ 0x15827d80, 0x10021427, // mov rb16, unif
+--/* [0x00001298] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000012a0] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+--/* [0x000012a8] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
+--/* [0x000012b0] */ 0x00000001, 0xe0020527, // mov ra20, 1
+--/* [0x000012b8] */ 0x00000040, 0xe0020567, // mov ra21, 64
+--/* [0x000012c0] */ 0x00000100, 0xe00205a7, // mov ra22, 256
+--/* [0x000012c8] */ 0x00000008, 0xe00205e7, // mov ra23, 8
+--/* [0x000012d0] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
+--/* [0x000012d8] */ 0x00000040, 0xe0021567, // mov rb21, 64
+--/* [0x000012e0] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
+--/* [0x000012e8] */ 0x00000018, 0xe00215e7, // mov rb23, 24
+--/* [0x000012f0] */ 0x00000000, 0xe0020227, // mov ra8, 0
+--/* [0x000012f8] */ 0x00000000, 0xe0020267, // mov ra9, 0
+--/* [0x00001300] */ 0x00000000, 0xe00202a7, // mov ra10, 0
+--/* [0x00001308] */ 0x00000000, 0xe00202e7, // mov ra11, 0
+--/* [0x00001310] */ 0x00000000, 0xe0020327, // mov ra12, 0
+--/* [0x00001318] */ 0x00000000, 0xe0020367, // mov ra13, 0
+--/* [0x00001320] */ 0x00000000, 0xe00203a7, // mov ra14, 0
+--/* [0x00001328] */ 0x00000000, 0xe00203e7, // mov ra15, 0
+--/* [0x00001330] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+--/* [0x00001338] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+--/* [0x00001340] */ 0x159e7480, 0x10020867, // mov r1, r2
+--/* [0x00001348] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+--/* [0x00001350] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+--/* [0x00001358] */ 0x159e7480, 0x10020827, // mov r0, r2
+--/* [0x00001360] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+--/* [0x00001368] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00001370] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
+--/* [0x00001378] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+--/* [0x00001380] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
+--/* [0x00001388] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+--/* [0x00001390] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+--/* [0x00001398] */ 0x159e7480, 0x10020867, // mov r1, r2
+--/* [0x000013a0] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+--/* [0x000013a8] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+--/* [0x000013b0] */ 0x159e7480, 0x10020827, // mov r0, r2
+--/* [0x000013b8] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+--/* [0x000013c0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x000013c8] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+--/* [0x000013d0] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
+--/* [0x000013d8] */ 0x15427d80, 0x10020827, // mov r0, ra_x_base
+--/* [0x000013e0] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
+--/* [0x000013e8] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base
+--/* [0x000013f0] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
+--/* [0x000013f8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
+--/* [0x00001400] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x00001408] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+--/* [0x00001410] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x_base, r0
+--/* [0x00001418] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+--/* [0x00001420] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
+--/* [0x00001428] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
+--/* [0x00001430] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
+--/* [0x00001438] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00001440] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00001260] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00001268] */ 0x0c9a0f80, 0x10020427, // add ra_x_base, unif, elem_num
+-+/* [0x00001270] */ 0x15827d80, 0x10020767, // mov ra_y, unif
+-+/* [0x00001278] */ 0x15827d80, 0x10020627, // mov ra_x2_base, unif
+-+/* [0x00001280] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x00001288] */ 0x0d620f80, 0x10020667, // sub ra_u2v_ref_offset, unif, ra_x2_base
+-+/* [0x00001290] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
+-+/* [0x00001298] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
+-+/* [0x000012a0] */ 0x15827d80, 0x10021427, // mov rb16, unif
+-+/* [0x000012a8] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000012b0] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+-+/* [0x000012b8] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
+-+/* [0x000012c0] */ 0x00000001, 0xe0020527, // mov ra20, 1
+-+/* [0x000012c8] */ 0x00000020, 0xe0020567, // mov ra21, 32
+-+/* [0x000012d0] */ 0x00000100, 0xe00205a7, // mov ra22, 256
+-+/* [0x000012d8] */ 0x00000008, 0xe00205e7, // mov ra23, 8
+-+/* [0x000012e0] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
+-+/* [0x000012e8] */ 0x00000040, 0xe0021567, // mov rb21, 64
+-+/* [0x000012f0] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
+-+/* [0x000012f8] */ 0x00000018, 0xe00215e7, // mov rb23, 24
+-+/* [0x00001300] */ 0x00000000, 0xe0020227, // mov ra8, 0
+-+/* [0x00001308] */ 0x00000000, 0xe0020267, // mov ra9, 0
+-+/* [0x00001310] */ 0x00000000, 0xe00202a7, // mov ra10, 0
+-+/* [0x00001318] */ 0x00000000, 0xe00202e7, // mov ra11, 0
+-+/* [0x00001320] */ 0x00000000, 0xe0020327, // mov ra12, 0
+-+/* [0x00001328] */ 0x00000000, 0xe0020367, // mov ra13, 0
+-+/* [0x00001330] */ 0x00000000, 0xe00203a7, // mov ra14, 0
+-+/* [0x00001338] */ 0x00000000, 0xe00203e7, // mov ra15, 0
+-+/* [0x00001340] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+-+/* [0x00001348] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+-+/* [0x00001350] */ 0x159e7480, 0x10020867, // mov r1, r2
+-+/* [0x00001358] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+-+/* [0x00001360] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+-+/* [0x00001368] */ 0x159e7480, 0x10020827, // mov r0, r2
+-+/* [0x00001370] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+-+/* [0x00001378] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00001380] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
+-+/* [0x00001388] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+-+/* [0x00001390] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
+-+/* [0x00001398] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+-+/* [0x000013a0] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+-+/* [0x000013a8] */ 0x159e7480, 0x10020867, // mov r1, r2
+-+/* [0x000013b0] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+-+/* [0x000013b8] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+-+/* [0x000013c0] */ 0x159e7480, 0x10020827, // mov r0, r2
+-+/* [0x000013c8] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+-+/* [0x000013d0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x000013d8] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+-+/* [0x000013e0] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
+-+/* [0x000013e8] */ 0x15427d80, 0x10020827, // mov r0, ra_x_base
+-+/* [0x000013f0] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
+-+/* [0x000013f8] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base
+-+/* [0x00001400] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
+-+/* [0x00001408] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
+-+/* [0x00001410] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x00001418] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+-+/* [0x00001420] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x_base, r0
+-+/* [0x00001428] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x00001430] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
+-+/* [0x00001438] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
+-+/* [0x00001440] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
+- /* [0x00001448] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00001450] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
+--/* [0x00001458] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+--/* [0x00001460] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
+--/* [0x00001468] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00001470] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+--/* [0x00001478] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
+--/* [0x00001480] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
+-+/* [0x00001450] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00001458] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00001460] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
+-+/* [0x00001468] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x00001470] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
+-+/* [0x00001478] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00001480] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+-+/* [0x00001488] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
+-+/* [0x00001490] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
+- // ::mc_filter_uv_b
+--/* [0x00001488] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00001490] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x00001498] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x000014a0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x000014a8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+--/* [0x000014b0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x000014b8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+--/* [0x000014c0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x000014c8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+--/* [0x000014d0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x000014d8] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+--/* [0x000014e0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+--/* [0x000014e8] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x000014f0] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000014f8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x00001500] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x00001508] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x00001510] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+--/* [0x00001518] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+--/* [0x00001520] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x00001528] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+--/* [0x00001530] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+--/* [0x00001538] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+--/* [0x00001540] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00001548] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00001550] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00001558] */ 0x0c9dc7c0, 0x10020c67, // add vr_setup, r3, rb28
+--/* [0x00001560] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
+--/* [0x00001568] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00001570] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00001578] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00001580] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00001588] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x00001590] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00001598] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000015a0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000015a8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+--/* [0x000015b0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000015b8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000015c0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000015c8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+--/* [0x000015d0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000015d8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000015e0] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000015e8] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+--/* [0x000015f0] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x000015f8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00001600] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x00001498] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x000014a0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x000014a8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x000014b0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x000014b8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x000014c0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x000014c8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x000014d0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x000014d8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+-+/* [0x000014e0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x000014e8] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+-+/* [0x000014f0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x000014f8] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x00001500] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00001508] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x00001510] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x00001518] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x00001520] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+-+/* [0x00001528] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+-+/* [0x00001530] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x00001538] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+-+/* [0x00001540] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+-+/* [0x00001548] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+-+/* [0x00001550] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00001558] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00001560] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00001568] */ 0x0c9dc7c0, 0x10020c67, // add vr_setup, r3, rb28
+-+/* [0x00001570] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
+-+/* [0x00001578] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00001580] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00001588] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00001590] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00001598] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x000015a0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000015a8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000015b0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000015b8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+-+/* [0x000015c0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000015c8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000015d0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000015d8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+-+/* [0x000015e0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000015e8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000015f0] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000015f8] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+-+/* [0x00001600] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x00001608] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00001610] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop_b
+--/* [0x00001608] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x00001610] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x00001618] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x00001620] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x00001628] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x00001630] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x00001638] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x00001640] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x00001648] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x00001650] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x00001658] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00001660] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
+--/* [0x00001668] */ 0x40038031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
+--/* [0x00001670] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x00001678] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x00001680] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x00001688] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x00001690] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x00001698] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x000016a0] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--/* [0x000016a8] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--/* [0x000016b0] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--/* [0x000016b8] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--/* [0x000016c0] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--/* [0x000016c8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--/* [0x000016d0] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--/* [0x000016d8] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--/* [0x000016e0] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
+--/* [0x000016e8] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+--/* [0x000016f0] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+--/* [0x000016f8] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+--/* [0x00001700] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+--/* [0x00001708] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+--/* [0x00001710] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+--/* [0x00001718] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x00001720] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+--/* [0x00001728] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
+--/* [0x00001730] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
+--/* [0x00001738] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
+--/* [0x00001740] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
+--/* [0x00001748] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
+--/* [0x00001750] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
+--/* [0x00001758] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
+--/* [0x00001760] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
+--/* [0x00001768] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
+--/* [0x00001770] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
+--/* [0x00001778] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
+--/* [0x00001780] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
+--/* [0x00001788] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
+--/* [0x00001790] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x00001798] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
+--/* [0x000017a0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x000017a8] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
+--/* [0x000017b0] */ 0xfffffe38, 0xf06809e7, // brr.anyn -, r:uvloop_b
+--/* [0x000017b8] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+--/* [0x000017c0] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
+--/* [0x000017c8] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
+--/* [0x000017d0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x000017d8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x000017e0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x000017e8] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x000017f0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x000017f8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00001800] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x00001808] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00001810] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00001618] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x00001620] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x00001628] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x00001630] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00001638] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x00001640] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00001648] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00001650] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x00001658] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x00001660] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x00001668] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00001670] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
+-+/* [0x00001678] */ 0x40038031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
+-+/* [0x00001680] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00001688] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x00001690] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00001698] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x000016a0] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x000016a8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x000016b0] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+/* [0x000016b8] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+/* [0x000016c0] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+/* [0x000016c8] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+/* [0x000016d0] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+/* [0x000016d8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+/* [0x000016e0] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+/* [0x000016e8] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+/* [0x000016f0] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
+-+/* [0x000016f8] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+-+/* [0x00001700] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+-+/* [0x00001708] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+-+/* [0x00001710] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+-+/* [0x00001718] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+-+/* [0x00001720] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+-+/* [0x00001728] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00001730] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+-+/* [0x00001738] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x00001740] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
+-+/* [0x00001748] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
+-+/* [0x00001750] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
+-+/* [0x00001758] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
+-+/* [0x00001760] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
+-+/* [0x00001768] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
+-+/* [0x00001770] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
+-+/* [0x00001778] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
+-+/* [0x00001780] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
+-+/* [0x00001788] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
+-+/* [0x00001790] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
+-+/* [0x00001798] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
+-+/* [0x000017a0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x000017a8] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
+-+/* [0x000017b0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x000017b8] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
+-+/* [0x000017c0] */ 0xfffffe38, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x000017c8] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+-+/* [0x000017d0] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
+-+/* [0x000017d8] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
+-+/* [0x000017e0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x000017e8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x000017f0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x000017f8] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x00001800] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x00001808] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00001810] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x00001818] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00001820] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_end
+- };
+- #ifdef __HIGHC__
+-diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
+-index db971f4..3464cdb 100644
+---- a/libavcodec/rpi_shader.h
+-+++ b/libavcodec/rpi_shader.h
+-@@ -5,16 +5,16 @@ extern unsigned int rpi_shader[];
+- 
+- #define mc_setup (rpi_shader + 0)
+- #define mc_filter_uv (rpi_shader + 146)
+--#define mc_filter (rpi_shader + 360)
+--#define mc_filter_b (rpi_shader + 670)
+--#define mc_filter_honly (rpi_shader + 894)
+--#define mc_exit (rpi_shader + 1048)
+--#define mc_exit1 (rpi_shader + 1066)
+--#define mc_interrupt_exit (rpi_shader + 1082)
+--#define mc_interrupt_exit4 (rpi_shader + 1120)
+--#define mc_interrupt_exit8 (rpi_shader + 1142)
+--#define mc_setup_uv (rpi_shader + 1172)
+--#define mc_filter_uv_b (rpi_shader + 1314)
+--#define mc_end (rpi_shader + 1542)
+-+#define mc_filter (rpi_shader + 364)
+-+#define mc_filter_b (rpi_shader + 674)
+-+#define mc_filter_honly (rpi_shader + 898)
+-+#define mc_exit (rpi_shader + 1052)
+-+#define mc_exit1 (rpi_shader + 1070)
+-+#define mc_interrupt_exit (rpi_shader + 1086)
+-+#define mc_interrupt_exit4 (rpi_shader + 1124)
+-+#define mc_interrupt_exit8 (rpi_shader + 1146)
+-+#define mc_setup_uv (rpi_shader + 1176)
+-+#define mc_filter_uv_b (rpi_shader + 1318)
+-+#define mc_end (rpi_shader + 1546)
+- 
+- #endif
+-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
+-index 02fdcb2..4809e1d 100644
+---- a/libavcodec/rpi_shader.qasm
+-+++ b/libavcodec/rpi_shader.qasm
+-@@ -21,7 +21,7 @@
+- # rb19                                          next ra16
+- #
+- # ra20                                          1
+--# ra21                                          64
+-+# ra21                                          32
+- # ra22                                          256
+- # ra23                                          8
+- #
+-@@ -97,7 +97,7 @@ add rb24, r1, r0
+- # load constants
+- 
+- mov ra20, 1
+--mov ra21, 64
+-+mov ra21, 32
+- mov ra22, 256
+- mov ra23, 8
+- 
+-@@ -270,7 +270,7 @@ add t0s, ra_x2_base, r2
+- 
+- mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+- 
+--mov r2, rb21         ; mul24 r2, r0, ra0
+-+nop                  ; mul24 r2, r0, ra0
+- nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+- nop                  ; mul24      r3, ra1 << 1, r0 << 1
+- nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-@@ -301,9 +301,9 @@ sub.setf -, r3, 8 ; mov r1, ra22
+- 
+- # apply horizontal filter
+- brr.anyn -, r:uvloop
+--max ra14, ra15, 0       ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
+--asr r0, r0, 14          ; mov r1, ra21
+--min.setf ra15, r0, rb22
+-+mov ra14, ra15          ; mul24 r0, r0, r1         # last bit of context scroll
+-+asr ra15, r0, 8         ; nop
+-+nop                     ; nop  # Delay slot 3 (TODO move more of the context scroll into here)
+- 
+- # apply vertical filter and write to VPM
+- 
+-@@ -315,12 +315,14 @@ add r1, r1, r0          ; mul24 r0, ra10, rb10
+- add r1, r1, r0          ; mul24 r0, ra9, rb9
+- add r1, r1, r0          ; mul24 r0, ra8, rb8
+- add r1, r1, r0          ; mul24 r0, ra15, rb15
+--add.ifnn r1, r1, r0     ; mov -, vw_wait
+-+add r1, r1, r0          ; mov -, vw_wait
+- sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--brr.anyn -, r:uvloop
+- asr r1, r1, 14
+--min r1, r1, rb22
+--max vpm, r1, 0
+-+add r1, r1, ra21
+-+brr.anyn -, r:uvloop
+-+asr r1, r1, 6          # Delay 1
+-+min r1, r1, rb22       # Delay 2
+-+max vpm, r1, 0         # Delay 3
+- 
+- # DMA out for U
+- 
+-@@ -1161,7 +1163,7 @@ add rb24, r1, r0
+- # load constants
+- 
+- mov ra20, 1
+--mov ra21, 64
+-+mov ra21, 32
+- mov ra22, 256
+- mov ra23, 8
+- 
+--- 
+-2.7.4
+-
+-
+-From b7321192751956ed7deceeb3dabe22ccedb8e08d Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Wed, 13 May 2015 14:37:32 +0100
+-Subject: [PATCH 19/68] Removed unused luma functions
+-
+----
+- libavcodec/hevc.c          |    4 +-
+- libavcodec/rpi_qpu.c       |   32 +-
+- libavcodec/rpi_shader.c    | 1097 +++++++++++++-------------------------------
+- libavcodec/rpi_shader.h    |   19 +-
+- libavcodec/rpi_shader.qasm |  970 +++------------------------------------
+- 5 files changed, 396 insertions(+), 1726 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index d6d78ee..31b8b2f 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -2731,8 +2731,8 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
+-         return;
+-     for(k=0;k<8;k++) {
+-         s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
+--        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
+--        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP); // Also need a dummy for V
+-+        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
+-+        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP_UV); // Also need a dummy for V
+-         assert(s->u_mvs[k] - s->mvs_base[k] < UV_COMMANDS_PER_QPU);
+-     }
+- 
+-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
+-index 4d9eda8..4e90cc1 100644
+---- a/libavcodec/rpi_qpu.c
+-+++ b/libavcodec/rpi_qpu.c
+-@@ -172,7 +172,7 @@ static int gpu_init(volatile struct GPU **gpu) {
+- 
+-   // Now copy over the QPU code into GPU memory
+-   {
+--    int num_bytes = qpu_get_fn(QPU_MC_END) - qpu_get_fn(QPU_MC_SETUP);
+-+    int num_bytes = qpu_get_fn(QPU_MC_END) - qpu_get_fn(QPU_MC_SETUP_UV);
+-     assert(num_bytes<=QPU_CODE_SIZE*sizeof(unsigned int));
+-     memcpy((void*)ptr->qpu_code, rpi_shader, num_bytes);
+-   }
+-@@ -612,24 +612,24 @@ unsigned int qpu_get_fn(int num) {
+-       gpu_unlock();
+-     }
+-     switch(num) {
+--    case QPU_MC_SETUP:
+--      fn = mc_setup;
+--      break;
+--    case QPU_MC_FILTER:
+--      fn = mc_filter;
+--      break;
+-+    //case QPU_MC_SETUP:
+-+    //  fn = mc_setup;
+-+    //  break;
+-+    //case QPU_MC_FILTER:
+-+    //  fn = mc_filter;
+-+    //  break;
+-     case QPU_MC_EXIT:
+-       fn = mc_exit;
+-       break;
+--    case QPU_MC_INTERRUPT_EXIT:
+--      fn = mc_interrupt_exit;
+--      break;
+--    case QPU_MC_FILTER_B:
+--      fn = mc_filter_b;
+--      break;
+--    case QPU_MC_FILTER_HONLY:
+--      fn = mc_filter_honly;
+--      break;
+-+    //case QPU_MC_INTERRUPT_EXIT:
+-+    //  fn = mc_interrupt_exit;
+-+    //  break;
+-+    //case QPU_MC_FILTER_B:
+-+    //  fn = mc_filter_b;
+-+    //  break;
+-+    //case QPU_MC_FILTER_HONLY:
+-+    //  fn = mc_filter_honly;
+-+    //  break;
+-     case QPU_MC_SETUP_UV:
+-       fn = mc_setup_uv;
+-       break;
+-diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
+-index 831633b..170e8ac 100644
+---- a/libavcodec/rpi_shader.c
+-+++ b/libavcodec/rpi_shader.c
+-@@ -21,798 +21,331 @@ __declspec(align(8))
+- __attribute__((aligned(8)))
+- #endif
+- unsigned int rpi_shader[] = {
+--// ::mc_setup
+-+// ::mc_setup_uv
+- /* [0x00000000] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+- /* [0x00000008] */ 0x0c9a0f80, 0x10020427, // add ra_x_base, unif, elem_num
+- /* [0x00000010] */ 0x15827d80, 0x10020767, // mov ra_y, unif
+- /* [0x00000018] */ 0x15827d80, 0x10020627, // mov ra_x2_base, unif
+--/* [0x00000020] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
+--/* [0x00000028] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
+--/* [0x00000030] */ 0x15827d80, 0x10021427, // mov rb16, unif
+--/* [0x00000038] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000040] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+--/* [0x00000048] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
+--/* [0x00000050] */ 0x00000001, 0xe0020527, // mov ra20, 1
+--/* [0x00000058] */ 0x00000020, 0xe0020567, // mov ra21, 32
+--/* [0x00000060] */ 0x00000100, 0xe00205a7, // mov ra22, 256
+--/* [0x00000068] */ 0x00000008, 0xe00205e7, // mov ra23, 8
+--/* [0x00000070] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
+--/* [0x00000078] */ 0x00000040, 0xe0021567, // mov rb21, 64
+--/* [0x00000080] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
+--/* [0x00000088] */ 0x00000018, 0xe00215e7, // mov rb23, 24
+--/* [0x00000090] */ 0x00000000, 0xe0020227, // mov ra8, 0
+--/* [0x00000098] */ 0x00000000, 0xe0020267, // mov ra9, 0
+--/* [0x000000a0] */ 0x00000000, 0xe00202a7, // mov ra10, 0
+--/* [0x000000a8] */ 0x00000000, 0xe00202e7, // mov ra11, 0
+--/* [0x000000b0] */ 0x00000000, 0xe0020327, // mov ra12, 0
+--/* [0x000000b8] */ 0x00000000, 0xe0020367, // mov ra13, 0
+--/* [0x000000c0] */ 0x00000000, 0xe00203a7, // mov ra14, 0
+--/* [0x000000c8] */ 0x00000000, 0xe00203e7, // mov ra15, 0
+--/* [0x000000d0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+--/* [0x000000d8] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+--/* [0x000000e0] */ 0x159e7480, 0x10020867, // mov r1, r2
+--/* [0x000000e8] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+--/* [0x000000f0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+--/* [0x000000f8] */ 0x159e7480, 0x10020827, // mov r0, r2
+--/* [0x00000100] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+--/* [0x00000108] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000110] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
+--/* [0x00000118] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+--/* [0x00000120] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
+--/* [0x00000128] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+--/* [0x00000130] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+--/* [0x00000138] */ 0x159e7480, 0x10020867, // mov r1, r2
+--/* [0x00000140] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+--/* [0x00000148] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+--/* [0x00000150] */ 0x159e7480, 0x10020827, // mov r0, r2
+--/* [0x00000158] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+--/* [0x00000160] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000168] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+--/* [0x00000170] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
+--/* [0x00000178] */ 0x15427d80, 0x10020827, // mov r0, ra_x_base
+--/* [0x00000180] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
+--/* [0x00000188] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
+--/* [0x00000190] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base
+--/* [0x00000198] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x000001a0] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
+-+/* [0x00000020] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x00000028] */ 0x0d620f80, 0x10020667, // sub ra_u2v_ref_offset, unif, ra_x2_base
+-+/* [0x00000030] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
+-+/* [0x00000038] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
+-+/* [0x00000040] */ 0x15827d80, 0x10021427, // mov rb16, unif
+-+/* [0x00000048] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000050] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+-+/* [0x00000058] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
+-+/* [0x00000060] */ 0x00000001, 0xe0020527, // mov ra20, 1
+-+/* [0x00000068] */ 0x00000020, 0xe0020567, // mov ra21, 32
+-+/* [0x00000070] */ 0x00000100, 0xe00205a7, // mov ra22, 256
+-+/* [0x00000078] */ 0x00000008, 0xe00205e7, // mov ra23, 8
+-+/* [0x00000080] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
+-+/* [0x00000088] */ 0x00000040, 0xe0021567, // mov rb21, 64
+-+/* [0x00000090] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
+-+/* [0x00000098] */ 0x00000018, 0xe00215e7, // mov rb23, 24
+-+/* [0x000000a0] */ 0x00000000, 0xe0020227, // mov ra8, 0
+-+/* [0x000000a8] */ 0x00000000, 0xe0020267, // mov ra9, 0
+-+/* [0x000000b0] */ 0x00000000, 0xe00202a7, // mov ra10, 0
+-+/* [0x000000b8] */ 0x00000000, 0xe00202e7, // mov ra11, 0
+-+/* [0x000000c0] */ 0x00000000, 0xe0020327, // mov ra12, 0
+-+/* [0x000000c8] */ 0x00000000, 0xe0020367, // mov ra13, 0
+-+/* [0x000000d0] */ 0x00000000, 0xe00203a7, // mov ra14, 0
+-+/* [0x000000d8] */ 0x00000000, 0xe00203e7, // mov ra15, 0
+-+/* [0x000000e0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+-+/* [0x000000e8] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+-+/* [0x000000f0] */ 0x159e7480, 0x10020867, // mov r1, r2
+-+/* [0x000000f8] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+-+/* [0x00000100] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+-+/* [0x00000108] */ 0x159e7480, 0x10020827, // mov r0, r2
+-+/* [0x00000110] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+-+/* [0x00000118] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000120] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
+-+/* [0x00000128] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+-+/* [0x00000130] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
+-+/* [0x00000138] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+-+/* [0x00000140] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+-+/* [0x00000148] */ 0x159e7480, 0x10020867, // mov r1, r2
+-+/* [0x00000150] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+-+/* [0x00000158] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+-+/* [0x00000160] */ 0x159e7480, 0x10020827, // mov r0, r2
+-+/* [0x00000168] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+-+/* [0x00000170] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000178] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+-+/* [0x00000180] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
+-+/* [0x00000188] */ 0x15427d80, 0x10020827, // mov r0, ra_x_base
+-+/* [0x00000190] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
+-+/* [0x00000198] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base
+-+/* [0x000001a0] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
+- /* [0x000001a8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
+--/* [0x000001b0] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
+--/* [0x000001b8] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
+--/* [0x000001c0] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+-+/* [0x000001b0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x000001b8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+-+/* [0x000001c0] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x_base, r0
+- /* [0x000001c8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+--/* [0x000001d0] */ 0x4c9d00cf, 0x10024821, // add r0, r0, r3; mul24 r1, r1, rb_pitch
+--/* [0x000001d8] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
+--/* [0x000001e0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+--/* [0x000001e8] */ 0x949dc5c0, 0xd0025890, // and r2, r2, ~3; mov ra_x_base, r0
+--/* [0x000001f0] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
+--/* [0x000001f8] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
+--/* [0x00000200] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000208] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000210] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
+--/* [0x00000218] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+--/* [0x00000220] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
+--/* [0x00000228] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000230] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+--/* [0x00000238] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
+--/* [0x00000240] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
+-+/* [0x000001d0] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
+-+/* [0x000001d8] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
+-+/* [0x000001e0] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
+-+/* [0x000001e8] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000001f0] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000001f8] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000200] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
+-+/* [0x00000208] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x00000210] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
+-+/* [0x00000218] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000220] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+-+/* [0x00000228] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
+-+/* [0x00000230] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
+- // ::mc_filter_uv
+--/* [0x00000248] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00000250] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x00000258] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00000260] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x00000268] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+--/* [0x00000270] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x00000278] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+--/* [0x00000280] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x00000288] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+--/* [0x00000290] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x00000298] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+--/* [0x000002a0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+--/* [0x000002a8] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x000002b0] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000002b8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x000002c0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x000002c8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x000002d0] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+--/* [0x000002d8] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+--/* [0x000002e0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x000002e8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x000002f0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x000002f8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00000300] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
+--/* [0x00000308] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000310] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000318] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000320] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000328] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x00000330] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000338] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000340] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000348] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+--/* [0x00000350] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000358] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000360] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000368] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+--/* [0x00000370] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000378] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000380] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000388] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+--/* [0x00000390] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x00000398] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x000003a0] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x00000238] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000240] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00000248] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000250] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000258] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x00000260] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000268] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x00000270] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x00000278] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+-+/* [0x00000280] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x00000288] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+-+/* [0x00000290] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x00000298] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x000002a0] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000002a8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x000002b0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x000002b8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x000002c0] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+-+/* [0x000002c8] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+-+/* [0x000002d0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x000002d8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x000002e0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x000002e8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x000002f0] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
+-+/* [0x000002f8] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000300] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000308] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000310] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000318] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x00000320] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000328] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000330] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000338] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+-+/* [0x00000340] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000348] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000350] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000358] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+-+/* [0x00000360] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000368] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000370] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000378] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+-+/* [0x00000380] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x00000388] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000390] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop
+--/* [0x000003a8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x000003b0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x000003b8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x000003c0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x000003c8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x000003d0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x000003d8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x000003e0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x000003e8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x000003f0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x000003f8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000400] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x00000408] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x00000410] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x00000418] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x00000420] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x00000428] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x00000430] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x00000438] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00000440] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--/* [0x00000448] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--/* [0x00000450] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--/* [0x00000458] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--/* [0x00000460] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--/* [0x00000468] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--/* [0x00000470] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--/* [0x00000478] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--/* [0x00000480] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
+--/* [0x00000488] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+--/* [0x00000490] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+--/* [0x00000498] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+--/* [0x000004a0] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+--/* [0x000004a8] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+--/* [0x000004b0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+--/* [0x000004b8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x000004c0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+--/* [0x000004c8] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop
+--/* [0x000004d0] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+--/* [0x000004d8] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+--/* [0x000004e0] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+--/* [0x000004e8] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
+--/* [0x000004f0] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
+--/* [0x000004f8] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
+--/* [0x00000500] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
+--/* [0x00000508] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
+--/* [0x00000510] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
+--/* [0x00000518] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
+--/* [0x00000520] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
+--/* [0x00000528] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x00000530] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x00000538] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+--/* [0x00000540] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
+--/* [0x00000548] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:uvloop
+--/* [0x00000550] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
+--/* [0x00000558] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x00000560] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+--/* [0x00000568] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00000570] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000578] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x00000580] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x00000588] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x00000590] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000598] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x000005a0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x000005a8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--// ::mc_filter
+--/* [0x000005b0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x000005b8] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x000005c0] */ 0x156e7d80, 0x10020667, // mov ra_x2shift, ra_x2shift_next
+--/* [0x000005c8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x000005d0] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
+--/* [0x000005d8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x000005e0] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3,unif
+--/* [0x000005e8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x000005f0] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
+--/* [0x000005f8] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
+--/* [0x00000600] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
+--/* [0x00000608] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x00000610] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
+--/* [0x00000618] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+--/* [0x00000620] */ 0x149dc5c0, 0xd00206a7, // and ra_x2_base_next, r2, ~3
+--/* [0x00000628] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x00000630] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+--/* [0x00000638] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x00000640] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000648] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x00000650] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x00000658] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x00000660] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+--/* [0x00000668] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+--/* [0x00000670] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x00000678] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000680] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000688] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00000690] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
+--/* [0x00000698] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000006a0] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000006a8] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000006b0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000006b8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x000006c0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000006c8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000006d0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000006d8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+--/* [0x000006e0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000006e8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000006f0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000006f8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+--/* [0x00000700] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000708] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000710] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000718] */ 0x000001d0, 0xf07809e7, // brr.anynn -, r:fast_path
+--/* [0x00000720] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+--/* [0x00000728] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x00000730] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000738] */ 0x00000000, 0xe00208e7, // mov r3, 0
+--// :loop
+--/* [0x00000740] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x00000748] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x00000750] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x00000758] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x00000760] */ 0xee654987, 0x10024860, // shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20
+--/* [0x00000768] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x00000770] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x00000778] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x00000780] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x00000788] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x00000790] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000798] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
+--/* [0x000007a0] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x000007a8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x000007b0] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x000007b8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x000007c0] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x000007c8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x000007d0] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--/* [0x000007d8] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--/* [0x000007e0] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--/* [0x000007e8] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--/* [0x000007f0] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--/* [0x000007f8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--/* [0x00000800] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--/* [0x00000808] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--/* [0x00000810] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
+--/* [0x00000818] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+--/* [0x00000820] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+--/* [0x00000828] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+--/* [0x00000830] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+--/* [0x00000838] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+--/* [0x00000840] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+--/* [0x00000848] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x00000850] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+--/* [0x00000858] */ 0xfffffec8, 0xf06809e7, // brr.anyn -, r:loop
+--/* [0x00000860] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
+--/* [0x00000868] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
+--/* [0x00000870] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
+--/* [0x00000878] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
+--/* [0x00000880] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
+--/* [0x00000888] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
+--/* [0x00000890] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
+--/* [0x00000898] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
+--/* [0x000008a0] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
+--/* [0x000008a8] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
+--/* [0x000008b0] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
+--/* [0x000008b8] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
+--/* [0x000008c0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x000008c8] */ 0xfffffe58, 0xf06809e7, // brr.anyn -, r:loop
+--/* [0x000008d0] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
+--/* [0x000008d8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x000008e0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+--/* [0x000008e8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x000008f0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x000008f8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000900] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--// :fast_path
+--/* [0x00000908] */ 0x00000000, 0xe00208e7, // mov r3, 0
+--// :fast_loop
+--/* [0x00000910] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x00000918] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x00000920] */ 0x95727d9b, 0x1004475f, // mov.ifz ra_y, ra_y_next   ; mov rb31, r3
+--/* [0x00000928] */ 0x95690dbf, 0x10044623, // mov.ifz ra_x2_base, ra_x2_base_next   ; mov r3, rb_pitch
+--/* [0x00000930] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x00000938] */ 0x929de5e4, 0x100248a1, // min r2, r2, rb_frame_height_minus_1 ; mov r1, r4
+--/* [0x00000940] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x00000948] */ 0xec414c87, 0x10024e20, // add t0s, ra_x_base, r2    ; v8subs r0, r0, rb20
+--/* [0x00000950] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x00000958] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
+--/* [0x00000960] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra1 << 1, r0 << 1
+--/* [0x00000968] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra2 << 2, r0 << 2
+--/* [0x00000970] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra3 << 3, r0 << 3
+--/* [0x00000978] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra4 << 4, r0 << 4
+--/* [0x00000980] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra5 << 5, r0 << 5
+--/* [0x00000988] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra6 << 6, r0 << 6
+--/* [0x00000990] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra7 << 7, r0 << 7
+--/* [0x00000998] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
+--/* [0x000009a0] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+--/* [0x000009a8] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+--/* [0x000009b0] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+--/* [0x000009b8] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+--/* [0x000009c0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+--/* [0x000009c8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x000009d0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8       ; mov r1, ra22
+--/* [0x000009d8] */ 0xffffff18, 0xf06809e7, // brr.anyn -, r:fast_loop
+--/* [0x000009e0] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
+--/* [0x000009e8] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
+--/* [0x000009f0] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
+--/* [0x000009f8] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
+--/* [0x00000a00] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
+--/* [0x00000a08] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
+--/* [0x00000a10] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
+--/* [0x00000a18] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
+--/* [0x00000a20] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
+--/* [0x00000a28] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
+--/* [0x00000a30] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
+--/* [0x00000a38] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
+--/* [0x00000a40] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x00000a48] */ 0xfffffea8, 0xf06809e7, // brr.anyn -, r:fast_loop
+--/* [0x00000a50] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
+--/* [0x00000a58] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x00000a60] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+--/* [0x00000a68] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000a70] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00000a78] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000a80] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--// ::mc_filter_b
+--/* [0x00000a88] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00000a90] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x00000a98] */ 0x156e7d80, 0x10020667, // mov ra_x2shift, ra_x2shift_next
+--/* [0x00000aa0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00000aa8] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
+--/* [0x00000ab0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x00000ab8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3,unif
+--/* [0x00000ac0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x00000ac8] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
+--/* [0x00000ad0] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
+--/* [0x00000ad8] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
+--/* [0x00000ae0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x00000ae8] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
+--/* [0x00000af0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+--/* [0x00000af8] */ 0x149dc5c0, 0xd00206a7, // and ra_x2_base_next, r2, ~3
+--/* [0x00000b00] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x00000b08] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+--/* [0x00000b10] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x00000b18] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000b20] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x00000b28] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x00000b30] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x00000b38] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+--/* [0x00000b40] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+--/* [0x00000b48] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x00000b50] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+--/* [0x00000b58] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+--/* [0x00000b60] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+--/* [0x00000b68] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000b70] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000b78] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00000b80] */ 0x0c9dc7c0, 0x10020c67, // add vr_setup, r3, rb28
+--/* [0x00000b88] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000b90] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000b98] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000ba0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000ba8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x00000bb0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000bb8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000bc0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000bc8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+--/* [0x00000bd0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000bd8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000be0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000be8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+--/* [0x00000bf0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000bf8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000c00] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000c08] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+--/* [0x00000c10] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x00000c18] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000c20] */ 0x00000000, 0xe00208e7, // mov r3, 0
+--// :bloop
+--/* [0x00000c28] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x00000c30] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x00000c38] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x00000c40] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x00000c48] */ 0xee654987, 0x10024860, // shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20
+--/* [0x00000c50] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x00000c58] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x00000c60] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x00000c68] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x00000c70] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x00000c78] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000c80] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
+--/* [0x00000c88] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x00000c90] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x00000c98] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x00000ca0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x00000ca8] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x00000cb0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00000cb8] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--/* [0x00000cc0] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--/* [0x00000cc8] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--/* [0x00000cd0] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--/* [0x00000cd8] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--/* [0x00000ce0] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--/* [0x00000ce8] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--/* [0x00000cf0] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--/* [0x00000cf8] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
+--/* [0x00000d00] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+--/* [0x00000d08] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+--/* [0x00000d10] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+--/* [0x00000d18] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+--/* [0x00000d20] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+--/* [0x00000d28] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+--/* [0x00000d30] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x00000d38] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+--/* [0x00000d40] */ 0xfffffec8, 0xf06809e7, // brr.anyn -, r:bloop
+--/* [0x00000d48] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
+--/* [0x00000d50] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
+--/* [0x00000d58] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
+--/* [0x00000d60] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
+--/* [0x00000d68] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
+--/* [0x00000d70] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
+--/* [0x00000d78] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
+--/* [0x00000d80] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
+--/* [0x00000d88] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
+--/* [0x00000d90] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
+--/* [0x00000d98] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
+--/* [0x00000da0] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
+--/* [0x00000da8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x00000db0] */ 0x8fc8f3f6, 0xd0020867, // asr r1, r1, 15          ; mov -, vr_wait
+--/* [0x00000db8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x00000dc0] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
+--/* [0x00000dc8] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:bloop
+--/* [0x00000dd0] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+--/* [0x00000dd8] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
+--/* [0x00000de0] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
+--/* [0x00000de8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000df0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00000df8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000e00] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--// ::mc_filter_honly
+--/* [0x00000e08] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00000e10] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x00000e18] */ 0x156e7d80, 0x10020667, // mov ra_x2shift, ra_x2shift_next
+--/* [0x00000e20] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00000e28] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
+--/* [0x00000e30] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x00000e38] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3,unif
+--/* [0x00000e40] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x00000e48] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
+--/* [0x00000e50] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
+--/* [0x00000e58] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
+--/* [0x00000e60] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x00000e68] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
+--/* [0x00000e70] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+--/* [0x00000e78] */ 0x149dc5c0, 0xd00206a7, // and ra_x2_base_next, r2, ~3
+--/* [0x00000e80] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x00000e88] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+--/* [0x00000e90] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x00000e98] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000ea0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x00000ea8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x00000eb0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x00000eb8] */ 0x0c9de1c0, 0xd0021467, // add rb17, r0, -2
+--/* [0x00000ec0] */ 0x919c71c0, 0xd0024812, // shl r0, r0, 7 ; mov rb18,r0
+--/* [0x00000ec8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000ed0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000ed8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00000ee0] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000ee8] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000ef0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000ef8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000f00] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x00000f08] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000f10] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000f18] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000f20] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+--/* [0x00000f28] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000f30] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x00000f38] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000f40] */ 0x00000000, 0xe00208e7, // mov r3, 0
+--// :loop_honly
+--/* [0x00000f48] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x00000f50] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x00000f58] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x00000f60] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x00000f68] */ 0xee654987, 0x10024860, // shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20
+--/* [0x00000f70] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x00000f78] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x00000f80] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x00000f88] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x00000f90] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x00000f98] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000fa0] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
+--/* [0x00000fa8] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x00000fb0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x00000fb8] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x00000fc0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x00000fc8] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x00000fd0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00000fd8] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--/* [0x00000fe0] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--/* [0x00000fe8] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--/* [0x00000ff0] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--/* [0x00000ff8] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--/* [0x00001000] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--/* [0x00001008] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--/* [0x00001010] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--/* [0x00001018] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
+--/* [0x00001020] */ 0x8d5927f6, 0x100269e1, // sub.setf -, r3, rb18 ; mov r1, ra22
+--/* [0x00001028] */ 0x559f2fc1, 0x100049e0, // mov -, vw_wait   ; mul24 r0, r0, r1
+--/* [0x00001030] */ 0xfffffef8, 0xf06809e7, // brr.anyn -, r:loop_honly
+--/* [0x00001038] */ 0x0f9cf1c0, 0xd0020827, // asr r0, r0, 15
+--/* [0x00001040] */ 0x129d61c0, 0x10020827, // min r0, r0, rb22
+--/* [0x00001048] */ 0x139c01c0, 0xd0020c27, // max vpm, r0, 0
+--/* [0x00001050] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00001058] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00001060] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00001068] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--// ::mc_exit
+--/* [0x00001070] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x00001078] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+--/* [0x00001080] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00001088] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00001090] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00001098] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x000010a0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x000010a8] */ 0x009e7000, 0x100009e7, // nop        ; nop
+--/* [0x000010b0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+--// ::mc_exit1
+--/* [0x000010b8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x000010c0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x000010c8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x000010d0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x000010d8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x000010e0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x000010e8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+--/* [0x000010f0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+--// ::mc_interrupt_exit
+--/* [0x000010f8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x00001100] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00001108] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00001110] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00001118] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00001120] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00001128] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00001130] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00001138] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00001140] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00001148] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00001150] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00001158] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00001160] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00001168] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00001170] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00001178] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x00001180] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+--/* [0x00001188] */ 0x009e7000, 0x100009e7, // nop        ; nop
+--// ::mc_interrupt_exit4
+--/* [0x00001190] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x00001198] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x000011a0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x000011a8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x000011b0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x000011b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x000011c0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x000011c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x000011d0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x000011d8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+--/* [0x000011e0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+--// ::mc_interrupt_exit8
+--/* [0x000011e8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x000011f0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x000011f8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00001200] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00001208] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00001210] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00001218] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00001220] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00001228] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00001230] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00001238] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00001240] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00001248] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x00001250] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+--/* [0x00001258] */ 0x009e7000, 0x100009e7, // nop        ; nop
+--// ::mc_setup_uv
+--/* [0x00001260] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00001268] */ 0x0c9a0f80, 0x10020427, // add ra_x_base, unif, elem_num
+--/* [0x00001270] */ 0x15827d80, 0x10020767, // mov ra_y, unif
+--/* [0x00001278] */ 0x15827d80, 0x10020627, // mov ra_x2_base, unif
+--/* [0x00001280] */ 0x009e7000, 0x100009e7, // nop
+--/* [0x00001288] */ 0x0d620f80, 0x10020667, // sub ra_u2v_ref_offset, unif, ra_x2_base
+--/* [0x00001290] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
+--/* [0x00001298] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
+--/* [0x000012a0] */ 0x15827d80, 0x10021427, // mov rb16, unif
+--/* [0x000012a8] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000012b0] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+--/* [0x000012b8] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
+--/* [0x000012c0] */ 0x00000001, 0xe0020527, // mov ra20, 1
+--/* [0x000012c8] */ 0x00000020, 0xe0020567, // mov ra21, 32
+--/* [0x000012d0] */ 0x00000100, 0xe00205a7, // mov ra22, 256
+--/* [0x000012d8] */ 0x00000008, 0xe00205e7, // mov ra23, 8
+--/* [0x000012e0] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
+--/* [0x000012e8] */ 0x00000040, 0xe0021567, // mov rb21, 64
+--/* [0x000012f0] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
+--/* [0x000012f8] */ 0x00000018, 0xe00215e7, // mov rb23, 24
+--/* [0x00001300] */ 0x00000000, 0xe0020227, // mov ra8, 0
+--/* [0x00001308] */ 0x00000000, 0xe0020267, // mov ra9, 0
+--/* [0x00001310] */ 0x00000000, 0xe00202a7, // mov ra10, 0
+--/* [0x00001318] */ 0x00000000, 0xe00202e7, // mov ra11, 0
+--/* [0x00001320] */ 0x00000000, 0xe0020327, // mov ra12, 0
+--/* [0x00001328] */ 0x00000000, 0xe0020367, // mov ra13, 0
+--/* [0x00001330] */ 0x00000000, 0xe00203a7, // mov ra14, 0
+--/* [0x00001338] */ 0x00000000, 0xe00203e7, // mov ra15, 0
+--/* [0x00001340] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+--/* [0x00001348] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+--/* [0x00001350] */ 0x159e7480, 0x10020867, // mov r1, r2
+--/* [0x00001358] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+--/* [0x00001360] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+--/* [0x00001368] */ 0x159e7480, 0x10020827, // mov r0, r2
+--/* [0x00001370] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+--/* [0x00001378] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00001380] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
+--/* [0x00001388] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+--/* [0x00001390] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
+--/* [0x00001398] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+--/* [0x000013a0] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+--/* [0x000013a8] */ 0x159e7480, 0x10020867, // mov r1, r2
+--/* [0x000013b0] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+--/* [0x000013b8] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+--/* [0x000013c0] */ 0x159e7480, 0x10020827, // mov r0, r2
+--/* [0x000013c8] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+--/* [0x000013d0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x000013d8] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+--/* [0x000013e0] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
+--/* [0x000013e8] */ 0x15427d80, 0x10020827, // mov r0, ra_x_base
+--/* [0x000013f0] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
+--/* [0x000013f8] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base
+--/* [0x00001400] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
+--/* [0x00001408] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
+--/* [0x00001410] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x00001418] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+--/* [0x00001420] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x_base, r0
+--/* [0x00001428] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+--/* [0x00001430] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
+--/* [0x00001438] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
+--/* [0x00001440] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
+--/* [0x00001448] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00001450] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00001458] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00001460] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
+--/* [0x00001468] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+--/* [0x00001470] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
+--/* [0x00001478] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00001480] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+--/* [0x00001488] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
+--/* [0x00001490] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
+-+/* [0x00000398] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x000003a0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x000003a8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x000003b0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x000003b8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x000003c0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x000003c8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x000003d0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x000003d8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x000003e0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x000003e8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x000003f0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x000003f8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x00000400] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00000408] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x00000410] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000418] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00000420] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00000428] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00000430] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+/* [0x00000438] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+/* [0x00000440] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+/* [0x00000448] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+/* [0x00000450] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+/* [0x00000458] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+/* [0x00000460] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+/* [0x00000468] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+/* [0x00000470] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
+-+/* [0x00000478] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+-+/* [0x00000480] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+-+/* [0x00000488] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+-+/* [0x00000490] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+-+/* [0x00000498] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+-+/* [0x000004a0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+-+/* [0x000004a8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x000004b0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+-+/* [0x000004b8] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop
+-+/* [0x000004c0] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+-+/* [0x000004c8] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+-+/* [0x000004d0] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+-+/* [0x000004d8] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
+-+/* [0x000004e0] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
+-+/* [0x000004e8] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
+-+/* [0x000004f0] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
+-+/* [0x000004f8] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
+-+/* [0x00000500] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
+-+/* [0x00000508] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
+-+/* [0x00000510] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
+-+/* [0x00000518] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x00000520] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x00000528] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x00000530] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
+-+/* [0x00000538] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:uvloop
+-+/* [0x00000540] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
+-+/* [0x00000548] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x00000550] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x00000558] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00000560] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000568] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000570] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x00000578] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x00000580] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000588] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x00000590] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000598] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_filter_uv_b
+--/* [0x00001498] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x000014a0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x000014a8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x000014b0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x000014b8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+--/* [0x000014c0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x000014c8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+--/* [0x000014d0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x000014d8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+--/* [0x000014e0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x000014e8] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+--/* [0x000014f0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+--/* [0x000014f8] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x00001500] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00001508] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x00001510] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x00001518] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x00001520] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+--/* [0x00001528] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+--/* [0x00001530] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x00001538] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+--/* [0x00001540] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+--/* [0x00001548] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+--/* [0x00001550] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00001558] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00001560] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00001568] */ 0x0c9dc7c0, 0x10020c67, // add vr_setup, r3, rb28
+--/* [0x00001570] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
+--/* [0x00001578] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00001580] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00001588] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00001590] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00001598] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x000015a0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000015a8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000015b0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000015b8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+--/* [0x000015c0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000015c8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000015d0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000015d8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+--/* [0x000015e0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000015e8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000015f0] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000015f8] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+--/* [0x00001600] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x00001608] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00001610] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x000005a0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x000005a8] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x000005b0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x000005b8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x000005c0] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x000005c8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x000005d0] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x000005d8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x000005e0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+-+/* [0x000005e8] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x000005f0] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+-+/* [0x000005f8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x00000600] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x00000608] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000610] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x00000618] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x00000620] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x00000628] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+-+/* [0x00000630] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+-+/* [0x00000638] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x00000640] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+-+/* [0x00000648] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+-+/* [0x00000650] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+-+/* [0x00000658] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000660] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000668] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00000670] */ 0x0c9dc7c0, 0x10020c67, // add vr_setup, r3, rb28
+-+/* [0x00000678] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
+-+/* [0x00000680] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000688] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000690] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000698] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000006a0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x000006a8] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000006b0] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000006b8] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000006c0] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+-+/* [0x000006c8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000006d0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000006d8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000006e0] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+-+/* [0x000006e8] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000006f0] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000006f8] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000700] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+-+/* [0x00000708] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x00000710] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000718] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop_b
+--/* [0x00001618] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x00001620] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x00001628] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x00001630] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x00001638] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x00001640] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x00001648] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x00001650] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x00001658] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x00001660] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x00001668] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00001670] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
+--/* [0x00001678] */ 0x40038031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
+--/* [0x00001680] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x00001688] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x00001690] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x00001698] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x000016a0] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x000016a8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x000016b0] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--/* [0x000016b8] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--/* [0x000016c0] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--/* [0x000016c8] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--/* [0x000016d0] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--/* [0x000016d8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--/* [0x000016e0] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--/* [0x000016e8] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--/* [0x000016f0] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
+--/* [0x000016f8] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+--/* [0x00001700] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+--/* [0x00001708] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+--/* [0x00001710] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+--/* [0x00001718] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+--/* [0x00001720] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+--/* [0x00001728] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x00001730] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+--/* [0x00001738] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
+--/* [0x00001740] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
+--/* [0x00001748] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
+--/* [0x00001750] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
+--/* [0x00001758] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
+--/* [0x00001760] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
+--/* [0x00001768] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
+--/* [0x00001770] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
+--/* [0x00001778] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
+--/* [0x00001780] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
+--/* [0x00001788] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
+--/* [0x00001790] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
+--/* [0x00001798] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
+--/* [0x000017a0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x000017a8] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
+--/* [0x000017b0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x000017b8] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
+--/* [0x000017c0] */ 0xfffffe38, 0xf06809e7, // brr.anyn -, r:uvloop_b
+--/* [0x000017c8] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+--/* [0x000017d0] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
+--/* [0x000017d8] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
+--/* [0x000017e0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x000017e8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x000017f0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x000017f8] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x00001800] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x00001808] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00001810] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x00001818] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00001820] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000720] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x00000728] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x00000730] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x00000738] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000740] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x00000748] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000750] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000758] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x00000760] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x00000768] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x00000770] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000778] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
+-+/* [0x00000780] */ 0x40038031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
+-+/* [0x00000788] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00000790] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x00000798] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x000007a0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x000007a8] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x000007b0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x000007b8] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+/* [0x000007c0] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+/* [0x000007c8] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+/* [0x000007d0] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+/* [0x000007d8] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+/* [0x000007e0] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+/* [0x000007e8] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+/* [0x000007f0] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+/* [0x000007f8] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
+-+/* [0x00000800] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+-+/* [0x00000808] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+-+/* [0x00000810] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+-+/* [0x00000818] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+-+/* [0x00000820] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+-+/* [0x00000828] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+-+/* [0x00000830] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00000838] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+-+/* [0x00000840] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x00000848] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
+-+/* [0x00000850] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
+-+/* [0x00000858] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
+-+/* [0x00000860] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
+-+/* [0x00000868] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
+-+/* [0x00000870] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
+-+/* [0x00000878] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
+-+/* [0x00000880] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
+-+/* [0x00000888] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
+-+/* [0x00000890] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
+-+/* [0x00000898] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
+-+/* [0x000008a0] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
+-+/* [0x000008a8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x000008b0] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
+-+/* [0x000008b8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x000008c0] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
+-+/* [0x000008c8] */ 0xfffffe38, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x000008d0] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+-+/* [0x000008d8] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
+-+/* [0x000008e0] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
+-+/* [0x000008e8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x000008f0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x000008f8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000900] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x00000908] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x00000910] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000918] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x00000920] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000928] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+// ::mc_exit
+-+/* [0x00000930] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00000938] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+-+/* [0x00000940] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000948] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000950] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000958] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000960] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00000968] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000970] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+// ::mc_interrupt_exit8
+-+/* [0x00000978] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00000980] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000988] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000990] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000998] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x000009a0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000009a8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000009b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000009b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000009c0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000009c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000009d0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000009d8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x000009e0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x000009e8] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_end
+- };
+- #ifdef __HIGHC__
+-diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
+-index 3464cdb..9de4535 100644
+---- a/libavcodec/rpi_shader.h
+-+++ b/libavcodec/rpi_shader.h
+-@@ -3,18 +3,11 @@
+- 
+- extern unsigned int rpi_shader[];
+- 
+--#define mc_setup (rpi_shader + 0)
+--#define mc_filter_uv (rpi_shader + 146)
+--#define mc_filter (rpi_shader + 364)
+--#define mc_filter_b (rpi_shader + 674)
+--#define mc_filter_honly (rpi_shader + 898)
+--#define mc_exit (rpi_shader + 1052)
+--#define mc_exit1 (rpi_shader + 1070)
+--#define mc_interrupt_exit (rpi_shader + 1086)
+--#define mc_interrupt_exit4 (rpi_shader + 1124)
+--#define mc_interrupt_exit8 (rpi_shader + 1146)
+--#define mc_setup_uv (rpi_shader + 1176)
+--#define mc_filter_uv_b (rpi_shader + 1318)
+--#define mc_end (rpi_shader + 1546)
+-+#define mc_setup_uv (rpi_shader + 0)
+-+#define mc_filter_uv (rpi_shader + 142)
+-+#define mc_filter_uv_b (rpi_shader + 360)
+-+#define mc_exit (rpi_shader + 588)
+-+#define mc_interrupt_exit8 (rpi_shader + 606)
+-+#define mc_end (rpi_shader + 636)
+- 
+- #endif
+-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
+-index 4809e1d..cd7346d 100644
+---- a/libavcodec/rpi_shader.qasm
+-+++ b/libavcodec/rpi_shader.qasm
+-@@ -71,8 +71,10 @@
+- 
+- .set rb_const_64,                  rb21
+- 
+--# mc_setup(next_kernel, x, y, ref_base, frame_width, frame_height, pitch, dst_pitch, pad0, pad1)
+--::mc_setup
+-+
+-+################################################################################
+-+# mc_setup_uv(next_kernel, x, y, ref_u_base, ref_v_base, frame_width, frame_height, pitch, dst_pitch, pad0, pad1, pad2)
+-+::mc_setup_uv
+- 
+- # Read starting kernel
+- mov ra31, unif
+-@@ -80,7 +82,9 @@ mov ra31, unif
+- # Load first request location
+- add ra_x_base, unif, elem_num # Store x
+- mov ra_y, unif # Store y
+--mov ra_x2_base, unif # Store frame base
+-+mov ra_x2_base, unif # Store frame u base
+-+nop
+-+sub ra_u2v_ref_offset, unif, ra_x2_base # Store offset to add to move from u to v in reference frame
+- 
+- # Read image dimensions
+- sub rb25,unif,1
+-@@ -143,29 +147,24 @@ mov r1, vpm_setup(0, 4, h8p(0, 0))
+- add rb28, r0, r1
+- 
+- # Compute base address for first and second access
+--#add r0, unif, elem_num     # x
+- mov r0, ra_x_base           # Load x
+--add r2, r0, 8               # x+8
+- max r0, r0, 0; mov r1, ra_y # Load y
+- min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base  # Load the frame base
+--shl ra_xshift_next, r0, 3
+--max r2, r2, 0
+-+shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
+- add ra_y, r1, 1
+--min r2, r2, rb_frame_width_minus_1
+--shl ra_x2shift_next, r2, 3
+--max r1, r1, 0  # y
+--min r1, r1, rb_frame_height_minus_1
+--add r0, r0, r3; mul24 r1, r1, rb_pitch
+--add r2, r2, r3
+-+add r0, r0, r3
+- and r0, r0, ~3
+--and r2, r2, ~3; mov ra_x_base, r0
+-+max r1, r1, 0 ; mov ra_x_base, r0 # y
+-+min r1, r1, rb_frame_height_minus_1
+- # submit texture requests for first line
+-+add r2, r2, r0 ; mul24 r1, r1, rb_pitch
+- add t0s, r0, r1 ; mov ra_x2_base, r2
+- add t0s, r2, r1
+- 
+- # Dump padding words
+- mov r0, unif
+- mov r0, unif
+-+mov r0, unif
+- 
+- # submit texture requests for second line
+- max r1, ra_y, 0
+-@@ -176,6 +175,8 @@ nop ; mul24 r1, r1, rb_pitch
+- add t0s, r1, ra_x_base
+- add t0s, r1, ra_x2_base
+- 
+-+
+-+
+- ################################################################################
+- 
+- # mc_filter_uv(next_kernel, x, y, frame_u_base, frame_v_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_u_dst, this_v_dst)
+-@@ -341,453 +342,26 @@ add vw_setup, rb26, r0 # VDW setup 0
+- mov vw_setup, rb29 # Stride
+- mov vw_addr, unif # start the VDW
+- 
+--################################################################################
+--
+--
+--# mc_filter(next_kernel, x, y, frame_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_dst)
+--
+--# At this point we have already issued two pairs of texture requests for the current block
+--# ra_x_base, ra_x16_base point to the current coordinates for this block
+--::mc_filter
+--mov ra31, unif
+--
+--# per-channel shifts were calculated on the *previous* invocation
+--
+--mov ra_xshift, ra_xshift_next
+--mov ra_x2shift, ra_x2shift_next
+--
+--# get base addresses and per-channel shifts for *next* invocation
+--add r0, unif, elem_num    # x
+--add r2, r0, 8 # x+8
+--max r0, r0, 0; mov r1, unif # y
+--min r0, r0, rb_frame_width_minus_1 ; mov r3,unif # frame_base
+--shl ra_xshift_next, r0, 3
+--max r2, r2, 0
+--min r2, r2, rb_frame_width_minus_1
+--shl ra_x2shift_next, r2, 3
+--add r0, r0, r3
+--add r2, r2, r3
+--and rb_x_base_next, r0, ~3
+--and ra_x2_base_next, r2, ~3
+--mov ra_y_next, r1
+--
+--# set up VPM write
+--mov vw_setup, rb28
+--
+--# get width,height of block
+--mov r2, 16
+--mov r0, unif
+--shr r1, r0, r2 # Extract width
+--sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
+--and r0, r0, rb22 # Extract height
+--add rb17, r0, 5
+--add rb18, r0, 7
+--shl r0, r0, 7
+--add r0, r0, r1 # Combine width and height of destination area
+--shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
+--add rb26, r0, rb27
+--
+--sub.setf -,8,r1 # 8-r1, so if <0 (negative) we need to use the full code
+--
+--# get filter coefficients
+--
+--mov r0, unif
+--asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--asr ra0, r0, rb23;      mov r0, unif
+--asr ra7, r0, rb23;      mul24 r0, r0, ra22
+--asr ra6, r0, rb23;      mul24 r0, r0, ra22
+--asr ra5, r0, rb23;      mul24 r0, r0, ra22
+--asr ra4, r0, rb23;      mov r0, unif
+--asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--asr rb8, r0, rb23;      mov r0, unif
+--asr rb15, r0, rb23;     mul24 r0, r0, ra22
+--asr rb14, r0, rb23;     mul24 r0, r0, ra22
+--asr rb13, r0, rb23;     mul24 r0, r0, ra22
+--brr.anynn -, r:fast_path
+--asr rb12, r0, rb23  # delay slot 1
+--
+--# r2 is elem_num
+--# r3 is loop counter
+--
+--mov r5rep, -8 # delay slot 2
+--
+--mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]  # delay slot 3
+--
+--# retrieve texture results and pick out bytes
+--# then submit two more texture requests
+--
+--## nop                                                                 ; ldtmu0     # loop counter increment
+--## shr r0, r4, ra17                                                    ; ldtmu0
+--## shr r1, r4, ra17     ; v8subs r0, r0, rb20
+--## add t0s, ra16, r5    ; v8subs r1, r1, rb20
+--## add ra16, ra16, rb16 ; mov t0s, ra16
+--##
+--## # generate seven shifted versions
+--## # interleave with scroll of vertical context
+--##
+--## mov r2, rb21         ; mul24 r3, r0, ra0
+--## sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+--## nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--## sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--## nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--## sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--## nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--## sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--## nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--## sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--## nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--## sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--## nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--## sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--## nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--## sub r2, r2, r3                                                    ; ldtmu0
+--##
+--## mov r0, ra22
+--## shr r0, r4, ra17     ; mul24 r2, r2, r0                           ; ldtmu0
+--## shr r1, r4, ra17     ; v8subs r0, r0, rb20
+--## add t0s, ra16, r5    ; v8subs r1, r1, rb20
+--## add ra16, ra16, rb16 ; mov t0s, ra16
+--##
+--## # apply horizontal filter
+--##
+--## asr r2, r2, 15    ; mul24 r3, r0, ra0
+--## min r2, r2, rb22
+--## max ra13, r2, 0
+--##
+--## # generate seven shifted versions
+--## # interleave with scroll of vertical context
+--##
+--## mov r2, rb21
+--## sub r2, r2, r3 ; mul24      r3, ra1 << 1, r0 << 1
+--## nop            ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--## sub r2, r2, r3 ; mul24    r3, ra2 << 2, r0 << 2
+--## nop            ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--## sub r2, r2, r3 ; mul24    r3, ra3 << 3, r0 << 3
+--## nop            ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--## sub r2, r2, r3 ; mul24    r3, ra4 << 4, r0 << 4
+--## nop            ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--## sub r2, r2, r3 ; mul24    r3, ra5 << 5, r0 << 5
+--## nop            ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--## sub r2, r2, r3 ; mul24    r3, ra6 << 6, r0 << 6
+--## nop            ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--## sub r2, r2, r3 ; mul24    r3, ra7 << 7, r0 << 7
+--## nop            ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--## sub r0, r2, r3
+--##
+--## # apply horizontal filter
+--##
+--## nop          ; mul24 r0, r0, ra22         # last bit of context scroll, including clamp to zero
+--## asr r0, r0, 15
+--## min r0, r0, rb22
+--## max ra14, r0, 0
+--##
+--##
+--##
+--##
+--## nop                                                                 ; ldtmu0     # loop counter increment
+--## shr r0, r4, ra17                                                    ; ldtmu0
+--## shr r1, r4, ra17     ; v8subs r0, r0, rb20
+--## add t0s, ra16, r5    ; v8subs r1, r1, rb20
+--## add ra16, ra16, rb16 ; mov t0s, ra16
+--##
+--## # generate seven shifted versions
+--## # interleave with scroll of vertical context
+--##
+--## mov r2, rb21         ; mul24 r3, r0, ra0
+--## sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+--## nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--## sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--## nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--## sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--## nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--## sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--## nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--## sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--## nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--## sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--## nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--## sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--## nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--## sub r0, r2, r3
+--##
+--## # apply horizontal filter
+--##
+--## nop          ; mul24 r0, r0, ra22         # last bit of context scroll, including clamp to zero
+--## asr r0, r0, 15
+--## min r0, r0, rb22
+--## max ra15, r0, 0
+--
+--
+--
+--
+--mov r3, 0
+--
+--:loop
+--# retrieve texture results and pick out bytes
+--# then submit two more texture requests
+--
+--sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
+--shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
+--
+--max r2, ra_y, 0  # y
+--min r2, r2, rb_frame_height_minus_1
+--add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--add t0s, ra_x2_base, r2
+--
+--# generate seven shifted versions
+--# interleave with scroll of vertical context
+--
+--mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--
+--mov r2, rb21         ; mul24 r3, r0, ra0
+--sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+--nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--sub r0, r2, r3
+--
+--mov r3, rb31
+--
+--mov ra8, ra9
+--mov ra9, ra10
+--mov ra10, ra11
+--mov ra11, ra12
+--mov ra12, ra13
+--mov ra13, ra14
+--
+--sub.setf -, r3, 8 ; mov r1, ra22
+--
+--# apply horizontal filter
+--brr.anyn -, r:loop
+--max ra14, ra15, 0       ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
+--asr r0, r0, 15          ; mov r1, ra21
+--min.setf ra15, r0, rb22
+--
+--# apply vertical filter and write to VPM
+--
+--nop                     ; mul24 r0, ra14, rb14
+--sub r1, r1, r0          ; mul24 r0, ra13, rb13
+--sub r1, r1, r0          ; mul24 r0, ra12, rb12
+--sub r1, r1, r0          ; mul24 r0, ra11, rb11
+--sub r1, r1, r0          ; mul24 r0, ra10, rb10
+--sub r1, r1, r0          ; mul24 r0, ra9, rb9
+--sub r1, r1, r0          ; mul24 r0, ra8, rb8
+--sub r1, r1, r0          ; mul24 r0, ra15, rb15
+--sub.ifnn r1, r1, r0     ; mov -, vw_wait
+--sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--brr.anyn -, r:loop
+--asr r1, r1, 15
+--min r1, r1, rb22
+--max vpm, r1, 0
+--
+--# DMA out
+--
+--bra -, ra31
+--mov vw_setup, rb26 # VDW: height rows, 16 8-bit units long
+--mov vw_setup, rb29
+--mov vw_addr, unif # start the VDW
+--
+--####################################################
+--
+--:fast_path
+--## nop                                                                 ; ldtmu0     # loop counter increment
+--## shr r0, r4, ra17                                                    ; ldtmu0
+--## shr r1, r4, ra17     ; v8subs r0, r0, rb20
+--## add t0s, ra16, r5    ; v8subs r1, r1, rb20
+--## add ra16, ra16, rb16 ; mov t0s, ra16
+--##
+--## # generate seven shifted versions
+--## # interleave with scroll of vertical context
+--##
+--## mov r2, rb21         ; mul24 r3, r0, ra0
+--## sub r2, r2, r3       ; mul24 r3, ra1 << 1, r0 << 1
+--## sub r2, r2, r3       ; mul24 r3, ra2 << 2, r0 << 2
+--## sub r2, r2, r3       ; mul24 r3, ra3 << 3, r0 << 3
+--## sub r2, r2, r3       ; mul24 r3, ra4 << 4, r0 << 4
+--## sub r2, r2, r3       ; mul24 r3, ra5 << 5, r0 << 5
+--## sub r2, r2, r3       ; mul24 r3, ra6 << 6, r0 << 6
+--## sub r2, r2, r3       ; mul24 r3, ra7 << 7, r0 << 7
+--## sub r2, r2, r3                                                    ; ldtmu0
+--##
+--## mov r0, ra22
+--## shr r0, r4, ra17     ; mul24 r2, r2, r0                           ; ldtmu0
+--## shr r1, r4, ra17     ; v8subs r0, r0, rb20
+--## add t0s, ra16, r5    ; v8subs r1, r1, rb20
+--## add ra16, ra16, rb16 ; mov t0s, ra16
+--##
+--## # apply horizontal filter
+--##
+--## asr r2, r2, 15    ; mul24 r3, r0, ra0
+--## min r2, r2, rb22
+--## max ra13, r2, 0
+--##
+--## # generate seven shifted versions
+--## # interleave with scroll of vertical context
+--##
+--## mov r2, rb21
+--## sub r2, r2, r3 ; mul24    r3, ra1 << 1, r0 << 1
+--## sub r2, r2, r3 ; mul24    r3, ra2 << 2, r0 << 2
+--## sub r2, r2, r3 ; mul24    r3, ra3 << 3, r0 << 3
+--## sub r2, r2, r3 ; mul24    r3, ra4 << 4, r0 << 4
+--## sub r2, r2, r3 ; mul24    r3, ra5 << 5, r0 << 5
+--## sub r2, r2, r3 ; mul24    r3, ra6 << 6, r0 << 6
+--## sub r2, r2, r3 ; mul24    r3, ra7 << 7, r0 << 7
+--## sub r0, r2, r3
+--##
+--## # apply horizontal filter
+--##
+--## nop          ; mul24 r0, r0, ra22         # last bit of context scroll, including clamp to zero
+--## asr r0, r0, 15
+--## min r0, r0, rb22
+--## max ra14, r0, 0
+--##
+--##
+--##
+--##
+--## nop                                                                 ; ldtmu0     # loop counter increment
+--## shr r0, r4, ra17                                                    ; ldtmu0
+--## shr r1, r4, ra17     ; v8subs r0, r0, rb20
+--## add t0s, ra16, r5    ; v8subs r1, r1, rb20
+--## add ra16, ra16, rb16 ; mov t0s, ra16
+--##
+--## # generate seven shifted versions
+--## # interleave with scroll of vertical context
+--##
+--## mov r2, rb21   ; mul24    r3, r0, ra0
+--## sub r2, r2, r3 ; mul24    r3, ra1 << 1, r0 << 1
+--## sub r2, r2, r3 ; mul24    r3, ra2 << 2, r0 << 2
+--## sub r2, r2, r3 ; mul24    r3, ra3 << 3, r0 << 3
+--## sub r2, r2, r3 ; mul24    r3, ra4 << 4, r0 << 4
+--## sub r2, r2, r3 ; mul24    r3, ra5 << 5, r0 << 5
+--## sub r2, r2, r3 ; mul24    r3, ra6 << 6, r0 << 6
+--## sub r2, r2, r3 ; mul24    r3, ra7 << 7, r0 << 7
+--## sub r0, r2, r3
+--##
+--## # apply horizontal filter
+--##
+--## nop          ; mul24 r0, r0, ra22         # last bit of context scroll, including clamp to zero
+--## asr r0, r0, 15
+--## min r0, r0, rb22
+--## max ra15, r0, 0
+--
+--
+--mov r3, 0  # This signifies the amount of unrolling
+--
+--:fast_loop
+--# retrieve texture results and pick out bytes
+--# then submit two more texture requests
+--
+--# Due to pipelining we can only skip second pipeline instructions related to the fetched pixels
+--sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
+--shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--mov.ifz ra_y, ra_y_next   ; mov rb31, r3
+--mov.ifz ra_x2_base, ra_x2_base_next   ; mov r3, rb_pitch
+--
+--max r2, ra_y, 0
+--min r2, r2, rb_frame_height_minus_1 ; mov r1, r4  # discard texture read
+--add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--add t0s, ra_x_base, r2    ; v8subs r0, r0, rb20
+--add t0s, ra_x2_base, r2
+--
+--# generate seven shifted versions
+--# interleave with scroll of vertical context
+--
+--mov r2, rb21         ; mul24 r3, r0, ra0
+--sub r2, r2, r3       ; mul24 r3, ra1 << 1, r0 << 1
+--sub r2, r2, r3       ; mul24 r3, ra2 << 2, r0 << 2
+--sub r2, r2, r3       ; mul24 r3, ra3 << 3, r0 << 3
+--sub r2, r2, r3       ; mul24 r3, ra4 << 4, r0 << 4
+--sub r2, r2, r3       ; mul24 r3, ra5 << 5, r0 << 5
+--sub r2, r2, r3       ; mul24 r3, ra6 << 6, r0 << 6
+--sub r2, r2, r3       ; mul24 r3, ra7 << 7, r0 << 7
+--sub r0, r2, r3       ; mov r3, rb31
+--
+--mov ra8, ra9
+--mov ra9, ra10
+--mov ra10, ra11
+--mov ra11, ra12
+--mov ra12, ra13
+--mov ra13, ra14
+--
+--sub.setf -, r3, 8       ; mov r1, ra22
+--
+--# apply horizontal filter
+--
+--brr.anyn -, r:fast_loop
+--max ra14, ra15, 0       ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
+--asr r0, r0, 15          ; mov r1, ra21
+--min.setf ra15, r0, rb22
+--
+--# apply vertical filter and write to VPM
+--
+--nop                     ; mul24 r0, ra14, rb14
+--sub r1, r1, r0          ; mul24 r0, ra13, rb13
+--sub r1, r1, r0          ; mul24 r0, ra12, rb12
+--sub r1, r1, r0          ; mul24 r0, ra11, rb11
+--sub r1, r1, r0          ; mul24 r0, ra10, rb10
+--sub r1, r1, r0          ; mul24 r0, ra9, rb9
+--sub r1, r1, r0          ; mul24 r0, ra8, rb8
+--sub r1, r1, r0          ; mul24 r0, ra15, rb15
+--sub.ifnn r1, r1, r0     ; mov -, vw_wait
+--sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--brr.anyn -, r:fast_loop
+--asr r1, r1, 15
+--min r1, r1, rb22
+--max vpm, r1, 0
+--
+--# DMA out
+--
+--bra -, ra31
+--mov vw_setup, rb26 # VDW: height rows, 16 8-bit units long
+--mov vw_setup, rb29
+--mov vw_addr, unif # start the VDW
+- 
+- ################################################################################
+- 
+--# mc_filter_b(next_kernel, x, y, frame_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_dst)
+--
+--# At this point we have already issued two pairs of texture requests for the current block
+--# ra_x_base, ra_x16_base point to the current coordinates for this block
+--::mc_filter_b
+-+::mc_filter_uv_b
+- mov ra31, unif
+- 
+- # per-channel shifts were calculated on the *previous* invocation
+- 
+- mov ra_xshift, ra_xshift_next
+--mov ra_x2shift, ra_x2shift_next
+- 
+- # get base addresses and per-channel shifts for *next* invocation
+- add r0, unif, elem_num    # x
+--add r2, r0, 8 # x+8
+- max r0, r0, 0; mov r1, unif # y
+--min r0, r0, rb_frame_width_minus_1 ; mov r3,unif # frame_base
+-+min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base
+- shl ra_xshift_next, r0, 3
+--max r2, r2, 0
+--min r2, r2, rb_frame_width_minus_1
+--shl ra_x2shift_next, r2, 3
+-+sub r2, unif, r3 # compute offset from frame base u to frame base v
+- add r0, r0, r3
+--add r2, r2, r3
+- and rb_x_base_next, r0, ~3
+--and ra_x2_base_next, r2, ~3
+- mov ra_y_next, r1
+-+add ra_x2_base_next, rb_x_base_next, r2
+- 
+- # set up VPM write
+- mov vw_setup, rb28
+-@@ -801,17 +375,22 @@ and r0, r0, rb22 # Extract height
+- add rb17, r0, 5
+- add rb18, r0, 7
+- shl r0, r0, 7
+-+
+- # r0 is currently height<<7
+- # For vr_setup we want height<<20 (so 20-7=13 additional bits)
+- shl r3, r0, 13
+- shl r3, r3, 8 # Mask off top 8 bits
+- shr r3, r3, 8
+-+
+- add r0, r0, r1 # Combine width and height of destination area
+- shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
+- add rb26, r0, rb27
+-+
+- # In a B frame, so also set up VPM read
+- add vr_setup, r3, rb28
+- 
+-+sub.setf -,8,r1 # 8-r1, so if <0 (negative) we need to use the full code
+-+
+- # get filter coefficients
+- 
+- mov r0, unif
+-@@ -837,9 +416,13 @@ asr rb12, r0, rb23
+- 
+- mov r5rep, -8
+- mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+
+-+# retrieve texture results and pick out bytes
+-+# then submit two more texture requests
+-+
+- mov r3, 0
+- 
+--:bloop
+-+:uvloop_b
+- # retrieve texture results and pick out bytes
+- # then submit two more texture requests
+- 
+-@@ -847,7 +430,7 @@ sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+- shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+- mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+- mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
+-+shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
+- 
+- max r2, ra_y, 0  # y
+- min r2, r2, rb_frame_height_minus_1
+-@@ -861,6 +444,7 @@ add t0s, ra_x2_base, r2
+- mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+- 
+- mov r2, rb21         ; mul24 r3, r0, ra0
+-+nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
+- sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+- nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+- sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-@@ -889,7 +473,7 @@ mov ra13, ra14
+- sub.setf -, r3, 8 ; mov r1, ra22
+- 
+- # apply horizontal filter
+--brr.anyn -, r:bloop
+-+brr.anyn -, r:uvloop_b
+- max ra14, ra15, 0       ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
+- asr r0, r0, 15          ; mov r1, ra21
+- min.setf ra15, r0, rb22
+-@@ -906,213 +490,50 @@ sub r1, r1, r0          ; mul24 r0, ra8, rb8
+- sub r1, r1, r0          ; mul24 r0, ra15, rb15
+- sub.ifnn r1, r1, r0     ; mov -, vw_wait
+- sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--asr r1, r1, 15          ; mov -, vr_wait
+-+asr r1, r1, 15
+- min r1, r1, rb22
+- add r0, vpm, 1          # Blend in previous VPM contents at this location
+--brr.anyn -, r:bloop
+-+brr.anyn -, r:uvloop_b
+- max r1, r1, 0
+- add r1, r1, r0
+- shr vpm, r1, 1
+- 
+--# DMA out
+-+
+-+# DMA out for U
+-+
+-+mov vw_setup, rb26 # VDW setup 0
+-+mov vw_setup, rb29 # Stride
+-+mov vw_addr, unif # start the VDW
+-+
+-+# DMA out for V
+-+# We need to wait for the U to complete first, but have nothing useful to compute while we wait.
+-+# Could potentially push this write into the start of the next pipeline stage.
+-+mov r0, 16
+-+mov -, vw_wait
+- 
+- bra -, ra31
+--mov vw_setup, rb26 # VDW: height rows, 16 8-bit units long
+--mov vw_setup, rb29
+-+add vw_setup, rb26, r0 # VDW setup 0
+-+mov vw_setup, rb29 # Stride
+- mov vw_addr, unif # start the VDW
+- 
+- ################################################################################
+- 
+--# mc_filter_honly(next_kernel, x, y, frame_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_dst)
+--# This filter only does horizontal filtering.
+--# It is assumed that the region to fetch does not include extra rows above.
+-+# mc_exit()
+- 
+--# At this point we have already issued two pairs of texture requests for the current block
+--# ra_x_base, ra_x16_base point to the current coordinates for this block
+--::mc_filter_honly
+--mov ra31, unif
+-+::mc_exit
+-+mov  -, vw_wait # wait on the VDW
+- 
+--# per-channel shifts were calculated on the *previous* invocation
+-+mov -,srel(0)
+- 
+--mov ra_xshift, ra_xshift_next
+--mov ra_x2shift, ra_x2shift_next
+--
+--# get base addresses and per-channel shifts for *next* invocation
+--add r0, unif, elem_num    # x
+--add r2, r0, 8 # x+8
+--max r0, r0, 0; mov r1, unif # y
+--min r0, r0, rb_frame_width_minus_1 ; mov r3,unif # frame_base
+--shl ra_xshift_next, r0, 3
+--max r2, r2, 0
+--min r2, r2, rb_frame_width_minus_1
+--shl ra_x2shift_next, r2, 3
+--add r0, r0, r3
+--add r2, r2, r3
+--and rb_x_base_next, r0, ~3
+--and ra_x2_base_next, r2, ~3
+--mov ra_y_next, r1
+--
+--# set up VPM write
+--mov vw_setup, rb28
+--
+--# get width,height of block
+--mov r2, 16
+--mov r0, unif
+--shr r1, r0, r2 # Extract width
+--sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
+--and r0, r0, rb22 # Extract height
+--add rb17, r0, -2 # Pipelining means we move data across 2 iterations early
+--shl r0, r0, 7 ; mov rb18,r0
+--add r0, r0, r1 # Combine width and height of destination area
+--shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
+--add rb26, r0, rb27
+--
+--# get filter coefficients
+--
+--mov r0, unif
+--asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--asr ra0, r0, rb23;      mov r0, unif
+--asr ra7, r0, rb23;      mul24 r0, r0, ra22
+--asr ra6, r0, rb23;      mul24 r0, r0, ra22
+--asr ra5, r0, rb23;      mul24 r0, r0, ra22
+--asr ra4, r0, rb23;      mov r0, unif
+--mov r0, unif
+--
+--# r2 is elem_num
+--# r3 is loop counter
+--mov r5rep, -8
+--mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]  # delay slot 3
+--mov r3, 0
+--
+--:loop_honly
+--# retrieve texture results and pick out bytes
+--# then submit two more texture requests
+--
+--sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
+--shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
+--
+--max r2, ra_y, 0  # y
+--min r2, r2, rb_frame_height_minus_1
+--add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--add t0s, ra_x2_base, r2
+--
+--# generate seven shifted versions
+--# interleave with scroll of vertical context
+--
+--mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--
+--mov r2, rb21         ; mul24 r3, r0, ra0
+--sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+--nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--sub r0, r2, r3       ; mov r3, rb31
+--
+--sub.setf -, r3, rb18 ; mov r1, ra22
+--
+--mov -, vw_wait   ; mul24 r0, r0, r1
+--brr.anyn -, r:loop_honly
+--asr r0, r0, 15          # delay 1
+--min r0, r0, rb22        # delay 2
+--max vpm, r0, 0          # delay 3
+--
+--# DMA out
+--bra -, ra31
+--mov vw_setup, rb26 # VDW: height rows, 16 8-bit units long
+--mov vw_setup, rb29
+--mov vw_addr, unif # start the VDW
+--
+--
+--################################################################################
+--
+--# mc_exit()
+--
+--::mc_exit
+--mov  -, vw_wait # wait on the VDW
+--
+--mov -,srel(0)
+--
+--ldtmu0
+--ldtmu0
+--ldtmu0
+--ldtmu0
+-+ldtmu0
+-+ldtmu0
+-+ldtmu0
+-+ldtmu0
+- 
+- nop        ; nop ; thrend
+- nop        ; nop # delay slot 1
+- nop        ; nop # delay slot 2
+- 
+--::mc_exit1
+--mov  -, vw_wait # wait on the VDW
+--
+--#mov -,srel(1)
+--
+--ldtmu0
+--ldtmu0
+--ldtmu0
+--ldtmu0
+--
+--nop        ; nop ; thrend
+--mov interrupt, 1; nop # delay slot 1
+--nop        ; nop # delay slot 2
+--
+--# mc_interrupt_exit()
+--::mc_interrupt_exit
+--mov  -, vw_wait # wait on the VDW
+--
+--ldtmu0
+--ldtmu0
+--ldtmu0
+--ldtmu0
+--
+--mov -,sacq(0) # 1
+--mov -,sacq(0) # 2
+--mov -,sacq(0) # 3
+--mov -,sacq(0) # 4
+--mov -,sacq(0) # 5
+--mov -,sacq(0) # 6
+--mov -,sacq(0) # 7
+--mov -,sacq(0) # 8
+--mov -,sacq(0) # 9
+--mov -,sacq(0) # 10
+--mov -,sacq(0) # 11
+--
+--nop        ; nop ; thrend
+--mov interrupt, 1; nop # delay slot 1
+--nop        ; nop # delay slot 2
+--
+--# mc_interrupt_exit4()
+--::mc_interrupt_exit4
+--mov  -, vw_wait # wait on the VDW
+--
+--ldtmu0
+--ldtmu0
+--ldtmu0
+--ldtmu0
+--
+--mov -,sacq(0) # 1
+--mov -,sacq(0) # 2
+--mov -,sacq(0) # 3
+--
+--nop        ; nop ; thrend
+--mov interrupt, 1; nop # delay slot 1
+--nop        ; nop # delay slot 2
+--
+- # mc_interrupt_exit8()
+- ::mc_interrupt_exit8
+- mov  -, vw_wait # wait on the VDW
+-@@ -1134,282 +555,5 @@ nop        ; nop ; thrend
+- mov interrupt, 1; nop # delay slot 1
+- nop        ; nop # delay slot 2
+- 
+--################################################################################
+--# mc_setup_uv(next_kernel, x, y, ref_u_base, ref_v_base, frame_width, frame_height, pitch, dst_pitch, pad0, pad1, pad2)
+--::mc_setup_uv
+--
+--# Read starting kernel
+--mov ra31, unif
+--
+--# Load first request location
+--add ra_x_base, unif, elem_num # Store x
+--mov ra_y, unif # Store y
+--mov ra_x2_base, unif # Store frame u base
+--nop
+--sub ra_u2v_ref_offset, unif, ra_x2_base # Store offset to add to move from u to v in reference frame
+--
+--# Read image dimensions
+--sub rb25,unif,1
+--sub rb30,unif,1
+--
+--# get source pitch
+--mov rb16, unif
+--
+--# get destination pitch
+--mov r0, unif
+--mov r1, vdw_setup_1(0)
+--add rb24, r1, r0
+--
+--# load constants
+--
+--mov ra20, 1
+--mov ra21, 32
+--mov ra22, 256
+--mov ra23, 8
+--
+--mov rb20, 0xffffff00
+--mov rb21, 64
+--mov rb22, 255
+--mov rb23, 24
+--
+--# touch vertical context to keep simulator happy
+--
+--mov ra8, 0
+--mov ra9, 0
+--mov ra10, 0
+--mov ra11, 0
+--mov ra12, 0
+--mov ra13, 0
+--mov ra14, 0
+--mov ra15, 0
+--
+--# Compute part of VPM to use for DMA output
+--mov r2, qpu_num
+--and r2, r2, 15
+--mov r1, r2
+--asr r1, r1, 2
+--shl r1, r1, 6
+--mov r0, r2
+--and r0, r0, 3
+--add r0, r0, r1
+--mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
+--shl r0, r0, 5
+--add rb27, r0, r1
+--
+--# Compute part of VPM to save data into
+--mov r2, qpu_num
+--and r2, r2, 15
+--mov r1, r2
+--asr r1, r1, 2
+--shl r1, r1, 6
+--mov r0, r2
+--and r0, r0, 3
+--add r0, r0, r1
+--mov r1, vpm_setup(0, 4, h8p(0, 0))
+--add rb28, r0, r1
+--
+--# Compute base address for first and second access
+--mov r0, ra_x_base           # Load x
+--max r0, r0, 0; mov r1, ra_y # Load y
+--min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base  # Load the frame base
+--shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
+--add ra_y, r1, 1
+--add r0, r0, r3
+--and r0, r0, ~3
+--max r1, r1, 0 ; mov ra_x_base, r0 # y
+--min r1, r1, rb_frame_height_minus_1
+--# submit texture requests for first line
+--add r2, r2, r0 ; mul24 r1, r1, rb_pitch
+--add t0s, r0, r1 ; mov ra_x2_base, r2
+--add t0s, r2, r1
+--
+--# Dump padding words
+--mov r0, unif
+--mov r0, unif
+--mov r0, unif
+--
+--# submit texture requests for second line
+--max r1, ra_y, 0
+--min r1, r1, rb_frame_height_minus_1
+--add ra_y, ra_y, 1
+--bra -, ra31
+--nop ; mul24 r1, r1, rb_pitch
+--add t0s, r1, ra_x_base
+--add t0s, r1, ra_x2_base
+--
+--
+--
+--################################################################################
+--
+--::mc_filter_uv_b
+--mov ra31, unif
+--
+--# per-channel shifts were calculated on the *previous* invocation
+--
+--mov ra_xshift, ra_xshift_next
+--
+--# get base addresses and per-channel shifts for *next* invocation
+--add r0, unif, elem_num    # x
+--max r0, r0, 0; mov r1, unif # y
+--min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base
+--shl ra_xshift_next, r0, 3
+--sub r2, unif, r3 # compute offset from frame base u to frame base v
+--add r0, r0, r3
+--and rb_x_base_next, r0, ~3
+--mov ra_y_next, r1
+--add ra_x2_base_next, rb_x_base_next, r2
+--
+--# set up VPM write
+--mov vw_setup, rb28
+--
+--# get width,height of block
+--mov r2, 16
+--mov r0, unif
+--shr r1, r0, r2 # Extract width
+--sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
+--and r0, r0, rb22 # Extract height
+--add rb17, r0, 5
+--add rb18, r0, 7
+--shl r0, r0, 7
+--
+--# r0 is currently height<<7
+--# For vr_setup we want height<<20 (so 20-7=13 additional bits)
+--shl r3, r0, 13
+--shl r3, r3, 8 # Mask off top 8 bits
+--shr r3, r3, 8
+--
+--add r0, r0, r1 # Combine width and height of destination area
+--shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
+--add rb26, r0, rb27
+--
+--# In a B frame, so also set up VPM read
+--add vr_setup, r3, rb28
+--
+--sub.setf -,8,r1 # 8-r1, so if <0 (negative) we need to use the full code
+--
+--# get filter coefficients
+--
+--mov r0, unif
+--asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--asr ra0, r0, rb23;      mov r0, unif
+--asr ra7, r0, rb23;      mul24 r0, r0, ra22
+--asr ra6, r0, rb23;      mul24 r0, r0, ra22
+--asr ra5, r0, rb23;      mul24 r0, r0, ra22
+--asr ra4, r0, rb23;      mov r0, unif
+--asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--asr rb8, r0, rb23;      mov r0, unif
+--asr rb15, r0, rb23;     mul24 r0, r0, ra22
+--asr rb14, r0, rb23;     mul24 r0, r0, ra22
+--asr rb13, r0, rb23;     mul24 r0, r0, ra22
+--asr rb12, r0, rb23
+--
+--# r2 is elem_num
+--# r3 is loop counter
+--
+--mov r5rep, -8
+--mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--
+--# retrieve texture results and pick out bytes
+--# then submit two more texture requests
+--
+--mov r3, 0
+--
+--:uvloop_b
+--# retrieve texture results and pick out bytes
+--# then submit two more texture requests
+--
+--sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
+--shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
+--
+--max r2, ra_y, 0  # y
+--min r2, r2, rb_frame_height_minus_1
+--add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--add t0s, ra_x2_base, r2
+--
+--# generate seven shifted versions
+--# interleave with scroll of vertical context
+--
+--mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--
+--mov r2, rb21         ; mul24 r3, r0, ra0
+--nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
+--sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+--nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--sub r0, r2, r3
+--
+--mov r3, rb31
+--
+--mov ra8, ra9
+--mov ra9, ra10
+--mov ra10, ra11
+--mov ra11, ra12
+--mov ra12, ra13
+--mov ra13, ra14
+--
+--sub.setf -, r3, 8 ; mov r1, ra22
+--
+--# apply horizontal filter
+--brr.anyn -, r:uvloop_b
+--max ra14, ra15, 0       ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
+--asr r0, r0, 15          ; mov r1, ra21
+--min.setf ra15, r0, rb22
+--
+--# apply vertical filter and write to VPM
+--
+--nop                     ; mul24 r0, ra14, rb14
+--sub r1, r1, r0          ; mul24 r0, ra13, rb13
+--sub r1, r1, r0          ; mul24 r0, ra12, rb12
+--sub r1, r1, r0          ; mul24 r0, ra11, rb11
+--sub r1, r1, r0          ; mul24 r0, ra10, rb10
+--sub r1, r1, r0          ; mul24 r0, ra9, rb9
+--sub r1, r1, r0          ; mul24 r0, ra8, rb8
+--sub r1, r1, r0          ; mul24 r0, ra15, rb15
+--sub.ifnn r1, r1, r0     ; mov -, vw_wait
+--sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--asr r1, r1, 15
+--min r1, r1, rb22
+--add r0, vpm, 1          # Blend in previous VPM contents at this location
+--brr.anyn -, r:uvloop_b
+--max r1, r1, 0
+--add r1, r1, r0
+--shr vpm, r1, 1
+--
+--
+--# DMA out for U
+--
+--mov vw_setup, rb26 # VDW setup 0
+--mov vw_setup, rb29 # Stride
+--mov vw_addr, unif # start the VDW
+--
+--# DMA out for V
+--# We need to wait for the U to complete first, but have nothing useful to compute while we wait.
+--# Could potentially push this write into the start of the next pipeline stage.
+--mov r0, 16
+--mov -, vw_wait
+--
+--bra -, ra31
+--add vw_setup, rb26, r0 # VDW setup 0
+--mov vw_setup, rb29 # Stride
+--mov vw_addr, unif # start the VDW
+--
+- ::mc_end
+-+# Do not add code here because mc_end must appear after all other code.
+--- 
+-2.7.4
+-
+-
+-From d40d59de0f09fd1a6e7146532418b63d8e2711b7 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Wed, 13 May 2015 14:54:25 +0100
+-Subject: [PATCH 20/68] Moved chroma P1 to QPUs
+-
+----
+- libavcodec/hevc.c | 38 ++++++++++++++++++++++++++++++++++++++
+- 1 file changed, 38 insertions(+)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 31b8b2f..391d139 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -2070,6 +2070,44 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                     s->sh.luma_offset_l1[current_mv.ref_idx[1]]);
+- 
+-         if (s->ps.sps->chroma_format_idc) {
+-+#ifdef RPI_INTER_QPU
+-+            if (s->enable_rpi) {
+-+                int reflist = 1;
+-+                int hshift           = s->ps.sps->hshift[1];
+-+                int vshift           = s->ps.sps->vshift[1];
+-+                const Mv *mv         = &current_mv.mv[reflist];
+-+                intptr_t mx          = av_mod_uintp2(mv->x, 2 + hshift);
+-+                intptr_t my          = av_mod_uintp2(mv->y, 2 + vshift);
+-+                intptr_t _mx         = mx << (1 - hshift);
+-+                intptr_t _my         = my << (1 - vshift); // Fractional part of motion vector
+-+
+-+                int x1_c = x0_c + (mv->x >> (2 + hshift));
+-+                int y1_c = y0_c + (mv->y >> (2 + hshift));
+-+                //int chan = x0>>8; // Allocate commands for the first 256 luma pixels across to the first QPU.  This is optimised for images around 1920 width
+-+                int chan = x0>>8;
+-+
+-+                uint32_t *u = s->u_mvs[chan & 7];
+-+                for(int start_y=0;start_y < nPbH_c;start_y+=16) {
+-+                  for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 3 + start_x;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 3 + start_y;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[1]);
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[2]);
+-+                      *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
+-+                      // TODO chroma weight and offset... s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]
+-+                      *u++ = rpi_filter_coefs[_mx][0];
+-+                      *u++ = rpi_filter_coefs[_mx][1];
+-+                      *u++ = rpi_filter_coefs[_my][0];
+-+                      *u++ = rpi_filter_coefs[_my][1];
+-+                      *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
+-+                      *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+-+                    }
+-+                }
+-+                s->u_mvs[chan & 7] = u;
+-+                return;
+-+            }
+-+#endif
+-             RPI_REDIRECT(chroma_mc_uni)(s, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1],
+-                           1, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
+-                           s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0], s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0]);
+--- 
+-2.7.4
+-
+-
+-From 75777ba7927086e862104b14f6446e81bc789611 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Wed, 13 May 2015 15:13:47 +0100
+-Subject: [PATCH 21/68] Added B prediction - not quite right
+-
+----
+- libavcodec/hevc.c          |  58 ++++++++++++++++++++++++
+- libavcodec/rpi_shader.c    | 108 +++++++++++++++++++++++----------------------
+- libavcodec/rpi_shader.h    |   6 +--
+- libavcodec/rpi_shader.qasm |  48 ++++++++++----------
+- 4 files changed, 141 insertions(+), 79 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 391d139..47ddfff 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -2127,6 +2127,64 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                    ref1->frame, &current_mv.mv[1], &current_mv);
+- 
+-         if (s->ps.sps->chroma_format_idc) {
+-+#ifdef RPI_INTER_QPU
+-+            if (s->enable_rpi) {
+-+                int hshift           = s->ps.sps->hshift[1];
+-+                int vshift           = s->ps.sps->vshift[1];
+-+                const Mv *mv         = &current_mv.mv[0];
+-+                intptr_t mx          = av_mod_uintp2(mv->x, 2 + hshift);
+-+                intptr_t my          = av_mod_uintp2(mv->y, 2 + vshift);
+-+                intptr_t _mx         = mx << (1 - hshift);
+-+                intptr_t _my         = my << (1 - vshift); // Fractional part of motion vector
+-+                int x1_c = x0_c + (mv->x >> (2 + hshift));
+-+                int y1_c = y0_c + (mv->y >> (2 + hshift));
+-+
+-+                const Mv *mv2         = &current_mv.mv[1];
+-+                intptr_t mx2          = av_mod_uintp2(mv2->x, 2 + hshift);
+-+                intptr_t my2          = av_mod_uintp2(mv2->y, 2 + vshift);
+-+                intptr_t _mx2         = mx2 << (1 - hshift);
+-+                intptr_t _my2         = my2 << (1 - vshift); // Fractional part of motion vector
+-+
+-+                int x2_c = x0_c + (mv2->x >> (2 + hshift));
+-+                int y2_c = y0_c + (mv2->y >> (2 + hshift));
+-+
+-+                int chan = x0>>8; // Allocate commands for the first 256 luma pixels across to the first QPU.  This is optimised for images around 1920 width
+-+
+-+                uint32_t *u = s->u_mvs[chan & 7];
+-+                for(int start_y=0;start_y < nPbH_c;start_y+=16) {
+-+                  for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 3 + start_x;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 3 + start_y;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[1]);
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[2]);
+-+                      *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
+-+                      *u++ = rpi_filter_coefs[_mx][0];
+-+                      *u++ = rpi_filter_coefs[_mx][1];
+-+                      *u++ = rpi_filter_coefs[_my][0];
+-+                      *u++ = rpi_filter_coefs[_my][1];
+-+                      *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]); // TODO this will become unused once we have a dedicated pass0 filter
+-+                      *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+-+
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = x2_c - 3 + start_x;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = y2_c - 3 + start_y;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[1]);
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[2]);
+-+                      *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
+-+                      // TODO chroma weight and offset... s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]
+-+                      *u++ = rpi_filter_coefs[_mx2][0];
+-+                      *u++ = rpi_filter_coefs[_mx2][1];
+-+                      *u++ = rpi_filter_coefs[_my2][0];
+-+                      *u++ = rpi_filter_coefs[_my2][1];
+-+                      *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
+-+                      *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+-+                    }
+-+                }
+-+                s->u_mvs[chan & 7] = u;
+-+                return;
+-+            }
+-+#endif
+-             RPI_REDIRECT(chroma_mc_bi)(s, dst1, s->frame->linesize[1], ref0->frame, ref1->frame,
+-                          x0_c, y0_c, nPbW_c, nPbH_c, &current_mv, 0);
+- 
+-diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
+-index 170e8ac..5d00cb2 100644
+---- a/libavcodec/rpi_shader.c
+-+++ b/libavcodec/rpi_shader.c
+-@@ -265,23 +265,23 @@ unsigned int rpi_shader[] = {
+- /* [0x00000760] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+- /* [0x00000768] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+- /* [0x00000770] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000778] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
+--/* [0x00000780] */ 0x40038031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
+--/* [0x00000788] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00000778] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x00000780] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x00000788] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+- /* [0x00000790] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x00000798] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000798] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+- /* [0x000007a0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x000007a8] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x000007a8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+- /* [0x000007b0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x000007b8] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+/* [0x000007b8] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+- /* [0x000007c0] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--/* [0x000007c8] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+/* [0x000007c8] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+- /* [0x000007d0] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--/* [0x000007d8] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+/* [0x000007d8] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+- /* [0x000007e0] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--/* [0x000007e8] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+/* [0x000007e8] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+- /* [0x000007f0] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--/* [0x000007f8] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
+-+/* [0x000007f8] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
+- /* [0x00000800] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+- /* [0x00000808] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+- /* [0x00000810] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+-@@ -291,61 +291,63 @@ unsigned int rpi_shader[] = {
+- /* [0x00000830] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+- /* [0x00000838] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+- /* [0x00000840] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
+--/* [0x00000848] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
+--/* [0x00000850] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
+--/* [0x00000858] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
+--/* [0x00000860] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
+--/* [0x00000868] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
+--/* [0x00000870] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
+--/* [0x00000878] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
+--/* [0x00000880] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
+--/* [0x00000888] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
+--/* [0x00000890] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
+--/* [0x00000898] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
+--/* [0x000008a0] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
+-+/* [0x00000848] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+-+/* [0x00000850] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+-+/* [0x00000858] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+-+/* [0x00000860] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
+-+/* [0x00000868] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
+-+/* [0x00000870] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
+-+/* [0x00000878] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
+-+/* [0x00000880] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
+-+/* [0x00000888] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
+-+/* [0x00000890] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
+-+/* [0x00000898] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
+-+/* [0x000008a0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+- /* [0x000008a8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x000008b0] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
+--/* [0x000008b8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x000008c0] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
+--/* [0x000008c8] */ 0xfffffe38, 0xf06809e7, // brr.anyn -, r:uvloop_b
+--/* [0x000008d0] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+--/* [0x000008d8] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
+--/* [0x000008e0] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
+--/* [0x000008e8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x000008f0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x000008f8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x00000900] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x00000908] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x00000910] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000918] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x00000920] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000928] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x000008b0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x000008b8] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
+-+/* [0x000008c0] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
+-+/* [0x000008c8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x000008d0] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
+-+/* [0x000008d8] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x000008e0] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+-+/* [0x000008e8] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
+-+/* [0x000008f0] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
+-+/* [0x000008f8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00000900] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000908] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000910] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x00000918] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x00000920] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000928] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x00000930] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000938] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_exit
+--/* [0x00000930] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x00000938] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+--/* [0x00000940] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000948] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000940] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00000948] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+- /* [0x00000950] */ 0x009e7000, 0xa00009e7, // ldtmu0
+- /* [0x00000958] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000960] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x00000968] */ 0x009e7000, 0x100009e7, // nop        ; nop
+--/* [0x00000970] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000960] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000968] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000970] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00000978] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000980] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_interrupt_exit8
+--/* [0x00000978] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x00000980] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000988] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000988] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+- /* [0x00000990] */ 0x009e7000, 0xa00009e7, // ldtmu0
+- /* [0x00000998] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x000009a0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x000009a8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000009a0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x000009a8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+- /* [0x000009b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x000009b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x000009c0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x000009c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x000009d0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x000009d8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x000009e0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+--/* [0x000009e8] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x000009d8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000009e0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000009e8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x000009f0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x000009f8] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_end
+- };
+- #ifdef __HIGHC__
+-diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
+-index 9de4535..e36c4ae 100644
+---- a/libavcodec/rpi_shader.h
+-+++ b/libavcodec/rpi_shader.h
+-@@ -6,8 +6,8 @@ extern unsigned int rpi_shader[];
+- #define mc_setup_uv (rpi_shader + 0)
+- #define mc_filter_uv (rpi_shader + 142)
+- #define mc_filter_uv_b (rpi_shader + 360)
+--#define mc_exit (rpi_shader + 588)
+--#define mc_interrupt_exit8 (rpi_shader + 606)
+--#define mc_end (rpi_shader + 636)
+-+#define mc_exit (rpi_shader + 592)
+-+#define mc_interrupt_exit8 (rpi_shader + 610)
+-+#define mc_end (rpi_shader + 640)
+- 
+- #endif
+-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
+-index cd7346d..870437d2 100644
+---- a/libavcodec/rpi_shader.qasm
+-+++ b/libavcodec/rpi_shader.qasm
+-@@ -443,23 +443,23 @@ add t0s, ra_x2_base, r2
+- 
+- mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+- 
+--mov r2, rb21         ; mul24 r3, r0, ra0
+--nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
+--sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
+-+nop                  ; mul24 r2, r0, ra0
+-+nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+nop                  ; mul24      r3, ra1 << 1, r0 << 1
+- nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+- nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+- nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+- nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+- nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+- nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+- nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--sub r0, r2, r3
+-+add r0, r2, r3
+- 
+- mov r3, rb31
+- 
+-@@ -474,23 +474,25 @@ sub.setf -, r3, 8 ; mov r1, ra22
+- 
+- # apply horizontal filter
+- brr.anyn -, r:uvloop_b
+--max ra14, ra15, 0       ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
+--asr r0, r0, 15          ; mov r1, ra21
+--min.setf ra15, r0, rb22
+-+mov ra14, ra15          ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
+-+asr ra15, r0, 8         ; nop
+-+nop                     ; nop
+- 
+- # apply vertical filter and write to VPM
+- 
+--nop                     ; mul24 r0, ra14, rb14
+--sub r1, r1, r0          ; mul24 r0, ra13, rb13
+--sub r1, r1, r0          ; mul24 r0, ra12, rb12
+--sub r1, r1, r0          ; mul24 r0, ra11, rb11
+--sub r1, r1, r0          ; mul24 r0, ra10, rb10
+--sub r1, r1, r0          ; mul24 r0, ra9, rb9
+--sub r1, r1, r0          ; mul24 r0, ra8, rb8
+--sub r1, r1, r0          ; mul24 r0, ra15, rb15
+--sub.ifnn r1, r1, r0     ; mov -, vw_wait
+-+nop                     ; mul24 r1, ra14, rb14
+-+nop                     ; mul24 r0, ra13, rb13
+-+add r1, r1, r0          ; mul24 r0, ra12, rb12
+-+add r1, r1, r0          ; mul24 r0, ra11, rb11
+-+add r1, r1, r0          ; mul24 r0, ra10, rb10
+-+add r1, r1, r0          ; mul24 r0, ra9, rb9
+-+add r1, r1, r0          ; mul24 r0, ra8, rb8
+-+add r1, r1, r0          ; mul24 r0, ra15, rb15
+-+add r1, r1, r0          ; mov -, vw_wait
+- sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--asr r1, r1, 15
+-+asr r1, r1, 14
+-+add r1, r1, ra21
+-+asr r1, r1, 6
+- min r1, r1, rb22
+- add r0, vpm, 1          # Blend in previous VPM contents at this location
+- brr.anyn -, r:uvloop_b
+--- 
+-2.7.4
+-
+-
+-From 3d4e94b8f0b08fe4c0b582fc7f1dbe9d1d9d60ed Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Thu, 14 May 2015 08:15:55 +0100
+-Subject: [PATCH 22/68] Added flush for SAO
+-
+----
+- libavcodec/hevc.c        |  2 +-
+- libavcodec/hevc_filter.c | 39 ++++++++++++++++++++++++++-------------
+- 2 files changed, 27 insertions(+), 14 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 47ddfff..93e1eba 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -2903,7 +2903,7 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-             rpi_execute_inter_qpu(s);
+- #endif
+-             // Transform all blocks
+--            //printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
+-+            // printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
+-             rpi_execute_transform(s);
+-             // Perform inter prediction
+-             rpi_execute_inter_cmds(s);
+-diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
+-index 9b6e26d..92a8271 100644
+---- a/libavcodec/hevc_filter.c
+-+++ b/libavcodec/hevc_filter.c
+-@@ -871,6 +871,21 @@ static void flush_buffer(AVBufferRef *bref) {
+-     GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
+-     gpu_cache_flush(p);
+- }
+-+
+-+static void ff_hevc_flush_chroma(HEVCContext *s)
+-+{
+-+    if (s->enable_rpi && !(  s->nal_unit_type == NAL_TRAIL_N ||
+-+            s->nal_unit_type == NAL_TSA_N   ||
+-+            s->nal_unit_type == NAL_STSA_N  ||
+-+            s->nal_unit_type == NAL_RADL_N  ||
+-+            s->nal_unit_type == NAL_RASL_N )) {
+-+        flush_buffer(s->frame->buf[1]);
+-+        flush_buffer(s->frame->buf[2]);
+-+        //memcpy(s->dummy.arm,s->frame->data[0],2048*64);
+-+        //memcpy(s->dummy.arm,s->frame->data[1],1024*32);
+-+        //memcpy(s->dummy.arm,s->frame->data[2],1024*32);
+-+    }
+-+}
+- #endif
+- 
+- void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
+-@@ -886,31 +901,29 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
+-             sao_filter_CTB(s, x - ctb_size, y);
+-         if (y && x_end) {
+-             sao_filter_CTB(s, x, y - ctb_size);
+--            if (s->threads_type & FF_THREAD_FRAME )
+-+            if (s->threads_type & FF_THREAD_FRAME ) {
+-+#ifdef RPI_INTER_QPU
+-+                ff_hevc_flush_chroma(s);
+-+#endif
+-                 ff_thread_report_progress(&s->ref->tf, y, 0);
+-+            }
+-         }
+-         if (x_end && y_end) {
+-             sao_filter_CTB(s, x , y);
+--            if (s->threads_type & FF_THREAD_FRAME )
+-+            if (s->threads_type & FF_THREAD_FRAME ) {
+-+#ifdef RPI_INTER_QPU
+-+                ff_hevc_flush_chroma(s);
+-+#endif
+-                 ff_thread_report_progress(&s->ref->tf, y + ctb_size, 0);
+-+            }
+-         }
+-     } else if (s->threads_type & FF_THREAD_FRAME && x_end) {
+-         //int newh = y + ctb_size - 4;
+-         //int currh = s->ref->tf.progress->data[0];
+-         //if (((y + ctb_size)&63)==0)
+--        if (!(  s->nal_unit_type == NAL_TRAIL_N ||
+--            s->nal_unit_type == NAL_TSA_N   ||
+--            s->nal_unit_type == NAL_STSA_N  ||
+--            s->nal_unit_type == NAL_RADL_N  ||
+--            s->nal_unit_type == NAL_RASL_N )) {
+- #ifdef RPI_INTER_QPU
+--            flush_buffer(s->frame->buf[1]);
+--            flush_buffer(s->frame->buf[2]);
+-+        ff_hevc_flush_chroma(s);
+- #endif
+--            //memcpy(s->dummy.arm,s->frame->data[0],2048*64);
+--            //memcpy(s->dummy.arm,s->frame->data[1],1024*32);
+--            //memcpy(s->dummy.arm,s->frame->data[2],1024*32);
+--        }
+-         ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
+-     }
+- }
+--- 
+-2.7.4
+-
+-
+-From 3e337b9c4ef0c356a0259be2254ad1bc4d5bbe29 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Thu, 14 May 2015 09:17:28 +0100
+-Subject: [PATCH 23/68] Stopped using acceleration in unsupported cases
+-
+----
+- libavcodec/hevc.c       | 14 +++++++-------
+- libavcodec/hevc_cabac.c |  4 ++--
+- 2 files changed, 9 insertions(+), 9 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 93e1eba..bfd5a55 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -1152,15 +1152,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+-                         for (i = 0; i < (size * size); i++) {
+-                             coeffs[i] = ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
+-                         }
+--                        printf("Cross component not supported\n"); // TODO
+--                        exit(-1);
+-                         s->hevcdsp.transform_add[log2_trafo_size_c-2](dst, coeffs, stride);
+-                     }
+-             }
+- 
+-             if (lc->tu.cross_pf) {
+--                printf("Cross component not supported\n"); // TODO
+--                exit(-1);
+-                 hls_cross_component_pred(s, 1);
+-             }
+-             for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
+-@@ -1189,8 +1185,6 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+-                         for (i = 0; i < (size * size); i++) {
+-                             coeffs[i] = ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
+-                         }
+--                        printf("Cross component not supported\n"); // TODO
+--                        exit(-1);
+-                         s->hevcdsp.transform_add[log2_trafo_size_c-2](dst, coeffs, stride);
+-                     }
+-             }
+-@@ -2857,7 +2851,13 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-     int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs];
+- 
+- #ifdef RPI
+--    s->enable_rpi = 1; // TODO this should depend on cross component and frame width etc.
+-+    s->enable_rpi = s->ps.sps->bit_depth == 8
+-+                    && s->ps.sps->width <= RPI_MAX_WIDTH
+-+                    && !s->ps.pps->cross_component_prediction_enabled_flag
+-+                    && s->ps.pps->num_tile_rows <= 1 && s->ps.pps->num_tile_columns <= 1
+-+                    && !(s->ps.pps->weighted_pred_flag && s->sh.slice_type == P_SLICE)
+-+                    && !(s->ps.pps->weighted_bipred_flag && s->sh.slice_type == B_SLICE);
+-+
+- #endif
+- 
+-     if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) {
+-diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
+-index 4f072be..38f53de 100644
+---- a/libavcodec/hevc_cabac.c
+-+++ b/libavcodec/hevc_cabac.c
+-@@ -1513,9 +1513,9 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+- #ifdef RPI
+-             if (!use_vpu) {
+-               int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
+--              if (max_xy == 0)
+-+              if (max_xy == 0) {
+-                   s->hevcdsp.idct_dc[log2_trafo_size-2](coeffs);
+--              else {
+-+              } else {
+-                   int col_limit = last_significant_coeff_x + last_significant_coeff_y + 4;
+-                   if (max_xy < 4)
+-                       col_limit = FFMIN(4, col_limit);
+--- 
+-2.7.4
+-
+-
+-From 3941d3e4c2305fa037e8aba5a14cf698ac8673db Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Thu, 14 May 2015 09:42:16 +0100
+-Subject: [PATCH 24/68] Split B prediction into two passes
+-
+----
+- libavcodec/hevc.c          |   1 +
+- libavcodec/hevc.h          |   1 +
+- libavcodec/rpi_qpu.c       |   3 +
+- libavcodec/rpi_qpu.h       |   1 +
+- libavcodec/rpi_shader.c    | 559 +++++++++++++++++++++++++++------------------
+- libavcodec/rpi_shader.h    |  11 +-
+- libavcodec/rpi_shader.qasm | 196 ++++++++++++++--
+- 7 files changed, 531 insertions(+), 241 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index bfd5a55..4b133d2 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -3801,6 +3801,7 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
+-             p += uv_commands_per_qpu;
+-         }
+-         s->mc_filter_uv = qpu_get_fn(QPU_MC_FILTER_UV);
+-+        s->mc_filter_uv_b0 = qpu_get_fn(QPU_MC_FILTER_UV_B0);
+-         s->mc_filter_uv_b = qpu_get_fn(QPU_MC_FILTER_UV_B);
+- 
+-     }
+-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
+-index d513579..4a39e39 100644
+---- a/libavcodec/hevc.h
+-+++ b/libavcodec/hevc.h
+-@@ -917,6 +917,7 @@ typedef struct HEVCContext {
+-     uint32_t *u_mvs[8];
+-     // Function pointers
+-     uint32_t mc_filter_uv;
+-+    uint32_t mc_filter_uv_b0;
+-     uint32_t mc_filter_uv_b;
+- #endif
+- 
+-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
+-index 4e90cc1..60bf079 100644
+---- a/libavcodec/rpi_qpu.c
+-+++ b/libavcodec/rpi_qpu.c
+-@@ -636,6 +636,9 @@ unsigned int qpu_get_fn(int num) {
+-     case QPU_MC_FILTER_UV:
+-       fn = mc_filter_uv;
+-       break;
+-+    case QPU_MC_FILTER_UV_B0:
+-+      fn = mc_filter_uv_b0;
+-+      break;
+-     case QPU_MC_FILTER_UV_B:
+-       fn = mc_filter_uv_b;
+-       break;
+-diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
+-index f9ad333..543c84b 100644
+---- a/libavcodec/rpi_qpu.h
+-+++ b/libavcodec/rpi_qpu.h
+-@@ -29,6 +29,7 @@ enum {
+-   QPU_MC_FILTER_HONLY,
+-   QPU_MC_SETUP_UV,
+-   QPU_MC_FILTER_UV,
+-+  QPU_MC_FILTER_UV_B0,
+-   QPU_MC_FILTER_UV_B,
+-   QPU_MC_INTERRUPT_EXIT8,
+-   QPU_MC_END
+-diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
+-index 5d00cb2..88ad20b 100644
+---- a/libavcodec/rpi_shader.c
+-+++ b/libavcodec/rpi_shader.c
+-@@ -39,18 +39,18 @@ unsigned int rpi_shader[] = {
+- /* [0x00000070] */ 0x00000100, 0xe00205a7, // mov ra22, 256
+- /* [0x00000078] */ 0x00000008, 0xe00205e7, // mov ra23, 8
+- /* [0x00000080] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
+--/* [0x00000088] */ 0x00000040, 0xe0021567, // mov rb21, 64
+--/* [0x00000090] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
+--/* [0x00000098] */ 0x00000018, 0xe00215e7, // mov rb23, 24
+--/* [0x000000a0] */ 0x00000000, 0xe0020227, // mov ra8, 0
+--/* [0x000000a8] */ 0x00000000, 0xe0020267, // mov ra9, 0
+--/* [0x000000b0] */ 0x00000000, 0xe00202a7, // mov ra10, 0
+--/* [0x000000b8] */ 0x00000000, 0xe00202e7, // mov ra11, 0
+--/* [0x000000c0] */ 0x00000000, 0xe0020327, // mov ra12, 0
+--/* [0x000000c8] */ 0x00000000, 0xe0020367, // mov ra13, 0
+--/* [0x000000d0] */ 0x00000000, 0xe00203a7, // mov ra14, 0
+--/* [0x000000d8] */ 0x00000000, 0xe00203e7, // mov ra15, 0
+--/* [0x000000e0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+-+/* [0x00000088] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
+-+/* [0x00000090] */ 0x00000018, 0xe00215e7, // mov rb23, 24
+-+/* [0x00000098] */ 0x00000000, 0xe0020227, // mov ra8, 0
+-+/* [0x000000a0] */ 0x00000000, 0xe0020267, // mov ra9, 0
+-+/* [0x000000a8] */ 0x00000000, 0xe00202a7, // mov ra10, 0
+-+/* [0x000000b0] */ 0x00000000, 0xe00202e7, // mov ra11, 0
+-+/* [0x000000b8] */ 0x00000000, 0xe0020327, // mov ra12, 0
+-+/* [0x000000c0] */ 0x00000000, 0xe0020367, // mov ra13, 0
+-+/* [0x000000c8] */ 0x00000000, 0xe00203a7, // mov ra14, 0
+-+/* [0x000000d0] */ 0x00000000, 0xe00203e7, // mov ra15, 0
+-+/* [0x000000d8] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+-+/* [0x000000e0] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
+- /* [0x000000e8] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+- /* [0x000000f0] */ 0x159e7480, 0x10020867, // mov r1, r2
+- /* [0x000000f8] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+-@@ -62,176 +62,176 @@ unsigned int rpi_shader[] = {
+- /* [0x00000128] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+- /* [0x00000130] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
+- /* [0x00000138] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+--/* [0x00000140] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+--/* [0x00000148] */ 0x159e7480, 0x10020867, // mov r1, r2
+--/* [0x00000150] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+--/* [0x00000158] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+--/* [0x00000160] */ 0x159e7480, 0x10020827, // mov r0, r2
+--/* [0x00000168] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+--/* [0x00000170] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000178] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+--/* [0x00000180] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
+--/* [0x00000188] */ 0x15427d80, 0x10020827, // mov r0, ra_x_base
+--/* [0x00000190] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
+--/* [0x00000198] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base
+--/* [0x000001a0] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
+--/* [0x000001a8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
+--/* [0x000001b0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x000001b8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+--/* [0x000001c0] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x_base, r0
+--/* [0x000001c8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+--/* [0x000001d0] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
+--/* [0x000001d8] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
+--/* [0x000001e0] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
+--/* [0x000001e8] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000001f0] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000001f8] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000200] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
+--/* [0x00000208] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+--/* [0x00000210] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
+--/* [0x00000218] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000220] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+--/* [0x00000228] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
+--/* [0x00000230] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
+-+/* [0x00000140] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
+-+/* [0x00000148] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+-+/* [0x00000150] */ 0x159e7480, 0x10020867, // mov r1, r2
+-+/* [0x00000158] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+-+/* [0x00000160] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+-+/* [0x00000168] */ 0x159e7480, 0x10020827, // mov r0, r2
+-+/* [0x00000170] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+-+/* [0x00000178] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000180] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+-+/* [0x00000188] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
+-+/* [0x00000190] */ 0x0f9c11c0, 0xd0020827, // asr r0, r0, 1
+-+/* [0x00000198] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
+-+/* [0x000001a0] */ 0x0c9e7040, 0x10021567, // add rb21, r0, r1
+-+/* [0x000001a8] */ 0x15427d80, 0x10020827, // mov r0, ra_x_base
+-+/* [0x000001b0] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
+-+/* [0x000001b8] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base
+-+/* [0x000001c0] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
+-+/* [0x000001c8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
+-+/* [0x000001d0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x000001d8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+-+/* [0x000001e0] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x_base, r0
+-+/* [0x000001e8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x000001f0] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
+-+/* [0x000001f8] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
+-+/* [0x00000200] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
+-+/* [0x00000208] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000210] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000218] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000220] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
+-+/* [0x00000228] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x00000230] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
+-+/* [0x00000238] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000240] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+-+/* [0x00000248] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
+-+/* [0x00000250] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
+- // ::mc_filter_uv
+--/* [0x00000238] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00000240] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x00000248] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00000250] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x00000258] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+--/* [0x00000260] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x00000268] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+--/* [0x00000270] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x00000278] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+--/* [0x00000280] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x00000288] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+--/* [0x00000290] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+--/* [0x00000298] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x000002a0] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000002a8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x000002b0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x000002b8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x000002c0] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+--/* [0x000002c8] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+--/* [0x000002d0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x000002d8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x000002e0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x000002e8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x000002f0] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
+--/* [0x000002f8] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000300] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000308] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000310] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000318] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x00000320] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000328] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000330] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000338] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+--/* [0x00000340] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000348] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000350] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000358] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+--/* [0x00000360] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000368] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000370] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000378] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+--/* [0x00000380] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x00000388] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000390] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x00000258] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000260] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00000268] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000270] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000278] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x00000280] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000288] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x00000290] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x00000298] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+-+/* [0x000002a0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x000002a8] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+-+/* [0x000002b0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x000002b8] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x000002c0] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000002c8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x000002d0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x000002d8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x000002e0] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+-+/* [0x000002e8] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+-+/* [0x000002f0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x000002f8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000300] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000308] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00000310] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
+-+/* [0x00000318] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000320] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000328] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000330] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000338] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x00000340] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000348] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000350] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000358] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+-+/* [0x00000360] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000368] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000370] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000378] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+-+/* [0x00000380] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000388] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000390] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000398] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+-+/* [0x000003a0] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x000003a8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x000003b0] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop
+--/* [0x00000398] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x000003a0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x000003a8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x000003b0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x000003b8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x000003c0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x000003c8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x000003d0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x000003d8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x000003e0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x000003e8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x000003f0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x000003f8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x00000400] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x00000408] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x00000410] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x00000418] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x00000420] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x00000428] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00000430] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--/* [0x00000438] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--/* [0x00000440] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--/* [0x00000448] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--/* [0x00000450] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--/* [0x00000458] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--/* [0x00000460] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--/* [0x00000468] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--/* [0x00000470] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
+--/* [0x00000478] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+--/* [0x00000480] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+--/* [0x00000488] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+--/* [0x00000490] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+--/* [0x00000498] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+--/* [0x000004a0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+--/* [0x000004a8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x000004b0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+--/* [0x000004b8] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop
+--/* [0x000004c0] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+--/* [0x000004c8] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+--/* [0x000004d0] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+--/* [0x000004d8] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
+--/* [0x000004e0] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
+--/* [0x000004e8] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
+--/* [0x000004f0] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
+--/* [0x000004f8] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
+--/* [0x00000500] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
+--/* [0x00000508] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
+--/* [0x00000510] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
+--/* [0x00000518] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x00000520] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x00000528] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+--/* [0x00000530] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
+--/* [0x00000538] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:uvloop
+--/* [0x00000540] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
+--/* [0x00000548] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x00000550] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+--/* [0x00000558] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00000560] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000568] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x00000570] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x00000578] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x00000580] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000588] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x00000590] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000598] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--// ::mc_filter_uv_b
+--/* [0x000005a0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x000005a8] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x000005b0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x000005b8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x000005c0] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+--/* [0x000005c8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x000005d0] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+--/* [0x000005d8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x000005e0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+--/* [0x000005e8] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x000005f0] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+--/* [0x000005f8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+--/* [0x00000600] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x00000608] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000610] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x00000618] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x00000620] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x00000628] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+--/* [0x00000630] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+--/* [0x00000638] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x00000640] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+--/* [0x00000648] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+--/* [0x00000650] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+--/* [0x00000658] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000660] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000668] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00000670] */ 0x0c9dc7c0, 0x10020c67, // add vr_setup, r3, rb28
+-+/* [0x000003b8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x000003c0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x000003c8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x000003d0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x000003d8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x000003e0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x000003e8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x000003f0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x000003f8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x00000400] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x00000408] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000410] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x00000418] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x00000420] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00000428] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x00000430] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000438] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00000440] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00000448] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00000450] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+/* [0x00000458] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+/* [0x00000460] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+/* [0x00000468] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+/* [0x00000470] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+/* [0x00000478] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+/* [0x00000480] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+/* [0x00000488] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+/* [0x00000490] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
+-+/* [0x00000498] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+-+/* [0x000004a0] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+-+/* [0x000004a8] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+-+/* [0x000004b0] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+-+/* [0x000004b8] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+-+/* [0x000004c0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+-+/* [0x000004c8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x000004d0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+-+/* [0x000004d8] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop
+-+/* [0x000004e0] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+-+/* [0x000004e8] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+-+/* [0x000004f0] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+-+/* [0x000004f8] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
+-+/* [0x00000500] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
+-+/* [0x00000508] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
+-+/* [0x00000510] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
+-+/* [0x00000518] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
+-+/* [0x00000520] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
+-+/* [0x00000528] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
+-+/* [0x00000530] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
+-+/* [0x00000538] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x00000540] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x00000548] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x00000550] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
+-+/* [0x00000558] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:uvloop
+-+/* [0x00000560] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
+-+/* [0x00000568] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x00000570] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x00000578] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00000580] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000588] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000590] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x00000598] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x000005a0] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x000005a8] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x000005b0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x000005b8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+// ::mc_filter_uv_b0
+-+/* [0x000005c0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x000005c8] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x000005d0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x000005d8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x000005e0] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x000005e8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x000005f0] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x000005f8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x00000600] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+-+/* [0x00000608] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x00000610] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+-+/* [0x00000618] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x00000620] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x00000628] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000630] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x00000638] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x00000640] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x00000648] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+-+/* [0x00000650] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+-+/* [0x00000658] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x00000660] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000668] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000670] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+- /* [0x00000678] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
+- /* [0x00000680] */ 0x15827d80, 0x10020827, // mov r0, unif
+- /* [0x00000688] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-@@ -253,7 +253,7 @@ unsigned int rpi_shader[] = {
+- /* [0x00000708] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+- /* [0x00000710] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+- /* [0x00000718] */ 0x00000000, 0xe00208e7, // mov r3, 0
+--// :uvloop_b
+-+// :uvloop_b0
+- /* [0x00000720] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+- /* [0x00000728] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+- /* [0x00000730] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-@@ -290,7 +290,7 @@ unsigned int rpi_shader[] = {
+- /* [0x00000828] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+- /* [0x00000830] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+- /* [0x00000838] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+--/* [0x00000840] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x00000840] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+- /* [0x00000848] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+- /* [0x00000850] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+- /* [0x00000858] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+-@@ -306,48 +306,163 @@ unsigned int rpi_shader[] = {
+- /* [0x000008a8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+- /* [0x000008b0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+- /* [0x000008b8] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
+--/* [0x000008c0] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
+--/* [0x000008c8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x000008d0] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
+--/* [0x000008d8] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:uvloop_b
+--/* [0x000008e0] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+--/* [0x000008e8] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
+--/* [0x000008f0] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
+--/* [0x000008f8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00000900] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000908] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x00000910] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x00000918] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x00000920] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000928] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x00000930] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000938] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x000008c0] */ 0xfffffad8, 0xf06809e7, // brr.anyn -, r:uvloop
+-+/* [0x000008c8] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
+-+/* [0x000008d0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x000008d8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x000008e0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x000008e8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x000008f0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x000008f8] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x00000900] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x00000908] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000910] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x00000918] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000920] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+// ::mc_filter_uv_b
+-+/* [0x00000928] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000930] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00000938] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000940] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000948] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x00000950] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000958] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x00000960] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x00000968] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+-+/* [0x00000970] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x00000978] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+-+/* [0x00000980] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x00000988] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x00000990] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000998] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x000009a0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x000009a8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x000009b0] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+-+/* [0x000009b8] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+-+/* [0x000009c0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x000009c8] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+-+/* [0x000009d0] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+-+/* [0x000009d8] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+-+/* [0x000009e0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x000009e8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x000009f0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x000009f8] */ 0x0c9dc7c0, 0x10020c67, // add vr_setup, r3, rb28
+-+/* [0x00000a00] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
+-+/* [0x00000a08] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000a10] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000a18] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000a20] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000a28] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x00000a30] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000a38] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000a40] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000a48] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+-+/* [0x00000a50] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000a58] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000a60] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000a68] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+-+/* [0x00000a70] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000a78] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000a80] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000a88] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+-+/* [0x00000a90] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x00000a98] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000aa0] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+// :uvloop_b
+-+/* [0x00000aa8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x00000ab0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x00000ab8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x00000ac0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000ac8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x00000ad0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000ad8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000ae0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x00000ae8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x00000af0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x00000af8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000b00] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x00000b08] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x00000b10] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00000b18] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x00000b20] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000b28] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00000b30] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00000b38] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00000b40] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+/* [0x00000b48] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+/* [0x00000b50] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+/* [0x00000b58] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+/* [0x00000b60] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+/* [0x00000b68] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+/* [0x00000b70] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+/* [0x00000b78] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+/* [0x00000b80] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
+-+/* [0x00000b88] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+-+/* [0x00000b90] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+-+/* [0x00000b98] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+-+/* [0x00000ba0] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+-+/* [0x00000ba8] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+-+/* [0x00000bb0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+-+/* [0x00000bb8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00000bc0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+-+/* [0x00000bc8] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x00000bd0] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+-+/* [0x00000bd8] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+-+/* [0x00000be0] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+-+/* [0x00000be8] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
+-+/* [0x00000bf0] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
+-+/* [0x00000bf8] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
+-+/* [0x00000c00] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
+-+/* [0x00000c08] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
+-+/* [0x00000c10] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
+-+/* [0x00000c18] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
+-+/* [0x00000c20] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
+-+/* [0x00000c28] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x00000c30] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x00000c38] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x00000c40] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
+-+/* [0x00000c48] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
+-+/* [0x00000c50] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x00000c58] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
+-+/* [0x00000c60] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x00000c68] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+-+/* [0x00000c70] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
+-+/* [0x00000c78] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
+-+/* [0x00000c80] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00000c88] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000c90] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000c98] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x00000ca0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x00000ca8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000cb0] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x00000cb8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000cc0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_exit
+--/* [0x00000940] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x00000948] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+--/* [0x00000950] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000958] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000960] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000968] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000970] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x00000978] */ 0x009e7000, 0x100009e7, // nop        ; nop
+--/* [0x00000980] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000cc8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00000cd0] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+-+/* [0x00000cd8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000ce0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000ce8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000cf0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000cf8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00000d00] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000d08] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_interrupt_exit8
+--/* [0x00000988] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x00000990] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000998] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x000009a0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x000009a8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x000009b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x000009b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x000009c0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x000009c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x000009d0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x000009d8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x000009e0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x000009e8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x000009f0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+--/* [0x000009f8] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000d10] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00000d18] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000d20] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000d28] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000d30] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000d38] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000d40] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000d48] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000d50] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000d58] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000d60] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000d68] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000d70] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00000d78] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x00000d80] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_end
+- };
+- #ifdef __HIGHC__
+-diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
+-index e36c4ae..809e582 100644
+---- a/libavcodec/rpi_shader.h
+-+++ b/libavcodec/rpi_shader.h
+-@@ -4,10 +4,11 @@
+- extern unsigned int rpi_shader[];
+- 
+- #define mc_setup_uv (rpi_shader + 0)
+--#define mc_filter_uv (rpi_shader + 142)
+--#define mc_filter_uv_b (rpi_shader + 360)
+--#define mc_exit (rpi_shader + 592)
+--#define mc_interrupt_exit8 (rpi_shader + 610)
+--#define mc_end (rpi_shader + 640)
+-+#define mc_filter_uv (rpi_shader + 150)
+-+#define mc_filter_uv_b0 (rpi_shader + 368)
+-+#define mc_filter_uv_b (rpi_shader + 586)
+-+#define mc_exit (rpi_shader + 818)
+-+#define mc_interrupt_exit8 (rpi_shader + 836)
+-+#define mc_end (rpi_shader + 866)
+- 
+- #endif
+-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
+-index 870437d2..635b894 100644
+---- a/libavcodec/rpi_shader.qasm
+-+++ b/libavcodec/rpi_shader.qasm
+-@@ -26,7 +26,7 @@
+- # ra23                                          8
+- #
+- # rb20                                          0xffffff00
+--# rb21                                          64
+-+# rb21                                          vpm_setup for writing 16bit results into VPM
+- # rb22                                          255
+- # rb23                                          24
+- #
+-@@ -34,7 +34,7 @@
+- # rb25                                          frame width-1
+- # rb26                                          height<<23 + width<<16 + vdw_setup_0
+- # rb27                                          vdw_setup_0 (depends on QPU number)
+--# rb28                                          vpm_setup (depends on QPU number)
+-+# rb28                                          vpm_setup (depends on QPU number) for writing 8bit results into VPM
+- # rb29                                          vdw_setup_1(dst_pitch-width)
+- # rb30                                          frame height-1
+- # rb31                                          used as temp to count loop iterations
+-@@ -69,8 +69,6 @@
+- .set ra_y_next,                    ra28
+- .set ra_y,                         ra29
+- 
+--.set rb_const_64,                  rb21
+--
+- 
+- ################################################################################
+- # mc_setup_uv(next_kernel, x, y, ref_u_base, ref_v_base, frame_width, frame_height, pitch, dst_pitch, pad0, pad1, pad2)
+-@@ -106,7 +104,6 @@ mov ra22, 256
+- mov ra23, 8
+- 
+- mov rb20, 0xffffff00
+--mov rb21, 64
+- mov rb22, 255
+- mov rb23, 24
+- 
+-@@ -123,6 +120,7 @@ mov ra15, 0
+- 
+- # Compute part of VPM to use for DMA output
+- mov r2, qpu_num
+-+shl r2, r2, 1   # Convert QPU numbers to be even (this means we can only use 8 QPUs, but is necessary as we need to save 16bit intermediate results)
+- and r2, r2, 15
+- mov r1, r2
+- asr r1, r1, 2
+-@@ -135,16 +133,21 @@ shl r0, r0, 5
+- add rb27, r0, r1
+- 
+- # Compute part of VPM to save data into
+--mov r2, qpu_num
+--and r2, r2, 15
+--mov r1, r2
+--asr r1, r1, 2
+--shl r1, r1, 6
+--mov r0, r2
+--and r0, r0, 3
+--add r0, r0, r1
+--mov r1, vpm_setup(0, 4, h8p(0, 0))
+-+mov r2, qpu_num   # qpu_num = abcd
+-+shl r2, r2, 1
+-+and r2, r2, 15    # r2 = bcd0
+-+mov r1, r2        # r1 = bcd0
+-+asr r1, r1, 2     # r1 = bc
+-+shl r1, r1, 6     # r1 = bc000000
+-+mov r0, r2        # r0 = bcd0
+-+and r0, r0, 3     # r0 = d0
+-+add r0, r0, r1    # r0 = bc0000d0
+-+mov r1, vpm_setup(0, 4, h8p(0, 0))   # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
+- add rb28, r0, r1
+-+asr r0, r0, 1     # r0 = bc0000d
+-+# Prepare VPM command for 16bit intermediates
+-+mov r1, vpm_setup(0, 2, h16p(0, 0))  # 2 is stride - stride acts on ADDR which is Y[5:0],H[0] for 16 bit
+-+add rb21, r0, r1
+- 
+- # Compute base address for first and second access
+- mov r0, ra_x_base           # Load x
+-@@ -345,6 +348,171 @@ mov vw_addr, unif # start the VDW
+- 
+- ################################################################################
+- 
+-+# mc_filter_uv_b0(next_kernel, x, y, frame_u_base, frame_v_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_u_dst, this_v_dst)
+-+
+-+# At this point we have already issued two pairs of texture requests for the current block
+-+# ra_x_base, ra_x16_base point to the current coordinates for this block
+-+::mc_filter_uv_b0
+-+mov ra31, unif
+-+
+-+# per-channel shifts were calculated on the *previous* invocation
+-+
+-+mov ra_xshift, ra_xshift_next
+-+
+-+# get base addresses and per-channel shifts for *next* invocation
+-+add r0, unif, elem_num    # x
+-+max r0, r0, 0; mov r1, unif # y
+-+min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base
+-+shl ra_xshift_next, r0, 3
+-+sub r2, unif, r3 # compute offset from frame base u to frame base v
+-+add r0, r0, r3
+-+and rb_x_base_next, r0, ~3
+-+mov ra_y_next, r1
+-+add ra_x2_base_next, rb_x_base_next, r2
+-+
+-+# set up VPM write
+-+mov vw_setup, rb28
+-+
+-+# get width,height of block
+-+mov r2, 16
+-+mov r0, unif
+-+shr r1, r0, r2 # Extract width
+-+sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
+-+and r0, r0, rb22 # Extract height
+-+add rb17, r0, 5
+-+add rb18, r0, 7
+-+shl r0, r0, 7
+-+add r0, r0, r1 # Combine width and height of destination area
+-+shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
+-+add rb26, r0, rb27
+-+
+-+sub.setf -,8,r1 # 8-r1, so if <0 (negative) we need to use the full code
+-+
+-+# get filter coefficients
+-+
+-+mov r0, unif
+-+asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra0, r0, rb23;      mov r0, unif
+-+asr ra7, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra6, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra5, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra4, r0, rb23;      mov r0, unif
+-+asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+asr rb8, r0, rb23;      mov r0, unif
+-+asr rb15, r0, rb23;     mul24 r0, r0, ra22
+-+asr rb14, r0, rb23;     mul24 r0, r0, ra22
+-+asr rb13, r0, rb23;     mul24 r0, r0, ra22
+-+asr rb12, r0, rb23
+-+
+-+# r2 is elem_num
+-+# r3 is loop counter
+-+
+-+mov r5rep, -8
+-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+
+-+# retrieve texture results and pick out bytes
+-+# then submit two more texture requests
+-+
+-+mov r3, 0
+-+
+-+:uvloop_b0
+-+# retrieve texture results and pick out bytes
+-+# then submit two more texture requests
+-+
+-+sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
+-+shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
+-+
+-+max r2, ra_y, 0  # y
+-+min r2, r2, rb_frame_height_minus_1
+-+add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+add t0s, ra_x2_base, r2
+-+
+-+# generate seven shifted versions
+-+# interleave with scroll of vertical context
+-+
+-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+
+-+nop                  ; mul24 r2, r0, ra0
+-+nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+add r0, r2, r3
+-+
+-+mov r3, rb31
+-+
+-+mov ra8, ra9
+-+mov ra9, ra10
+-+mov ra10, ra11
+-+mov ra11, ra12
+-+mov ra12, ra13
+-+mov ra13, ra14
+-+
+-+sub.setf -, r3, 8 ; mov r1, ra22
+-+
+-+# apply horizontal filter
+-+brr.anyn -, r:uvloop_b0
+-+mov ra14, ra15          ; mul24 r0, r0, r1         # last bit of context scroll
+-+asr ra15, r0, 8         ; nop
+-+nop                     ; nop  # Delay slot 3 (TODO move more of the context scroll into here)
+-+
+-+# apply vertical filter and write to VPM
+-+
+-+nop                     ; mul24 r1, ra14, rb14
+-+nop                     ; mul24 r0, ra13, rb13
+-+add r1, r1, r0          ; mul24 r0, ra12, rb12
+-+add r1, r1, r0          ; mul24 r0, ra11, rb11
+-+add r1, r1, r0          ; mul24 r0, ra10, rb10
+-+add r1, r1, r0          ; mul24 r0, ra9, rb9
+-+add r1, r1, r0          ; mul24 r0, ra8, rb8
+-+add r1, r1, r0          ; mul24 r0, ra15, rb15
+-+add r1, r1, r0          ; mov -, vw_wait
+-+sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+asr r1, r1, 14
+-+add r1, r1, ra21
+-+brr.anyn -, r:uvloop
+-+asr r1, r1, 6          # Delay 1
+-+min r1, r1, rb22       # Delay 2
+-+max vpm, r1, 0         # Delay 3
+-+
+-+# DMA out for U
+-+
+-+mov vw_setup, rb26 # VDW setup 0
+-+mov vw_setup, rb29 # Stride
+-+mov vw_addr, unif # start the VDW
+-+
+-+# DMA out for V
+-+# We need to wait for the U to complete first, but have nothing useful to compute while we wait.
+-+# Could potentially push this write into the start of the next pipeline stage.
+-+mov r0, 16
+-+mov -, vw_wait
+-+
+-+bra -, ra31
+-+add vw_setup, rb26, r0 # VDW setup 0
+-+mov vw_setup, rb29 # Stride
+-+mov vw_addr, unif # start the VDW
+-+
+-+################################################################################
+-+
+- ::mc_filter_uv_b
+- mov ra31, unif
+- 
+--- 
+-2.7.4
+-
+-
+-From 85d0ffa2bcf6a2b94c1a0c8f84241cda9ac92ce2 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Thu, 14 May 2015 10:04:55 +0100
+-Subject: [PATCH 25/68] Switch to using 16bit temp buffers
+-
+----
+- libavcodec/hevc.c          |  2 +-
+- libavcodec/rpi_shader.c    |  4 ++--
+- libavcodec/rpi_shader.qasm | 10 +++++-----
+- 3 files changed, 8 insertions(+), 8 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 4b133d2..28a6660 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -2147,7 +2147,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                 uint32_t *u = s->u_mvs[chan & 7];
+-                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
+-                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
+--                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b0;
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 3 + start_x;
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 3 + start_y;
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[1]);
+-diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
+-index 88ad20b..ffd3a07 100644
+---- a/libavcodec/rpi_shader.c
+-+++ b/libavcodec/rpi_shader.c
+-@@ -220,7 +220,7 @@ unsigned int rpi_shader[] = {
+- /* [0x00000600] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+- /* [0x00000608] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+- /* [0x00000610] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+--/* [0x00000618] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x00000618] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
+- /* [0x00000620] */ 0x00000010, 0xe00208a7, // mov r2, 16
+- /* [0x00000628] */ 0x15827d80, 0x10020827, // mov r0, unif
+- /* [0x00000630] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-@@ -346,7 +346,7 @@ unsigned int rpi_shader[] = {
+- /* [0x000009e0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+- /* [0x000009e8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+- /* [0x000009f0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x000009f8] */ 0x0c9dc7c0, 0x10020c67, // add vr_setup, r3, rb28
+-+/* [0x000009f8] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
+- /* [0x00000a00] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
+- /* [0x00000a08] */ 0x15827d80, 0x10020827, // mov r0, unif
+- /* [0x00000a10] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
+-index 635b894..9577121 100644
+---- a/libavcodec/rpi_shader.qasm
+-+++ b/libavcodec/rpi_shader.qasm
+-@@ -26,7 +26,7 @@
+- # ra23                                          8
+- #
+- # rb20                                          0xffffff00
+--# rb21                                          vpm_setup for writing 16bit results into VPM
+-+# rb21                                          vpm_setup for reading/writing 16bit results into VPM
+- # rb22                                          255
+- # rb23                                          24
+- #
+-@@ -370,8 +370,8 @@ and rb_x_base_next, r0, ~3
+- mov ra_y_next, r1
+- add ra_x2_base_next, rb_x_base_next, r2
+- 
+--# set up VPM write
+--mov vw_setup, rb28
+-+# set up VPM write, we need to save 16bit precision
+-+mov vw_setup, rb21
+- 
+- # get width,height of block
+- mov r2, 16
+-@@ -554,8 +554,8 @@ add r0, r0, r1 # Combine width and height of destination area
+- shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
+- add rb26, r0, rb27
+- 
+--# In a B frame, so also set up VPM read
+--add vr_setup, r3, rb28
+-+# In a B frame, so also set up VPM read (reading back 16bit precision)
+-+add vr_setup, r3, rb21
+- 
+- sub.setf -,8,r1 # 8-r1, so if <0 (negative) we need to use the full code
+- 
+--- 
+-2.7.4
+-
+-
+-From abc51bf61df597082fbd7cf1bba5031e4d44318b Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Thu, 14 May 2015 10:30:44 +0100
+-Subject: [PATCH 26/68] Corrected B prediction: matching md5 sum for hobbit50
+-
+----
+- libavcodec/rpi_shader.c    | 815 ++++++++++++++++++++++-----------------------
+- libavcodec/rpi_shader.h    |  12 +-
+- libavcodec/rpi_shader.qasm |  36 +-
+- 3 files changed, 429 insertions(+), 434 deletions(-)
+-
+-diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
+-index ffd3a07..77cca46 100644
+---- a/libavcodec/rpi_shader.c
+-+++ b/libavcodec/rpi_shader.c
+-@@ -38,431 +38,428 @@ unsigned int rpi_shader[] = {
+- /* [0x00000068] */ 0x00000020, 0xe0020567, // mov ra21, 32
+- /* [0x00000070] */ 0x00000100, 0xe00205a7, // mov ra22, 256
+- /* [0x00000078] */ 0x00000008, 0xe00205e7, // mov ra23, 8
+--/* [0x00000080] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
+--/* [0x00000088] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
+--/* [0x00000090] */ 0x00000018, 0xe00215e7, // mov rb23, 24
+--/* [0x00000098] */ 0x00000000, 0xe0020227, // mov ra8, 0
+--/* [0x000000a0] */ 0x00000000, 0xe0020267, // mov ra9, 0
+--/* [0x000000a8] */ 0x00000000, 0xe00202a7, // mov ra10, 0
+--/* [0x000000b0] */ 0x00000000, 0xe00202e7, // mov ra11, 0
+--/* [0x000000b8] */ 0x00000000, 0xe0020327, // mov ra12, 0
+--/* [0x000000c0] */ 0x00000000, 0xe0020367, // mov ra13, 0
+--/* [0x000000c8] */ 0x00000000, 0xe00203a7, // mov ra14, 0
+--/* [0x000000d0] */ 0x00000000, 0xe00203e7, // mov ra15, 0
+--/* [0x000000d8] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+--/* [0x000000e0] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
+--/* [0x000000e8] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+--/* [0x000000f0] */ 0x159e7480, 0x10020867, // mov r1, r2
+--/* [0x000000f8] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+--/* [0x00000100] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+--/* [0x00000108] */ 0x159e7480, 0x10020827, // mov r0, r2
+--/* [0x00000110] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+--/* [0x00000118] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000120] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
+--/* [0x00000128] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+--/* [0x00000130] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
+--/* [0x00000138] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+--/* [0x00000140] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
+--/* [0x00000148] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+--/* [0x00000150] */ 0x159e7480, 0x10020867, // mov r1, r2
+--/* [0x00000158] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+--/* [0x00000160] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+--/* [0x00000168] */ 0x159e7480, 0x10020827, // mov r0, r2
+--/* [0x00000170] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+--/* [0x00000178] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000180] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+--/* [0x00000188] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
+--/* [0x00000190] */ 0x0f9c11c0, 0xd0020827, // asr r0, r0, 1
+--/* [0x00000198] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
+--/* [0x000001a0] */ 0x0c9e7040, 0x10021567, // add rb21, r0, r1
+--/* [0x000001a8] */ 0x15427d80, 0x10020827, // mov r0, ra_x_base
+--/* [0x000001b0] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
+--/* [0x000001b8] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base
+--/* [0x000001c0] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
+--/* [0x000001c8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
+--/* [0x000001d0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x000001d8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+--/* [0x000001e0] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x_base, r0
+--/* [0x000001e8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+--/* [0x000001f0] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
+--/* [0x000001f8] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
+--/* [0x00000200] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
+--/* [0x00000208] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000080] */ 0x00000040, 0xe00207a7, // mov ra30, 64
+-+/* [0x00000088] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
+-+/* [0x00000090] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
+-+/* [0x00000098] */ 0x00000018, 0xe00215e7, // mov rb23, 24
+-+/* [0x000000a0] */ 0x00000000, 0xe0020227, // mov ra8, 0
+-+/* [0x000000a8] */ 0x00000000, 0xe0020267, // mov ra9, 0
+-+/* [0x000000b0] */ 0x00000000, 0xe00202a7, // mov ra10, 0
+-+/* [0x000000b8] */ 0x00000000, 0xe00202e7, // mov ra11, 0
+-+/* [0x000000c0] */ 0x00000000, 0xe0020327, // mov ra12, 0
+-+/* [0x000000c8] */ 0x00000000, 0xe0020367, // mov ra13, 0
+-+/* [0x000000d0] */ 0x00000000, 0xe00203a7, // mov ra14, 0
+-+/* [0x000000d8] */ 0x00000000, 0xe00203e7, // mov ra15, 0
+-+/* [0x000000e0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+-+/* [0x000000e8] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
+-+/* [0x000000f0] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+-+/* [0x000000f8] */ 0x159e7480, 0x10020867, // mov r1, r2
+-+/* [0x00000100] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+-+/* [0x00000108] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+-+/* [0x00000110] */ 0x159e7480, 0x10020827, // mov r0, r2
+-+/* [0x00000118] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+-+/* [0x00000120] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000128] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
+-+/* [0x00000130] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+-+/* [0x00000138] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
+-+/* [0x00000140] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+-+/* [0x00000148] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
+-+/* [0x00000150] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+-+/* [0x00000158] */ 0x159e7480, 0x10020867, // mov r1, r2
+-+/* [0x00000160] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+-+/* [0x00000168] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+-+/* [0x00000170] */ 0x159e7480, 0x10020827, // mov r0, r2
+-+/* [0x00000178] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+-+/* [0x00000180] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000188] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+-+/* [0x00000190] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
+-+/* [0x00000198] */ 0x0f9c11c0, 0xd0020827, // asr r0, r0, 1
+-+/* [0x000001a0] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
+-+/* [0x000001a8] */ 0x0c9e7040, 0x10021567, // add rb21, r0, r1
+-+/* [0x000001b0] */ 0x15427d80, 0x10020827, // mov r0, ra_x_base
+-+/* [0x000001b8] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
+-+/* [0x000001c0] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base
+-+/* [0x000001c8] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
+-+/* [0x000001d0] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
+-+/* [0x000001d8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x000001e0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+-+/* [0x000001e8] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x_base, r0
+-+/* [0x000001f0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x000001f8] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
+-+/* [0x00000200] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
+-+/* [0x00000208] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
+- /* [0x00000210] */ 0x15827d80, 0x10020827, // mov r0, unif
+- /* [0x00000218] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000220] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
+--/* [0x00000228] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+--/* [0x00000230] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
+--/* [0x00000238] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000240] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+--/* [0x00000248] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
+--/* [0x00000250] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
+-+/* [0x00000220] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000228] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
+-+/* [0x00000230] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x00000238] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
+-+/* [0x00000240] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000248] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+-+/* [0x00000250] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
+-+/* [0x00000258] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
+- // ::mc_filter_uv
+--/* [0x00000258] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00000260] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x00000268] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00000270] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x00000278] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+--/* [0x00000280] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x00000288] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+--/* [0x00000290] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x00000298] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+--/* [0x000002a0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x000002a8] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+--/* [0x000002b0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+--/* [0x000002b8] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x000002c0] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000002c8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x000002d0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x000002d8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x000002e0] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+--/* [0x000002e8] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+--/* [0x000002f0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x000002f8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000300] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000308] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00000310] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
+--/* [0x00000318] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000320] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000328] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000330] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000338] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x00000340] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000348] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000350] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000358] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+--/* [0x00000360] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000368] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000370] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000378] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+--/* [0x00000380] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000388] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000390] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000398] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+--/* [0x000003a0] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x000003a8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x000003b0] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x00000260] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000268] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00000270] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000278] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000280] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x00000288] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000290] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x00000298] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x000002a0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+-+/* [0x000002a8] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x000002b0] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+-+/* [0x000002b8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x000002c0] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x000002c8] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000002d0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x000002d8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x000002e0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x000002e8] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+-+/* [0x000002f0] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+-+/* [0x000002f8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x00000300] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000308] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000310] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00000318] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
+-+/* [0x00000320] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000328] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000330] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000338] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000340] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x00000348] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000350] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000358] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000360] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+-+/* [0x00000368] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000370] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000378] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000380] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+-+/* [0x00000388] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000390] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000398] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000003a0] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+-+/* [0x000003a8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x000003b0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x000003b8] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop
+--/* [0x000003b8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x000003c0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x000003c8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x000003d0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x000003d8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x000003e0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x000003e8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x000003f0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x000003f8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x00000400] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x00000408] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000410] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x00000418] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x00000420] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x00000428] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x00000430] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x00000438] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x00000440] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x00000448] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00000450] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--/* [0x00000458] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--/* [0x00000460] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--/* [0x00000468] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--/* [0x00000470] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--/* [0x00000478] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--/* [0x00000480] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--/* [0x00000488] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--/* [0x00000490] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
+--/* [0x00000498] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+--/* [0x000004a0] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+--/* [0x000004a8] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+--/* [0x000004b0] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+--/* [0x000004b8] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+--/* [0x000004c0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+--/* [0x000004c8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x000004d0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+--/* [0x000004d8] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop
+--/* [0x000004e0] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+--/* [0x000004e8] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+--/* [0x000004f0] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+--/* [0x000004f8] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
+--/* [0x00000500] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
+--/* [0x00000508] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
+--/* [0x00000510] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
+--/* [0x00000518] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
+--/* [0x00000520] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
+--/* [0x00000528] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
+--/* [0x00000530] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
+--/* [0x00000538] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x00000540] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x00000548] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+--/* [0x00000550] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
+--/* [0x00000558] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:uvloop
+--/* [0x00000560] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
+--/* [0x00000568] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x00000570] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+--/* [0x00000578] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00000580] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000588] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x00000590] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x00000598] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x000005a0] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x000005a8] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x000005b0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x000005b8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x000003c0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x000003c8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x000003d0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x000003d8] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x000003e0] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x000003e8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x000003f0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x000003f8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x00000400] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x00000408] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x00000410] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000418] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x00000420] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x00000428] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00000430] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x00000438] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000440] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00000448] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00000450] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00000458] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+/* [0x00000460] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+/* [0x00000468] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+/* [0x00000470] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+/* [0x00000478] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+/* [0x00000480] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+/* [0x00000488] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+/* [0x00000490] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+/* [0x00000498] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
+-+/* [0x000004a0] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+-+/* [0x000004a8] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+-+/* [0x000004b0] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+-+/* [0x000004b8] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+-+/* [0x000004c0] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+-+/* [0x000004c8] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+-+/* [0x000004d0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x000004d8] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+-+/* [0x000004e0] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop
+-+/* [0x000004e8] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+-+/* [0x000004f0] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+-+/* [0x000004f8] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+-+/* [0x00000500] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
+-+/* [0x00000508] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
+-+/* [0x00000510] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
+-+/* [0x00000518] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
+-+/* [0x00000520] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
+-+/* [0x00000528] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
+-+/* [0x00000530] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
+-+/* [0x00000538] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
+-+/* [0x00000540] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x00000548] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x00000550] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x00000558] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
+-+/* [0x00000560] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:uvloop
+-+/* [0x00000568] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
+-+/* [0x00000570] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x00000578] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x00000580] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00000588] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000590] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000598] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x000005a0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x000005a8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x000005b0] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x000005b8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x000005c0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_filter_uv_b0
+--/* [0x000005c0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x000005c8] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x000005d0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x000005d8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x000005e0] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+--/* [0x000005e8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x000005f0] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+--/* [0x000005f8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x00000600] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+--/* [0x00000608] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x00000610] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+--/* [0x00000618] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
+--/* [0x00000620] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x00000628] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000630] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x00000638] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x00000640] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x00000648] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+--/* [0x00000650] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+--/* [0x00000658] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x00000660] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000668] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000670] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00000678] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
+--/* [0x00000680] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000688] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000690] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000698] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000006a0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x000006a8] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000006b0] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000006b8] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000006c0] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+--/* [0x000006c8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000006d0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000006d8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000006e0] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+--/* [0x000006e8] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000006f0] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000006f8] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000700] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+--/* [0x00000708] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x00000710] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000718] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x000005c8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x000005d0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x000005d8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x000005e0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x000005e8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x000005f0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x000005f8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x00000600] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x00000608] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+-+/* [0x00000610] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x00000618] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+-+/* [0x00000620] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
+-+/* [0x00000628] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x00000630] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000638] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x00000640] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x00000648] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x00000650] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+-+/* [0x00000658] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+-+/* [0x00000660] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x00000668] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000670] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000678] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00000680] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
+-+/* [0x00000688] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000690] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000698] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000006a0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000006a8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x000006b0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000006b8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000006c0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000006c8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+-+/* [0x000006d0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000006d8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000006e0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000006e8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+-+/* [0x000006f0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000006f8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000700] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000708] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+-+/* [0x00000710] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x00000718] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000720] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop_b0
+--/* [0x00000720] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x00000728] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x00000730] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x00000738] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x00000740] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x00000748] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x00000750] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x00000758] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x00000760] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x00000768] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x00000770] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000778] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x00000780] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x00000788] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x00000790] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x00000798] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x000007a0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x000007a8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x000007b0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x000007b8] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--/* [0x000007c0] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--/* [0x000007c8] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--/* [0x000007d0] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--/* [0x000007d8] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--/* [0x000007e0] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--/* [0x000007e8] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--/* [0x000007f0] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--/* [0x000007f8] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
+--/* [0x00000800] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+--/* [0x00000808] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+--/* [0x00000810] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+--/* [0x00000818] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+--/* [0x00000820] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+--/* [0x00000828] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+--/* [0x00000830] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x00000838] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+--/* [0x00000840] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+--/* [0x00000848] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+--/* [0x00000850] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+--/* [0x00000858] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+--/* [0x00000860] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
+--/* [0x00000868] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
+--/* [0x00000870] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
+--/* [0x00000878] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
+--/* [0x00000880] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
+--/* [0x00000888] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
+--/* [0x00000890] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
+--/* [0x00000898] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
+--/* [0x000008a0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x000008a8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x000008b0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+--/* [0x000008b8] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
+--/* [0x000008c0] */ 0xfffffad8, 0xf06809e7, // brr.anyn -, r:uvloop
+--/* [0x000008c8] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
+--/* [0x000008d0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x000008d8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+--/* [0x000008e0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x000008e8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x000008f0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x000008f8] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x00000900] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x00000908] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000910] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x00000918] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000920] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000728] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x00000730] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x00000738] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x00000740] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000748] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x00000750] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000758] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000760] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x00000768] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x00000770] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x00000778] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000780] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x00000788] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x00000790] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00000798] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x000007a0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x000007a8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x000007b0] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x000007b8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x000007c0] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+/* [0x000007c8] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+/* [0x000007d0] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+/* [0x000007d8] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+/* [0x000007e0] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+/* [0x000007e8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+/* [0x000007f0] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+/* [0x000007f8] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+/* [0x00000800] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
+-+/* [0x00000808] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+-+/* [0x00000810] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+-+/* [0x00000818] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+-+/* [0x00000820] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+-+/* [0x00000828] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+-+/* [0x00000830] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+-+/* [0x00000838] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00000840] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+-+/* [0x00000848] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+-+/* [0x00000850] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+-+/* [0x00000858] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+-+/* [0x00000860] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+-+/* [0x00000868] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
+-+/* [0x00000870] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
+-+/* [0x00000878] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
+-+/* [0x00000880] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
+-+/* [0x00000888] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
+-+/* [0x00000890] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
+-+/* [0x00000898] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
+-+/* [0x000008a0] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
+-+/* [0x000008a8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x000008b0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x000008b8] */ 0xfffffe50, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+-+/* [0x000008c0] */ 0x0f9ce3c0, 0xd0020c27, // asr vpm, r1, 14
+-+/* [0x000008c8] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x000008d0] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x000008d8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x000008e0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x000008e8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x000008f0] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x000008f8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x00000900] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000908] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x00000910] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000918] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_filter_uv_b
+--/* [0x00000928] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00000930] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x00000938] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00000940] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x00000948] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+--/* [0x00000950] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x00000958] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+--/* [0x00000960] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x00000968] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+--/* [0x00000970] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x00000978] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+--/* [0x00000980] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+--/* [0x00000988] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x00000990] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000998] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x000009a0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x000009a8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x000009b0] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+--/* [0x000009b8] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+--/* [0x000009c0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x000009c8] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+--/* [0x000009d0] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+--/* [0x000009d8] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+--/* [0x000009e0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x000009e8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x000009f0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x000009f8] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
+--/* [0x00000a00] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
+--/* [0x00000a08] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000a10] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000a18] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000a20] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000a28] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x00000a30] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000a38] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000a40] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000a48] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+--/* [0x00000a50] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000a58] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000a60] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000a68] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+--/* [0x00000a70] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000a78] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000a80] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000a88] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+--/* [0x00000a90] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x00000a98] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000aa0] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x00000920] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000928] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00000930] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000938] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000940] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x00000948] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000950] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x00000958] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x00000960] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+-+/* [0x00000968] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x00000970] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+-+/* [0x00000978] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x00000980] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x00000988] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000990] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x00000998] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x000009a0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x000009a8] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+-+/* [0x000009b0] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+-+/* [0x000009b8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x000009c0] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+-+/* [0x000009c8] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+-+/* [0x000009d0] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+-+/* [0x000009d8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x000009e0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x000009e8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x000009f0] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
+-+/* [0x000009f8] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
+-+/* [0x00000a00] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000a08] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000a10] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000a18] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000a20] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x00000a28] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000a30] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000a38] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000a40] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+-+/* [0x00000a48] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000a50] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000a58] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000a60] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+-+/* [0x00000a68] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000a70] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000a78] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000a80] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+-+/* [0x00000a88] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x00000a90] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000a98] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop_b
+--/* [0x00000aa8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x00000ab0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x00000ab8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x00000ac0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x00000ac8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x00000ad0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x00000ad8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x00000ae0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x00000ae8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x00000af0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x00000af8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000b00] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x00000b08] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x00000b10] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x00000b18] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x00000b20] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x00000b28] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x00000b30] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x00000b38] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00000b40] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--/* [0x00000b48] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--/* [0x00000b50] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--/* [0x00000b58] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--/* [0x00000b60] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--/* [0x00000b68] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--/* [0x00000b70] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--/* [0x00000b78] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--/* [0x00000b80] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
+--/* [0x00000b88] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+--/* [0x00000b90] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+--/* [0x00000b98] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+--/* [0x00000ba0] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+--/* [0x00000ba8] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+--/* [0x00000bb0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+--/* [0x00000bb8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x00000bc0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+--/* [0x00000bc8] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
+--/* [0x00000bd0] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+--/* [0x00000bd8] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+--/* [0x00000be0] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+--/* [0x00000be8] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
+--/* [0x00000bf0] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
+--/* [0x00000bf8] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
+--/* [0x00000c00] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
+--/* [0x00000c08] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
+--/* [0x00000c10] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
+--/* [0x00000c18] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
+--/* [0x00000c20] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
+--/* [0x00000c28] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x00000c30] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x00000c38] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+--/* [0x00000c40] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
+--/* [0x00000c48] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
+--/* [0x00000c50] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x00000c58] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
+--/* [0x00000c60] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:uvloop_b
+--/* [0x00000c68] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+--/* [0x00000c70] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
+--/* [0x00000c78] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
+--/* [0x00000c80] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00000c88] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000c90] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x00000c98] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x00000ca0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x00000ca8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000cb0] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x00000cb8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000cc0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000aa0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x00000aa8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x00000ab0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x00000ab8] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000ac0] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x00000ac8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000ad0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000ad8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x00000ae0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x00000ae8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x00000af0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000af8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x00000b00] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x00000b08] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00000b10] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x00000b18] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000b20] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00000b28] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00000b30] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00000b38] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+/* [0x00000b40] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+/* [0x00000b48] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+/* [0x00000b50] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+/* [0x00000b58] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+/* [0x00000b60] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+/* [0x00000b68] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+/* [0x00000b70] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+/* [0x00000b78] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
+-+/* [0x00000b80] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+-+/* [0x00000b88] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+-+/* [0x00000b90] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+-+/* [0x00000b98] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+-+/* [0x00000ba0] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+-+/* [0x00000ba8] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+-+/* [0x00000bb0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00000bb8] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+-+/* [0x00000bc0] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x00000bc8] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+-+/* [0x00000bd0] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+-+/* [0x00000bd8] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+-+/* [0x00000be0] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
+-+/* [0x00000be8] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
+-+/* [0x00000bf0] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
+-+/* [0x00000bf8] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
+-+/* [0x00000c00] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
+-+/* [0x00000c08] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
+-+/* [0x00000c10] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
+-+/* [0x00000c18] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
+-+/* [0x00000c20] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x00000c28] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x00000c30] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x00000c38] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
+-+/* [0x00000c40] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
+-+/* [0x00000c48] */ 0xfffffe38, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x00000c50] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
+-+/* [0x00000c58] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x00000c60] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x00000c68] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00000c70] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000c78] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000c80] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x00000c88] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x00000c90] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000c98] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x00000ca0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000ca8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_exit
+--/* [0x00000cc8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x00000cd0] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+-+/* [0x00000cb0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00000cb8] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+-+/* [0x00000cc0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000cc8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000cd0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+- /* [0x00000cd8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000ce0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000ce8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000cf0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000cf8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x00000d00] */ 0x009e7000, 0x100009e7, // nop        ; nop
+--/* [0x00000d08] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000ce0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00000ce8] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000cf0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_interrupt_exit8
+--/* [0x00000d10] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00000cf8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00000d00] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000d08] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000d10] */ 0x009e7000, 0xa00009e7, // ldtmu0
+- /* [0x00000d18] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000d20] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000d28] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000d30] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000d20] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000d28] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000d30] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00000d38] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00000d40] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00000d48] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00000d50] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000d58] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000d60] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000d68] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000d70] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x00000d78] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+--/* [0x00000d80] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000d58] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00000d60] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x00000d68] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_end
+- };
+- #ifdef __HIGHC__
+-diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
+-index 809e582..6562fa9 100644
+---- a/libavcodec/rpi_shader.h
+-+++ b/libavcodec/rpi_shader.h
+-@@ -4,11 +4,11 @@
+- extern unsigned int rpi_shader[];
+- 
+- #define mc_setup_uv (rpi_shader + 0)
+--#define mc_filter_uv (rpi_shader + 150)
+--#define mc_filter_uv_b0 (rpi_shader + 368)
+--#define mc_filter_uv_b (rpi_shader + 586)
+--#define mc_exit (rpi_shader + 818)
+--#define mc_interrupt_exit8 (rpi_shader + 836)
+--#define mc_end (rpi_shader + 866)
+-+#define mc_filter_uv (rpi_shader + 152)
+-+#define mc_filter_uv_b0 (rpi_shader + 370)
+-+#define mc_filter_uv_b (rpi_shader + 584)
+-+#define mc_exit (rpi_shader + 812)
+-+#define mc_interrupt_exit8 (rpi_shader + 830)
+-+#define mc_end (rpi_shader + 860)
+- 
+- #endif
+-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
+-index 9577121..562dc35 100644
+---- a/libavcodec/rpi_shader.qasm
+-+++ b/libavcodec/rpi_shader.qasm
+-@@ -39,13 +39,13 @@
+- # rb30                                          frame height-1
+- # rb31                                          used as temp to count loop iterations
+- #
+--# ra24...ra30                                   15, 14, 13, 12, 11, 10, 9
+- # ra24                                          clipped(row start address+8+elem_num)&~3
+- # ra25                                          per-channel shifts 2
+- # ra26                                          next ra24
+- # ra27                                          next ra25
+- # ra28                                          next y
+- # ra29                                          y for next texture access
+-+# ra30                                          64
+- #
+- # ra31                                          next kernel address
+- 
+-@@ -102,6 +102,7 @@ mov ra20, 1
+- mov ra21, 32
+- mov ra22, 256
+- mov ra23, 8
+-+mov ra30, 64
+- 
+- mov rb20, 0xffffff00
+- mov rb22, 255
+-@@ -472,7 +473,7 @@ sub.setf -, r3, 8 ; mov r1, ra22
+- # apply horizontal filter
+- brr.anyn -, r:uvloop_b0
+- mov ra14, ra15          ; mul24 r0, r0, r1         # last bit of context scroll
+--asr ra15, r0, 8         ; nop
+-+asr ra15, r0, 8         ; nop  # TODO isn't ra15 already in 24bit precision, may not need the sign extension here?
+- nop                     ; nop  # Delay slot 3 (TODO move more of the context scroll into here)
+- 
+- # apply vertical filter and write to VPM
+-@@ -487,18 +488,18 @@ add r1, r1, r0          ; mul24 r0, ra8, rb8
+- add r1, r1, r0          ; mul24 r0, ra15, rb15
+- add r1, r1, r0          ; mov -, vw_wait
+- sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--asr r1, r1, 14
+--add r1, r1, ra21
+--brr.anyn -, r:uvloop
+--asr r1, r1, 6          # Delay 1
+--min r1, r1, rb22       # Delay 2
+--max vpm, r1, 0         # Delay 3
+-+#asr r1, r1, 14
+-+#add r1, r1, ra21
+-+brr.anyn -, r:uvloop_b0
+-+asr vpm, r1, 14        # Delay 1 shifts down by shift2=6, but results are still in 16bit precision TODO may be able to avoid the mul24 and use more delay slots
+-+nop                    # Delay 2
+-+nop                    # Delay 3
+- 
+- # DMA out for U
+- 
+- mov vw_setup, rb26 # VDW setup 0
+- mov vw_setup, rb29 # Stride
+--mov vw_addr, unif # start the VDW
+-+mov vw_addr, unif # start the VDW    # TODO in pass0 we don't need to save any results
+- 
+- # DMA out for V
+- # We need to wait for the U to complete first, but have nothing useful to compute while we wait.
+-@@ -639,12 +640,11 @@ mov ra12, ra13
+- mov ra13, ra14
+- 
+- sub.setf -, r3, 8 ; mov r1, ra22
+--
+- # apply horizontal filter
+- brr.anyn -, r:uvloop_b
+- mov ra14, ra15          ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
+- asr ra15, r0, 8         ; nop
+--nop                     ; nop
+-+nop                     ; nop    # TODO improve use of delay slots
+- 
+- # apply vertical filter and write to VPM
+- 
+-@@ -658,15 +658,13 @@ add r1, r1, r0          ; mul24 r0, ra8, rb8
+- add r1, r1, r0          ; mul24 r0, ra15, rb15
+- add r1, r1, r0          ; mov -, vw_wait
+- sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--asr r1, r1, 14
+--add r1, r1, ra21
+--asr r1, r1, 6
+--min r1, r1, rb22
+--add r0, vpm, 1          # Blend in previous VPM contents at this location
+-+asr r1, r1, 14          # shift2=6
+-+add r1, r1, vpm         # Blend in previous VPM contents at this location
+-+add r1, r1, ra30
+- brr.anyn -, r:uvloop_b
+--max r1, r1, 0
+--add r1, r1, r0
+--shr vpm, r1, 1
+-+asr r1, r1, 7           # Delay 1
+-+min r1, r1, rb22        # Delay 2
+-+max vpm, r1, 0          # Delay 3
+- 
+- 
+- # DMA out for U
+--- 
+-2.7.4
+-
+-
+-From ea60373134f98099c4ebaf0d23cca666008b4bba Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Thu, 14 May 2015 10:55:07 +0100
+-Subject: [PATCH 27/68] P prediction uses 4 tap filters
+-
+----
+- libavcodec/hevc.c          |  50 ++--
+- libavcodec/rpi_shader.c    | 631 ++++++++++++++++++++++-----------------------
+- libavcodec/rpi_shader.h    |  10 +-
+- libavcodec/rpi_shader.qasm |  43 +--
+- 4 files changed, 344 insertions(+), 390 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 28a6660..a47ebc5 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -65,15 +65,15 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
+- #define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24)
+- 
+- // TODO Chroma only needs 4 taps
+--static uint32_t rpi_filter_coefs[8][2] = {
+--        { ENCODE_COEFFS(  0,  0,  0,  64), ENCODE_COEFFS(   0,   0,  0,  0 ) },
+--        { ENCODE_COEFFS(  0,  0, -2,  58), ENCODE_COEFFS(  10,  -2,  0,  0 ) },
+--        { ENCODE_COEFFS(  0,  0, -4,  54), ENCODE_COEFFS(  16,  -2,  0,  0 ) },
+--        { ENCODE_COEFFS(  0,  0, -6,  46), ENCODE_COEFFS(  28,  -4,  0,  0 ) },
+--        { ENCODE_COEFFS(  0,  0, -4,  36), ENCODE_COEFFS(  36,  -4,  0,  0 ) },
+--        { ENCODE_COEFFS(  0,  0, -4,  28), ENCODE_COEFFS(  46,  -6,  0,  0 ) },
+--        { ENCODE_COEFFS(  0,  0, -2,  16), ENCODE_COEFFS(  54,  -4,  0,  0 ) },
+--        { ENCODE_COEFFS(  0,  0, -2,  10), ENCODE_COEFFS(  58,  -2,  0,  0 ) }
+-+static uint32_t rpi_filter_coefs[8][1] = {
+-+        { ENCODE_COEFFS(   0,  64,   0,   0) },
+-+        { ENCODE_COEFFS(  -2,  58,  10,  -2) },
+-+        { ENCODE_COEFFS(  -4,  54,  16,  -2) },
+-+        { ENCODE_COEFFS(  -6,  46,  28,  -4) },
+-+        { ENCODE_COEFFS(  -4,  36,  36,  -4) },
+-+        { ENCODE_COEFFS(  -4,  28,  46,  -6) },
+-+        { ENCODE_COEFFS(  -2,  16,  54,  -4) },
+-+        { ENCODE_COEFFS(  -2,  10,  58,  -2) }
+- };
+- 
+- static uint32_t get_vc_address(AVBufferRef *bref) {
+-@@ -2027,16 +2027,16 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
+-                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
+--                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 3 + start_x;
+--                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 3 + start_y;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[1]);
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[2]);
+-                       *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
+-                       // TODO chroma weight and offset... s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]
+-                       *u++ = rpi_filter_coefs[_mx][0];
+--                      *u++ = rpi_filter_coefs[_mx][1];
+-+                      u++;
+-                       *u++ = rpi_filter_coefs[_my][0];
+--                      *u++ = rpi_filter_coefs[_my][1];
+-+                      u++;
+-                       *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
+-                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+-                     }
+-@@ -2084,16 +2084,16 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
+-                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
+--                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 3 + start_x;
+--                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 3 + start_y;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[1]);
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[2]);
+-                       *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
+-                       // TODO chroma weight and offset... s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]
+-                       *u++ = rpi_filter_coefs[_mx][0];
+--                      *u++ = rpi_filter_coefs[_mx][1];
+-+                      u++;
+-                       *u++ = rpi_filter_coefs[_my][0];
+--                      *u++ = rpi_filter_coefs[_my][1];
+-+                      u++;
+-                       *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
+-                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+-                     }
+-@@ -2148,29 +2148,29 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
+-                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b0;
+--                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 3 + start_x;
+--                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 3 + start_y;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[1]);
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[2]);
+-                       *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
+-                       *u++ = rpi_filter_coefs[_mx][0];
+--                      *u++ = rpi_filter_coefs[_mx][1];
+-+                      u++;
+-                       *u++ = rpi_filter_coefs[_my][0];
+--                      *u++ = rpi_filter_coefs[_my][1];
+-+                      u++;
+-                       *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]); // TODO this will become unused once we have a dedicated pass0 filter
+-                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+- 
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b;
+--                      u++[-RPI_CHROMA_COMMAND_WORDS] = x2_c - 3 + start_x;
+--                      u++[-RPI_CHROMA_COMMAND_WORDS] = y2_c - 3 + start_y;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = x2_c - 1 + start_x;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = y2_c - 1 + start_y;
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[1]);
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[2]);
+-                       *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
+-                       // TODO chroma weight and offset... s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]
+-                       *u++ = rpi_filter_coefs[_mx2][0];
+--                      *u++ = rpi_filter_coefs[_mx2][1];
+-+                      u++;
+-                       *u++ = rpi_filter_coefs[_my2][0];
+--                      *u++ = rpi_filter_coefs[_my2][1];
+-+                      u++;
+-                       *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
+-                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+-                     }
+-diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
+-index 77cca46..c8d0728 100644
+---- a/libavcodec/rpi_shader.c
+-+++ b/libavcodec/rpi_shader.c
+-@@ -116,8 +116,8 @@ unsigned int rpi_shader[] = {
+- /* [0x000002d0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+- /* [0x000002d8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+- /* [0x000002e0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x000002e8] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+--/* [0x000002f0] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+-+/* [0x000002e8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+-+/* [0x000002f0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+- /* [0x000002f8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+- /* [0x00000300] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+- /* [0x00000308] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-@@ -128,338 +128,315 @@ unsigned int rpi_shader[] = {
+- /* [0x00000330] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+- /* [0x00000338] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+- /* [0x00000340] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x00000348] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000350] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000358] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000360] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+--/* [0x00000368] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000370] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000378] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000380] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+--/* [0x00000388] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000390] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000398] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000003a0] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+--/* [0x000003a8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x000003b0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x000003b8] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x00000348] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000350] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000358] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000360] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000368] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+-+/* [0x00000370] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x00000378] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000380] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop
+--/* [0x000003c0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x000003c8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x000003d0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x000003d8] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x000003e0] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x000003e8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x000003f0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x000003f8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x00000400] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x00000408] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x00000410] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000418] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x00000420] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x00000428] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x00000430] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x00000438] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x00000440] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x00000448] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x00000450] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00000458] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--/* [0x00000460] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--/* [0x00000468] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--/* [0x00000470] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--/* [0x00000478] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--/* [0x00000480] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--/* [0x00000488] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--/* [0x00000490] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--/* [0x00000498] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
+--/* [0x000004a0] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+--/* [0x000004a8] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+--/* [0x000004b0] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+--/* [0x000004b8] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+--/* [0x000004c0] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+--/* [0x000004c8] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+--/* [0x000004d0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x000004d8] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+--/* [0x000004e0] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop
+--/* [0x000004e8] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+--/* [0x000004f0] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+--/* [0x000004f8] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+--/* [0x00000500] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
+--/* [0x00000508] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
+--/* [0x00000510] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
+--/* [0x00000518] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
+--/* [0x00000520] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
+--/* [0x00000528] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
+--/* [0x00000530] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
+--/* [0x00000538] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
+--/* [0x00000540] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x00000548] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x00000550] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+--/* [0x00000558] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
+--/* [0x00000560] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:uvloop
+--/* [0x00000568] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
+--/* [0x00000570] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x00000578] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+--/* [0x00000580] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00000588] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000590] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x00000598] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x000005a0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x000005a8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x000005b0] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x000005b8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x000005c0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000388] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x00000390] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x00000398] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x000003a0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x000003a8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x000003b0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x000003b8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x000003c0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x000003c8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x000003d0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x000003d8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x000003e0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x000003e8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x000003f0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x000003f8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x00000400] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000408] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00000410] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00000418] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00000420] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
+-+/* [0x00000428] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+-+/* [0x00000430] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+-+/* [0x00000438] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00000440] */ 0x8d5847f6, 0xd00269e1, // sub.setf -, r3, 4 ; mov r1, ra22
+-+/* [0x00000448] */ 0xffffff20, 0xf06809e7, // brr.anyn -, r:uvloop
+-+/* [0x00000450] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+-+/* [0x00000458] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+-+/* [0x00000460] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+-+/* [0x00000468] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+-+/* [0x00000470] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+-+/* [0x00000478] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+/* [0x00000480] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x00000488] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x00000490] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x00000498] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x000004a0] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
+-+/* [0x000004a8] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop
+-+/* [0x000004b0] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
+-+/* [0x000004b8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x000004c0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x000004c8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x000004d0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x000004d8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x000004e0] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x000004e8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x000004f0] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x000004f8] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x00000500] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000508] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_filter_uv_b0
+--/* [0x000005c8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x000005d0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x000005d8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x000005e0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x000005e8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+--/* [0x000005f0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x000005f8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+--/* [0x00000600] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x00000608] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+--/* [0x00000610] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x00000618] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+--/* [0x00000620] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
+--/* [0x00000628] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x00000630] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000638] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x00000640] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x00000648] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x00000650] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+--/* [0x00000658] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+--/* [0x00000660] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x00000668] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000670] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000678] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00000680] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
+--/* [0x00000688] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000690] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000698] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000006a0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000006a8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x000006b0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000006b8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000006c0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000006c8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+--/* [0x000006d0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000006d8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000006e0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000006e8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+--/* [0x000006f0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000006f8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000700] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000708] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+--/* [0x00000710] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x00000718] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000720] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x00000510] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000518] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00000520] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000528] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000530] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x00000538] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000540] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x00000548] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x00000550] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+-+/* [0x00000558] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x00000560] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+-+/* [0x00000568] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
+-+/* [0x00000570] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x00000578] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000580] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x00000588] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x00000590] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x00000598] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+-+/* [0x000005a0] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+-+/* [0x000005a8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x000005b0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x000005b8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x000005c0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x000005c8] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
+-+/* [0x000005d0] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000005d8] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000005e0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000005e8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000005f0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x000005f8] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000600] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000608] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000610] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+-+/* [0x00000618] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000620] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000628] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000630] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+-+/* [0x00000638] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000640] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000648] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000650] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+-+/* [0x00000658] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x00000660] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000668] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop_b0
+--/* [0x00000728] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x00000730] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x00000738] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x00000740] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x00000748] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x00000750] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x00000758] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x00000760] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x00000768] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x00000770] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x00000778] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000780] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x00000788] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x00000790] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x00000798] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x000007a0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x000007a8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x000007b0] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x000007b8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x000007c0] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--/* [0x000007c8] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--/* [0x000007d0] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--/* [0x000007d8] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--/* [0x000007e0] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--/* [0x000007e8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--/* [0x000007f0] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--/* [0x000007f8] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--/* [0x00000800] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
+--/* [0x00000808] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+--/* [0x00000810] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+--/* [0x00000818] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+--/* [0x00000820] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+--/* [0x00000828] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+--/* [0x00000830] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+--/* [0x00000838] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x00000840] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+--/* [0x00000848] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+--/* [0x00000850] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+--/* [0x00000858] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+--/* [0x00000860] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+--/* [0x00000868] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
+--/* [0x00000870] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
+--/* [0x00000878] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
+--/* [0x00000880] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
+--/* [0x00000888] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
+--/* [0x00000890] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
+--/* [0x00000898] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
+--/* [0x000008a0] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
+--/* [0x000008a8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x000008b0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x000008b8] */ 0xfffffe50, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+--/* [0x000008c0] */ 0x0f9ce3c0, 0xd0020c27, // asr vpm, r1, 14
+--/* [0x000008c8] */ 0x009e7000, 0x100009e7, // nop
+--/* [0x000008d0] */ 0x009e7000, 0x100009e7, // nop
+--/* [0x000008d8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x000008e0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x000008e8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x000008f0] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x000008f8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x00000900] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000908] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x00000910] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000918] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000670] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x00000678] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x00000680] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x00000688] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000690] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x00000698] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x000006a0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x000006a8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x000006b0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x000006b8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x000006c0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x000006c8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x000006d0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x000006d8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x000006e0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x000006e8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x000006f0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x000006f8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00000700] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00000708] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+/* [0x00000710] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+/* [0x00000718] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+/* [0x00000720] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+/* [0x00000728] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+/* [0x00000730] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+/* [0x00000738] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+/* [0x00000740] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+/* [0x00000748] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
+-+/* [0x00000750] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+-+/* [0x00000758] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+-+/* [0x00000760] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+-+/* [0x00000768] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+-+/* [0x00000770] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+-+/* [0x00000778] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+-+/* [0x00000780] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00000788] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+-+/* [0x00000790] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+-+/* [0x00000798] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+-+/* [0x000007a0] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+-+/* [0x000007a8] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+-+/* [0x000007b0] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
+-+/* [0x000007b8] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
+-+/* [0x000007c0] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
+-+/* [0x000007c8] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
+-+/* [0x000007d0] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
+-+/* [0x000007d8] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
+-+/* [0x000007e0] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
+-+/* [0x000007e8] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
+-+/* [0x000007f0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x000007f8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x00000800] */ 0xfffffe50, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+-+/* [0x00000808] */ 0x0f9ce3c0, 0xd0020c27, // asr vpm, r1, 14
+-+/* [0x00000810] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x00000818] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x00000820] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00000828] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000830] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000838] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x00000840] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x00000848] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000850] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x00000858] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000860] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_filter_uv_b
+--/* [0x00000920] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00000928] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x00000930] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00000938] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x00000940] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+--/* [0x00000948] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x00000950] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+--/* [0x00000958] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x00000960] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+--/* [0x00000968] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x00000970] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+--/* [0x00000978] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+--/* [0x00000980] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x00000988] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000990] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x00000998] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x000009a0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x000009a8] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+--/* [0x000009b0] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+--/* [0x000009b8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x000009c0] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+--/* [0x000009c8] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+--/* [0x000009d0] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+--/* [0x000009d8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x000009e0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x000009e8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x000009f0] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
+--/* [0x000009f8] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
+--/* [0x00000a00] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000a08] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000a10] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000a18] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000a20] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x00000a28] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000a30] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000a38] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000a40] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+--/* [0x00000a48] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000a50] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000a58] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000a60] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+--/* [0x00000a68] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000a70] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000a78] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000a80] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+--/* [0x00000a88] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x00000a90] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000a98] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x00000868] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000870] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00000878] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000880] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000888] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x00000890] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000898] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x000008a0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x000008a8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+-+/* [0x000008b0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x000008b8] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+-+/* [0x000008c0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x000008c8] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x000008d0] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000008d8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x000008e0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x000008e8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x000008f0] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+-+/* [0x000008f8] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+-+/* [0x00000900] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x00000908] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+-+/* [0x00000910] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+-+/* [0x00000918] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+-+/* [0x00000920] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000928] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000930] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00000938] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
+-+/* [0x00000940] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
+-+/* [0x00000948] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000950] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000958] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000960] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000968] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x00000970] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000978] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000980] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000988] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+-+/* [0x00000990] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000998] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000009a0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000009a8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+-+/* [0x000009b0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000009b8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000009c0] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000009c8] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+-+/* [0x000009d0] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x000009d8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x000009e0] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop_b
+--/* [0x00000aa0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x00000aa8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x00000ab0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x00000ab8] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x00000ac0] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x00000ac8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x00000ad0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x00000ad8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x00000ae0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x00000ae8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x00000af0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000af8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x00000b00] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x00000b08] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x00000b10] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x00000b18] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x00000b20] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x00000b28] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x00000b30] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00000b38] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--/* [0x00000b40] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--/* [0x00000b48] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--/* [0x00000b50] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--/* [0x00000b58] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--/* [0x00000b60] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--/* [0x00000b68] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--/* [0x00000b70] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--/* [0x00000b78] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
+--/* [0x00000b80] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+--/* [0x00000b88] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+--/* [0x00000b90] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+--/* [0x00000b98] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+--/* [0x00000ba0] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+--/* [0x00000ba8] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+--/* [0x00000bb0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x00000bb8] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+--/* [0x00000bc0] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
+--/* [0x00000bc8] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+--/* [0x00000bd0] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+--/* [0x00000bd8] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+--/* [0x00000be0] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
+--/* [0x00000be8] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
+--/* [0x00000bf0] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
+--/* [0x00000bf8] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
+--/* [0x00000c00] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
+--/* [0x00000c08] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
+--/* [0x00000c10] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
+--/* [0x00000c18] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
+--/* [0x00000c20] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x00000c28] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x00000c30] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+--/* [0x00000c38] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
+--/* [0x00000c40] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
+--/* [0x00000c48] */ 0xfffffe38, 0xf06809e7, // brr.anyn -, r:uvloop_b
+--/* [0x00000c50] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
+--/* [0x00000c58] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x00000c60] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+--/* [0x00000c68] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00000c70] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000c78] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x00000c80] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x00000c88] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x00000c90] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000c98] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x00000ca0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000ca8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x000009e8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x000009f0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x000009f8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x00000a00] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000a08] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x00000a10] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000a18] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000a20] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x00000a28] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x00000a30] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x00000a38] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000a40] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x00000a48] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x00000a50] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00000a58] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x00000a60] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000a68] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00000a70] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00000a78] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00000a80] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+/* [0x00000a88] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+/* [0x00000a90] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+/* [0x00000a98] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+/* [0x00000aa0] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+/* [0x00000aa8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+/* [0x00000ab0] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+/* [0x00000ab8] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+/* [0x00000ac0] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
+-+/* [0x00000ac8] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+-+/* [0x00000ad0] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+-+/* [0x00000ad8] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+-+/* [0x00000ae0] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+-+/* [0x00000ae8] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+-+/* [0x00000af0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+-+/* [0x00000af8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00000b00] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+-+/* [0x00000b08] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x00000b10] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+-+/* [0x00000b18] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+-+/* [0x00000b20] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+-+/* [0x00000b28] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
+-+/* [0x00000b30] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
+-+/* [0x00000b38] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
+-+/* [0x00000b40] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
+-+/* [0x00000b48] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
+-+/* [0x00000b50] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
+-+/* [0x00000b58] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
+-+/* [0x00000b60] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
+-+/* [0x00000b68] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x00000b70] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x00000b78] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x00000b80] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
+-+/* [0x00000b88] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
+-+/* [0x00000b90] */ 0xfffffe38, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x00000b98] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
+-+/* [0x00000ba0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x00000ba8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x00000bb0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00000bb8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000bc0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000bc8] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x00000bd0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x00000bd8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000be0] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x00000be8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000bf0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_exit
+--/* [0x00000cb0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x00000cb8] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+--/* [0x00000cc0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000cc8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000cd0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000cd8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000ce0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x00000ce8] */ 0x009e7000, 0x100009e7, // nop        ; nop
+--/* [0x00000cf0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000bf8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00000c00] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+-+/* [0x00000c08] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000c10] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000c18] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000c20] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000c28] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00000c30] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000c38] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_interrupt_exit8
+--/* [0x00000cf8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x00000d00] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000d08] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000d10] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000d18] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000d20] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000d28] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000d30] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000d38] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000d40] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000d48] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000d50] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000d58] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x00000d60] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+--/* [0x00000d68] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000c40] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00000c48] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000c50] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000c58] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000c60] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000c68] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000c70] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000c78] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000c80] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000c88] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000c90] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000c98] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000ca0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00000ca8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x00000cb0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_end
+- };
+- #ifdef __HIGHC__
+-diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
+-index 6562fa9..1bf7a68 100644
+---- a/libavcodec/rpi_shader.h
+-+++ b/libavcodec/rpi_shader.h
+-@@ -5,10 +5,10 @@ extern unsigned int rpi_shader[];
+- 
+- #define mc_setup_uv (rpi_shader + 0)
+- #define mc_filter_uv (rpi_shader + 152)
+--#define mc_filter_uv_b0 (rpi_shader + 370)
+--#define mc_filter_uv_b (rpi_shader + 584)
+--#define mc_exit (rpi_shader + 812)
+--#define mc_interrupt_exit8 (rpi_shader + 830)
+--#define mc_end (rpi_shader + 860)
+-+#define mc_filter_uv_b0 (rpi_shader + 324)
+-+#define mc_filter_uv_b (rpi_shader + 538)
+-+#define mc_exit (rpi_shader + 766)
+-+#define mc_interrupt_exit8 (rpi_shader + 784)
+-+#define mc_end (rpi_shader + 814)
+- 
+- #endif
+-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
+-index 562dc35..8e4f18f 100644
+---- a/libavcodec/rpi_shader.qasm
+-+++ b/libavcodec/rpi_shader.qasm
+-@@ -16,8 +16,8 @@
+- # ra19                                          next ra17
+- #
+- # rb16                                          pitch
+--# rb17                                          height + 5
+--# rb18                                          height + 7
+-+# rb17                                          height + 1
+-+# rb18                                          height + 3
+- # rb19                                          next ra16
+- #
+- # ra20                                          1
+-@@ -214,8 +214,8 @@ mov r0, unif
+- shr r1, r0, r2 # Extract width
+- sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
+- and r0, r0, rb22 # Extract height
+--add rb17, r0, 5
+--add rb18, r0, 7
+-+add rb17, r0, 1
+-+add rb18, r0, 3
+- shl r0, r0, 7
+- add r0, r0, r1 # Combine width and height of destination area
+- shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
+-@@ -230,18 +230,11 @@ asr ra3, r0, rb23;      mul24 r0, r0, ra22
+- asr ra2, r0, rb23;      mul24 r0, r0, ra22
+- asr ra1, r0, rb23;      mul24 r0, r0, ra22
+- asr ra0, r0, rb23;      mov r0, unif
+--asr ra7, r0, rb23;      mul24 r0, r0, ra22
+--asr ra6, r0, rb23;      mul24 r0, r0, ra22
+--asr ra5, r0, rb23;      mul24 r0, r0, ra22
+--asr ra4, r0, rb23;      mov r0, unif
+-+                        mov r0, unif
+- asr rb11, r0, rb23;     mul24 r0, r0, ra22
+- asr rb10, r0, rb23;     mul24 r0, r0, ra22
+- asr rb9, r0, rb23;      mul24 r0, r0, ra22
+- asr rb8, r0, rb23;      mov r0, unif
+--asr rb15, r0, rb23;     mul24 r0, r0, ra22
+--asr rb14, r0, rb23;     mul24 r0, r0, ra22
+--asr rb13, r0, rb23;     mul24 r0, r0, ra22
+--asr rb12, r0, rb23
+- 
+- # r2 is elem_num
+- # r3 is loop counter
+-@@ -283,26 +276,14 @@ add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+- nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+- add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+- nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+- add r0, r2, r3
+- 
+- mov r3, rb31
+- 
+--mov ra8, ra9
+--mov ra9, ra10
+--mov ra10, ra11
+--mov ra11, ra12
+- mov ra12, ra13
+- mov ra13, ra14
+- 
+--sub.setf -, r3, 8 ; mov r1, ra22
+-+sub.setf -, r3, 4 ; mov r1, ra22
+- 
+- # apply horizontal filter
+- brr.anyn -, r:uvloop
+-@@ -312,14 +293,10 @@ nop                     ; nop  # Delay slot 3 (TODO move more of the context scr
+- 
+- # apply vertical filter and write to VPM
+- 
+--nop                     ; mul24 r1, ra14, rb14
+--nop                     ; mul24 r0, ra13, rb13
+--add r1, r1, r0          ; mul24 r0, ra12, rb12
+--add r1, r1, r0          ; mul24 r0, ra11, rb11
+--add r1, r1, r0          ; mul24 r0, ra10, rb10
+--add r1, r1, r0          ; mul24 r0, ra9, rb9
+--add r1, r1, r0          ; mul24 r0, ra8, rb8
+--add r1, r1, r0          ; mul24 r0, ra15, rb15
+-+nop                     ; mul24 r1, ra14, rb10
+-+nop                     ; mul24 r0, ra13, rb9
+-+add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+add r1, r1, r0          ; mul24 r0, ra15, rb11
+- add r1, r1, r0          ; mov -, vw_wait
+- sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+- asr r1, r1, 14
+--- 
+-2.7.4
+-
+-
+-From e4bdd110d4640519b751ab428e7976a1e9a15802 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Thu, 14 May 2015 11:03:51 +0100
+-Subject: [PATCH 28/68] Optimised B0 pass
+-
+----
+- libavcodec/rpi_shader.c    | 424 +++++++++++++++++++++------------------------
+- libavcodec/rpi_shader.h    |   8 +-
+- libavcodec/rpi_shader.qasm |  43 +----
+- 3 files changed, 212 insertions(+), 263 deletions(-)
+-
+-diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
+-index c8d0728..1f63ee0 100644
+---- a/libavcodec/rpi_shader.c
+-+++ b/libavcodec/rpi_shader.c
+-@@ -204,239 +204,215 @@ unsigned int rpi_shader[] = {
+- /* [0x00000580] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+- /* [0x00000588] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+- /* [0x00000590] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x00000598] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+--/* [0x000005a0] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+-+/* [0x00000598] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+-+/* [0x000005a0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+- /* [0x000005a8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+- /* [0x000005b0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+- /* [0x000005b8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+- /* [0x000005c0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x000005c8] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
+--/* [0x000005d0] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000005d8] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000005e0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000005e8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000005f0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x000005f8] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000600] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000608] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000610] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+--/* [0x00000618] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000620] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000628] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000630] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+--/* [0x00000638] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000640] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000648] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000650] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+--/* [0x00000658] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x00000660] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000668] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x000005c8] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000005d0] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000005d8] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000005e0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000005e8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x000005f0] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000005f8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000600] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000608] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000610] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+-+/* [0x00000618] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x00000620] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000628] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop_b0
+--/* [0x00000670] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x00000678] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x00000680] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x00000688] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x00000690] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x00000698] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x000006a0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x000006a8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x000006b0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x000006b8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x000006c0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x000006c8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x000006d0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x000006d8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x000006e0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x000006e8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x000006f0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x000006f8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x00000700] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00000708] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--/* [0x00000710] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--/* [0x00000718] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--/* [0x00000720] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--/* [0x00000728] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--/* [0x00000730] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--/* [0x00000738] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--/* [0x00000740] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--/* [0x00000748] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
+--/* [0x00000750] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+--/* [0x00000758] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+--/* [0x00000760] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+--/* [0x00000768] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+--/* [0x00000770] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+--/* [0x00000778] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+--/* [0x00000780] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x00000788] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+--/* [0x00000790] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+--/* [0x00000798] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+--/* [0x000007a0] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+--/* [0x000007a8] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+--/* [0x000007b0] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
+--/* [0x000007b8] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
+--/* [0x000007c0] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
+--/* [0x000007c8] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
+--/* [0x000007d0] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
+--/* [0x000007d8] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
+--/* [0x000007e0] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
+--/* [0x000007e8] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
+--/* [0x000007f0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x000007f8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x00000800] */ 0xfffffe50, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+--/* [0x00000808] */ 0x0f9ce3c0, 0xd0020c27, // asr vpm, r1, 14
+--/* [0x00000810] */ 0x009e7000, 0x100009e7, // nop
+--/* [0x00000818] */ 0x009e7000, 0x100009e7, // nop
+--/* [0x00000820] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00000828] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000830] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x00000838] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x00000840] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x00000848] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000850] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x00000858] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000860] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000630] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x00000638] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x00000640] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x00000648] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000650] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x00000658] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000660] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000668] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x00000670] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x00000678] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x00000680] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000688] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x00000690] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x00000698] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x000006a0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x000006a8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x000006b0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x000006b8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x000006c0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x000006c8] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
+-+/* [0x000006d0] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+-+/* [0x000006d8] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+-+/* [0x000006e0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x000006e8] */ 0x8d5847f6, 0xd00269e1, // sub.setf -, r3, 4 ; mov r1, ra22
+-+/* [0x000006f0] */ 0xffffff20, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+-+/* [0x000006f8] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+-+/* [0x00000700] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+-+/* [0x00000708] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+-+/* [0x00000710] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+-+/* [0x00000718] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+-+/* [0x00000720] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+/* [0x00000728] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x00000730] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x00000738] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x00000740] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+-+/* [0x00000748] */ 0x0f9ce3c0, 0xd0020c27, // asr vpm, r1, 14
+-+/* [0x00000750] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x00000758] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x00000760] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00000768] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000770] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000778] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x00000780] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x00000788] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000790] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x00000798] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x000007a0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_filter_uv_b
+--/* [0x00000868] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00000870] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x00000878] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00000880] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x00000888] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+--/* [0x00000890] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x00000898] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+--/* [0x000008a0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x000008a8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+--/* [0x000008b0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x000008b8] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+--/* [0x000008c0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+--/* [0x000008c8] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x000008d0] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000008d8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x000008e0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x000008e8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x000008f0] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+--/* [0x000008f8] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+--/* [0x00000900] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x00000908] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+--/* [0x00000910] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+--/* [0x00000918] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+--/* [0x00000920] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000928] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000930] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00000938] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
+--/* [0x00000940] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
+--/* [0x00000948] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000950] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000958] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000960] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000968] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x00000970] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000978] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000980] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000988] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+--/* [0x00000990] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000998] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000009a0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000009a8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+--/* [0x000009b0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000009b8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000009c0] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000009c8] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+--/* [0x000009d0] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x000009d8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x000009e0] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x000007a8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x000007b0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x000007b8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x000007c0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x000007c8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x000007d0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x000007d8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x000007e0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x000007e8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+-+/* [0x000007f0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x000007f8] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+-+/* [0x00000800] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x00000808] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x00000810] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000818] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x00000820] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x00000828] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x00000830] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+-+/* [0x00000838] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+-+/* [0x00000840] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x00000848] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+-+/* [0x00000850] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+-+/* [0x00000858] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+-+/* [0x00000860] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000868] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000870] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00000878] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
+-+/* [0x00000880] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
+-+/* [0x00000888] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000890] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000898] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000008a0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000008a8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x000008b0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000008b8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000008c0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000008c8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+-+/* [0x000008d0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000008d8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000008e0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000008e8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+-+/* [0x000008f0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000008f8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000900] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000908] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+-+/* [0x00000910] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x00000918] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000920] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop_b
+--/* [0x000009e8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x000009f0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x000009f8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x00000a00] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x00000a08] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x00000a10] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x00000a18] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x00000a20] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x00000a28] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x00000a30] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x00000a38] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000a40] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x00000a48] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x00000a50] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x00000a58] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x00000a60] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x00000a68] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x00000a70] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x00000a78] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00000a80] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--/* [0x00000a88] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--/* [0x00000a90] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--/* [0x00000a98] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--/* [0x00000aa0] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--/* [0x00000aa8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--/* [0x00000ab0] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--/* [0x00000ab8] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--/* [0x00000ac0] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
+--/* [0x00000ac8] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+--/* [0x00000ad0] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+--/* [0x00000ad8] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+--/* [0x00000ae0] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+--/* [0x00000ae8] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+--/* [0x00000af0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+--/* [0x00000af8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x00000b00] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+--/* [0x00000b08] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
+--/* [0x00000b10] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+--/* [0x00000b18] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+--/* [0x00000b20] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+--/* [0x00000b28] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
+--/* [0x00000b30] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
+--/* [0x00000b38] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
+--/* [0x00000b40] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
+--/* [0x00000b48] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
+--/* [0x00000b50] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
+--/* [0x00000b58] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
+--/* [0x00000b60] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
+--/* [0x00000b68] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x00000b70] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x00000b78] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+--/* [0x00000b80] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
+--/* [0x00000b88] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
+--/* [0x00000b90] */ 0xfffffe38, 0xf06809e7, // brr.anyn -, r:uvloop_b
+--/* [0x00000b98] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
+--/* [0x00000ba0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x00000ba8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+--/* [0x00000bb0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00000bb8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000bc0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x00000bc8] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x00000bd0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x00000bd8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000be0] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x00000be8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000bf0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000928] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x00000930] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x00000938] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x00000940] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000948] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x00000950] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000958] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000960] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x00000968] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x00000970] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x00000978] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000980] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x00000988] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x00000990] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00000998] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x000009a0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x000009a8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x000009b0] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x000009b8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x000009c0] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+/* [0x000009c8] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+/* [0x000009d0] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+/* [0x000009d8] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+/* [0x000009e0] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+/* [0x000009e8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+/* [0x000009f0] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+/* [0x000009f8] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+/* [0x00000a00] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
+-+/* [0x00000a08] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+-+/* [0x00000a10] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+-+/* [0x00000a18] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+-+/* [0x00000a20] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+-+/* [0x00000a28] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+-+/* [0x00000a30] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+-+/* [0x00000a38] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00000a40] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+-+/* [0x00000a48] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x00000a50] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+-+/* [0x00000a58] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+-+/* [0x00000a60] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+-+/* [0x00000a68] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
+-+/* [0x00000a70] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
+-+/* [0x00000a78] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
+-+/* [0x00000a80] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
+-+/* [0x00000a88] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
+-+/* [0x00000a90] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
+-+/* [0x00000a98] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
+-+/* [0x00000aa0] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
+-+/* [0x00000aa8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x00000ab0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x00000ab8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x00000ac0] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
+-+/* [0x00000ac8] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
+-+/* [0x00000ad0] */ 0xfffffe38, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x00000ad8] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
+-+/* [0x00000ae0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x00000ae8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x00000af0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00000af8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000b00] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000b08] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x00000b10] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x00000b18] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000b20] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x00000b28] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000b30] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_exit
+--/* [0x00000bf8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x00000c00] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+--/* [0x00000c08] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000c10] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000c18] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000c20] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000c28] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x00000c30] */ 0x009e7000, 0x100009e7, // nop        ; nop
+--/* [0x00000c38] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000b38] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00000b40] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+-+/* [0x00000b48] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000b50] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000b58] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000b60] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000b68] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00000b70] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000b78] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_interrupt_exit8
+--/* [0x00000c40] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x00000c48] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000c50] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000c58] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000c60] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000c68] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000c70] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000c78] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000c80] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000c88] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000c90] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000c98] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000ca0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x00000ca8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+--/* [0x00000cb0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000b80] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00000b88] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000b90] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000b98] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000ba0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000ba8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000bb0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000bb8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000bc0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000bc8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000bd0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000bd8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000be0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00000be8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x00000bf0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_end
+- };
+- #ifdef __HIGHC__
+-diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
+-index 1bf7a68..cb74887 100644
+---- a/libavcodec/rpi_shader.h
+-+++ b/libavcodec/rpi_shader.h
+-@@ -6,9 +6,9 @@ extern unsigned int rpi_shader[];
+- #define mc_setup_uv (rpi_shader + 0)
+- #define mc_filter_uv (rpi_shader + 152)
+- #define mc_filter_uv_b0 (rpi_shader + 324)
+--#define mc_filter_uv_b (rpi_shader + 538)
+--#define mc_exit (rpi_shader + 766)
+--#define mc_interrupt_exit8 (rpi_shader + 784)
+--#define mc_end (rpi_shader + 814)
+-+#define mc_filter_uv_b (rpi_shader + 490)
+-+#define mc_exit (rpi_shader + 718)
+-+#define mc_interrupt_exit8 (rpi_shader + 736)
+-+#define mc_end (rpi_shader + 766)
+- 
+- #endif
+-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
+-index 8e4f18f..faa5755 100644
+---- a/libavcodec/rpi_shader.qasm
+-+++ b/libavcodec/rpi_shader.qasm
+-@@ -357,15 +357,13 @@ mov r0, unif
+- shr r1, r0, r2 # Extract width
+- sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
+- and r0, r0, rb22 # Extract height
+--add rb17, r0, 5
+--add rb18, r0, 7
+-+add rb17, r0, 1
+-+add rb18, r0, 3
+- shl r0, r0, 7
+- add r0, r0, r1 # Combine width and height of destination area
+- shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
+- add rb26, r0, rb27
+- 
+--sub.setf -,8,r1 # 8-r1, so if <0 (negative) we need to use the full code
+--
+- # get filter coefficients
+- 
+- mov r0, unif
+-@@ -373,18 +371,11 @@ asr ra3, r0, rb23;      mul24 r0, r0, ra22
+- asr ra2, r0, rb23;      mul24 r0, r0, ra22
+- asr ra1, r0, rb23;      mul24 r0, r0, ra22
+- asr ra0, r0, rb23;      mov r0, unif
+--asr ra7, r0, rb23;      mul24 r0, r0, ra22
+--asr ra6, r0, rb23;      mul24 r0, r0, ra22
+--asr ra5, r0, rb23;      mul24 r0, r0, ra22
+--asr ra4, r0, rb23;      mov r0, unif
+-+                        mov r0, unif
+- asr rb11, r0, rb23;     mul24 r0, r0, ra22
+- asr rb10, r0, rb23;     mul24 r0, r0, ra22
+- asr rb9, r0, rb23;      mul24 r0, r0, ra22
+- asr rb8, r0, rb23;      mov r0, unif
+--asr rb15, r0, rb23;     mul24 r0, r0, ra22
+--asr rb14, r0, rb23;     mul24 r0, r0, ra22
+--asr rb13, r0, rb23;     mul24 r0, r0, ra22
+--asr rb12, r0, rb23
+- 
+- # r2 is elem_num
+- # r3 is loop counter
+-@@ -426,26 +417,14 @@ add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+- nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+- add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+- nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+- add r0, r2, r3
+- 
+- mov r3, rb31
+- 
+--mov ra8, ra9
+--mov ra9, ra10
+--mov ra10, ra11
+--mov ra11, ra12
+- mov ra12, ra13
+- mov ra13, ra14
+- 
+--sub.setf -, r3, 8 ; mov r1, ra22
+-+sub.setf -, r3, 4 ; mov r1, ra22
+- 
+- # apply horizontal filter
+- brr.anyn -, r:uvloop_b0
+-@@ -455,18 +434,12 @@ nop                     ; nop  # Delay slot 3 (TODO move more of the context scr
+- 
+- # apply vertical filter and write to VPM
+- 
+--nop                     ; mul24 r1, ra14, rb14
+--nop                     ; mul24 r0, ra13, rb13
+--add r1, r1, r0          ; mul24 r0, ra12, rb12
+--add r1, r1, r0          ; mul24 r0, ra11, rb11
+--add r1, r1, r0          ; mul24 r0, ra10, rb10
+--add r1, r1, r0          ; mul24 r0, ra9, rb9
+--add r1, r1, r0          ; mul24 r0, ra8, rb8
+--add r1, r1, r0          ; mul24 r0, ra15, rb15
+-+nop                     ; mul24 r1, ra14, rb10
+-+nop                     ; mul24 r0, ra13, rb9
+-+add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+add r1, r1, r0          ; mul24 r0, ra15, rb11
+- add r1, r1, r0          ; mov -, vw_wait
+- sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--#asr r1, r1, 14
+--#add r1, r1, ra21
+- brr.anyn -, r:uvloop_b0
+- asr vpm, r1, 14        # Delay 1 shifts down by shift2=6, but results are still in 16bit precision TODO may be able to avoid the mul24 and use more delay slots
+- nop                    # Delay 2
+--- 
+-2.7.4
+-
+-
+-From 93805e78a13d36e28ed84a0e8456da2eac45be89 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Thu, 14 May 2015 11:12:43 +0100
+-Subject: [PATCH 29/68] Optimised B pass
+-
+----
+- libavcodec/rpi_shader.c    | 202 ++++++++++++++++++++-------------------------
+- libavcodec/rpi_shader.h    |   6 +-
+- libavcodec/rpi_shader.qasm |  41 ++-------
+- 3 files changed, 100 insertions(+), 149 deletions(-)
+-
+-diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
+-index 1f63ee0..4e6c5ea 100644
+---- a/libavcodec/rpi_shader.c
+-+++ b/libavcodec/rpi_shader.c
+-@@ -289,8 +289,8 @@ unsigned int rpi_shader[] = {
+- /* [0x00000818] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+- /* [0x00000820] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+- /* [0x00000828] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x00000830] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+--/* [0x00000838] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+-+/* [0x00000830] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+-+/* [0x00000838] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+- /* [0x00000840] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+- /* [0x00000848] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+- /* [0x00000850] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+-@@ -299,120 +299,96 @@ unsigned int rpi_shader[] = {
+- /* [0x00000868] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+- /* [0x00000870] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+- /* [0x00000878] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
+--/* [0x00000880] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
+--/* [0x00000888] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000890] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000898] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000008a0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000008a8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x000008b0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000008b8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000008c0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000008c8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+--/* [0x000008d0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000008d8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000008e0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000008e8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+--/* [0x000008f0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000008f8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000900] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000908] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
+--/* [0x00000910] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x00000918] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000920] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x00000880] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000888] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000890] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000898] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000008a0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x000008a8] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000008b0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000008b8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000008c0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000008c8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+-+/* [0x000008d0] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x000008d8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x000008e0] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop_b
+--/* [0x00000928] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x00000930] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x00000938] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x00000940] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x00000948] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x00000950] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x00000958] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x00000960] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x00000968] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x00000970] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x00000978] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000980] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x00000988] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x00000990] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x00000998] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x000009a0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x000009a8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x000009b0] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x000009b8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x000009c0] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--/* [0x000009c8] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--/* [0x000009d0] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--/* [0x000009d8] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--/* [0x000009e0] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--/* [0x000009e8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--/* [0x000009f0] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--/* [0x000009f8] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--/* [0x00000a00] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
+--/* [0x00000a08] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+--/* [0x00000a10] */ 0x15267d80, 0x10020227, // mov ra8, ra9
+--/* [0x00000a18] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+--/* [0x00000a20] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+--/* [0x00000a28] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+--/* [0x00000a30] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+--/* [0x00000a38] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x00000a40] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
+--/* [0x00000a48] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
+--/* [0x00000a50] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+--/* [0x00000a58] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+--/* [0x00000a60] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+--/* [0x00000a68] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
+--/* [0x00000a70] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
+--/* [0x00000a78] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
+--/* [0x00000a80] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
+--/* [0x00000a88] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
+--/* [0x00000a90] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
+--/* [0x00000a98] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
+--/* [0x00000aa0] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
+--/* [0x00000aa8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x00000ab0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x00000ab8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+--/* [0x00000ac0] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
+--/* [0x00000ac8] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
+--/* [0x00000ad0] */ 0xfffffe38, 0xf06809e7, // brr.anyn -, r:uvloop_b
+--/* [0x00000ad8] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
+--/* [0x00000ae0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x00000ae8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+--/* [0x00000af0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00000af8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000b00] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x00000b08] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x00000b10] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x00000b18] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000b20] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x00000b28] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000b30] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x000008e8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x000008f0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x000008f8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x00000900] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000908] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x00000910] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000918] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000920] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x00000928] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x00000930] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x00000938] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000940] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x00000948] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x00000950] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00000958] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x00000960] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000968] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00000970] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00000978] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00000980] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
+-+/* [0x00000988] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+-+/* [0x00000990] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+-+/* [0x00000998] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x000009a0] */ 0x8d5847f6, 0xd00269e1, // sub.setf -, r3, 4 ; mov r1, ra22
+-+/* [0x000009a8] */ 0xffffff20, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x000009b0] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+-+/* [0x000009b8] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+-+/* [0x000009c0] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+-+/* [0x000009c8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+-+/* [0x000009d0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+-+/* [0x000009d8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+/* [0x000009e0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x000009e8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x000009f0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x000009f8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x00000a00] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
+-+/* [0x00000a08] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
+-+/* [0x00000a10] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x00000a18] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
+-+/* [0x00000a20] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x00000a28] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x00000a30] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00000a38] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000a40] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000a48] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x00000a50] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x00000a58] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000a60] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x00000a68] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000a70] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_exit
+--/* [0x00000b38] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x00000b40] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+--/* [0x00000b48] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000b50] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000b58] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000b60] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000b68] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x00000b70] */ 0x009e7000, 0x100009e7, // nop        ; nop
+--/* [0x00000b78] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000a78] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00000a80] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+-+/* [0x00000a88] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a90] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a98] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000aa0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000aa8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00000ab0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000ab8] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_interrupt_exit8
+--/* [0x00000b80] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x00000b88] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000b90] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000b98] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000ba0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000ba8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000bb0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000bb8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000bc0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000bc8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000bd0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000bd8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000be0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x00000be8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+--/* [0x00000bf0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000ac0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00000ac8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000ad0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000ad8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000ae0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000ae8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000af0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000af8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000b00] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000b08] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000b10] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000b18] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000b20] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00000b28] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x00000b30] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_end
+- };
+- #ifdef __HIGHC__
+-diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
+-index cb74887..53da629 100644
+---- a/libavcodec/rpi_shader.h
+-+++ b/libavcodec/rpi_shader.h
+-@@ -7,8 +7,8 @@ extern unsigned int rpi_shader[];
+- #define mc_filter_uv (rpi_shader + 152)
+- #define mc_filter_uv_b0 (rpi_shader + 324)
+- #define mc_filter_uv_b (rpi_shader + 490)
+--#define mc_exit (rpi_shader + 718)
+--#define mc_interrupt_exit8 (rpi_shader + 736)
+--#define mc_end (rpi_shader + 766)
+-+#define mc_exit (rpi_shader + 670)
+-+#define mc_interrupt_exit8 (rpi_shader + 688)
+-+#define mc_end (rpi_shader + 718)
+- 
+- #endif
+-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
+-index faa5755..f38c926 100644
+---- a/libavcodec/rpi_shader.qasm
+-+++ b/libavcodec/rpi_shader.qasm
+-@@ -491,8 +491,8 @@ mov r0, unif
+- shr r1, r0, r2 # Extract width
+- sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
+- and r0, r0, rb22 # Extract height
+--add rb17, r0, 5
+--add rb18, r0, 7
+-+add rb17, r0, 1
+-+add rb18, r0, 3
+- shl r0, r0, 7
+- 
+- # r0 is currently height<<7
+-@@ -508,8 +508,6 @@ add rb26, r0, rb27
+- # In a B frame, so also set up VPM read (reading back 16bit precision)
+- add vr_setup, r3, rb21
+- 
+--sub.setf -,8,r1 # 8-r1, so if <0 (negative) we need to use the full code
+--
+- # get filter coefficients
+- 
+- mov r0, unif
+-@@ -517,18 +515,11 @@ asr ra3, r0, rb23;      mul24 r0, r0, ra22
+- asr ra2, r0, rb23;      mul24 r0, r0, ra22
+- asr ra1, r0, rb23;      mul24 r0, r0, ra22
+- asr ra0, r0, rb23;      mov r0, unif
+--asr ra7, r0, rb23;      mul24 r0, r0, ra22
+--asr ra6, r0, rb23;      mul24 r0, r0, ra22
+--asr ra5, r0, rb23;      mul24 r0, r0, ra22
+--asr ra4, r0, rb23;      mov r0, unif
+-+                        mov r0, unif
+- asr rb11, r0, rb23;     mul24 r0, r0, ra22
+- asr rb10, r0, rb23;     mul24 r0, r0, ra22
+- asr rb9, r0, rb23;      mul24 r0, r0, ra22
+- asr rb8, r0, rb23;      mov r0, unif
+--asr rb15, r0, rb23;     mul24 r0, r0, ra22
+--asr rb14, r0, rb23;     mul24 r0, r0, ra22
+--asr rb13, r0, rb23;     mul24 r0, r0, ra22
+--asr rb12, r0, rb23
+- 
+- # r2 is elem_num
+- # r3 is loop counter
+-@@ -570,26 +561,14 @@ add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+- nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+- add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+- nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+- add r0, r2, r3
+- 
+- mov r3, rb31
+- 
+--mov ra8, ra9
+--mov ra9, ra10
+--mov ra10, ra11
+--mov ra11, ra12
+- mov ra12, ra13
+- mov ra13, ra14
+- 
+--sub.setf -, r3, 8 ; mov r1, ra22
+-+sub.setf -, r3, 4 ; mov r1, ra22
+- # apply horizontal filter
+- brr.anyn -, r:uvloop_b
+- mov ra14, ra15          ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
+-@@ -598,14 +577,10 @@ nop                     ; nop    # TODO improve use of delay slots
+- 
+- # apply vertical filter and write to VPM
+- 
+--nop                     ; mul24 r1, ra14, rb14
+--nop                     ; mul24 r0, ra13, rb13
+--add r1, r1, r0          ; mul24 r0, ra12, rb12
+--add r1, r1, r0          ; mul24 r0, ra11, rb11
+--add r1, r1, r0          ; mul24 r0, ra10, rb10
+--add r1, r1, r0          ; mul24 r0, ra9, rb9
+--add r1, r1, r0          ; mul24 r0, ra8, rb8
+--add r1, r1, r0          ; mul24 r0, ra15, rb15
+-+nop                     ; mul24 r1, ra14, rb10
+-+nop                     ; mul24 r0, ra13, rb9
+-+add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+add r1, r1, r0          ; mul24 r0, ra15, rb11
+- add r1, r1, r0          ; mov -, vw_wait
+- sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+- asr r1, r1, 14          # shift2=6
+--- 
+-2.7.4
+-
+-
+-From e48df43c16de74dddbc7c702d64dd01eaf8e6b39 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Thu, 14 May 2015 11:17:09 +0100
+-Subject: [PATCH 30/68] Used P delay slots more efficiently
+-
+----
+- libavcodec/rpi_shader.c    | 437 ++++++++++++++++++++++-----------------------
+- libavcodec/rpi_shader.h    |  10 +-
+- libavcodec/rpi_shader.qasm |  19 +-
+- 3 files changed, 228 insertions(+), 238 deletions(-)
+-
+-diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
+-index 4e6c5ea..a1af4e3 100644
+---- a/libavcodec/rpi_shader.c
+-+++ b/libavcodec/rpi_shader.c
+-@@ -156,239 +156,236 @@ unsigned int rpi_shader[] = {
+- /* [0x00000408] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+- /* [0x00000410] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+- /* [0x00000418] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00000420] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
+--/* [0x00000428] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+--/* [0x00000430] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+-+/* [0x00000420] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+-+/* [0x00000428] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+-+/* [0x00000430] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
+- /* [0x00000438] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x00000440] */ 0x8d5847f6, 0xd00269e1, // sub.setf -, r3, 4 ; mov r1, ra22
+--/* [0x00000448] */ 0xffffff20, 0xf06809e7, // brr.anyn -, r:uvloop
+--/* [0x00000450] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+--/* [0x00000458] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+--/* [0x00000460] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+--/* [0x00000468] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+--/* [0x00000470] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+--/* [0x00000478] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+--/* [0x00000480] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+--/* [0x00000488] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x00000490] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x00000498] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+--/* [0x000004a0] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
+--/* [0x000004a8] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop
+--/* [0x000004b0] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
+--/* [0x000004b8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x000004c0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+--/* [0x000004c8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x000004d0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x000004d8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x000004e0] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x000004e8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x000004f0] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x000004f8] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x00000500] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000508] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000440] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+-+/* [0x00000448] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+-+/* [0x00000450] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+-+/* [0x00000458] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+-+/* [0x00000460] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+/* [0x00000468] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x00000470] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x00000478] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x00000480] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x00000488] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
+-+/* [0x00000490] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:uvloop
+-+/* [0x00000498] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
+-+/* [0x000004a0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x000004a8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x000004b0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x000004b8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x000004c0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x000004c8] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x000004d0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x000004d8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x000004e0] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x000004e8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x000004f0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_filter_uv_b0
+--/* [0x00000510] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00000518] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x00000520] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00000528] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x00000530] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+--/* [0x00000538] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x00000540] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+--/* [0x00000548] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x00000550] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+--/* [0x00000558] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x00000560] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+--/* [0x00000568] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
+--/* [0x00000570] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x00000578] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000580] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x00000588] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x00000590] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x00000598] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+--/* [0x000005a0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+--/* [0x000005a8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x000005b0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x000005b8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x000005c0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x000005c8] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000005d0] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000005d8] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000005e0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000005e8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x000005f0] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000005f8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000600] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000608] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000610] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+--/* [0x00000618] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x00000620] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000628] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x000004f8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000500] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00000508] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000510] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000518] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x00000520] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000528] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x00000530] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x00000538] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+-+/* [0x00000540] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x00000548] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+-+/* [0x00000550] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
+-+/* [0x00000558] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x00000560] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000568] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x00000570] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x00000578] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x00000580] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+-+/* [0x00000588] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+-+/* [0x00000590] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x00000598] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x000005a0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x000005a8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x000005b0] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000005b8] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000005c0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000005c8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000005d0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x000005d8] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000005e0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000005e8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000005f0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000005f8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+-+/* [0x00000600] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x00000608] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000610] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop_b0
+--/* [0x00000630] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x00000638] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x00000640] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x00000648] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x00000650] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x00000658] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x00000660] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x00000668] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x00000670] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x00000678] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x00000680] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000688] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x00000690] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x00000698] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x000006a0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x000006a8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x000006b0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x000006b8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x000006c0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x000006c8] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
+--/* [0x000006d0] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+--/* [0x000006d8] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+--/* [0x000006e0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x000006e8] */ 0x8d5847f6, 0xd00269e1, // sub.setf -, r3, 4 ; mov r1, ra22
+--/* [0x000006f0] */ 0xffffff20, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+--/* [0x000006f8] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+--/* [0x00000700] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+--/* [0x00000708] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+--/* [0x00000710] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+--/* [0x00000718] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+--/* [0x00000720] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+--/* [0x00000728] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+--/* [0x00000730] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x00000738] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x00000740] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+--/* [0x00000748] */ 0x0f9ce3c0, 0xd0020c27, // asr vpm, r1, 14
+--/* [0x00000750] */ 0x009e7000, 0x100009e7, // nop
+--/* [0x00000758] */ 0x009e7000, 0x100009e7, // nop
+--/* [0x00000760] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00000768] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000770] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x00000778] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x00000780] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x00000788] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000790] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x00000798] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x000007a0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000618] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x00000620] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x00000628] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x00000630] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000638] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x00000640] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000648] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000650] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x00000658] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x00000660] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x00000668] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000670] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x00000678] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x00000680] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00000688] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x00000690] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000698] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x000006a0] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x000006a8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x000006b0] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
+-+/* [0x000006b8] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+-+/* [0x000006c0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+-+/* [0x000006c8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x000006d0] */ 0x8d5847f6, 0xd00269e1, // sub.setf -, r3, 4 ; mov r1, ra22
+-+/* [0x000006d8] */ 0xffffff20, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+-+/* [0x000006e0] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+-+/* [0x000006e8] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+-+/* [0x000006f0] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+-+/* [0x000006f8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+-+/* [0x00000700] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+-+/* [0x00000708] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+/* [0x00000710] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x00000718] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x00000720] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x00000728] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+-+/* [0x00000730] */ 0x0f9ce3c0, 0xd0020c27, // asr vpm, r1, 14
+-+/* [0x00000738] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x00000740] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x00000748] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00000750] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000758] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000760] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x00000768] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x00000770] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000778] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x00000780] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000788] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_filter_uv_b
+--/* [0x000007a8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x000007b0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x000007b8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x000007c0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x000007c8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+--/* [0x000007d0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x000007d8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+--/* [0x000007e0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x000007e8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+--/* [0x000007f0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x000007f8] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+--/* [0x00000800] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+--/* [0x00000808] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x00000810] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000818] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x00000820] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x00000828] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x00000830] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+--/* [0x00000838] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+--/* [0x00000840] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x00000848] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+--/* [0x00000850] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+--/* [0x00000858] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+--/* [0x00000860] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000868] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000870] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00000878] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
+--/* [0x00000880] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000888] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000890] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000898] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000008a0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x000008a8] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000008b0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000008b8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000008c0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000008c8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+--/* [0x000008d0] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x000008d8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x000008e0] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x00000790] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000798] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x000007a0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x000007a8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x000007b0] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x000007b8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x000007c0] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x000007c8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x000007d0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+-+/* [0x000007d8] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x000007e0] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+-+/* [0x000007e8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x000007f0] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x000007f8] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000800] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x00000808] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x00000810] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x00000818] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+-+/* [0x00000820] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+-+/* [0x00000828] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x00000830] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+-+/* [0x00000838] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+-+/* [0x00000840] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+-+/* [0x00000848] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000850] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000858] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00000860] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
+-+/* [0x00000868] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000870] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000878] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000880] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000888] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x00000890] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000898] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000008a0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000008a8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000008b0] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+-+/* [0x000008b8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x000008c0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x000008c8] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop_b
+--/* [0x000008e8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x000008f0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x000008f8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x00000900] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x00000908] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x00000910] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x00000918] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x00000920] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x00000928] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x00000930] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x00000938] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000940] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x00000948] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x00000950] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x00000958] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x00000960] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x00000968] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x00000970] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x00000978] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00000980] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
+--/* [0x00000988] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+--/* [0x00000990] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+--/* [0x00000998] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x000009a0] */ 0x8d5847f6, 0xd00269e1, // sub.setf -, r3, 4 ; mov r1, ra22
+--/* [0x000009a8] */ 0xffffff20, 0xf06809e7, // brr.anyn -, r:uvloop_b
+--/* [0x000009b0] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+--/* [0x000009b8] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+--/* [0x000009c0] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+--/* [0x000009c8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+--/* [0x000009d0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+--/* [0x000009d8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+--/* [0x000009e0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+--/* [0x000009e8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x000009f0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x000009f8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+--/* [0x00000a00] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
+--/* [0x00000a08] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
+--/* [0x00000a10] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:uvloop_b
+--/* [0x00000a18] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
+--/* [0x00000a20] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x00000a28] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+--/* [0x00000a30] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00000a38] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000a40] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x00000a48] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x00000a50] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x00000a58] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000a60] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x00000a68] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000a70] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x000008d0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x000008d8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x000008e0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x000008e8] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x000008f0] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x000008f8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000900] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000908] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x00000910] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x00000918] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x00000920] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000928] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x00000930] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x00000938] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00000940] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x00000948] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000950] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00000958] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00000960] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00000968] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
+-+/* [0x00000970] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+-+/* [0x00000978] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+-+/* [0x00000980] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00000988] */ 0x8d5847f6, 0xd00269e1, // sub.setf -, r3, 4 ; mov r1, ra22
+-+/* [0x00000990] */ 0xffffff20, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x00000998] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+-+/* [0x000009a0] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+-+/* [0x000009a8] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+-+/* [0x000009b0] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+-+/* [0x000009b8] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+-+/* [0x000009c0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+/* [0x000009c8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x000009d0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x000009d8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x000009e0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x000009e8] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
+-+/* [0x000009f0] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
+-+/* [0x000009f8] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x00000a00] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
+-+/* [0x00000a08] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x00000a10] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x00000a18] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00000a20] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000a28] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000a30] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x00000a38] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x00000a40] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000a48] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x00000a50] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000a58] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_exit
+--/* [0x00000a78] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x00000a80] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+-+/* [0x00000a60] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00000a68] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+-+/* [0x00000a70] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a78] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a80] */ 0x009e7000, 0xa00009e7, // ldtmu0
+- /* [0x00000a88] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000a90] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000a98] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000aa0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000aa8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x00000ab0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+--/* [0x00000ab8] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000a90] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00000a98] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000aa0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_interrupt_exit8
+--/* [0x00000ac0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00000aa8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00000ab0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000ab8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000ac0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+- /* [0x00000ac8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000ad0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000ad8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000ae0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000ad0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000ad8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000ae0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00000ae8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00000af0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00000af8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00000b00] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000b08] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000b10] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000b18] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000b20] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x00000b28] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+--/* [0x00000b30] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000b08] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00000b10] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x00000b18] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_end
+- };
+- #ifdef __HIGHC__
+-diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
+-index 53da629..1fb3e37 100644
+---- a/libavcodec/rpi_shader.h
+-+++ b/libavcodec/rpi_shader.h
+-@@ -5,10 +5,10 @@ extern unsigned int rpi_shader[];
+- 
+- #define mc_setup_uv (rpi_shader + 0)
+- #define mc_filter_uv (rpi_shader + 152)
+--#define mc_filter_uv_b0 (rpi_shader + 324)
+--#define mc_filter_uv_b (rpi_shader + 490)
+--#define mc_exit (rpi_shader + 670)
+--#define mc_interrupt_exit8 (rpi_shader + 688)
+--#define mc_end (rpi_shader + 718)
+-+#define mc_filter_uv_b0 (rpi_shader + 318)
+-+#define mc_filter_uv_b (rpi_shader + 484)
+-+#define mc_exit (rpi_shader + 664)
+-+#define mc_interrupt_exit8 (rpi_shader + 682)
+-+#define mc_end (rpi_shader + 712)
+- 
+- #endif
+-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
+-index f38c926..02e95dd 100644
+---- a/libavcodec/rpi_shader.qasm
+-+++ b/libavcodec/rpi_shader.qasm
+-@@ -268,6 +268,7 @@ add t0s, ra_x2_base, r2
+- 
+- mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+- 
+-+# apply horizontal filter
+- nop                  ; mul24 r2, r0, ra0
+- nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+- nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-@@ -276,20 +277,12 @@ add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+- nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+- add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+- nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--add r0, r2, r3
+--
+--mov r3, rb31
+--
+--mov ra12, ra13
+--mov ra13, ra14
+--
+--sub.setf -, r3, 4 ; mov r1, ra22
+--
+--# apply horizontal filter
+-+add r0, r2, r3       ; mov r3, rb31
+-+sub.setf -, r3, 4    ; mov ra12, ra13
+- brr.anyn -, r:uvloop
+--mov ra14, ra15          ; mul24 r0, r0, r1         # last bit of context scroll
+--asr ra15, r0, 8         ; nop
+--nop                     ; nop  # Delay slot 3 (TODO move more of the context scroll into here)
+-+mov ra13, ra14       # Delay slot 1
+-+mov ra14, ra15       # Delay slot 2
+-+mov ra15, r0         # Delay slot 3
+- 
+- # apply vertical filter and write to VPM
+- 
+--- 
+-2.7.4
+-
+-
+-From b33dfc243ff5509299685add3c532ab7f207fd73 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Thu, 14 May 2015 11:22:25 +0100
+-Subject: [PATCH 31/68] Improved use of delay slots
+-
+----
+- libavcodec/rpi_shader.c    | 503 ++++++++++++++++++++++-----------------------
+- libavcodec/rpi_shader.h    |  10 +-
+- libavcodec/rpi_shader.qasm |  41 ++--
+- 3 files changed, 265 insertions(+), 289 deletions(-)
+-
+-diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
+-index a1af4e3..c498f28 100644
+---- a/libavcodec/rpi_shader.c
+-+++ b/libavcodec/rpi_shader.c
+-@@ -122,270 +122,263 @@ unsigned int rpi_shader[] = {
+- /* [0x00000300] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+- /* [0x00000308] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+- /* [0x00000310] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00000318] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
+--/* [0x00000320] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000328] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000330] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000338] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000340] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x00000348] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000350] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000358] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000360] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000368] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+--/* [0x00000370] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x00000378] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000380] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x00000318] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000320] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000328] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000330] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000338] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x00000340] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000348] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000350] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000358] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000360] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+-+/* [0x00000368] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x00000370] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000378] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop
+--/* [0x00000388] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x00000390] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x00000398] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x000003a0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x000003a8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x000003b0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x000003b8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x000003c0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x000003c8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x000003d0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x000003d8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x000003e0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x000003e8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x000003f0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x000003f8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x00000400] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x00000408] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x00000410] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x00000418] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00000420] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+--/* [0x00000428] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+--/* [0x00000430] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
+--/* [0x00000438] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x00000440] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+--/* [0x00000448] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+--/* [0x00000450] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+--/* [0x00000458] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+--/* [0x00000460] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+--/* [0x00000468] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+--/* [0x00000470] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x00000478] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x00000480] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+--/* [0x00000488] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
+--/* [0x00000490] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:uvloop
+--/* [0x00000498] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
+--/* [0x000004a0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x000004a8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+--/* [0x000004b0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x000004b8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x000004c0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x000004c8] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x000004d0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x000004d8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x000004e0] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x000004e8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x000004f0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000380] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x00000388] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x00000390] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x00000398] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x000003a0] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x000003a8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x000003b0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x000003b8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x000003c0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x000003c8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x000003d0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x000003d8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x000003e0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x000003e8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x000003f0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x000003f8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000400] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00000408] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00000410] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00000418] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+-+/* [0x00000420] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+-+/* [0x00000428] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
+-+/* [0x00000430] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00000438] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+-+/* [0x00000440] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+-+/* [0x00000448] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+-+/* [0x00000450] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+-+/* [0x00000458] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+/* [0x00000460] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x00000468] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x00000470] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x00000478] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x00000480] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
+-+/* [0x00000488] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:uvloop
+-+/* [0x00000490] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
+-+/* [0x00000498] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x000004a0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x000004a8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x000004b0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x000004b8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x000004c0] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x000004c8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x000004d0] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x000004d8] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x000004e0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x000004e8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_filter_uv_b0
+--/* [0x000004f8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00000500] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x00000508] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00000510] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x00000518] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+--/* [0x00000520] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x00000528] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+--/* [0x00000530] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x00000538] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+--/* [0x00000540] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x00000548] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+--/* [0x00000550] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
+--/* [0x00000558] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x00000560] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000568] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x00000570] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x00000578] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x00000580] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+--/* [0x00000588] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+--/* [0x00000590] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x00000598] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x000005a0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x000005a8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x000005b0] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000005b8] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000005c0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000005c8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000005d0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x000005d8] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000005e0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000005e8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000005f0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000005f8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+--/* [0x00000600] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x00000608] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000610] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x000004f0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x000004f8] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00000500] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000508] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000510] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x00000518] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000520] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x00000528] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x00000530] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+-+/* [0x00000538] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x00000540] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+-+/* [0x00000548] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
+-+/* [0x00000550] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x00000558] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000560] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x00000568] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x00000570] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x00000578] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+-+/* [0x00000580] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+-+/* [0x00000588] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x00000590] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000598] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x000005a0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x000005a8] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000005b0] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000005b8] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000005c0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000005c8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x000005d0] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000005d8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000005e0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000005e8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000005f0] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+-+/* [0x000005f8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x00000600] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000608] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop_b0
+--/* [0x00000618] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x00000620] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x00000628] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x00000630] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x00000638] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x00000640] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x00000648] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x00000650] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x00000658] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x00000660] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x00000668] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000670] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x00000678] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x00000680] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x00000688] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x00000690] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x00000698] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x000006a0] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x000006a8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x000006b0] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
+--/* [0x000006b8] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+--/* [0x000006c0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+--/* [0x000006c8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x000006d0] */ 0x8d5847f6, 0xd00269e1, // sub.setf -, r3, 4 ; mov r1, ra22
+--/* [0x000006d8] */ 0xffffff20, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+--/* [0x000006e0] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+--/* [0x000006e8] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+--/* [0x000006f0] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+--/* [0x000006f8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+--/* [0x00000700] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+--/* [0x00000708] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+--/* [0x00000710] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+--/* [0x00000718] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x00000720] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x00000728] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+--/* [0x00000730] */ 0x0f9ce3c0, 0xd0020c27, // asr vpm, r1, 14
+--/* [0x00000738] */ 0x009e7000, 0x100009e7, // nop
+--/* [0x00000740] */ 0x009e7000, 0x100009e7, // nop
+--/* [0x00000748] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00000750] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000758] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x00000760] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x00000768] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x00000770] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000778] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x00000780] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000788] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000610] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x00000618] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x00000620] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x00000628] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000630] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x00000638] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000640] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000648] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x00000650] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x00000658] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x00000660] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000668] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x00000670] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x00000678] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00000680] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x00000688] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000690] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00000698] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x000006a0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x000006a8] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+-+/* [0x000006b0] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+-+/* [0x000006b8] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+-+/* [0x000006c0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x000006c8] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+-+/* [0x000006d0] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+-+/* [0x000006d8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+-+/* [0x000006e0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+-+/* [0x000006e8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+/* [0x000006f0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x000006f8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x00000700] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
+-+/* [0x00000708] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+-+/* [0x00000710] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
+-+/* [0x00000718] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x00000720] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x00000728] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00000730] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000738] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000740] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x00000748] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x00000750] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000758] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x00000760] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000768] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_filter_uv_b
+--/* [0x00000790] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00000798] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x000007a0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x000007a8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x000007b0] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+--/* [0x000007b8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x000007c0] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+--/* [0x000007c8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x000007d0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+--/* [0x000007d8] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x000007e0] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+--/* [0x000007e8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+--/* [0x000007f0] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x000007f8] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000800] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x00000808] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x00000810] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x00000818] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+--/* [0x00000820] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+--/* [0x00000828] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x00000830] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+--/* [0x00000838] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+--/* [0x00000840] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+--/* [0x00000848] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000850] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000858] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00000860] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
+--/* [0x00000868] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000870] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000878] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000880] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000888] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x00000890] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000898] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000008a0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000008a8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000008b0] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+--/* [0x000008b8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x000008c0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x000008c8] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x00000770] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000778] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00000780] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000788] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000790] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x00000798] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x000007a0] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x000007a8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x000007b0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+-+/* [0x000007b8] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x000007c0] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+-+/* [0x000007c8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x000007d0] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x000007d8] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000007e0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x000007e8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x000007f0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x000007f8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+-+/* [0x00000800] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+-+/* [0x00000808] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x00000810] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+-+/* [0x00000818] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+-+/* [0x00000820] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+-+/* [0x00000828] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000830] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000838] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00000840] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
+-+/* [0x00000848] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000850] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000858] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000860] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000868] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x00000870] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000878] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000880] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000888] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000890] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+-+/* [0x00000898] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x000008a0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x000008a8] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop_b
+--/* [0x000008d0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x000008d8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x000008e0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x000008e8] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x000008f0] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x000008f8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x00000900] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x00000908] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x00000910] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x00000918] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x00000920] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000928] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x00000930] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x00000938] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x00000940] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x00000948] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x00000950] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x00000958] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x00000960] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00000968] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
+--/* [0x00000970] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
+--/* [0x00000978] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+--/* [0x00000980] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x00000988] */ 0x8d5847f6, 0xd00269e1, // sub.setf -, r3, 4 ; mov r1, ra22
+--/* [0x00000990] */ 0xffffff20, 0xf06809e7, // brr.anyn -, r:uvloop_b
+--/* [0x00000998] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
+--/* [0x000009a0] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
+--/* [0x000009a8] */ 0x009e7000, 0x100009e7, // nop                     ; nop
+--/* [0x000009b0] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+--/* [0x000009b8] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+--/* [0x000009c0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+--/* [0x000009c8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+--/* [0x000009d0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x000009d8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x000009e0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+--/* [0x000009e8] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
+--/* [0x000009f0] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
+--/* [0x000009f8] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:uvloop_b
+--/* [0x00000a00] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
+--/* [0x00000a08] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x00000a10] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+--/* [0x00000a18] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00000a20] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000a28] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x00000a30] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x00000a38] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x00000a40] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000a48] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x00000a50] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000a58] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x000008b0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x000008b8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x000008c0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x000008c8] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x000008d0] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x000008d8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x000008e0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x000008e8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x000008f0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x000008f8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x00000900] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000908] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x00000910] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x00000918] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00000920] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x00000928] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000930] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00000938] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00000940] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00000948] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+-+/* [0x00000950] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+-+/* [0x00000958] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x00000960] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00000968] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+-+/* [0x00000970] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+-+/* [0x00000978] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+-+/* [0x00000980] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+-+/* [0x00000988] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+/* [0x00000990] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x00000998] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x000009a0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x000009a8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x000009b0] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
+-+/* [0x000009b8] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
+-+/* [0x000009c0] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x000009c8] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
+-+/* [0x000009d0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x000009d8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x000009e0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x000009e8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x000009f0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x000009f8] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x00000a00] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x00000a08] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000a10] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x00000a18] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000a20] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_exit
+--/* [0x00000a60] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x00000a68] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+--/* [0x00000a70] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a28] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00000a30] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+-+/* [0x00000a38] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a40] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a48] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a50] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a58] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00000a60] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000a68] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+// ::mc_interrupt_exit8
+-+/* [0x00000a70] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+- /* [0x00000a78] */ 0x009e7000, 0xa00009e7, // ldtmu0
+- /* [0x00000a80] */ 0x009e7000, 0xa00009e7, // ldtmu0
+- /* [0x00000a88] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000a90] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x00000a98] */ 0x009e7000, 0x100009e7, // nop        ; nop
+--/* [0x00000aa0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+--// ::mc_interrupt_exit8
+--/* [0x00000aa8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x00000ab0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000ab8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000ac0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000ac8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000ad0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000ad8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000ae0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000ae8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000af0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000af8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000b00] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000b08] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x00000b10] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+--/* [0x00000b18] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000a90] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a98] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000aa0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000aa8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000ab0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000ab8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000ac0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000ac8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000ad0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00000ad8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x00000ae0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_end
+- };
+- #ifdef __HIGHC__
+-diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
+-index 1fb3e37..3fac45f 100644
+---- a/libavcodec/rpi_shader.h
+-+++ b/libavcodec/rpi_shader.h
+-@@ -5,10 +5,10 @@ extern unsigned int rpi_shader[];
+- 
+- #define mc_setup_uv (rpi_shader + 0)
+- #define mc_filter_uv (rpi_shader + 152)
+--#define mc_filter_uv_b0 (rpi_shader + 318)
+--#define mc_filter_uv_b (rpi_shader + 484)
+--#define mc_exit (rpi_shader + 664)
+--#define mc_interrupt_exit8 (rpi_shader + 682)
+--#define mc_end (rpi_shader + 712)
+-+#define mc_filter_uv_b0 (rpi_shader + 316)
+-+#define mc_filter_uv_b (rpi_shader + 476)
+-+#define mc_exit (rpi_shader + 650)
+-+#define mc_interrupt_exit8 (rpi_shader + 668)
+-+#define mc_end (rpi_shader + 698)
+- 
+- #endif
+-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
+-index 02e95dd..10f5113 100644
+---- a/libavcodec/rpi_shader.qasm
+-+++ b/libavcodec/rpi_shader.qasm
+-@@ -221,8 +221,6 @@ add r0, r0, r1 # Combine width and height of destination area
+- shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
+- add rb26, r0, rb27
+- 
+--sub.setf -,8,r1 # 8-r1, so if <0 (negative) we need to use the full code
+--
+- # get filter coefficients
+- 
+- mov r0, unif
+-@@ -410,20 +408,12 @@ add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+- nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+- add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+- nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--add r0, r2, r3
+--
+--mov r3, rb31
+--
+--mov ra12, ra13
+--mov ra13, ra14
+--
+--sub.setf -, r3, 4 ; mov r1, ra22
+--
+--# apply horizontal filter
+-+add r0, r2, r3       ; mov r3, rb31
+-+sub.setf -, r3, 4    ; mov ra12, ra13
+- brr.anyn -, r:uvloop_b0
+--mov ra14, ra15          ; mul24 r0, r0, r1         # last bit of context scroll
+--asr ra15, r0, 8         ; nop  # TODO isn't ra15 already in 24bit precision, may not need the sign extension here?
+--nop                     ; nop  # Delay slot 3 (TODO move more of the context scroll into here)
+-+mov ra13, ra14       # Delay slot 1
+-+mov ra14, ra15       # Delay slot 2
+-+mov ra15, r0         # Delay slot 3
+- 
+- # apply vertical filter and write to VPM
+- 
+-@@ -432,9 +422,9 @@ nop                     ; mul24 r0, ra13, rb9
+- add r1, r1, r0          ; mul24 r0, ra12, rb8
+- add r1, r1, r0          ; mul24 r0, ra15, rb11
+- add r1, r1, r0          ; mov -, vw_wait
+--sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+sub.setf -, r3, rb18
+- brr.anyn -, r:uvloop_b0
+--asr vpm, r1, 14        # Delay 1 shifts down by shift2=6, but results are still in 16bit precision TODO may be able to avoid the mul24 and use more delay slots
+-+asr vpm, r1, 6         # Delay 1 shifts down by shift2=6, but results are still in 16bit precision
+- nop                    # Delay 2
+- nop                    # Delay 3
+- 
+-@@ -554,19 +544,12 @@ add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+- nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+- add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+- nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--add r0, r2, r3
+--
+--mov r3, rb31
+--
+--mov ra12, ra13
+--mov ra13, ra14
+--
+--sub.setf -, r3, 4 ; mov r1, ra22
+--# apply horizontal filter
+-+add r0, r2, r3       ; mov r3, rb31
+-+sub.setf -, r3, 4    ; mov ra12, ra13
+- brr.anyn -, r:uvloop_b
+--mov ra14, ra15          ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
+--asr ra15, r0, 8         ; nop
+--nop                     ; nop    # TODO improve use of delay slots
+-+mov ra13, ra14       # Delay slot 1
+-+mov ra14, ra15       # Delay slot 2
+-+mov ra15, r0         # Delay slot 3
+- 
+- # apply vertical filter and write to VPM
+- 
+--- 
+-2.7.4
+-
+-
+-From af59f8e00eb977e97debc5e72ba47e0077db1787 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Thu, 14 May 2015 11:31:23 +0100
+-Subject: [PATCH 32/68] Avoid writeback of first B results
+-
+----
+- libavcodec/rpi_shader.c    | 229 ++++++++++++++++++++++-----------------------
+- libavcodec/rpi_shader.h    |   8 +-
+- libavcodec/rpi_shader.qasm |  18 +---
+- 3 files changed, 121 insertions(+), 134 deletions(-)
+-
+-diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
+-index c498f28..ba453a2 100644
+---- a/libavcodec/rpi_shader.c
+-+++ b/libavcodec/rpi_shader.c
+-@@ -255,130 +255,125 @@ unsigned int rpi_shader[] = {
+- /* [0x00000710] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
+- /* [0x00000718] */ 0x009e7000, 0x100009e7, // nop
+- /* [0x00000720] */ 0x009e7000, 0x100009e7, // nop
+--/* [0x00000728] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00000730] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000738] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x00000740] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x00000748] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x00000750] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000758] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x00000760] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000768] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000728] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000730] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000738] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000740] */ 0x009e7000, 0x100009e7, // nop
+- // ::mc_filter_uv_b
+--/* [0x00000770] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00000778] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x00000780] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00000788] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x00000790] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+--/* [0x00000798] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x000007a0] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+--/* [0x000007a8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x000007b0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+--/* [0x000007b8] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x000007c0] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+--/* [0x000007c8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+--/* [0x000007d0] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x000007d8] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000007e0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x000007e8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x000007f0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x000007f8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+--/* [0x00000800] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+--/* [0x00000808] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x00000810] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+--/* [0x00000818] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+--/* [0x00000820] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+--/* [0x00000828] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000830] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000838] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00000840] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
+-+/* [0x00000748] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000750] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00000758] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000760] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000768] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x00000770] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000778] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x00000780] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x00000788] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+-+/* [0x00000790] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x00000798] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+-+/* [0x000007a0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x000007a8] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x000007b0] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000007b8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x000007c0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x000007c8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x000007d0] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+-+/* [0x000007d8] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+-+/* [0x000007e0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x000007e8] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+-+/* [0x000007f0] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+-+/* [0x000007f8] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+-+/* [0x00000800] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000808] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000810] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00000818] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
+-+/* [0x00000820] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000828] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000830] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000838] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000840] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+- /* [0x00000848] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000850] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000858] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000860] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000868] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x00000870] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000878] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000880] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000888] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000890] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+--/* [0x00000898] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x000008a0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x000008a8] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x00000850] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000858] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000860] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000868] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+-+/* [0x00000870] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x00000878] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000880] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop_b
+--/* [0x000008b0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x000008b8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x000008c0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x000008c8] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x000008d0] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x000008d8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x000008e0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x000008e8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x000008f0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x000008f8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x00000900] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000908] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x00000910] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x00000918] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x00000920] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x00000928] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x00000930] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x00000938] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x00000940] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00000948] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+--/* [0x00000950] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+--/* [0x00000958] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
+--/* [0x00000960] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x00000968] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+--/* [0x00000970] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+--/* [0x00000978] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+--/* [0x00000980] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+--/* [0x00000988] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+--/* [0x00000990] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+--/* [0x00000998] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x000009a0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x000009a8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+--/* [0x000009b0] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
+--/* [0x000009b8] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
+--/* [0x000009c0] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
+--/* [0x000009c8] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
+--/* [0x000009d0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x000009d8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+--/* [0x000009e0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x000009e8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x000009f0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x000009f8] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x00000a00] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x00000a08] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000a10] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x00000a18] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000a20] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000888] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x00000890] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x00000898] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x000008a0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x000008a8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x000008b0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x000008b8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x000008c0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x000008c8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x000008d0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x000008d8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x000008e0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x000008e8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x000008f0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x000008f8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x00000900] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000908] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00000910] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00000918] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00000920] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+-+/* [0x00000928] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+-+/* [0x00000930] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x00000938] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00000940] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+-+/* [0x00000948] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+-+/* [0x00000950] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+-+/* [0x00000958] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+-+/* [0x00000960] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+/* [0x00000968] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x00000970] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x00000978] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x00000980] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x00000988] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
+-+/* [0x00000990] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
+-+/* [0x00000998] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x000009a0] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
+-+/* [0x000009a8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x000009b0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x000009b8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x000009c0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x000009c8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x000009d0] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x000009d8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x000009e0] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x000009e8] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x000009f0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x000009f8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_exit
+--/* [0x00000a28] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x00000a30] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+--/* [0x00000a38] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000a40] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000a48] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000a50] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000a58] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x00000a60] */ 0x009e7000, 0x100009e7, // nop        ; nop
+--/* [0x00000a68] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000a00] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00000a08] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+-+/* [0x00000a10] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a18] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a20] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a28] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a30] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00000a38] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000a40] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_interrupt_exit8
+--/* [0x00000a70] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x00000a78] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000a80] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000a88] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000a90] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a48] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00000a50] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a58] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a60] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a68] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a70] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000a78] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000a80] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000a88] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000a90] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00000a98] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00000aa0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000aa8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000ab0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000ab8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000ac0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000ac8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000ad0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x00000ad8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+--/* [0x00000ae0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000aa8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00000ab0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x00000ab8] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_end
+- };
+- #ifdef __HIGHC__
+-diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
+-index 3fac45f..45dbe0e 100644
+---- a/libavcodec/rpi_shader.h
+-+++ b/libavcodec/rpi_shader.h
+-@@ -6,9 +6,9 @@ extern unsigned int rpi_shader[];
+- #define mc_setup_uv (rpi_shader + 0)
+- #define mc_filter_uv (rpi_shader + 152)
+- #define mc_filter_uv_b0 (rpi_shader + 316)
+--#define mc_filter_uv_b (rpi_shader + 476)
+--#define mc_exit (rpi_shader + 650)
+--#define mc_interrupt_exit8 (rpi_shader + 668)
+--#define mc_end (rpi_shader + 698)
+-+#define mc_filter_uv_b (rpi_shader + 466)
+-+#define mc_exit (rpi_shader + 640)
+-+#define mc_interrupt_exit8 (rpi_shader + 658)
+-+#define mc_end (rpi_shader + 688)
+- 
+- #endif
+-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
+-index 10f5113..e138c95 100644
+---- a/libavcodec/rpi_shader.qasm
+-+++ b/libavcodec/rpi_shader.qasm
+-@@ -428,22 +428,14 @@ asr vpm, r1, 6         # Delay 1 shifts down by shift2=6, but results are still
+- nop                    # Delay 2
+- nop                    # Delay 3
+- 
+-+# in pass0 we don't really need to save any results, but need to discard the uniforms
+- # DMA out for U
+- 
+--mov vw_setup, rb26 # VDW setup 0
+--mov vw_setup, rb29 # Stride
+--mov vw_addr, unif # start the VDW    # TODO in pass0 we don't need to save any results
+--
+--# DMA out for V
+--# We need to wait for the U to complete first, but have nothing useful to compute while we wait.
+--# Could potentially push this write into the start of the next pipeline stage.
+--mov r0, 16
+--mov -, vw_wait
+--
+- bra -, ra31
+--add vw_setup, rb26, r0 # VDW setup 0
+--mov vw_setup, rb29 # Stride
+--mov vw_addr, unif # start the VDW
+-+mov r0, unif           # Delay 1
+-+mov r0, unif           # Delay 2
+-+nop                    # Delay 3
+-+
+- 
+- ################################################################################
+- 
+--- 
+-2.7.4
+-
+-
+-From 12e57278cb19a769d2e1488e8e94003027493d09 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Thu, 14 May 2015 11:36:24 +0100
+-Subject: [PATCH 33/68] Cutdown size of chroma prediction commands
+-
+----
+- libavcodec/hevc.c          |  17 +-
+- libavcodec/rpi_shader.c    | 543 ++++++++++++++++++++++-----------------------
+- libavcodec/rpi_shader.h    |  12 +-
+- libavcodec/rpi_shader.qasm |  11 +-
+- 4 files changed, 281 insertions(+), 302 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index a47ebc5..32b89d5 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -56,7 +56,7 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
+- 
+- #ifdef RPI_INTER_QPU
+- 
+--#define RPI_CHROMA_COMMAND_WORDS 12
+-+#define RPI_CHROMA_COMMAND_WORDS 10
+- #define UV_COMMANDS_PER_QPU ((1 + (256*64*2)/(4*4)) * RPI_CHROMA_COMMAND_WORDS)
+- // The QPU code for UV blocks only works up to a block width of 8
+- #define RPI_CHROMA_BLOCK_WIDTH 8
+-@@ -2032,11 +2032,8 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[1]);
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[2]);
+-                       *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
+--                      // TODO chroma weight and offset... s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]
+-                       *u++ = rpi_filter_coefs[_mx][0];
+--                      u++;
+-                       *u++ = rpi_filter_coefs[_my][0];
+--                      u++;
+-                       *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
+-                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+-                     }
+-@@ -2091,9 +2088,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                       *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
+-                       // TODO chroma weight and offset... s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]
+-                       *u++ = rpi_filter_coefs[_mx][0];
+--                      u++;
+-                       *u++ = rpi_filter_coefs[_my][0];
+--                      u++;
+-                       *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
+-                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+-                     }
+-@@ -2154,11 +2149,8 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[2]);
+-                       *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
+-                       *u++ = rpi_filter_coefs[_mx][0];
+--                      u++;
+-                       *u++ = rpi_filter_coefs[_my][0];
+--                      u++;
+--                      *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]); // TODO this will become unused once we have a dedicated pass0 filter
+--                      *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+-+                      u+=2; // Intermediate results are not written back in first pass of B filtering
+- 
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b;
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = x2_c - 1 + start_x;
+-@@ -2166,11 +2158,8 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[1]);
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[2]);
+-                       *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
+--                      // TODO chroma weight and offset... s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]
+-                       *u++ = rpi_filter_coefs[_mx2][0];
+--                      u++;
+-                       *u++ = rpi_filter_coefs[_my2][0];
+--                      u++;
+-                       *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
+-                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+-                     }
+-@@ -2808,7 +2797,7 @@ static void rpi_inter_clear(HEVCContext *s)
+-         *s->u_mvs[i]++ = pic_height;
+-         *s->u_mvs[i]++ = s->frame->linesize[1];
+-         *s->u_mvs[i]++ = s->frame->linesize[2];
+--        s->u_mvs[i] += 3;  // Padding words
+-+        s->u_mvs[i] += 1;  // Padding words
+-     }
+- }
+- 
+-diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
+-index ba453a2..b0b93b5 100644
+---- a/libavcodec/rpi_shader.c
+-+++ b/libavcodec/rpi_shader.c
+-@@ -89,291 +89,286 @@ unsigned int rpi_shader[] = {
+- /* [0x00000200] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
+- /* [0x00000208] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
+- /* [0x00000210] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000218] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000220] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000228] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
+--/* [0x00000230] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+--/* [0x00000238] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
+--/* [0x00000240] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000248] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+--/* [0x00000250] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
+--/* [0x00000258] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
+-+/* [0x00000218] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
+-+/* [0x00000220] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x00000228] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
+-+/* [0x00000230] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000238] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+-+/* [0x00000240] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
+-+/* [0x00000248] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
+- // ::mc_filter_uv
+--/* [0x00000260] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00000268] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x00000270] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00000278] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x00000280] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+--/* [0x00000288] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x00000290] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+--/* [0x00000298] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x000002a0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+--/* [0x000002a8] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x000002b0] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+--/* [0x000002b8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+--/* [0x000002c0] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x000002c8] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000002d0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x000002d8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x000002e0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x000002e8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+--/* [0x000002f0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+--/* [0x000002f8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x00000300] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000308] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000310] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00000318] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000320] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000328] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000330] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000338] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x00000340] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000348] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000350] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000358] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000360] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+--/* [0x00000368] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x00000370] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000378] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x00000250] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000258] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00000260] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000268] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000270] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x00000278] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000280] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x00000288] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x00000290] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+-+/* [0x00000298] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x000002a0] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+-+/* [0x000002a8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x000002b0] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x000002b8] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000002c0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x000002c8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x000002d0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x000002d8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+-+/* [0x000002e0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+-+/* [0x000002e8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x000002f0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x000002f8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000300] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00000308] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000310] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000318] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000320] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000328] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x00000330] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000338] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000340] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000348] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
+-+/* [0x00000350] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x00000358] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000360] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop
+--/* [0x00000380] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x00000388] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x00000390] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x00000398] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x000003a0] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x000003a8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x000003b0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x000003b8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x000003c0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x000003c8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x000003d0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x000003d8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x000003e0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x000003e8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x000003f0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x000003f8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x00000400] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x00000408] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x00000410] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00000418] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+--/* [0x00000420] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+--/* [0x00000428] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
+--/* [0x00000430] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x00000438] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+--/* [0x00000440] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+--/* [0x00000448] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+--/* [0x00000450] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+--/* [0x00000458] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+--/* [0x00000460] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+--/* [0x00000468] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x00000470] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x00000478] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+--/* [0x00000480] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
+--/* [0x00000488] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:uvloop
+--/* [0x00000490] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
+--/* [0x00000498] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x000004a0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+--/* [0x000004a8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x000004b0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x000004b8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x000004c0] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x000004c8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x000004d0] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x000004d8] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x000004e0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x000004e8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000368] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x00000370] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x00000378] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x00000380] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000388] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x00000390] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000398] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x000003a0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x000003a8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x000003b0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x000003b8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x000003c0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x000003c8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x000003d0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x000003d8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x000003e0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x000003e8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x000003f0] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x000003f8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00000400] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+-+/* [0x00000408] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+-+/* [0x00000410] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
+-+/* [0x00000418] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00000420] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+-+/* [0x00000428] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+-+/* [0x00000430] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+-+/* [0x00000438] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+-+/* [0x00000440] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+/* [0x00000448] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x00000450] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x00000458] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x00000460] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x00000468] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
+-+/* [0x00000470] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:uvloop
+-+/* [0x00000478] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
+-+/* [0x00000480] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x00000488] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x00000490] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00000498] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x000004a0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x000004a8] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x000004b0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x000004b8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x000004c0] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x000004c8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x000004d0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_filter_uv_b0
+--/* [0x000004f0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x000004f8] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x00000500] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00000508] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x00000510] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+--/* [0x00000518] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x00000520] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+--/* [0x00000528] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x00000530] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+--/* [0x00000538] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x00000540] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+--/* [0x00000548] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
+--/* [0x00000550] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x00000558] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000560] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x00000568] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x00000570] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x00000578] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+--/* [0x00000580] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+--/* [0x00000588] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x00000590] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000598] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x000005a0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x000005a8] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000005b0] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000005b8] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000005c0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000005c8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x000005d0] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000005d8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000005e0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000005e8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000005f0] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+--/* [0x000005f8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x00000600] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000608] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x000004d8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x000004e0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x000004e8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x000004f0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x000004f8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x00000500] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000508] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x00000510] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x00000518] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+-+/* [0x00000520] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x00000528] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+-+/* [0x00000530] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
+-+/* [0x00000538] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x00000540] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000548] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x00000550] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x00000558] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x00000560] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+-+/* [0x00000568] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+-+/* [0x00000570] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x00000578] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000580] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000588] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00000590] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000598] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000005a0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000005a8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000005b0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x000005b8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000005c0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000005c8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000005d0] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
+-+/* [0x000005d8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x000005e0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x000005e8] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop_b0
+--/* [0x00000610] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x00000618] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x00000620] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x00000628] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x00000630] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x00000638] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x00000640] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x00000648] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x00000650] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x00000658] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x00000660] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000668] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x00000670] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x00000678] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x00000680] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x00000688] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x00000690] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x00000698] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x000006a0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x000006a8] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+--/* [0x000006b0] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+--/* [0x000006b8] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+--/* [0x000006c0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x000006c8] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+--/* [0x000006d0] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+--/* [0x000006d8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+--/* [0x000006e0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+--/* [0x000006e8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+--/* [0x000006f0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+--/* [0x000006f8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x00000700] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
+--/* [0x00000708] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+--/* [0x00000710] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
+--/* [0x00000718] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x000005f0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x000005f8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x00000600] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x00000608] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000610] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x00000618] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000620] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000628] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x00000630] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x00000638] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x00000640] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000648] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x00000650] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x00000658] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00000660] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x00000668] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000670] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00000678] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00000680] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00000688] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+-+/* [0x00000690] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+-+/* [0x00000698] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+-+/* [0x000006a0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x000006a8] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+-+/* [0x000006b0] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+-+/* [0x000006b8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+-+/* [0x000006c0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+-+/* [0x000006c8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+/* [0x000006d0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x000006d8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x000006e0] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
+-+/* [0x000006e8] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+-+/* [0x000006f0] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
+-+/* [0x000006f8] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x00000700] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x00000708] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000710] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000718] */ 0x15827d80, 0x10020827, // mov r0, unif
+- /* [0x00000720] */ 0x009e7000, 0x100009e7, // nop
+--/* [0x00000728] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000730] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000738] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000740] */ 0x009e7000, 0x100009e7, // nop
+- // ::mc_filter_uv_b
+--/* [0x00000748] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00000750] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x00000758] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00000760] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x00000768] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+--/* [0x00000770] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x00000778] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+--/* [0x00000780] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x00000788] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+--/* [0x00000790] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x00000798] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+--/* [0x000007a0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+--/* [0x000007a8] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x000007b0] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000007b8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x000007c0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x000007c8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x000007d0] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+--/* [0x000007d8] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+--/* [0x000007e0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x000007e8] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+--/* [0x000007f0] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+--/* [0x000007f8] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+--/* [0x00000800] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000808] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000810] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00000818] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
+--/* [0x00000820] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000828] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000830] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000838] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000840] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x00000848] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000850] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000858] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000860] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000868] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+--/* [0x00000870] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x00000878] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000880] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x00000728] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000730] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00000738] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000740] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000748] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x00000750] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000758] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x00000760] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x00000768] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+-+/* [0x00000770] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x00000778] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+-+/* [0x00000780] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x00000788] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x00000790] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000798] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x000007a0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x000007a8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x000007b0] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+-+/* [0x000007b8] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+-+/* [0x000007c0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x000007c8] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+-+/* [0x000007d0] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+-+/* [0x000007d8] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+-+/* [0x000007e0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x000007e8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x000007f0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x000007f8] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
+-+/* [0x00000800] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000808] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000810] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000818] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000820] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x00000828] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000830] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000838] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000840] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
+-+/* [0x00000848] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x00000850] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000858] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop_b
+--/* [0x00000888] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x00000890] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x00000898] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x000008a0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x000008a8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x000008b0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x000008b8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x000008c0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x000008c8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x000008d0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x000008d8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x000008e0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x000008e8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x000008f0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x000008f8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x00000900] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x00000908] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x00000910] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x00000918] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00000920] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+--/* [0x00000928] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+--/* [0x00000930] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
+--/* [0x00000938] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x00000940] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+--/* [0x00000948] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+--/* [0x00000950] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+--/* [0x00000958] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+--/* [0x00000960] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+--/* [0x00000968] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+--/* [0x00000970] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x00000978] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x00000980] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+--/* [0x00000988] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
+--/* [0x00000990] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
+--/* [0x00000998] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
+--/* [0x000009a0] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
+--/* [0x000009a8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x000009b0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+--/* [0x000009b8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x000009c0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x000009c8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x000009d0] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x000009d8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x000009e0] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x000009e8] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x000009f0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x000009f8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000860] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x00000868] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x00000870] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x00000878] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000880] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x00000888] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000890] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000898] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x000008a0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x000008a8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x000008b0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x000008b8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x000008c0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x000008c8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x000008d0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x000008d8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x000008e0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x000008e8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x000008f0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x000008f8] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+-+/* [0x00000900] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+-+/* [0x00000908] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x00000910] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00000918] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+-+/* [0x00000920] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+-+/* [0x00000928] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+-+/* [0x00000930] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+-+/* [0x00000938] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+/* [0x00000940] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x00000948] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x00000950] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x00000958] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x00000960] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
+-+/* [0x00000968] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
+-+/* [0x00000970] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x00000978] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
+-+/* [0x00000980] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x00000988] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x00000990] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00000998] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x000009a0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x000009a8] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x000009b0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x000009b8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x000009c0] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x000009c8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x000009d0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_exit
+--/* [0x00000a00] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x00000a08] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+--/* [0x00000a10] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000a18] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000a20] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000a28] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000a30] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x00000a38] */ 0x009e7000, 0x100009e7, // nop        ; nop
+--/* [0x00000a40] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x000009d8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x000009e0] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+-+/* [0x000009e8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x000009f0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x000009f8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a00] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a08] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00000a10] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000a18] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_interrupt_exit8
+--/* [0x00000a48] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x00000a50] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000a58] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000a60] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000a68] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a20] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00000a28] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a30] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a38] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a40] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a48] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000a50] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000a58] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000a60] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000a68] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00000a70] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00000a78] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000a80] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000a88] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000a90] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000a98] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000aa0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000aa8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x00000ab0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+--/* [0x00000ab8] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000a80] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00000a88] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x00000a90] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_end
+- };
+- #ifdef __HIGHC__
+-diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
+-index 45dbe0e..99927c4 100644
+---- a/libavcodec/rpi_shader.h
+-+++ b/libavcodec/rpi_shader.h
+-@@ -4,11 +4,11 @@
+- extern unsigned int rpi_shader[];
+- 
+- #define mc_setup_uv (rpi_shader + 0)
+--#define mc_filter_uv (rpi_shader + 152)
+--#define mc_filter_uv_b0 (rpi_shader + 316)
+--#define mc_filter_uv_b (rpi_shader + 466)
+--#define mc_exit (rpi_shader + 640)
+--#define mc_interrupt_exit8 (rpi_shader + 658)
+--#define mc_end (rpi_shader + 688)
+-+#define mc_filter_uv (rpi_shader + 148)
+-+#define mc_filter_uv_b0 (rpi_shader + 310)
+-+#define mc_filter_uv_b (rpi_shader + 458)
+-+#define mc_exit (rpi_shader + 630)
+-+#define mc_interrupt_exit8 (rpi_shader + 648)
+-+#define mc_end (rpi_shader + 678)
+- 
+- #endif
+-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
+-index e138c95..d9ffcda 100644
+---- a/libavcodec/rpi_shader.qasm
+-+++ b/libavcodec/rpi_shader.qasm
+-@@ -167,8 +167,6 @@ add t0s, r2, r1
+- 
+- # Dump padding words
+- mov r0, unif
+--mov r0, unif
+--mov r0, unif
+- 
+- # submit texture requests for second line
+- max r1, ra_y, 0
+-@@ -228,11 +226,10 @@ asr ra3, r0, rb23;      mul24 r0, r0, ra22
+- asr ra2, r0, rb23;      mul24 r0, r0, ra22
+- asr ra1, r0, rb23;      mul24 r0, r0, ra22
+- asr ra0, r0, rb23;      mov r0, unif
+--                        mov r0, unif
+- asr rb11, r0, rb23;     mul24 r0, r0, ra22
+- asr rb10, r0, rb23;     mul24 r0, r0, ra22
+- asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--asr rb8, r0, rb23;      mov r0, unif
+-+asr rb8, r0, rb23
+- 
+- # r2 is elem_num
+- # r3 is loop counter
+-@@ -362,11 +359,10 @@ asr ra3, r0, rb23;      mul24 r0, r0, ra22
+- asr ra2, r0, rb23;      mul24 r0, r0, ra22
+- asr ra1, r0, rb23;      mul24 r0, r0, ra22
+- asr ra0, r0, rb23;      mov r0, unif
+--                        mov r0, unif
+- asr rb11, r0, rb23;     mul24 r0, r0, ra22
+- asr rb10, r0, rb23;     mul24 r0, r0, ra22
+- asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--asr rb8, r0, rb23;      mov r0, unif
+-+asr rb8, r0, rb23
+- 
+- # r2 is elem_num
+- # r3 is loop counter
+-@@ -490,11 +486,10 @@ asr ra3, r0, rb23;      mul24 r0, r0, ra22
+- asr ra2, r0, rb23;      mul24 r0, r0, ra22
+- asr ra1, r0, rb23;      mul24 r0, r0, ra22
+- asr ra0, r0, rb23;      mov r0, unif
+--                        mov r0, unif
+- asr rb11, r0, rb23;     mul24 r0, r0, ra22
+- asr rb10, r0, rb23;     mul24 r0, r0, ra22
+- asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--asr rb8, r0, rb23;      mov r0, unif
+-+asr rb8, r0, rb23
+- 
+- # r2 is elem_num
+- # r3 is loop counter
+--- 
+-2.7.4
+-
+-
+-From 3e8f02cf9d3e4bfcd07a5fcf321ace07c4f2e6f3 Mon Sep 17 00:00:00 2001
+-From: popcornmix <popcornmix@gmail.com>
+-Date: Thu, 14 May 2015 15:21:49 +0100
+-Subject: [PATCH 34/68] hevc: don't redirect when not rpi_enabled
+-
+----
+- libavcodec/hevc.c | 2 +-
+- 1 file changed, 1 insertion(+), 1 deletion(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 32b89d5..2459e34 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -1468,7 +1468,7 @@ static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size)
+-  */
+- 
+- #ifdef RPI_INTER
+--#define RPI_REDIRECT(fn) rpi_ ## fn
+-+#define RPI_REDIRECT(fn) (s->enable_rpi ? rpi_ ## fn : fn)
+- static void rpi_luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+-                         AVFrame *ref, const Mv *mv, int x_off, int y_off,
+-                         int block_w, int block_h, int luma_weight, int luma_offset)
+--- 
+-2.7.4
+-
+-
+-From 6da455b382b28c3c1f4e98c1703a695cdb946ad3 Mon Sep 17 00:00:00 2001
+-From: popcornmix <popcornmix@gmail.com>
+-Date: Thu, 14 May 2015 15:22:02 +0100
+-Subject: [PATCH 35/68] Use /dev/vcio for mailbox access
+-
+----
+- libavcodec/rpi_mailbox.c | 2 +-
+- 1 file changed, 1 insertion(+), 1 deletion(-)
+-
+-diff --git a/libavcodec/rpi_mailbox.c b/libavcodec/rpi_mailbox.c
+-index 536896f..77a56dd 100644
+---- a/libavcodec/rpi_mailbox.c
+-+++ b/libavcodec/rpi_mailbox.c
+-@@ -39,7 +39,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+- 
+- #define MAJOR_NUM 100
+- #define IOCTL_MBOX_PROPERTY _IOWR(MAJOR_NUM, 0, char *)
+--#define DEVICE_FILE_NAME "/dev/char_dev"
+-+#define DEVICE_FILE_NAME "/dev/vcio"
+- 
+- #include "rpi_mailbox.h"
+- 
+--- 
+-2.7.4
+-
+-
+-From f96ef6131f16a4c03b8e2882bdf7319c3b646a6c Mon Sep 17 00:00:00 2001
+-From: popcornmix <popcornmix@gmail.com>
+-Date: Thu, 14 May 2015 15:25:25 +0100
+-Subject: [PATCH 36/68] Use vcsm for all memory allocations
+-
+----
+- libavcodec/rpi_qpu.c | 174 +++++++++++++++++++--------------------------------
+- 1 file changed, 64 insertions(+), 110 deletions(-)
+-
+-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
+-index 60bf079..f62051f 100644
+---- a/libavcodec/rpi_qpu.c
+-+++ b/libavcodec/rpi_qpu.c
+-@@ -1,7 +1,5 @@
+- #ifdef RPI
+--// define RPI_USE_VCSM to use the vcsm device for shared memory
+- // This works better than the mmap in that the memory can be cached, but requires a kernel modification to enable the device.
+--#define RPI_USE_VCSM
+- // define RPI_TIME_TOTAL_QPU to print out how much time is spent in the QPU code
+- #define RPI_TIME_TOTAL_QPU
+- // define RPI_TIME_TOTAL_VPU to print out how much time is spent in the VPI code
+-@@ -25,9 +23,7 @@
+- #include "rpi_shader.h"
+- #include "rpi_hevc_transform.h"
+- 
+--#ifdef RPI_USE_VCSM
+- #include "rpi_user_vcsm.h"
+--#endif
+- 
+- // On Pi2 there is no way to access the VPU L2 cache
+- // GPU_MEM_FLG should be 4 for uncached memory.  (Or C for alias to allocate in the VPU L2 cache)
+-@@ -96,7 +92,6 @@ struct GPU
+-   unsigned int vpu_code[VPU_CODE_SIZE];
+-   short transMatrix2even[16*16*2];
+-   int open_count; // Number of allocated video buffers
+--  unsigned int vc_handle; // Handle of this memory
+-   int      mb; // Mailbox handle
+-   int      vc; // Address in GPU memory
+-   int mail[12]; // These are used to pass pairs of code/unifs to the QPUs
+-@@ -105,6 +100,7 @@ struct GPU
+- // Stop more than one thread trying to allocate memory or use the processing resources at once
+- static pthread_mutex_t gpu_mutex = PTHREAD_MUTEX_INITIALIZER;
+- static volatile struct GPU* gpu = NULL;
+-+static GPU_MEM_PTR_T gpu_mem_ptr;
+- 
+- #if defined(RPI_TIME_TOTAL_QPU) || defined(RPI_TIME_TOTAL_VPU)
+- static unsigned int Microseconds(void) {
+-@@ -132,39 +128,27 @@ static volatile int vpu_async_tail=0; // Contains the number of posted jobs
+- static volatile int vpu_async_head=0;
+- #endif
+- 
+-+static int gpu_malloc_uncached_internal(int numbytes, GPU_MEM_PTR_T *p, int mb);
+-+static void gpu_free_internal(GPU_MEM_PTR_T *p);
+-+
+- // Connect to QPU, returns 0 on success.
+- static int gpu_init(volatile struct GPU **gpu) {
+-   int mb = mbox_open();
+-   int vc;
+--  int handle;
+-   volatile struct GPU* ptr;
+- 	if (mb < 0)
+- 		return -1;
+- 
+- 	if (qpu_enable(mb, 1)) return -2;
+- 
+--#ifdef RPI_USE_VCSM
+-   vcsm_init();
+--#endif
+-+  gpu_malloc_uncached_internal(sizeof(struct GPU), &gpu_mem_ptr, mb);
+-+  ptr = (volatile struct GPU*)gpu_mem_ptr.arm;
+-+  memset(ptr, 0, sizeof *ptr);
+-+  vc = gpu_mem_ptr.vc;
+- 
+--  handle = mem_alloc(mb, sizeof(struct GPU), 4096, GPU_MEM_FLG);
+--  if (!handle)
+--  {
+--    qpu_enable(mb, 0);
+--    return -3;
+--  }
+--	vc = mem_lock(mb, handle);
+--	ptr = mapmem_shared((vc+GPU_MEM_MAP)&~0xc0000000, sizeof(struct GPU));
+--	if (ptr == NULL)
+--	{	mem_free(mb, handle);
+--		mem_unlock(mb, handle);
+--		qpu_enable(mb, 0);
+--		return -4;
+--	}
+--
+--	ptr->mb = mb;
+--	ptr->vc_handle = handle;
+--	ptr->vc = vc;
+-+  ptr->mb = mb;
+-+  ptr->vc = vc;
+- 
+-   printf("GPU allocated at 0x%x\n",vc);
+- 
+-@@ -226,94 +210,74 @@ static void gpu_unlock(void) {
+-   pthread_mutex_unlock(&gpu_mutex);
+- }
+- 
+-+static int gpu_malloc_uncached_internal(int numbytes, GPU_MEM_PTR_T *p, int mb) {
+-+  p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
+-+  assert(p->vcsm_handle);
+-+  p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
+-+  assert(p->vc_handle);
+-+  p->arm = vcsm_lock(p->vcsm_handle);
+-+  assert(p->arm);
+-+  p->vc = mem_lock(mb, p->vc_handle);
+-+  assert(p->vc);
+-+  return 0;
+-+}
+-+
+- // Allocate memory on GPU
+- // Fills in structure <p> containing ARM pointer, videocore handle, videocore memory address, numbytes
+- // Returns 0 on success.
+- // This allocates memory that will not be cached in ARM's data cache.
+- // Therefore safe to use without data cache flushing.
+--int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p) {
+-+int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p)
+-+{
+-+  int r;
+-   gpu_lock();
+--  p->vc_handle = mem_alloc(gpu->mb, numbytes, 4096, GPU_MEM_FLG);
+--  p->vcsm_handle = 0;
+--  if (!p->vc_handle)
+--  {
+--    qpu_enable(gpu->mb, 0);
+--    return -3;
+--  }
+--  p->vc = mem_lock(gpu->mb, p->vc_handle);
+--  p->arm = mapmem_shared((p->vc+GPU_MEM_MAP)&~0xc0000000,numbytes);
+--  p->numbytes = numbytes;
+--  if (p->arm == NULL)
+--  {
+--    mem_free(gpu->mb, p->vc_handle);
+--    mem_unlock(gpu->mb, p->vc_handle);
+--    gpu_unlock();
+--    qpu_enable(gpu->mb, 0);
+--    return -4;
+--  }
+-+  r = gpu_malloc_uncached_internal(numbytes, p, gpu->mb);
+-   gpu->open_count++;
+-   gpu_unlock();
+--  return 0;
+-+  return r;
+- }
+- 
+- void gpu_cache_flush(GPU_MEM_PTR_T *p)
+- {
+--  // This only works when using RPI_USE_VCSM
+-   void *tmp = vcsm_lock(p->vcsm_handle);
+-   vcsm_unlock_ptr(tmp);
+- }
+- 
+-+static int gpu_malloc_cached_internal(int numbytes, GPU_MEM_PTR_T *p) {
+-+  p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST, (char *)"Video Frame" );
+-+  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_VC, (char *)"Video Frame" );
+-+  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
+-+  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST_AND_VC, (char *)"Video Frame" );
+-+  assert(p->vcsm_handle);
+-+  p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
+-+  assert(p->vc_handle);
+-+  p->arm = vcsm_lock(p->vcsm_handle);
+-+  assert(p->arm);
+-+  p->vc = mem_lock(gpu->mb, p->vc_handle);
+-+  assert(p->vc);
+-+  return 0;
+-+}
+-+
+- // This allocates data that will be
+- //    Cached in ARM L2
+- //    Uncached in VPU L2
+--int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p) {
+-+int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p)
+-+{
+-+  int r;
+-   gpu_lock();
+--#ifdef RPI_USE_VCSM
+--  {
+--      p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST, (char *)"Video Frame" ); // f....... locks up for VP9 - retest this?
+--      //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_VC, (char *)"Video Frame" ); // 3b...... works
+--      //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" ); //fb...... locks up
+--      //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST_AND_VC, (char *)"Video Frame" ); // 3b works (but corrupted due to caching)
+--      p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
+--      p->arm = vcsm_lock(p->vcsm_handle);
+--      p->vc = mem_lock(gpu->mb, p->vc_handle);
+--  }
+--#else
+--  p->vc_handle = mem_alloc(gpu->mb, numbytes, 4096, GPU_MEM_FLG);
+--  p->vcsm_handle = 0;
+--  if (!p->handle)
+--  {
+--    qpu_enable(gpu->mb, 0);
+--    return -3;
+--  }
+--  p->vc = mem_lock(gpu->mb, p->vc_handle);
+--  printf("This mapmem_private does not seem to work\n");
+--  exit(-1);
+--  p->arm = mapmem_private((p->vc+GPU_MEM_MAP)&~0xc0000000,numbytes);
+--  p->numbytes = numbytes;
+--  if (p->arm == NULL)
+--  {
+--    mem_free(gpu->mb, p->handle);
+--    mem_unlock(gpu->mb, p->handle);
+--    gpu_unlock();
+--    qpu_enable(gpu->mb, 0);
+--    return -4;
+--  }
+--#endif
+-+  r = gpu_malloc_cached_internal(numbytes, p);
+-   gpu->open_count++;
+-   gpu_unlock();
+--  return 0;
+-+  return r;
+- }
+- 
+- static void gpu_term(void)
+- {
+--	int mb;
+--	unsigned handle;
+-+  int mb;
+- 
+-   if (gpu==NULL)
+-     return;
+-   mb = gpu->mb;
+--  handle = gpu->vc_handle;
+- 
+- #ifdef RPI_ASYNC
+-   {
+-@@ -323,37 +287,26 @@ static void gpu_term(void)
+-   }
+- #endif
+- 
+-+  qpu_enable(mb, 0);
+-+  gpu_free_internal(&gpu_mem_ptr);
+- 
+--	unmapmem((void*)gpu, sizeof(struct GPU));
+--	mem_unlock(mb, handle);
+--	mem_free(mb, handle);
+--	qpu_enable(mb, 0);
+--#ifdef RPI_USE_VCSM
+-   vcsm_exit();
+--#endif
+--	mbox_close(mb);
+-+
+-+  mbox_close(mb);
+-   gpu = NULL;
+- }
+- 
+--void gpu_free(GPU_MEM_PTR_T *p) {
+-+void gpu_free_internal(GPU_MEM_PTR_T *p) {
+-   int mb = gpu->mb;
+--	unsigned handle = p->vc_handle;
+-+  mem_unlock(mb,p->vc_handle);
+-+  vcsm_unlock_ptr(p->arm);
+-+  vcsm_free(p->vcsm_handle);
+-+}
+-+
+-+void gpu_free(GPU_MEM_PTR_T *p) {
+-   gpu_lock();
+--#ifdef RPI_USE_VCSM
+--  if (p->vcsm_handle) {
+--      mem_unlock(mb,p->vc_handle);
+--      vcsm_unlock_ptr(p->arm);
+--      vcsm_free(p->vcsm_handle);
+--  } else {
+--	unmapmem((void*)p->arm, sizeof(struct GPU));
+--      mem_unlock(mb, handle);
+--      mem_free(mb, handle);
+--  }
+--#else
+--	unmapmem((void*)p->arm, sizeof(struct GPU));
+--	mem_unlock(mb, handle);
+--	mem_free(mb, handle);
+--#endif
+-+
+-+  gpu_free_internal(p);
+- 
+-   gpu->open_count--;
+-   if (gpu->open_count==0) {
+-@@ -386,20 +339,21 @@ unsigned int vpu_get_constants(void) {
+- 
+- static void *vpu_start(void *arg) {
+-   while(1) {
+-+    int *p;
+-     pthread_mutex_lock(&post_mutex);
+-     while( vpu_async_tail - vpu_async_head <= 0)
+-     {
+-       //printf("Checking number %d %d\n",vpu_async_head,vpu_async_tail);
+-       pthread_cond_wait(&post_cond_tail, &post_mutex);
+-     }
+--    int *p = vpu_cmds[vpu_async_head%MAXCMDS];
+-+    p = vpu_cmds[vpu_async_head%MAXCMDS];
+-     pthread_mutex_unlock(&post_mutex);
+- 
+-     if (p[6] == -1) {
+-       break; // Last job
+-     }
+-     if (p[7]) {
+--        GPU_MEM_PTR_T *buf = (GPU_MEM_PTR_T *)p[7];
+-+        //GPU_MEM_PTR_T *buf = (GPU_MEM_PTR_T *)p[7];
+-         //gpu_cache_flush(buf);
+-     }
+-     vpu_execute_code(p[0], p[1], p[2], p[3], p[4], p[5], p[6]);
+--- 
+-2.7.4
+-
+-
+-From 7c94b833b48a455d27d82eb2ca1b53a162705caf Mon Sep 17 00:00:00 2001
+-From: popcornmix <popcornmix@gmail.com>
+-Date: Thu, 14 May 2015 15:43:17 +0100
+-Subject: [PATCH 37/68] Enable EARLY_MALLOC and fix sps access bug
+-
+----
+- libavcodec/hevc.c | 5 +++--
+- 1 file changed, 3 insertions(+), 2 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 2459e34..4e82a15 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -44,7 +44,7 @@
+- #ifdef RPI
+-   #include "rpi_qpu.h"
+-   // For some unknown reason, the code seems to crash if I do a late malloc
+--  #define EARLY_MALLOC
+-+  //#define EARLY_MALLOC
+-   // Move Inter prediction into separate pass
+-   #define RPI_INTER
+- #endif
+-@@ -149,7 +149,8 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
+- #ifdef RPI
+- #ifdef EARLY_MALLOC
+- #else
+--    int coeffs_in_ctb = (1 << s->ps.sps->log2_ctb_size) * (1 << s->ps.sps->log2_ctb_size);
+-+    assert(sps);
+-+    int coeffs_in_ctb = (1 << sps->log2_ctb_size) * (1 << sps->log2_ctb_size);
+-     int coefs_per_row = sps->ctb_width * coeffs_in_ctb * 3;  // Allow space for chroma
+-     printf("pic_arrays_init\n");
+-     printf("Allocated %d\n",coefs_per_row);
+--- 
+-2.7.4
+-
+-
+-From 0a0a92817a7959d213dca9c75a242b6ad88d6b80 Mon Sep 17 00:00:00 2001
+-From: popcornmix <popcornmix@gmail.com>
+-Date: Thu, 14 May 2015 16:40:51 +0100
+-Subject: [PATCH 38/68] Add copy of av_mod_uintp2 for use with stable ffmpeg
+-
+----
+- libavcodec/hevc.c | 8 ++++++++
+- 1 file changed, 8 insertions(+)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 4e82a15..80db603 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -51,6 +51,14 @@
+- 
+- // #define DISABLE_MC
+- 
+-+#ifndef av_mod_uintp2
+-+static av_always_inline av_const unsigned av_mod_uintp2_c(unsigned a, unsigned p)
+-+{
+-+    return a & ((1 << p) - 1);
+-+}
+-+#   define av_mod_uintp2   av_mod_uintp2_c
+-+#endif
+-+
+- const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
+- 
+- 
+--- 
+-2.7.4
+-
+-
+-From c48d08e968b24c2e260b0cc76c7901a1b4d75bbf Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Mon, 18 May 2015 11:11:02 +0100
+-Subject: [PATCH 39/68] Added support for weighted prediction in P frames
+-
+----
+- libavcodec/hevc.c          |  52 ++++-
+- libavcodec/rpi_shader.c    | 566 +++++++++++++++++++++++----------------------
+- libavcodec/rpi_shader.h    |  12 +-
+- libavcodec/rpi_shader.qasm |  39 +++-
+- 4 files changed, 384 insertions(+), 285 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 80db603..9668ef8 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -64,7 +64,7 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
+- 
+- #ifdef RPI_INTER_QPU
+- 
+--#define RPI_CHROMA_COMMAND_WORDS 10
+-+#define RPI_CHROMA_COMMAND_WORDS 12
+- #define UV_COMMANDS_PER_QPU ((1 + (256*64*2)/(4*4)) * RPI_CHROMA_COMMAND_WORDS)
+- // The QPU code for UV blocks only works up to a block width of 8
+- #define RPI_CHROMA_BLOCK_WIDTH 8
+-@@ -2031,6 +2031,8 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                 int y1_c = y0_c + (mv->y >> (2 + hshift));
+-                 //int chan = x0>>8; // Allocate commands for the first 256 luma pixels across to the first QPU.  This is optimised for images around 1920 width
+-                 int chan = x0>>8;
+-+                int weight_flag      = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
+-+                                       (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
+- 
+-                 uint32_t *u = s->u_mvs[chan & 7];
+-                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
+-@@ -2043,6 +2045,13 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                       *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
+-                       *u++ = rpi_filter_coefs[_mx][0];
+-                       *u++ = rpi_filter_coefs[_my][0];
+-+                      if (weight_flag) {
+-+                          *u++ = (s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0] << 16) + (s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0] & 0xffff);
+-+                          *u++ = (s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1] << 16) + (s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1] & 0xffff);
+-+                      } else {
+-+                          *u++ = 1; // Weight of 1 and offset of 0
+-+                          *u++ = 1;
+-+                      }
+-                       *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
+-                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+-                     }
+-@@ -2085,6 +2094,8 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                 int y1_c = y0_c + (mv->y >> (2 + hshift));
+-                 //int chan = x0>>8; // Allocate commands for the first 256 luma pixels across to the first QPU.  This is optimised for images around 1920 width
+-                 int chan = x0>>8;
+-+                int weight_flag      = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
+-+                                       (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
+- 
+-                 uint32_t *u = s->u_mvs[chan & 7];
+-                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
+-@@ -2098,6 +2109,13 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                       // TODO chroma weight and offset... s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]
+-                       *u++ = rpi_filter_coefs[_mx][0];
+-                       *u++ = rpi_filter_coefs[_my][0];
+-+                      if (weight_flag) {
+-+                          *u++ = (s->sh.chroma_offset_l0[current_mv.ref_idx[1]][0] << 16) + (s->sh.chroma_weight_l0[current_mv.ref_idx[1]][0] & 0xffff);
+-+                          *u++ = (s->sh.chroma_offset_l0[current_mv.ref_idx[1]][1] << 16) + (s->sh.chroma_weight_l0[current_mv.ref_idx[1]][1] & 0xffff);
+-+                      } else {
+-+                          *u++ = 1; // Weight of 1 and offset of 0
+-+                          *u++ = 1;
+-+                      }
+-                       *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
+-                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+-                     }
+-@@ -2159,6 +2177,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                       *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
+-                       *u++ = rpi_filter_coefs[_mx][0];
+-                       *u++ = rpi_filter_coefs[_my][0];
+-+                      u+=2; // Weights not supported in B slices
+-                       u+=2; // Intermediate results are not written back in first pass of B filtering
+- 
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b;
+-@@ -2169,6 +2188,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                       *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
+-                       *u++ = rpi_filter_coefs[_mx2][0];
+-                       *u++ = rpi_filter_coefs[_my2][0];
+-+                      u+=2; // Weights not supported in B slices
+-                       *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
+-                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+-                     }
+-@@ -2795,6 +2815,9 @@ static void rpi_inter_clear(HEVCContext *s)
+-     int i;
+-     int pic_width        = s->ps.sps->width >> s->ps.sps->hshift[1];
+-     int pic_height       = s->ps.sps->height >> s->ps.sps->vshift[1];
+-+    int weight_flag      = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
+-+                           (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
+-+
+-     for(i=0;i<8;i++) {
+-         s->u_mvs[i] = s->mvs_base[i];
+-         *s->u_mvs[i]++ = 0;
+-@@ -2806,6 +2829,13 @@ static void rpi_inter_clear(HEVCContext *s)
+-         *s->u_mvs[i]++ = pic_height;
+-         *s->u_mvs[i]++ = s->frame->linesize[1];
+-         *s->u_mvs[i]++ = s->frame->linesize[2];
+-+        if (weight_flag) {
+-+            *s->u_mvs[i]++ = 1 << (s->sh.chroma_log2_weight_denom + 6 - 1);
+-+            *s->u_mvs[i]++ = s->sh.chroma_log2_weight_denom + 6;
+-+        } else {
+-+            *s->u_mvs[i]++ = 1 << 5;
+-+            *s->u_mvs[i]++ = 6;
+-+        }
+-         s->u_mvs[i] += 1;  // Padding words
+-     }
+- }
+-@@ -2849,12 +2879,29 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-     int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs];
+- 
+- #ifdef RPI
+-+#ifdef RPI_INTER_QPU
+-     s->enable_rpi = s->ps.sps->bit_depth == 8
+-                     && s->ps.sps->width <= RPI_MAX_WIDTH
+-                     && !s->ps.pps->cross_component_prediction_enabled_flag
+-                     && s->ps.pps->num_tile_rows <= 1 && s->ps.pps->num_tile_columns <= 1
+--                    && !(s->ps.pps->weighted_pred_flag && s->sh.slice_type == P_SLICE)
+-                     && !(s->ps.pps->weighted_bipred_flag && s->sh.slice_type == B_SLICE);
+-+#else
+-+    s->enable_rpi = s->ps.sps->bit_depth == 8
+-+                    && s->ps.sps->width <= RPI_MAX_WIDTH
+-+                    && !s->ps.pps->cross_component_prediction_enabled_flag
+-+                    && s->ps.pps->num_tile_rows <= 1 && s->ps.pps->num_tile_columns <= 1;
+-+#endif
+-+
+-+    /*if (!s->enable_rpi) {
+-+      if (s->ps.pps->cross_component_prediction_enabled_flag)
+-+        printf("Cross component\n");
+-+      if (s->ps.pps->num_tile_rows > 1 || s->ps.pps->num_tile_columns > 1)
+-+        printf("Tiles\n");
+-+      if (s->ps.pps->weighted_pred_flag && s->sh.slice_type == P_SLICE)
+-+        printf("Weighted P slice\n");
+-+      if (s->ps.pps->weighted_bipred_flag && s->sh.slice_type == B_SLICE)
+-+        printf("Weighted B slice\n");
+-+    }*/
+- 
+- #endif
+- 
+-@@ -2987,6 +3034,7 @@ static int hls_decode_entry_wpp(AVCodecContext *avctxt, void *input_ctb_row, int
+- 
+- #ifdef RPI
+-     s->enable_rpi = 0;
+-+    //printf("Wavefront\n");
+- #endif
+- 
+-     if(ctb_row) {
+-diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
+-index b0b93b5..3f04d80 100644
+---- a/libavcodec/rpi_shader.c
+-+++ b/libavcodec/rpi_shader.c
+-@@ -88,287 +88,307 @@ unsigned int rpi_shader[] = {
+- /* [0x000001f8] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
+- /* [0x00000200] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
+- /* [0x00000208] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
+--/* [0x00000210] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000218] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
+--/* [0x00000220] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+--/* [0x00000228] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
+--/* [0x00000230] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000238] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+--/* [0x00000240] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
+--/* [0x00000248] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
+-+/* [0x00000210] */ 0x15827d80, 0x10021327, // mov rb12,unif
+-+/* [0x00000218] */ 0x15827d80, 0x10021367, // mov rb13,unif
+-+/* [0x00000220] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000228] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
+-+/* [0x00000230] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x00000238] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
+-+/* [0x00000240] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000248] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+-+/* [0x00000250] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
+-+/* [0x00000258] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
+- // ::mc_filter_uv
+--/* [0x00000250] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00000258] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x00000260] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00000268] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x00000270] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+--/* [0x00000278] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x00000280] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+--/* [0x00000288] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x00000290] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+--/* [0x00000298] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x000002a0] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+--/* [0x000002a8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+--/* [0x000002b0] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x000002b8] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000002c0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x000002c8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x000002d0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x000002d8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+--/* [0x000002e0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+--/* [0x000002e8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x000002f0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x000002f8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000300] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00000308] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000310] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000318] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000320] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000328] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x00000330] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000338] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000340] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000348] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
+--/* [0x00000350] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x00000358] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000360] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x00000260] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000268] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00000270] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000278] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000280] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x00000288] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000290] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x00000298] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x000002a0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+-+/* [0x000002a8] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x000002b0] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+-+/* [0x000002b8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x000002c0] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x000002c8] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000002d0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x000002d8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x000002e0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x000002e8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+-+/* [0x000002f0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+-+/* [0x000002f8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x00000300] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000308] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000310] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00000318] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000320] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000328] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000330] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000338] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x00000340] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000348] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000350] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000358] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
+-+/* [0x00000360] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000368] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000370] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
+-+/* [0x00000378] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000380] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
+-+/* [0x00000388] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000390] */ 0x0f9e7080, 0x100613e7, // asr.ifnz rb15, r0, r2
+-+/* [0x00000398] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x000003a0] */ 0x0f9e7080, 0x100613a7, // asr.ifnz rb14, r0, r2
+-+/* [0x000003a8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x000003b0] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop
+--/* [0x00000368] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x00000370] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x00000378] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x00000380] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x00000388] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x00000390] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x00000398] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x000003a0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x000003a8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x000003b0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x000003b8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x000003c0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x000003c8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x000003d0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x000003d8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x000003e0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x000003e8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x000003f0] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x000003f8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00000400] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+--/* [0x00000408] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+--/* [0x00000410] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
+--/* [0x00000418] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x00000420] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+--/* [0x00000428] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+--/* [0x00000430] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+--/* [0x00000438] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+--/* [0x00000440] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+--/* [0x00000448] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+--/* [0x00000450] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x00000458] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x00000460] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+--/* [0x00000468] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
+--/* [0x00000470] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:uvloop
+--/* [0x00000478] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
+--/* [0x00000480] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x00000488] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+--/* [0x00000490] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00000498] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x000004a0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x000004a8] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x000004b0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x000004b8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x000004c0] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x000004c8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x000004d0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x000003b8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x000003c0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x000003c8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x000003d0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x000003d8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x000003e0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x000003e8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x000003f0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x000003f8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x00000400] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x00000408] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000410] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x00000418] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x00000420] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00000428] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x00000430] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000438] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00000440] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00000448] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00000450] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+-+/* [0x00000458] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+-+/* [0x00000460] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
+-+/* [0x00000468] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00000470] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+-+/* [0x00000478] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+-+/* [0x00000480] */ 0x00000020, 0xe0021327, // mov rb12,32
+-+/* [0x00000488] */ 0x00000006, 0xe0021367, // mov rb13,6
+-+/* [0x00000490] */ 0x00000001, 0xe00213a7, // mov rb14,1
+-+/* [0x00000498] */ 0x00000000, 0xe00213e7, // mov rb15,0
+-+/* [0x000004a0] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+-+/* [0x000004a8] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+-+/* [0x000004b0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+/* [0x000004b8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x000004c0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x000004c8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x000004d0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x000004d8] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
+-+/* [0x000004e0] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
+-+/* [0x000004e8] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
+-+/* [0x000004f0] */ 0xfffffea8, 0xf06809e7, // brr.anyn -, r:uvloop
+-+/* [0x000004f8] */ 0x0c9cf3c0, 0x10020867, // add r1, r1, rb15
+-+/* [0x00000500] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x00000508] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x00000510] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00000518] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000520] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000528] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x00000530] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x00000538] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000540] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x00000548] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000550] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_filter_uv_b0
+--/* [0x000004d8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x000004e0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x000004e8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x000004f0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x000004f8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+--/* [0x00000500] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x00000508] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+--/* [0x00000510] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x00000518] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+--/* [0x00000520] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x00000528] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+--/* [0x00000530] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
+--/* [0x00000538] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x00000540] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000548] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x00000550] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x00000558] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x00000560] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+--/* [0x00000568] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+--/* [0x00000570] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x00000578] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000580] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000588] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00000590] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000598] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000005a0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000005a8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000005b0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x000005b8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000005c0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000005c8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000005d0] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
+--/* [0x000005d8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x000005e0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x000005e8] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x00000558] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000560] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00000568] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000570] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000578] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x00000580] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000588] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x00000590] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x00000598] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+-+/* [0x000005a0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x000005a8] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+-+/* [0x000005b0] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
+-+/* [0x000005b8] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x000005c0] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000005c8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x000005d0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x000005d8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x000005e0] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+-+/* [0x000005e8] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+-+/* [0x000005f0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x000005f8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000600] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000608] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00000610] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000618] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000620] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000628] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000630] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x00000638] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000640] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000648] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000650] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
+-+/* [0x00000658] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000660] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000668] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x00000670] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000678] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop_b0
+--/* [0x000005f0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x000005f8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x00000600] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x00000608] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x00000610] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x00000618] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x00000620] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x00000628] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x00000630] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x00000638] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x00000640] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000648] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x00000650] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x00000658] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x00000660] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x00000668] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x00000670] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x00000678] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x00000680] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00000688] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+--/* [0x00000690] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+--/* [0x00000698] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+--/* [0x000006a0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x000006a8] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+--/* [0x000006b0] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+--/* [0x000006b8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+--/* [0x000006c0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+--/* [0x000006c8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+--/* [0x000006d0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+--/* [0x000006d8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x000006e0] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
+--/* [0x000006e8] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+--/* [0x000006f0] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
+--/* [0x000006f8] */ 0x009e7000, 0x100009e7, // nop
+--/* [0x00000700] */ 0x009e7000, 0x100009e7, // nop
+--/* [0x00000708] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000710] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000718] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000720] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x00000680] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x00000688] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x00000690] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x00000698] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x000006a0] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x000006a8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x000006b0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x000006b8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x000006c0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x000006c8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x000006d0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x000006d8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x000006e0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x000006e8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x000006f0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x000006f8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000700] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00000708] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00000710] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00000718] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+-+/* [0x00000720] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+-+/* [0x00000728] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+-+/* [0x00000730] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00000738] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+-+/* [0x00000740] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+-+/* [0x00000748] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+-+/* [0x00000750] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+-+/* [0x00000758] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+/* [0x00000760] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x00000768] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x00000770] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
+-+/* [0x00000778] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+-+/* [0x00000780] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
+-+/* [0x00000788] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x00000790] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x00000798] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x000007a0] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000007a8] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000007b0] */ 0x009e7000, 0x100009e7, // nop
+- // ::mc_filter_uv_b
+--/* [0x00000728] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00000730] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x00000738] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00000740] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x00000748] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+--/* [0x00000750] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x00000758] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+--/* [0x00000760] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x00000768] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+--/* [0x00000770] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x00000778] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+--/* [0x00000780] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+--/* [0x00000788] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x00000790] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000798] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x000007a0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x000007a8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x000007b0] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+--/* [0x000007b8] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+--/* [0x000007c0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x000007c8] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+--/* [0x000007d0] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+--/* [0x000007d8] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+--/* [0x000007e0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x000007e8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x000007f0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x000007f8] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
+--/* [0x00000800] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000808] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000810] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000818] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000820] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x00000828] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000830] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000838] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000840] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
+--/* [0x00000848] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x00000850] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000858] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x000007b8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x000007c0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x000007c8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x000007d0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x000007d8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x000007e0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x000007e8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x000007f0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x000007f8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+-+/* [0x00000800] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x00000808] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+-+/* [0x00000810] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x00000818] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x00000820] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000828] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x00000830] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x00000838] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x00000840] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+-+/* [0x00000848] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+-+/* [0x00000850] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x00000858] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+-+/* [0x00000860] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+-+/* [0x00000868] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+-+/* [0x00000870] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000878] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000880] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00000888] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
+-+/* [0x00000890] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000898] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000008a0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000008a8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000008b0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x000008b8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000008c0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000008c8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000008d0] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
+-+/* [0x000008d8] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000008e0] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000008e8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x000008f0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x000008f8] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop_b
+--/* [0x00000860] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x00000868] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x00000870] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x00000878] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x00000880] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x00000888] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x00000890] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x00000898] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x000008a0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x000008a8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x000008b0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x000008b8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x000008c0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x000008c8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x000008d0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x000008d8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x000008e0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x000008e8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x000008f0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x000008f8] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+--/* [0x00000900] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+--/* [0x00000908] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
+--/* [0x00000910] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x00000918] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+--/* [0x00000920] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+--/* [0x00000928] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+--/* [0x00000930] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+--/* [0x00000938] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+--/* [0x00000940] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+--/* [0x00000948] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x00000950] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x00000958] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+--/* [0x00000960] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
+--/* [0x00000968] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
+--/* [0x00000970] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
+--/* [0x00000978] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
+--/* [0x00000980] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x00000988] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+--/* [0x00000990] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00000998] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x000009a0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x000009a8] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x000009b0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x000009b8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x000009c0] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x000009c8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x000009d0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000900] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x00000908] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+-+/* [0x00000910] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+/* [0x00000918] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000920] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x00000928] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000930] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000938] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x00000940] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+-+/* [0x00000948] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+-+/* [0x00000950] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000958] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x00000960] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x00000968] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00000970] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x00000978] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000980] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00000988] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00000990] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00000998] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+-+/* [0x000009a0] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+-+/* [0x000009a8] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x000009b0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x000009b8] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+-+/* [0x000009c0] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+-+/* [0x000009c8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+-+/* [0x000009d0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+-+/* [0x000009d8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+/* [0x000009e0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x000009e8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x000009f0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x000009f8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x00000a00] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
+-+/* [0x00000a08] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
+-+/* [0x00000a10] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x00000a18] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
+-+/* [0x00000a20] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x00000a28] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x00000a30] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00000a38] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000a40] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000a48] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x00000a50] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x00000a58] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000a60] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x00000a68] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000a70] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_exit
+--/* [0x000009d8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x000009e0] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+--/* [0x000009e8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x000009f0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x000009f8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000a00] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000a08] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x00000a10] */ 0x009e7000, 0x100009e7, // nop        ; nop
+--/* [0x00000a18] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000a78] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00000a80] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+-+/* [0x00000a88] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a90] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a98] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000aa0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000aa8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00000ab0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000ab8] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_interrupt_exit8
+--/* [0x00000a20] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x00000a28] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000a30] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000a38] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000a40] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000a48] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000a50] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000a58] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000a60] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000a68] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000a70] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000a78] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000a80] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x00000a88] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+--/* [0x00000a90] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000ac0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00000ac8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000ad0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000ad8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000ae0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000ae8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000af0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000af8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000b00] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000b08] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000b10] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000b18] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000b20] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00000b28] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x00000b30] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_end
+- };
+- #ifdef __HIGHC__
+-diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
+-index 99927c4..cec9901 100644
+---- a/libavcodec/rpi_shader.h
+-+++ b/libavcodec/rpi_shader.h
+-@@ -4,11 +4,11 @@
+- extern unsigned int rpi_shader[];
+- 
+- #define mc_setup_uv (rpi_shader + 0)
+--#define mc_filter_uv (rpi_shader + 148)
+--#define mc_filter_uv_b0 (rpi_shader + 310)
+--#define mc_filter_uv_b (rpi_shader + 458)
+--#define mc_exit (rpi_shader + 630)
+--#define mc_interrupt_exit8 (rpi_shader + 648)
+--#define mc_end (rpi_shader + 678)
+-+#define mc_filter_uv (rpi_shader + 152)
+-+#define mc_filter_uv_b0 (rpi_shader + 342)
+-+#define mc_filter_uv_b (rpi_shader + 494)
+-+#define mc_exit (rpi_shader + 670)
+-+#define mc_interrupt_exit8 (rpi_shader + 688)
+-+#define mc_end (rpi_shader + 718)
+- 
+- #endif
+-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
+-index d9ffcda..97c4c02 100644
+---- a/libavcodec/rpi_shader.qasm
+-+++ b/libavcodec/rpi_shader.qasm
+-@@ -9,7 +9,12 @@
+- #                                               (ra15 isn't clamped to zero - this happens during the
+- #                                                copy to ra14, and during its use in the vertical filter)
+- #
+--# rb8...rb15                                    eight vertical filter coefficients
+-+# rb8...rb11                                    eight vertical filter coefficients
+-+
+-+# rb12 offset to add before shift
+-+# rb13 shift
+-+# rb14 weight (U on left, V on right)
+-+# rb15 offset (U on left, V on right)
+- #
+- # ra16                                          clipped(row start address+elem_num)&~3
+- # ra17                                          per-channel shifts
+-@@ -165,6 +170,9 @@ add r2, r2, r0 ; mul24 r1, r1, rb_pitch
+- add t0s, r0, r1 ; mov ra_x2_base, r2
+- add t0s, r2, r1
+- 
+-+mov rb12,unif # offset before shift
+-+mov rb13,unif # offset after shift
+-+
+- # Dump padding words
+- mov r0, unif
+- 
+-@@ -231,11 +239,21 @@ asr rb10, r0, rb23;     mul24 r0, r0, ra22
+- asr rb9, r0, rb23;      mul24 r0, r0, ra22
+- asr rb8, r0, rb23
+- 
+-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+
+-+mov r0, unif # U offset/weight
+-+asr rb15, r0, r2  # Compute offset from MSBs
+-+shl r0, r0, r2
+-+asr rb14, r0, r2  # Compute weight from LSBs
+-+mov r0, unif # V offset/weight
+-+asr.ifnz rb15, r0, r2
+-+shl r0, r0, r2
+-+asr.ifnz rb14, r0, r2
+-+
+- # r2 is elem_num
+- # r3 is loop counter
+- 
+- mov r5rep, -8
+--mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+- 
+- # retrieve texture results and pick out bytes
+- # then submit two more texture requests
+-@@ -279,6 +297,11 @@ mov ra13, ra14       # Delay slot 1
+- mov ra14, ra15       # Delay slot 2
+- mov ra15, r0         # Delay slot 3
+- 
+-+mov rb12,32
+-+mov rb13,6
+-+mov rb14,1
+-+mov rb15,0
+-+
+- # apply vertical filter and write to VPM
+- 
+- nop                     ; mul24 r1, ra14, rb10
+-@@ -288,9 +311,11 @@ add r1, r1, r0          ; mul24 r0, ra15, rb11
+- add r1, r1, r0          ; mov -, vw_wait
+- sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+- asr r1, r1, 14
+--add r1, r1, ra21
+-+nop                     ; mul24 r1, r1, rb14
+-+add r1, r1, rb12
+-+asr r1, r1, rb13
+- brr.anyn -, r:uvloop
+--asr r1, r1, 6          # Delay 1
+-+add r1, r1, rb15       # Delay 1
+- min r1, r1, rb22       # Delay 2
+- max vpm, r1, 0         # Delay 3
+- 
+-@@ -364,6 +389,9 @@ asr rb10, r0, rb23;     mul24 r0, r0, ra22
+- asr rb9, r0, rb23;      mul24 r0, r0, ra22
+- asr rb8, r0, rb23
+- 
+-+mov r0, unif # U offset/weight
+-+mov r0, unif # V offset/weight
+-+
+- # r2 is elem_num
+- # r3 is loop counter
+- 
+-@@ -491,6 +519,9 @@ asr rb10, r0, rb23;     mul24 r0, r0, ra22
+- asr rb9, r0, rb23;      mul24 r0, r0, ra22
+- asr rb8, r0, rb23
+- 
+-+mov r0, unif # U offset/weight
+-+mov r0, unif # V offset/weight
+-+
+- # r2 is elem_num
+- # r3 is loop counter
+- 
+--- 
+-2.7.4
+-
+-
+-From 310d994ea39e29b41a6a013abc4d94e6b90487b2 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Tue, 19 May 2015 08:43:30 +0100
+-Subject: [PATCH 40/68] Improved ordering of tasks
+-
+----
+- libavcodec/hevc.c | 8 ++++----
+- 1 file changed, 4 insertions(+), 4 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 9668ef8..951e2d3 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -2943,15 +2943,15 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-           s->dblk_cmds[s->num_dblk_cmds][0] = x_ctb;
+-           s->dblk_cmds[s->num_dblk_cmds++][1] = y_ctb;
+-           if ( (((y_ctb + ctb_size)&63) == 0) && x_ctb + ctb_size >= s->ps.sps->width) {
+--#ifdef RPI_INTER_QPU
+--            // Kick off inter prediction on QPUs
+--            rpi_execute_inter_qpu(s);
+--#endif
+-             // Transform all blocks
+-             // printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
+-             rpi_execute_transform(s);
+-             // Perform inter prediction
+-             rpi_execute_inter_cmds(s);
+-+#ifdef RPI_INTER_QPU
+-+            // Kick off inter prediction on QPUs
+-+            rpi_execute_inter_qpu(s);
+-+#endif
+-             // Wait for transform completion
+-             vpu_wait(s->vpu_id);
+- 
+--- 
+-2.7.4
+-
+-
+-From d6e1ce7898196e49e52a6223c12979b3d0014588 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Wed, 20 May 2015 19:58:19 +0100
+-Subject: [PATCH 41/68] Drafted Luma inter prediction
+-
+----
+- libavcodec/rpi_shader.qasm | 594 ++++++++++++++++++++++++++++++++++++++++++---
+- 1 file changed, 554 insertions(+), 40 deletions(-)
+-
+-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
+-index 97c4c02..9cfc0d9 100644
+---- a/libavcodec/rpi_shader.qasm
+-+++ b/libavcodec/rpi_shader.qasm
+-@@ -2,7 +2,10 @@
+- #
+- # ra0...ra7                                     eight horizontal filter coefficients
+- #
+--# rb1...rb7                                     seven shifted copies of the current unfiltered row
+-+# rb0 rx_shift2
+-+# rb1 ra_y2_next
+-+#
+-+# rb4...rb7
+- #
+- # ra8...ra15                                    eight filtered rows of context (rb15 == most recent)
+- #
+-@@ -26,9 +29,9 @@
+- # rb19                                          next ra16
+- #
+- # ra20                                          1
+--# ra21                                          32
+-+# ra21                                          ra_21
+- # ra22                                          256
+--# ra23                                          8
+-+# ra23                                          rx_shift2_next
+- #
+- # rb20                                          0xffffff00
+- # rb21                                          vpm_setup for reading/writing 16bit results into VPM
+-@@ -57,16 +60,23 @@
+- .set rb_frame_width_minus_1,       rb25
+- .set rb_frame_height_minus_1,      rb30
+- .set rb_pitch,                     rb16
+--.set ra_x_base,                    ra16
+--.set rb_x_base_next,               rb19
+--.set ra_x2_base,                   ra24
+--.set ra_x2_base_next,              ra26
+-+.set ra_x,                         ra16
+-+.set ra_y2,                        ra21
+-+.set ra_y2_next,                   rb1
+-+
+-+.set rb_x_next,                    rb19
+-+.set rx_frame_base2_next,          rb19
+-+
+-+.set ra_frame_base,                ra24
+-+.set ra_frame_base_next,           ra26
+- .set ra_xshift,                    ra17
+- 
+--.set ra_x2shift,                   ra25
+- .set ra_u2v_ref_offset,            ra25
+-+.set ra_frame_base2,               ra25
+- 
+- .set ra_xshift_next,               ra19
+-+.set rx_xshift2,                   rb0
+-+.set rx_xshift2_next,              ra23
+- 
+- .set ra_x2shift_next,              ra27
+- .set ra_u2v_dst_offset,            ra27
+-@@ -83,11 +93,11 @@
+- mov ra31, unif
+- 
+- # Load first request location
+--add ra_x_base, unif, elem_num # Store x
+-+add ra_x, unif, elem_num # Store x
+- mov ra_y, unif # Store y
+--mov ra_x2_base, unif # Store frame u base
+-+mov ra_frame_base, unif # Store frame u base
+- nop
+--sub ra_u2v_ref_offset, unif, ra_x2_base # Store offset to add to move from u to v in reference frame
+-+sub ra_u2v_ref_offset, unif, ra_frame_base # Store offset to add to move from u to v in reference frame
+- 
+- # Read image dimensions
+- sub rb25,unif,1
+-@@ -104,9 +114,7 @@ add rb24, r1, r0
+- # load constants
+- 
+- mov ra20, 1
+--mov ra21, 32
+- mov ra22, 256
+--mov ra23, 8
+- mov ra30, 64
+- 
+- mov rb20, 0xffffff00
+-@@ -156,18 +164,18 @@ mov r1, vpm_setup(0, 2, h16p(0, 0))  # 2 is stride - stride acts on ADDR which i
+- add rb21, r0, r1
+- 
+- # Compute base address for first and second access
+--mov r0, ra_x_base           # Load x
+-+mov r0, ra_x           # Load x
+- max r0, r0, 0; mov r1, ra_y # Load y
+--min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base  # Load the frame base
+-+min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base  # Load the frame base
+- shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
+- add ra_y, r1, 1
+- add r0, r0, r3
+- and r0, r0, ~3
+--max r1, r1, 0 ; mov ra_x_base, r0 # y
+-+max r1, r1, 0 ; mov ra_x, r0 # y
+- min r1, r1, rb_frame_height_minus_1
+- # submit texture requests for first line
+- add r2, r2, r0 ; mul24 r1, r1, rb_pitch
+--add t0s, r0, r1 ; mov ra_x2_base, r2
+-+add t0s, r0, r1 ; mov ra_frame_base, r2
+- add t0s, r2, r1
+- 
+- mov rb12,unif # offset before shift
+-@@ -182,8 +190,8 @@ min r1, r1, rb_frame_height_minus_1
+- add ra_y, ra_y, 1
+- bra -, ra31
+- nop ; mul24 r1, r1, rb_pitch
+--add t0s, r1, ra_x_base
+--add t0s, r1, ra_x2_base
+-+add t0s, r1, ra_x
+-+add t0s, r1, ra_frame_base
+- 
+- 
+- 
+-@@ -192,7 +200,7 @@ add t0s, r1, ra_x2_base
+- # mc_filter_uv(next_kernel, x, y, frame_u_base, frame_v_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_u_dst, this_v_dst)
+- 
+- # At this point we have already issued two pairs of texture requests for the current block
+--# ra_x_base, ra_x16_base point to the current coordinates for this block
+-+# ra_x, ra_x16_base point to the current coordinates for this block
+- ::mc_filter_uv
+- mov ra31, unif
+- 
+-@@ -207,9 +215,9 @@ min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base
+- shl ra_xshift_next, r0, 3
+- sub r2, unif, r3 # compute offset from frame base u to frame base v
+- add r0, r0, r3
+--and rb_x_base_next, r0, ~3
+-+and rb_x_next, r0, ~3
+- mov ra_y_next, r1
+--add ra_x2_base_next, rb_x_base_next, r2
+-+add ra_frame_base_next, rb_x_next, r2
+- 
+- # set up VPM write
+- mov vw_setup, rb28
+-@@ -265,16 +273,16 @@ mov r3, 0
+- # then submit two more texture requests
+- 
+- sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
+--shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
+-+mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+- mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+- shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
+- 
+- max r2, ra_y, 0  # y
+- min r2, r2, rb_frame_height_minus_1
+- add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--add t0s, ra_x2_base, r2
+-+add t0s, ra_x, r2    ; v8subs r1, r1, rb20
+-+add t0s, ra_frame_base, r2
+- 
+- # generate seven shifted versions
+- # interleave with scroll of vertical context
+-@@ -297,7 +305,7 @@ mov ra13, ra14       # Delay slot 1
+- mov ra14, ra15       # Delay slot 2
+- mov ra15, r0         # Delay slot 3
+- 
+--mov rb12,32
+-+mov rb12,32 # TODO remove these to make P weighted prediction work properly
+- mov rb13,6
+- mov rb14,1
+- mov rb15,0
+-@@ -342,7 +350,7 @@ mov vw_addr, unif # start the VDW
+- # mc_filter_uv_b0(next_kernel, x, y, frame_u_base, frame_v_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_u_dst, this_v_dst)
+- 
+- # At this point we have already issued two pairs of texture requests for the current block
+--# ra_x_base, ra_x16_base point to the current coordinates for this block
+-+# ra_x, ra_x16_base point to the current coordinates for this block
+- ::mc_filter_uv_b0
+- mov ra31, unif
+- 
+-@@ -357,9 +365,9 @@ min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base
+- shl ra_xshift_next, r0, 3
+- sub r2, unif, r3 # compute offset from frame base u to frame base v
+- add r0, r0, r3
+--and rb_x_base_next, r0, ~3
+-+and rb_x_next, r0, ~3
+- mov ra_y_next, r1
+--add ra_x2_base_next, rb_x_base_next, r2
+-+add ra_frame_base_next, rb_x_next, r2
+- 
+- # set up VPM write, we need to save 16bit precision
+- mov vw_setup, rb21
+-@@ -408,16 +416,16 @@ mov r3, 0
+- # then submit two more texture requests
+- 
+- sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
+--shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
+-+mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+- mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+- shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
+- 
+- max r2, ra_y, 0  # y
+- min r2, r2, rb_frame_height_minus_1
+- add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--add t0s, ra_x2_base, r2
+-+add t0s, ra_x, r2    ; v8subs r1, r1, rb20
+-+add t0s, ra_frame_base, r2
+- 
+- # generate seven shifted versions
+- # interleave with scroll of vertical context
+-@@ -477,9 +485,9 @@ min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base
+- shl ra_xshift_next, r0, 3
+- sub r2, unif, r3 # compute offset from frame base u to frame base v
+- add r0, r0, r3
+--and rb_x_base_next, r0, ~3
+-+and rb_x_next, r0, ~3
+- mov ra_y_next, r1
+--add ra_x2_base_next, rb_x_base_next, r2
+-+add ra_frame_base_next, rb_x_next, r2
+- 
+- # set up VPM write
+- mov vw_setup, rb28
+-@@ -538,16 +546,16 @@ mov r3, 0
+- # then submit two more texture requests
+- 
+- sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
+--shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+-+shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
+-+mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+- mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+- shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
+- 
+- max r2, ra_y, 0  # y
+- min r2, r2, rb_frame_height_minus_1
+- add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--add t0s, ra_x2_base, r2
+-+add t0s, ra_x, r2    ; v8subs r1, r1, rb20
+-+add t0s, ra_frame_base, r2
+- 
+- # generate seven shifted versions
+- # interleave with scroll of vertical context
+-@@ -642,5 +650,511 @@ nop        ; nop ; thrend
+- mov interrupt, 1; nop # delay slot 1
+- nop        ; nop # delay slot 2
+- 
+-+
+-+
+-+
+-+
+-+# LUMA CODE
+-+
+-+# The idea is to form B predictions by doing 8 pixels from ref0 in parallel with 8 pixels from ref1.
+-+# For P frames we make the second x,y coordinates offset by +8
+-+
+-+################################################################################
+-+# mc_setup(next_kernel, x, y, ref_y_base, x2, y2, ref_y2_base, frame_width, frame_height, pitch, dst_pitch, offset, shift, pad2)
+-+::mc_setup
+-+
+-+# Read starting kernel
+-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+mov ra31, unif
+-+
+-+# Compute base address for first and second access
+-+add r0, unif, elem_num # Load x
+-+max r0, r0, 0; mov r1, unif # Load y
+-+min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
+-+shl ra_xshift_next, r0, 3 # Compute shifts
+-+add ra_y, r1, 1
+-+and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
+-+add r2, r2, r0  # r2 is address for frame0 (not including y offset)
+-+max r1, r1, 0
+-+min r1, r1, rb_frame_height_minus_1
+-+nop             ; mul24 r1, r1, rb_pitch   # r2 contains the addresses (not including y offset) for frame0
+-+add t0s, r2, r1 ; mov ra_frame_base, r2
+-+
+-+add r0, unif, elem_num # Load x
+-+max r0, r0, 0; mov r1, unif # Load y
+-+min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
+-+shl rx_xshift2_next, r0, 3 # Compute shifts
+-+add ra_y2, r1, 1
+-+and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
+-+add r2, r2, r0  # r2 is address for frame1 (not including y offset)
+-+max r1, r1, 0
+-+min r1, r1, rb_frame_height_minus_1
+-+nop             ; mul24 r1, r1, rb_pitch   # r2 contains the addresses (not including y offset) for frame0
+-+add t0s, r2, r1 ; mov ra_frame_base2, r2
+-+
+-+
+-+# Read image dimensions
+-+sub rb25,unif,1
+-+sub rb30,unif,1
+-+
+-+# get source pitch
+-+mov rb16, unif
+-+
+-+# get destination pitch
+-+mov r0, unif
+-+mov r1, vdw_setup_1(0)
+-+add rb24, r1, r0
+-+
+-+# load constants
+-+
+-+mov ra20, 1
+-+mov ra22, 256
+-+mov ra30, 64
+-+
+-+mov rb20, 0xffffff00
+-+mov rb22, 255
+-+mov rb23, 24
+-+
+-+# touch vertical context to keep simulator happy
+-+
+-+mov ra8, 0
+-+mov ra9, 0
+-+mov ra10, 0
+-+mov ra11, 0
+-+mov ra12, 0
+-+mov ra13, 0
+-+mov ra14, 0
+-+mov ra15, 0
+-+
+-+# Compute part of VPM to use for DMA output
+-+mov r2, qpu_num
+-+mov r1, r2
+-+asr r1, r1, 2
+-+shl r1, r1, 6
+-+mov r0, r2
+-+and r0, r0, 3
+-+add r0, r0, r1
+-+mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
+-+shl r0, r0, 5
+-+add rb27, r0, r1
+-+
+-+# Compute part of VPM to save data into
+-+mov r2, qpu_num   # qpu_num = abcd
+-+mov r1, r2
+-+asr r1, r1, 2
+-+shl r1, r1, 6
+-+mov r0, r2
+-+and r0, r0, 3
+-+add r0, r0, r1
+-+mov r1, vpm_setup(0, 4, h8p(0, 0))   # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
+-+add rb28, r0, r1
+-+
+-+mov rb12,unif # offset before shift
+-+mov rb13,unif # shift
+-+
+-+# Dump padding words
+-+mov r0, unif
+-+
+-+# submit texture requests for second line
+-+max r1, ra_y, 0
+-+min r1, r1, rb_frame_height_minus_1
+-+add ra_y, ra_y, 1
+-+nop ; mul24 r1, r1, rb_pitch
+-+add t0s, r1, ra_frame_base
+-+
+-+max r1, ra_y2, 0
+-+min r1, r1, rb_frame_height_minus_1
+-+bra -, ra31
+-+add ra_y2, ra_y2, 1           # Delay 1
+-+nop ; mul24 r1, r1, rb_pitch  # Delay 2
+-+add t0s, r1, ra_frame_base2   # Delay 3
+-+
+-+
+-+################################################################################
+-+
+-+# mc_filter(next_kernel, x, y, frame_base, x2, y2, frame_base2, height, hcoeffs[0], hcoeffs2[0], hcoeffs[1], hcoeffs2[1], vcoeffs[0], vcoeffs2[0], vcoeffs[1], vcoeffs2[1], offsetweight0, offsetweight1, this_dst)
+-+# In a P block, only the first half of coefficients contain used information.
+-+# At this point we have already issued two pairs of texture requests for the current block
+-+# ra_x, ra_x16_base point to the current coordinates for this block
+-+::mc_filter
+-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+mov ra31, unif
+-+
+-+# per-channel shifts were calculated on the *previous* invocation
+-+
+-+mov ra_xshift, ra_xshift_next
+-+mov rx_xshift2, rx_xshift2_next
+-+
+-+# get base addresses and per-channel shifts for *next* invocation
+-+add r0, unif, elem_num # Load x
+-+max r0, r0, 0; mov r1, unif # Load y
+-+min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
+-+shl ra_xshift_next, r0, 3 # Compute shifts
+-+mov ra_y_next, r1
+-+and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
+-+add ra_frame_base_next, r2, r0  # r2 is address for frame0 (not including y offset)
+-+
+-+add r0, unif, elem_num # Load x
+-+max r0, r0, 0   ; mov r1, unif # Load y
+-+min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
+-+shl rx_xshift2_next, r0, 3 # Compute shifts
+-+add ra_y2_next, r1, 1
+-+and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
+-+add rx_frame_base2_next, r2, r0  # r2 is address for frame1 (not including y offset)
+-+
+-+
+-+# set up VPM write
+-+mov vw_setup, rb28
+-+
+-+# get width,height of block
+-+mov r2, 16
+-+mov r0, unif
+-+shr r1, r0, r2 # Extract width
+-+sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
+-+and r0, r0, rb22 # Extract height
+-+add rb17, r0, 5
+-+add rb18, r0, 7
+-+shl r0, r0, 7
+-+add r0, r0, r1 # Combine width and height of destination area
+-+shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
+-+add rb26, r0, rb27
+-+
+-+# get filter coefficients and discard unused B frame values
+-+mov r0, unif
+-+mov.ifnz -, unif # Alternate coefficients are unused for P frames
+-+asr ra3, r0, rb23;      mul24 r0, r0, ra22 # These may need some pre-rotation to be used in B frames correctly
+-+asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra0, r0, rb23;      mov r0, unif
+-+mov.ifnz -, unif
+-+asr ra7, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra6, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra5, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra4, r0, rb23;      mov r0, unif
+-+mov.ifnz -, unif
+-+asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+asr rb8, r0, rb23;      mov r0, unif
+-+mov.ifnz -, unif
+-+asr rb7, r0, rb23;      mul24 r0, r0, ra22
+-+asr rb6, r0, rb23;      mul24 r0, r0, ra22
+-+asr rb5, r0, rb23;      mul24 r0, r0, ra22
+-+asr rb4, r0, rb23
+-+
+-+mov r0, unif # Frame0 offset/weight
+-+mov.ifnz -, unif # Frame1 offset/weight unused
+-+asr rb15, r0, r2  # Compute offset from MSBs
+-+shl r0, r0, r2
+-+asr rb14, r0, r2  # Compute weight from LSBs
+-+
+-+# r3 is loop counter
+-+
+-+# retrieve texture results and pick out bytes
+-+# then submit two more texture requests
+-+
+-+mov r3, 0
+-+
+-+:yloop
+-+# retrieve texture results and pick out bytes
+-+# then submit two more texture requests
+-+
+-+# If we knew there was no clipping then this code would get simpler.
+-+# Perhaps we could add on the pitch and clip using larger values?
+-+
+-+sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
+-+shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu0
+-+mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+-+mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+shr r1, r4, rx_xshift2
+-+mov.ifz ra_y2, ra_y2_next
+-+
+-+max r2, ra_y, 0  # y
+-+min r2, r2, rb_frame_height_minus_1
+-+add ra_y, ra_y, 1            ; mul24 r2, r2, r3
+-+add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte
+-+
+-+max r2, ra_y2, 0  # y
+-+min r2, r2, rb_frame_height_minus_1
+-+add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
+-+add t0s, ra_frame_base2, r2   ; v8subs r0, r0, rb20
+-+
+-+
+-+# generate seven shifted versions
+-+# interleave with scroll of vertical context
+-+
+-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+
+-+# apply horizontal filter
+-+nop                  ; mul24 r2, r0, ra0
+-+nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+add r0, r2, r3       ; mov r3, rb31
+-+sub.setf -, r3, 8    ; mov ra12, ra13
+-+mov ra9, ra10
+-+mov ra10, ra11
+-+mov ra11, ra12
+-+mov ra12, ra13
+-+brr.anyn -, r:yloop
+-+mov ra13, ra14       # Delay slot 1
+-+mov ra14, ra15       # Delay slot 2
+-+mov ra15, r0         # Delay slot 3
+-+
+-+# apply vertical filter and write to VPM
+-+
+-+nop                     ; mul24 r1, ra14, rb10
+-+nop                     ; mul24 r0, ra13, rb9
+-+add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+add r1, r1, r0          ; mul24 r0, ra8, rb4
+-+add r1, r1, r0          ; mul24 r0, ra9, rb5
+-+add r1, r1, r0          ; mul24 r0, ra10, rb6
+-+add r1, r1, r0          ; mul24 r0, ra11, rb7
+-+
+-+add r1, r1, r0          ; mov -, vw_wait
+-+sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+asr r1, r1, 14
+-+nop                     ; mul24 r1, r1, rb14
+-+add r1, r1, rb12
+-+asr r1, r1, rb13
+-+brr.anyn -, r:yloop
+-+add r1, r1, rb15       # Delay 1
+-+min r1, r1, rb22       # Delay 2
+-+max vpm, r1, 0         # Delay 3
+-+
+-+# DMA out
+-+
+-+bra -, ra31
+-+mov vw_setup, rb26 # VDW setup 0    Delay 1
+-+mov vw_setup, rb29 # Stride         Delay 2
+-+mov vw_addr, unif # start the VDW   Delay 3
+-+
+-+
+-+
+-+################################################################################
+-+
+-+# mc_filter_b(next_kernel, x, y, frame_base, x2, y2, frame_base2, width_height, hcoeffs[0], hcoeffs2[0], hcoeffs[1], hcoeffs2[1], vcoeffs[0], vcoeffs2[0], vcoeffs[1], vcoeffs2[1], offsetweight0, offsetweight1, this_dst)
+-+# In a P block, only the first half of coefficients contain used information.
+-+# At this point we have already issued two pairs of texture requests for the current block
+-+# May be better to just send 16.16 motion vector and figure out the coefficients inside this block (only 4 cases so can compute hcoeffs in around 24 cycles?)
+-+# Can fill in the coefficients so only
+-+# Can also assume default weighted prediction for B frames.
+-+# Perhaps can unpack coefficients in a more efficient manner by doing H/V for a and b at the same time?
+-+# Or possibly by taking advantage of symmetry?
+-+# From 19->7 32bits per command.
+-+::mc_filter_b
+-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+mov ra31, unif
+-+
+-+# per-channel shifts were calculated on the *previous* invocation
+-+
+-+mov ra_xshift, ra_xshift_next
+-+mov rx_xshift2, rx_xshift2_next
+-+
+-+# get base addresses and per-channel shifts for *next* invocation
+-+add r0, unif, elem_num # Load x
+-+max r0, r0, 0; mov r1, unif # Load y
+-+min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
+-+shl ra_xshift_next, r0, 3 # Compute shifts
+-+mov ra_y_next, r1
+-+and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
+-+add ra_frame_base_next, r2, r0  # r2 is address for frame0 (not including y offset)
+-+
+-+add r0, unif, elem_num # Load x
+-+max r0, r0, 0   ; mov r1, unif # Load y
+-+min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
+-+shl rx_xshift2_next, r0, 3 # Compute shifts
+-+add ra_y2_next, r1, 1
+-+and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
+-+add rx_frame_base2_next, r2, r0  # r2 is address for frame1 (not including y offset)
+-+
+-+
+-+# set up VPM write
+-+mov vw_setup, rb28
+-+
+-+# get width,height of block
+-+mov r2, 16
+-+mov r0, unif
+-+shr r1, r0, r2 # Extract width
+-+sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
+-+and r0, r0, rb22 # Extract height
+-+add rb17, r0, 5
+-+add rb18, r0, 7
+-+shl r0, r0, 7
+-+add r0, r0, r1 # Combine width and height of destination area
+-+shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
+-+add rb26, r0, rb27
+-+
+-+# get filter coefficients and discard unused B frame values
+-+mov r0, unif
+-+mov r1, 1
+-+mov.ifnz r0, unif # Alternate coefficients are unused for P frames
+-+nop              ;      mul24 r0, r0 << 13, r1 << 13
+-+asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+nop              ;      mul24 r0, r0 << 14, r1 << 14
+-+asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+nop              ;      mul24 r0, r0 << 15, r1 << 15 # Adjust such that a rotate of 1 will produce the values with first 8 on left, second 8 on right
+-+asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+asr ra0, r0, rb23;      mov r0, unif
+-+mov.ifnz r0, unif
+-+nop              ;      mul24 r0, r0 << 9, r1 << 9
+-+asr ra7, r0, rb23;      mul24 r0, r0, ra22
+-+nop              ;      mul24 r0, r0 << 10, r1 << 10
+-+asr ra6, r0, rb23;      mul24 r0, r0, ra22
+-+nop              ;      mul24 r0, r0 << 11, r1 << 11
+-+asr ra5, r0, rb23;      mul24 r0, r0, ra22
+-+nop              ;      mul24 r0, r0 << 12, r1 << 12
+-+asr ra4, r0, rb23;      mov r0, unif
+-+mov.ifnz r0, unif
+-+asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+asr rb8, r0, rb23;      mov r0, unif
+-+mov.ifnz r0, unif
+-+asr rb7, r0, rb23;      mul24 r0, r0, ra22
+-+asr rb6, r0, rb23;      mul24 r0, r0, ra22
+-+asr rb5, r0, rb23;      mul24 r0, r0, ra22
+-+asr rb4, r0, rb23
+-+
+-+mov r0, unif # Frame0 offset/weight
+-+mov.ifnz r0, unif # Frame1 offset/weight unused
+-+asr rb15, r0, r2  # Compute offset from MSBs
+-+shl r0, r0, r2
+-+asr rb14, r0, r2  # Compute weight from LSBs
+-+
+-+# r3 is loop counter
+-+
+-+# retrieve texture results and pick out bytes
+-+# then submit two more texture requests
+-+
+-+mov r3, 0
+-+
+-+:yloopb
+-+# retrieve texture results and pick out bytes
+-+# then submit two more texture requests
+-+
+-+# If we knew there was no clipping then this code would get simpler.
+-+# Perhaps we could add on the pitch and clip using larger values?
+-+
+-+sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
+-+shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu0
+-+mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+-+mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+shr r1, r4, rx_xshift2
+-+mov.ifz ra_y2, ra_y2_next
+-+
+-+max r2, ra_y, 0  # y
+-+min r2, r2, rb_frame_height_minus_1
+-+add ra_y, ra_y, 1            ; mul24 r2, r2, r3
+-+add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte
+-+
+-+max r2, ra_y2, 0  # y
+-+min r2, r2, rb_frame_height_minus_1
+-+add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
+-+add t0s, ra_frame_base2, r2   ; v8subs r0, r0, rb20
+-+
+-+
+-+# generate seven shifted versions
+-+# interleave with scroll of vertical context
+-+
+-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+
+-+# apply horizontal filter
+-+nop                  ; mul24 r2, r0, ra0
+-+nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+add r0, r2, r3       ; mov r3, rb31
+-+sub.setf -, r3, 8    ; mov ra12, ra13
+-+mov ra9, ra10
+-+mov ra10, ra11
+-+mov ra11, ra12
+-+mov ra12, ra13
+-+brr.anyn -, r:yloopb
+-+mov ra13, ra14       # Delay slot 1
+-+mov ra14, ra15       # Delay slot 2
+-+mov ra15, r0         # Delay slot 3
+-+
+-+# apply vertical filter and write to VPM
+-+
+-+nop                     ; mul24 r1, ra14, rb10
+-+nop                     ; mul24 r0, ra13, rb9
+-+add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+add r1, r1, r0          ; mul24 r0, ra8, rb4
+-+add r1, r1, r0          ; mul24 r0, ra9, rb5
+-+add r1, r1, r0          ; mul24 r0, ra10, rb6
+-+add r1, r1, r0          ; mul24 r0, ra11, rb7
+-+
+-+add r1, r1, r0          ; mov -, vw_wait
+-+sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+asr r1, r1, 14
+-+nop                     ; mul24 r1, r1 << 8, ra20 << 8 # Rotate to align left and right halves
+-+add r1, r1, ra30        ; mul24 r0, r1, rb14
+-+add r1, r1, r0
+-+brr.anyn -, r:yloopb
+-+asr r1, r1, 7          # Delay 1
+-+min r1, r1, rb22       # Delay 2
+-+max vpm, r1, 0         # Delay 3
+-+
+-+# DMA out
+-+bra -, ra31
+-+mov vw_setup, rb26 # VDW setup 0    Delay 1
+-+mov vw_setup, rb29 # Stride         Delay 2
+-+mov vw_addr, unif # start the VDW   Delay 3
+-+
+-+################################################################################
+-+
+-+# mc_interrupt_exit12()
+-+::mc_interrupt_exit12
+-+mov  -, vw_wait # wait on the VDW
+-+
+-+ldtmu0
+-+ldtmu0
+-+ldtmu0
+-+ldtmu0
+-+
+-+mov -,sacq(0) # 1
+-+mov -,sacq(0) # 2
+-+mov -,sacq(0) # 3
+-+mov -,sacq(0) # 4
+-+mov -,sacq(0) # 5
+-+mov -,sacq(0) # 6
+-+mov -,sacq(0) # 7
+-+mov -,sacq(0) # 8
+-+mov -,sacq(0) # 9
+-+mov -,sacq(0) # 10
+-+mov -,sacq(0) # 11
+-+
+-+nop        ; nop ; thrend
+-+mov interrupt, 1; nop # delay slot 1
+-+nop        ; nop # delay slot 2
+-+
+-+
+- ::mc_end
+- # Do not add code here because mc_end must appear after all other code.
+--- 
+-2.7.4
+-
+-
+-From f2ffe4186fa49cb27579953c276b51728a08a8b5 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Wed, 20 May 2015 19:58:30 +0100
+-Subject: [PATCH 42/68] Added support for fast cache flush in deblocker
+-
+----
+- libavcodec/hevc_filter.c   |   44 +-
+- libavcodec/rpi_qpu.c       |    6 +
+- libavcodec/rpi_qpu.h       |    2 +
+- libavcodec/rpi_shader.c    | 1028 +++++++++++++++++++++++++++++---------------
+- libavcodec/rpi_shader.h    |   16 +-
+- libavcodec/rpi_user_vcsm.h |   22 +
+- 6 files changed, 768 insertions(+), 350 deletions(-)
+-
+-diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
+-index 92a8271..186317a 100644
+---- a/libavcodec/hevc_filter.c
+-+++ b/libavcodec/hevc_filter.c
+-@@ -37,6 +37,11 @@
+- 
+- #include "bit_depth_template.c"
+- 
+-+#ifdef RPI
+-+#include "rpi_user_vcsm.h"
+-+#include "rpi_qpu.h"
+-+#endif
+-+
+- #define LUMA 0
+- #define CB 1
+- #define CR 2
+-@@ -872,15 +877,46 @@ static void flush_buffer(AVBufferRef *bref) {
+-     gpu_cache_flush(p);
+- }
+- 
+--static void ff_hevc_flush_chroma(HEVCContext *s)
+-+// Return Physical address for this image
+-+static int ff_hevc_buf_base(AVBufferRef *bref) {
+-+  GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
+-+  return p->vc & 0x3fffffff;
+-+}
+-+
+-+static void ff_hevc_flush_chroma(HEVCContext *s, ThreadFrame *f, int n)
+- {
+-     if (s->enable_rpi && !(  s->nal_unit_type == NAL_TRAIL_N ||
+-             s->nal_unit_type == NAL_TSA_N   ||
+-             s->nal_unit_type == NAL_STSA_N  ||
+-             s->nal_unit_type == NAL_RADL_N  ||
+-             s->nal_unit_type == NAL_RASL_N )) {
+-+#define RPI_FAST_CACHEFLUSH
+-+#ifdef RPI_FAST_CACHEFLUSH
+-+        struct vcsm_user_clean_invalid_s iocache = {};
+-+        int curr_y = f->progress->data[0];
+-+        int sz,base;
+-+        if (curr_y < 0) curr_y = 0;
+-+        if (n<=curr_y) return; // Should not happen
+-+        sz = s->frame->linesize[1] * (n-curr_y);
+-+        base = s->frame->linesize[1] * curr_y;
+-+        iocache.s[0].cmd = 3; // Flush L1 cache
+-+        iocache.s[0].addr = 0;
+-+        iocache.s[0].size  = 0;
+-+
+-+        iocache.s[1].cmd = 2;
+-+        iocache.s[1].addr = ff_hevc_buf_base(s->frame->buf[1]) + base;
+-+        iocache.s[1].size  = sz;
+-+
+-+        iocache.s[2].cmd = 2;
+-+        iocache.s[2].addr = ff_hevc_buf_base(s->frame->buf[2]) + base;
+-+        iocache.s[2].size  = sz;
+-+
+-+        vcsm_clean_invalid( gpu_get_mailbox(), &iocache );
+-+
+-+#else
+-         flush_buffer(s->frame->buf[1]);
+-         flush_buffer(s->frame->buf[2]);
+-+#endif
+-         //memcpy(s->dummy.arm,s->frame->data[0],2048*64);
+-         //memcpy(s->dummy.arm,s->frame->data[1],1024*32);
+-         //memcpy(s->dummy.arm,s->frame->data[2],1024*32);
+-@@ -903,7 +939,7 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
+-             sao_filter_CTB(s, x, y - ctb_size);
+-             if (s->threads_type & FF_THREAD_FRAME ) {
+- #ifdef RPI_INTER_QPU
+--                ff_hevc_flush_chroma(s);
+-+                ff_hevc_flush_chroma(s,&s->ref->tf, y);
+- #endif
+-                 ff_thread_report_progress(&s->ref->tf, y, 0);
+-             }
+-@@ -912,7 +948,7 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
+-             sao_filter_CTB(s, x , y);
+-             if (s->threads_type & FF_THREAD_FRAME ) {
+- #ifdef RPI_INTER_QPU
+--                ff_hevc_flush_chroma(s);
+-+                ff_hevc_flush_chroma(s, &s->ref->tf, y + ctb_size);
+- #endif
+-                 ff_thread_report_progress(&s->ref->tf, y + ctb_size, 0);
+-             }
+-@@ -922,7 +958,7 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
+-         //int currh = s->ref->tf.progress->data[0];
+-         //if (((y + ctb_size)&63)==0)
+- #ifdef RPI_INTER_QPU
+--        ff_hevc_flush_chroma(s);
+-+        ff_hevc_flush_chroma(s, &s->ref->tf, y + ctb_size - 4);
+- #endif
+-         ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
+-     }
+-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
+-index f62051f..fd8a276 100644
+---- a/libavcodec/rpi_qpu.c
+-+++ b/libavcodec/rpi_qpu.c
+-@@ -237,6 +237,12 @@ int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p)
+-   return r;
+- }
+- 
+-+int gpu_get_mailbox(void)
+-+{
+-+  assert(gpu);
+-+  return gpu->mb;
+-+}
+-+
+- void gpu_cache_flush(GPU_MEM_PTR_T *p)
+- {
+-   void *tmp = vcsm_lock(p->vcsm_handle);
+-diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
+-index 543c84b..88965e5 100644
+---- a/libavcodec/rpi_qpu.h
+-+++ b/libavcodec/rpi_qpu.h
+-@@ -49,4 +49,6 @@ extern int rpi_test_shader(void);
+- extern void rpi_do_block(const unsigned char *in_buffer_vc, int src_pitch, unsigned char *dst_vc, int dst_pitch, unsigned char *dst);
+- extern void rpi_do_block_arm(const unsigned char *in_buffer, int src_pitch, unsigned char *dst, int dst_pitch);
+- 
+-+extern int gpu_get_mailbox(void);
+-+
+- #endif
+-diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
+-index 3f04d80..9c30e32 100644
+---- a/libavcodec/rpi_shader.c
+-+++ b/libavcodec/rpi_shader.c
+-@@ -23,11 +23,11 @@ __attribute__((aligned(8)))
+- unsigned int rpi_shader[] = {
+- // ::mc_setup_uv
+- /* [0x00000000] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00000008] */ 0x0c9a0f80, 0x10020427, // add ra_x_base, unif, elem_num
+-+/* [0x00000008] */ 0x0c9a0f80, 0x10020427, // add ra_x, unif, elem_num
+- /* [0x00000010] */ 0x15827d80, 0x10020767, // mov ra_y, unif
+--/* [0x00000018] */ 0x15827d80, 0x10020627, // mov ra_x2_base, unif
+-+/* [0x00000018] */ 0x15827d80, 0x10020627, // mov ra_frame_base, unif
+- /* [0x00000020] */ 0x009e7000, 0x100009e7, // nop
+--/* [0x00000028] */ 0x0d620f80, 0x10020667, // sub ra_u2v_ref_offset, unif, ra_x2_base
+-+/* [0x00000028] */ 0x0d620f80, 0x10020667, // sub ra_u2v_ref_offset, unif, ra_frame_base
+- /* [0x00000030] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
+- /* [0x00000038] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
+- /* [0x00000040] */ 0x15827d80, 0x10021427, // mov rb16, unif
+-@@ -35,360 +35,708 @@ unsigned int rpi_shader[] = {
+- /* [0x00000050] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+- /* [0x00000058] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
+- /* [0x00000060] */ 0x00000001, 0xe0020527, // mov ra20, 1
+--/* [0x00000068] */ 0x00000020, 0xe0020567, // mov ra21, 32
+--/* [0x00000070] */ 0x00000100, 0xe00205a7, // mov ra22, 256
+--/* [0x00000078] */ 0x00000008, 0xe00205e7, // mov ra23, 8
+--/* [0x00000080] */ 0x00000040, 0xe00207a7, // mov ra30, 64
+--/* [0x00000088] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
+--/* [0x00000090] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
+--/* [0x00000098] */ 0x00000018, 0xe00215e7, // mov rb23, 24
+--/* [0x000000a0] */ 0x00000000, 0xe0020227, // mov ra8, 0
+--/* [0x000000a8] */ 0x00000000, 0xe0020267, // mov ra9, 0
+--/* [0x000000b0] */ 0x00000000, 0xe00202a7, // mov ra10, 0
+--/* [0x000000b8] */ 0x00000000, 0xe00202e7, // mov ra11, 0
+--/* [0x000000c0] */ 0x00000000, 0xe0020327, // mov ra12, 0
+--/* [0x000000c8] */ 0x00000000, 0xe0020367, // mov ra13, 0
+--/* [0x000000d0] */ 0x00000000, 0xe00203a7, // mov ra14, 0
+--/* [0x000000d8] */ 0x00000000, 0xe00203e7, // mov ra15, 0
+--/* [0x000000e0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+--/* [0x000000e8] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
+--/* [0x000000f0] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+--/* [0x000000f8] */ 0x159e7480, 0x10020867, // mov r1, r2
+--/* [0x00000100] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+--/* [0x00000108] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+--/* [0x00000110] */ 0x159e7480, 0x10020827, // mov r0, r2
+--/* [0x00000118] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+--/* [0x00000120] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000128] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
+--/* [0x00000130] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+--/* [0x00000138] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
+--/* [0x00000140] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+--/* [0x00000148] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
+--/* [0x00000150] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+--/* [0x00000158] */ 0x159e7480, 0x10020867, // mov r1, r2
+--/* [0x00000160] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+--/* [0x00000168] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+--/* [0x00000170] */ 0x159e7480, 0x10020827, // mov r0, r2
+--/* [0x00000178] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+--/* [0x00000180] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000188] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+--/* [0x00000190] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
+--/* [0x00000198] */ 0x0f9c11c0, 0xd0020827, // asr r0, r0, 1
+--/* [0x000001a0] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
+--/* [0x000001a8] */ 0x0c9e7040, 0x10021567, // add rb21, r0, r1
+--/* [0x000001b0] */ 0x15427d80, 0x10020827, // mov r0, ra_x_base
+--/* [0x000001b8] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
+--/* [0x000001c0] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base
+--/* [0x000001c8] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
+--/* [0x000001d0] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
+--/* [0x000001d8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x000001e0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+--/* [0x000001e8] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x_base, r0
+--/* [0x000001f0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+--/* [0x000001f8] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
+--/* [0x00000200] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
+--/* [0x00000208] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
+--/* [0x00000210] */ 0x15827d80, 0x10021327, // mov rb12,unif
+--/* [0x00000218] */ 0x15827d80, 0x10021367, // mov rb13,unif
+--/* [0x00000220] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000228] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
+--/* [0x00000230] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+--/* [0x00000238] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
+--/* [0x00000240] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000248] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+--/* [0x00000250] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
+--/* [0x00000258] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
+-+/* [0x00000068] */ 0x00000100, 0xe00205a7, // mov ra22, 256
+-+/* [0x00000070] */ 0x00000040, 0xe00207a7, // mov ra30, 64
+-+/* [0x00000078] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
+-+/* [0x00000080] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
+-+/* [0x00000088] */ 0x00000018, 0xe00215e7, // mov rb23, 24
+-+/* [0x00000090] */ 0x00000000, 0xe0020227, // mov ra8, 0
+-+/* [0x00000098] */ 0x00000000, 0xe0020267, // mov ra9, 0
+-+/* [0x000000a0] */ 0x00000000, 0xe00202a7, // mov ra10, 0
+-+/* [0x000000a8] */ 0x00000000, 0xe00202e7, // mov ra11, 0
+-+/* [0x000000b0] */ 0x00000000, 0xe0020327, // mov ra12, 0
+-+/* [0x000000b8] */ 0x00000000, 0xe0020367, // mov ra13, 0
+-+/* [0x000000c0] */ 0x00000000, 0xe00203a7, // mov ra14, 0
+-+/* [0x000000c8] */ 0x00000000, 0xe00203e7, // mov ra15, 0
+-+/* [0x000000d0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+-+/* [0x000000d8] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
+-+/* [0x000000e0] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+-+/* [0x000000e8] */ 0x159e7480, 0x10020867, // mov r1, r2
+-+/* [0x000000f0] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+-+/* [0x000000f8] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+-+/* [0x00000100] */ 0x159e7480, 0x10020827, // mov r0, r2
+-+/* [0x00000108] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+-+/* [0x00000110] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000118] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
+-+/* [0x00000120] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+-+/* [0x00000128] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
+-+/* [0x00000130] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+-+/* [0x00000138] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
+-+/* [0x00000140] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+-+/* [0x00000148] */ 0x159e7480, 0x10020867, // mov r1, r2
+-+/* [0x00000150] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+-+/* [0x00000158] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+-+/* [0x00000160] */ 0x159e7480, 0x10020827, // mov r0, r2
+-+/* [0x00000168] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+-+/* [0x00000170] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000178] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+-+/* [0x00000180] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
+-+/* [0x00000188] */ 0x0f9c11c0, 0xd0020827, // asr r0, r0, 1
+-+/* [0x00000190] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
+-+/* [0x00000198] */ 0x0c9e7040, 0x10021567, // add rb21, r0, r1
+-+/* [0x000001a0] */ 0x15427d80, 0x10020827, // mov r0, ra_x
+-+/* [0x000001a8] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
+-+/* [0x000001b0] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base
+-+/* [0x000001b8] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
+-+/* [0x000001c0] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
+-+/* [0x000001c8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x000001d0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+-+/* [0x000001d8] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x, r0
+-+/* [0x000001e0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x000001e8] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
+-+/* [0x000001f0] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_frame_base, r2
+-+/* [0x000001f8] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
+-+/* [0x00000200] */ 0x15827d80, 0x10021327, // mov rb12,unif
+-+/* [0x00000208] */ 0x15827d80, 0x10021367, // mov rb13,unif
+-+/* [0x00000210] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000218] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
+-+/* [0x00000220] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x00000228] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
+-+/* [0x00000230] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000238] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+-+/* [0x00000240] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x
+-+/* [0x00000248] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_frame_base
+- // ::mc_filter_uv
+--/* [0x00000260] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00000268] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x00000270] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00000278] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x00000280] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+--/* [0x00000288] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x00000290] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+--/* [0x00000298] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x000002a0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+--/* [0x000002a8] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x000002b0] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+--/* [0x000002b8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+--/* [0x000002c0] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x000002c8] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000002d0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x000002d8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x000002e0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x000002e8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+--/* [0x000002f0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+--/* [0x000002f8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x00000300] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000308] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000310] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00000318] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000320] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000328] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000330] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000338] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x00000340] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000348] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000350] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000358] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
+--/* [0x00000360] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000368] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000370] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
+--/* [0x00000378] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000380] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
+--/* [0x00000388] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000390] */ 0x0f9e7080, 0x100613e7, // asr.ifnz rb15, r0, r2
+--/* [0x00000398] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x000003a0] */ 0x0f9e7080, 0x100613a7, // asr.ifnz rb14, r0, r2
+--/* [0x000003a8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x000003b0] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x00000250] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000258] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00000260] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000268] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000270] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x00000278] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000280] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x00000288] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x00000290] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
+-+/* [0x00000298] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x000002a0] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
+-+/* [0x000002a8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x000002b0] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x000002b8] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000002c0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x000002c8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x000002d0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x000002d8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+-+/* [0x000002e0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+-+/* [0x000002e8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x000002f0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x000002f8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000300] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00000308] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000310] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000318] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000320] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000328] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x00000330] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000338] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000340] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000348] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
+-+/* [0x00000350] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000358] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000360] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
+-+/* [0x00000368] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000370] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
+-+/* [0x00000378] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000380] */ 0x0f9e7080, 0x100613e7, // asr.ifnz rb15, r0, r2
+-+/* [0x00000388] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000390] */ 0x0f9e7080, 0x100613a7, // asr.ifnz rb14, r0, r2
+-+/* [0x00000398] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x000003a0] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop
+--/* [0x000003b8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x000003c0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x000003c8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x000003d0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x000003d8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x000003e0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x000003e8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x000003f0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x000003f8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x00000400] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x00000408] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000410] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x00000418] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x00000420] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x00000428] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x00000430] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x00000438] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x00000440] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x00000448] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00000450] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+--/* [0x00000458] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+--/* [0x00000460] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
+--/* [0x00000468] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x00000470] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+--/* [0x00000478] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+--/* [0x00000480] */ 0x00000020, 0xe0021327, // mov rb12,32
+--/* [0x00000488] */ 0x00000006, 0xe0021367, // mov rb13,6
+--/* [0x00000490] */ 0x00000001, 0xe00213a7, // mov rb14,1
+--/* [0x00000498] */ 0x00000000, 0xe00213e7, // mov rb15,0
+--/* [0x000004a0] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+--/* [0x000004a8] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+--/* [0x000004b0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+--/* [0x000004b8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+--/* [0x000004c0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x000004c8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x000004d0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+--/* [0x000004d8] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
+--/* [0x000004e0] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
+--/* [0x000004e8] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
+--/* [0x000004f0] */ 0xfffffea8, 0xf06809e7, // brr.anyn -, r:uvloop
+--/* [0x000004f8] */ 0x0c9cf3c0, 0x10020867, // add r1, r1, rb15
+--/* [0x00000500] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x00000508] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+--/* [0x00000510] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00000518] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000520] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x00000528] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x00000530] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x00000538] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000540] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x00000548] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000550] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x000003a8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x000003b0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
+-+/* [0x000003b8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+-+/* [0x000003c0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x000003c8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x000003d0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x000003d8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x000003e0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x000003e8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
+-+/* [0x000003f0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_frame_base, r2
+-+/* [0x000003f8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000400] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x00000408] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x00000410] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00000418] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x00000420] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000428] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00000430] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00000438] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00000440] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+-+/* [0x00000448] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+-+/* [0x00000450] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
+-+/* [0x00000458] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00000460] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+-+/* [0x00000468] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+-+/* [0x00000470] */ 0x00000020, 0xe0021327, // mov rb12,32
+-+/* [0x00000478] */ 0x00000006, 0xe0021367, // mov rb13,6
+-+/* [0x00000480] */ 0x00000001, 0xe00213a7, // mov rb14,1
+-+/* [0x00000488] */ 0x00000000, 0xe00213e7, // mov rb15,0
+-+/* [0x00000490] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+-+/* [0x00000498] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+-+/* [0x000004a0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+/* [0x000004a8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x000004b0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x000004b8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x000004c0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x000004c8] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
+-+/* [0x000004d0] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
+-+/* [0x000004d8] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
+-+/* [0x000004e0] */ 0xfffffea8, 0xf06809e7, // brr.anyn -, r:uvloop
+-+/* [0x000004e8] */ 0x0c9cf3c0, 0x10020867, // add r1, r1, rb15
+-+/* [0x000004f0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x000004f8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x00000500] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00000508] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000510] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000518] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x00000520] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x00000528] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000530] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x00000538] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000540] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_filter_uv_b0
+--/* [0x00000558] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00000560] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x00000568] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00000570] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x00000578] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+--/* [0x00000580] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x00000588] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+--/* [0x00000590] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x00000598] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+--/* [0x000005a0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x000005a8] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+--/* [0x000005b0] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
+--/* [0x000005b8] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x000005c0] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000005c8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x000005d0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x000005d8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x000005e0] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+--/* [0x000005e8] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+--/* [0x000005f0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x000005f8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000600] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000608] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00000610] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000618] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000620] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000628] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000630] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x00000638] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000640] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000648] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000650] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
+--/* [0x00000658] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000660] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000668] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x00000670] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000678] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x00000548] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000550] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00000558] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000560] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000568] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x00000570] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000578] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x00000580] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x00000588] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
+-+/* [0x00000590] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x00000598] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
+-+/* [0x000005a0] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
+-+/* [0x000005a8] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x000005b0] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000005b8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x000005c0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x000005c8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x000005d0] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+-+/* [0x000005d8] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+-+/* [0x000005e0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x000005e8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x000005f0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x000005f8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00000600] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000608] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000610] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000618] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000620] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x00000628] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000630] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000638] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000640] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
+-+/* [0x00000648] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000650] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000658] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x00000660] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000668] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop_b0
+--/* [0x00000680] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x00000688] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x00000690] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x00000698] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x000006a0] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x000006a8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x000006b0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x000006b8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x000006c0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x000006c8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x000006d0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x000006d8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x000006e0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x000006e8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x000006f0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x000006f8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x00000700] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x00000708] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x00000710] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00000718] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+--/* [0x00000720] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+--/* [0x00000728] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+--/* [0x00000730] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x00000738] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+--/* [0x00000740] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+--/* [0x00000748] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+--/* [0x00000750] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+--/* [0x00000758] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+--/* [0x00000760] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+--/* [0x00000768] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x00000770] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
+--/* [0x00000778] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+--/* [0x00000780] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
+--/* [0x00000788] */ 0x009e7000, 0x100009e7, // nop
+--/* [0x00000790] */ 0x009e7000, 0x100009e7, // nop
+--/* [0x00000798] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x000007a0] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000007a8] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000007b0] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x00000670] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x00000678] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
+-+/* [0x00000680] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+-+/* [0x00000688] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000690] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x00000698] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x000006a0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x000006a8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x000006b0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
+-+/* [0x000006b8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_frame_base, r2
+-+/* [0x000006c0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x000006c8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x000006d0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x000006d8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x000006e0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x000006e8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x000006f0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x000006f8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00000700] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00000708] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+-+/* [0x00000710] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+-+/* [0x00000718] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+-+/* [0x00000720] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00000728] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+-+/* [0x00000730] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+-+/* [0x00000738] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+-+/* [0x00000740] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+-+/* [0x00000748] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+/* [0x00000750] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x00000758] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x00000760] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
+-+/* [0x00000768] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+-+/* [0x00000770] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
+-+/* [0x00000778] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x00000780] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x00000788] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000790] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000798] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000007a0] */ 0x009e7000, 0x100009e7, // nop
+- // ::mc_filter_uv_b
+--/* [0x000007b8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x000007c0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x000007c8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x000007d0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x000007d8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+--/* [0x000007e0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x000007e8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+--/* [0x000007f0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x000007f8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
+--/* [0x00000800] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x00000808] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
+--/* [0x00000810] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+--/* [0x00000818] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x00000820] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000828] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x00000830] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x00000838] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x00000840] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+--/* [0x00000848] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+--/* [0x00000850] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x00000858] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+--/* [0x00000860] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+--/* [0x00000868] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+--/* [0x00000870] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000878] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000880] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00000888] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
+--/* [0x00000890] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000898] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000008a0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000008a8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000008b0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x000008b8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000008c0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000008c8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000008d0] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
+--/* [0x000008d8] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000008e0] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000008e8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x000008f0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x000008f8] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x000007a8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x000007b0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x000007b8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x000007c0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x000007c8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x000007d0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x000007d8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x000007e0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x000007e8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
+-+/* [0x000007f0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x000007f8] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
+-+/* [0x00000800] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x00000808] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x00000810] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000818] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x00000820] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x00000828] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x00000830] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+-+/* [0x00000838] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+-+/* [0x00000840] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x00000848] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+-+/* [0x00000850] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+-+/* [0x00000858] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+-+/* [0x00000860] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000868] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000870] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00000878] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
+-+/* [0x00000880] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000888] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000890] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000898] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000008a0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x000008a8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000008b0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000008b8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000008c0] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
+-+/* [0x000008c8] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000008d0] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000008d8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x000008e0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x000008e8] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop_b
+--/* [0x00000900] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x00000908] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
+--/* [0x00000910] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
+--/* [0x00000918] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x00000920] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x00000928] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x00000930] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x00000938] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x00000940] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
+--/* [0x00000948] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
+--/* [0x00000950] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000958] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x00000960] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x00000968] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x00000970] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x00000978] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x00000980] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x00000988] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x00000990] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00000998] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+--/* [0x000009a0] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+--/* [0x000009a8] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
+--/* [0x000009b0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x000009b8] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+--/* [0x000009c0] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+--/* [0x000009c8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+--/* [0x000009d0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+--/* [0x000009d8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+--/* [0x000009e0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+--/* [0x000009e8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x000009f0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x000009f8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+--/* [0x00000a00] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
+--/* [0x00000a08] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
+--/* [0x00000a10] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
+--/* [0x00000a18] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
+--/* [0x00000a20] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x00000a28] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+--/* [0x00000a30] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00000a38] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000a40] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x00000a48] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x00000a50] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x00000a58] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000a60] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x00000a68] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000a70] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x000008f0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x000008f8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
+-+/* [0x00000900] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+-+/* [0x00000908] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000910] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x00000918] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000920] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000928] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x00000930] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
+-+/* [0x00000938] */ 0x0c627c80, 0x10020e27, // add t0s, ra_frame_base, r2
+-+/* [0x00000940] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000948] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x00000950] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x00000958] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00000960] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x00000968] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000970] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00000978] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00000980] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00000988] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+-+/* [0x00000990] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+-+/* [0x00000998] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x000009a0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x000009a8] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+-+/* [0x000009b0] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+-+/* [0x000009b8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+-+/* [0x000009c0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+-+/* [0x000009c8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+/* [0x000009d0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x000009d8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x000009e0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x000009e8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x000009f0] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
+-+/* [0x000009f8] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
+-+/* [0x00000a00] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x00000a08] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
+-+/* [0x00000a10] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x00000a18] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x00000a20] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00000a28] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000a30] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000a38] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x00000a40] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x00000a48] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000a50] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x00000a58] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000a60] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_exit
+--/* [0x00000a78] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x00000a80] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+-+/* [0x00000a68] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00000a70] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+-+/* [0x00000a78] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a80] */ 0x009e7000, 0xa00009e7, // ldtmu0
+- /* [0x00000a88] */ 0x009e7000, 0xa00009e7, // ldtmu0
+- /* [0x00000a90] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000a98] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000aa0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000aa8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x00000ab0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+--/* [0x00000ab8] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000a98] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00000aa0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000aa8] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_interrupt_exit8
+--/* [0x00000ac0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00000ab0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00000ab8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000ac0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+- /* [0x00000ac8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+- /* [0x00000ad0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000ad8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000ae0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000ad8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000ae0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00000ae8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00000af0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00000af8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00000b00] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00000b08] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000b10] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000b18] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000b20] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x00000b28] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+--/* [0x00000b30] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000b10] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00000b18] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x00000b20] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+// ::mc_setup
+-+/* [0x00000b28] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000b30] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000b38] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000b40] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000b48] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
+-+/* [0x00000b50] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000b58] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
+-+/* [0x00000b60] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+-+/* [0x00000b68] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
+-+/* [0x00000b70] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+-+/* [0x00000b78] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x00000b80] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
+-+/* [0x00000b88] */ 0x8c9e7452, 0x10025e18, // add t0s, r2, r1 ; mov ra_frame_base, r2
+-+/* [0x00000b90] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000b98] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000ba0] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
+-+/* [0x00000ba8] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
+-+/* [0x00000bb0] */ 0x0c9c13c0, 0xd0020567, // add ra_y2, r1, 1
+-+/* [0x00000bb8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+-+/* [0x00000bc0] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
+-+/* [0x00000bc8] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+-+/* [0x00000bd0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x00000bd8] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
+-+/* [0x00000be0] */ 0x8c9e7452, 0x10025e19, // add t0s, r2, r1 ; mov ra_frame_base2, r2
+-+/* [0x00000be8] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
+-+/* [0x00000bf0] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
+-+/* [0x00000bf8] */ 0x15827d80, 0x10021427, // mov rb16, unif
+-+/* [0x00000c00] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000c08] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+-+/* [0x00000c10] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
+-+/* [0x00000c18] */ 0x00000001, 0xe0020527, // mov ra20, 1
+-+/* [0x00000c20] */ 0x00000100, 0xe00205a7, // mov ra22, 256
+-+/* [0x00000c28] */ 0x00000040, 0xe00207a7, // mov ra30, 64
+-+/* [0x00000c30] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
+-+/* [0x00000c38] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
+-+/* [0x00000c40] */ 0x00000018, 0xe00215e7, // mov rb23, 24
+-+/* [0x00000c48] */ 0x00000000, 0xe0020227, // mov ra8, 0
+-+/* [0x00000c50] */ 0x00000000, 0xe0020267, // mov ra9, 0
+-+/* [0x00000c58] */ 0x00000000, 0xe00202a7, // mov ra10, 0
+-+/* [0x00000c60] */ 0x00000000, 0xe00202e7, // mov ra11, 0
+-+/* [0x00000c68] */ 0x00000000, 0xe0020327, // mov ra12, 0
+-+/* [0x00000c70] */ 0x00000000, 0xe0020367, // mov ra13, 0
+-+/* [0x00000c78] */ 0x00000000, 0xe00203a7, // mov ra14, 0
+-+/* [0x00000c80] */ 0x00000000, 0xe00203e7, // mov ra15, 0
+-+/* [0x00000c88] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+-+/* [0x00000c90] */ 0x159e7480, 0x10020867, // mov r1, r2
+-+/* [0x00000c98] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+-+/* [0x00000ca0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+-+/* [0x00000ca8] */ 0x159e7480, 0x10020827, // mov r0, r2
+-+/* [0x00000cb0] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+-+/* [0x00000cb8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000cc0] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
+-+/* [0x00000cc8] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+-+/* [0x00000cd0] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
+-+/* [0x00000cd8] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+-+/* [0x00000ce0] */ 0x159e7480, 0x10020867, // mov r1, r2
+-+/* [0x00000ce8] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+-+/* [0x00000cf0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+-+/* [0x00000cf8] */ 0x159e7480, 0x10020827, // mov r0, r2
+-+/* [0x00000d00] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+-+/* [0x00000d08] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000d10] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+-+/* [0x00000d18] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
+-+/* [0x00000d20] */ 0x15827d80, 0x10021327, // mov rb12,unif
+-+/* [0x00000d28] */ 0x15827d80, 0x10021367, // mov rb13,unif
+-+/* [0x00000d30] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000d38] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
+-+/* [0x00000d40] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x00000d48] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
+-+/* [0x00000d50] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+-+/* [0x00000d58] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_frame_base
+-+/* [0x00000d60] */ 0x13540dc0, 0xd0020867, // max r1, ra_y2, 0
+-+/* [0x00000d68] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x00000d70] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000d78] */ 0x0c541dc0, 0xd0020567, // add ra_y2, ra_y2, 1
+-+/* [0x00000d80] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+-+/* [0x00000d88] */ 0x0c667380, 0x10020e27, // add t0s, r1, ra_frame_base2
+-+// ::mc_filter
+-+/* [0x00000d90] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000d98] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000da0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00000da8] */ 0x155e7d80, 0x10021027, // mov rx_xshift2, rx_xshift2_next
+-+/* [0x00000db0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000db8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000dc0] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
+-+/* [0x00000dc8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000dd0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x00000dd8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+-+/* [0x00000de0] */ 0x0c9e7400, 0x100206a7, // add ra_frame_base_next, r2, r0
+-+/* [0x00000de8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000df0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0   ; mov r1, unif
+-+/* [0x00000df8] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
+-+/* [0x00000e00] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
+-+/* [0x00000e08] */ 0x0c9c13c0, 0xd0021067, // add ra_y2_next, r1, 1
+-+/* [0x00000e10] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+-+/* [0x00000e18] */ 0x0c9e7400, 0x100214e7, // add rx_frame_base2_next, r2, r0
+-+/* [0x00000e20] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x00000e28] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x00000e30] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000e38] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x00000e40] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x00000e48] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x00000e50] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+-+/* [0x00000e58] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+-+/* [0x00000e60] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x00000e68] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000e70] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000e78] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00000e80] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000e88] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
+-+/* [0x00000e90] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000e98] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000ea0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000ea8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x00000eb0] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
+-+/* [0x00000eb8] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000ec0] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000ec8] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000ed0] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+-+/* [0x00000ed8] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
+-+/* [0x00000ee0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000ee8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000ef0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000ef8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+-+/* [0x00000f00] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
+-+/* [0x00000f08] */ 0x4f5971c6, 0x100251e0, // asr rb7, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000f10] */ 0x4f5971c6, 0x100251a0, // asr rb6, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000f18] */ 0x4f5971c6, 0x10025160, // asr rb5, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000f20] */ 0x0f9d71c0, 0x10021127, // asr rb4, r0, rb23
+-+/* [0x00000f28] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000f30] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
+-+/* [0x00000f38] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
+-+/* [0x00000f40] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000f48] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
+-+/* [0x00000f50] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+// :yloop
+-+/* [0x00000f58] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
+-+/* [0x00000f60] */ 0x8e4539bf, 0xa0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu0
+-+/* [0x00000f68] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+-+/* [0x00000f70] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000f78] */ 0x0e9c09c0, 0x10020867, // shr r1, r4, rx_xshift2
+-+/* [0x00000f80] */ 0x159c1fc0, 0x10040567, // mov.ifz ra_y2, ra_y2_next
+-+/* [0x00000f88] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000f90] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000f98] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
+-+/* [0x00000fa0] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
+-+/* [0x00000fa8] */ 0x13540dc0, 0xd00208a7, // max r2, ra_y2, 0
+-+/* [0x00000fb0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000fb8] */ 0x4c541dd3, 0xd0024562, // add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
+-+/* [0x00000fc0] */ 0xec654c87, 0x10024e20, // add t0s, ra_frame_base2, r2   ; v8subs r0, r0, rb20
+-+/* [0x00000fc8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000fd0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x00000fd8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x00000fe0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00000fe8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x00000ff0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000ff8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00001000] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00001008] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00001010] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+/* [0x00001018] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+/* [0x00001020] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+/* [0x00001028] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+/* [0x00001030] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+/* [0x00001038] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+/* [0x00001040] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+/* [0x00001048] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+/* [0x00001050] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+-+/* [0x00001058] */ 0x8d3487f6, 0xd00279cc, // sub.setf -, r3, 8    ; mov ra12, ra13
+-+/* [0x00001060] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+-+/* [0x00001068] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+-+/* [0x00001070] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+-+/* [0x00001078] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+-+/* [0x00001080] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:yloop
+-+/* [0x00001088] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00001090] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+-+/* [0x00001098] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+-+/* [0x000010a0] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+-+/* [0x000010a8] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+-+/* [0x000010b0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+/* [0x000010b8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x000010c0] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb4
+-+/* [0x000010c8] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb5
+-+/* [0x000010d0] */ 0x4c286237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb6
+-+/* [0x000010d8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
+-+/* [0x000010e0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x000010e8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x000010f0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x000010f8] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
+-+/* [0x00001100] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
+-+/* [0x00001108] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
+-+/* [0x00001110] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:yloop
+-+/* [0x00001118] */ 0x0c9cf3c0, 0x10020867, // add r1, r1, rb15
+-+/* [0x00001120] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x00001128] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x00001130] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00001138] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00001140] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00001148] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+// ::mc_filter_b
+-+/* [0x00001150] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00001158] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00001160] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00001168] */ 0x155e7d80, 0x10021027, // mov rx_xshift2, rx_xshift2_next
+-+/* [0x00001170] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00001178] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00001180] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
+-+/* [0x00001188] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00001190] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x00001198] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+-+/* [0x000011a0] */ 0x0c9e7400, 0x100206a7, // add ra_frame_base_next, r2, r0
+-+/* [0x000011a8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x000011b0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0   ; mov r1, unif
+-+/* [0x000011b8] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
+-+/* [0x000011c0] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
+-+/* [0x000011c8] */ 0x0c9c13c0, 0xd0021067, // add ra_y2_next, r1, 1
+-+/* [0x000011d0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+-+/* [0x000011d8] */ 0x0c9e7400, 0x100214e7, // add rx_frame_base2_next, r2, r0
+-+/* [0x000011e0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x000011e8] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x000011f0] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000011f8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x00001200] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x00001208] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x00001210] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+-+/* [0x00001218] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+-+/* [0x00001220] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x00001228] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00001230] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00001238] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00001240] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00001248] */ 0x00000001, 0xe0020867, // mov r1, 1
+-+/* [0x00001250] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
+-+/* [0x00001258] */ 0x409f3001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 13, r1 << 13
+-+/* [0x00001260] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00001268] */ 0x409f2001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 14, r1 << 14
+-+/* [0x00001270] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00001278] */ 0x409f1001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 15, r1 << 15
+-+/* [0x00001280] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00001288] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x00001290] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
+-+/* [0x00001298] */ 0x409f7001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 9, r1 << 9
+-+/* [0x000012a0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000012a8] */ 0x409f6001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 10, r1 << 10
+-+/* [0x000012b0] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000012b8] */ 0x409f5001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 11, r1 << 11
+-+/* [0x000012c0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000012c8] */ 0x409f4001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 12, r1 << 12
+-+/* [0x000012d0] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+-+/* [0x000012d8] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
+-+/* [0x000012e0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000012e8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000012f0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000012f8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+-+/* [0x00001300] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
+-+/* [0x00001308] */ 0x4f5971c6, 0x100251e0, // asr rb7, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00001310] */ 0x4f5971c6, 0x100251a0, // asr rb6, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00001318] */ 0x4f5971c6, 0x10025160, // asr rb5, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00001320] */ 0x0f9d71c0, 0x10021127, // asr rb4, r0, rb23
+-+/* [0x00001328] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00001330] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
+-+/* [0x00001338] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
+-+/* [0x00001340] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00001348] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
+-+/* [0x00001350] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+// :yloopb
+-+/* [0x00001358] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
+-+/* [0x00001360] */ 0x8e4539bf, 0xa0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu0
+-+/* [0x00001368] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+-+/* [0x00001370] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00001378] */ 0x0e9c09c0, 0x10020867, // shr r1, r4, rx_xshift2
+-+/* [0x00001380] */ 0x159c1fc0, 0x10040567, // mov.ifz ra_y2, ra_y2_next
+-+/* [0x00001388] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00001390] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00001398] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
+-+/* [0x000013a0] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
+-+/* [0x000013a8] */ 0x13540dc0, 0xd00208a7, // max r2, ra_y2, 0
+-+/* [0x000013b0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x000013b8] */ 0x4c541dd3, 0xd0024562, // add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
+-+/* [0x000013c0] */ 0xec654c87, 0x10024e20, // add t0s, ra_frame_base2, r2   ; v8subs r0, r0, rb20
+-+/* [0x000013c8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x000013d0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x000013d8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x000013e0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x000013e8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x000013f0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x000013f8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00001400] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00001408] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00001410] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+/* [0x00001418] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+/* [0x00001420] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+/* [0x00001428] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+/* [0x00001430] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+/* [0x00001438] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+/* [0x00001440] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+/* [0x00001448] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+/* [0x00001450] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+-+/* [0x00001458] */ 0x8d3487f6, 0xd00279cc, // sub.setf -, r3, 8    ; mov ra12, ra13
+-+/* [0x00001460] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+-+/* [0x00001468] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+-+/* [0x00001470] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+-+/* [0x00001478] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+-+/* [0x00001480] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:yloopb
+-+/* [0x00001488] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00001490] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+-+/* [0x00001498] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+-+/* [0x000014a0] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+-+/* [0x000014a8] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+-+/* [0x000014b0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+/* [0x000014b8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x000014c0] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb4
+-+/* [0x000014c8] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb5
+-+/* [0x000014d0] */ 0x4c286237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb6
+-+/* [0x000014d8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
+-+/* [0x000014e0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x000014e8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x000014f0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x000014f8] */ 0x4053800e, 0xd00049e1, // nop                     ; mul24 r1, r1 << 8, ra20 << 8
+-+/* [0x00001500] */ 0x4c78e38f, 0x10024860, // add r1, r1, ra30        ; mul24 r0, r1, rb14
+-+/* [0x00001508] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
+-+/* [0x00001510] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:yloopb
+-+/* [0x00001518] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
+-+/* [0x00001520] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x00001528] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x00001530] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00001538] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00001540] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00001548] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+// ::mc_interrupt_exit12
+-+/* [0x00001550] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00001558] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00001560] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00001568] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00001570] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00001578] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001580] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001588] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001590] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001598] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000015a0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000015a8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000015b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000015b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000015c0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000015c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000015d0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x000015d8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x000015e0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_end
+- };
+- #ifdef __HIGHC__
+-diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
+-index cec9901..3fa8531 100644
+---- a/libavcodec/rpi_shader.h
+-+++ b/libavcodec/rpi_shader.h
+-@@ -4,11 +4,15 @@
+- extern unsigned int rpi_shader[];
+- 
+- #define mc_setup_uv (rpi_shader + 0)
+--#define mc_filter_uv (rpi_shader + 152)
+--#define mc_filter_uv_b0 (rpi_shader + 342)
+--#define mc_filter_uv_b (rpi_shader + 494)
+--#define mc_exit (rpi_shader + 670)
+--#define mc_interrupt_exit8 (rpi_shader + 688)
+--#define mc_end (rpi_shader + 718)
+-+#define mc_filter_uv (rpi_shader + 148)
+-+#define mc_filter_uv_b0 (rpi_shader + 338)
+-+#define mc_filter_uv_b (rpi_shader + 490)
+-+#define mc_exit (rpi_shader + 666)
+-+#define mc_interrupt_exit8 (rpi_shader + 684)
+-+#define mc_setup (rpi_shader + 714)
+-+#define mc_filter (rpi_shader + 868)
+-+#define mc_filter_b (rpi_shader + 1108)
+-+#define mc_interrupt_exit12 (rpi_shader + 1364)
+-+#define mc_end (rpi_shader + 1402)
+- 
+- #endif
+-diff --git a/libavcodec/rpi_user_vcsm.h b/libavcodec/rpi_user_vcsm.h
+-index fbebbbe..95e6de1 100644
+---- a/libavcodec/rpi_user_vcsm.h
+-+++ b/libavcodec/rpi_user_vcsm.h
+-@@ -418,6 +418,28 @@ int vcsm_unlock_hdl( unsigned int handle );
+- */
+- int vcsm_unlock_hdl_sp( unsigned int handle, int cache_no_flush );
+- 
+-+/* Clean and/or invalidate the memory associated with this user opaque handle
+-+**
+-+** Returns:        non-zero on error
+-+**
+-+** structure contains a list of flush/invalidate commands. Commands are:
+-+** 0: nop
+-+** 1: invalidate given physical range in L2
+-+** 2: clean      given physical range in L2
+-+** 3: clean+invalidate all of L1
+-+** 4: flush      all of L2 and all of L1
+-+*/
+-+struct vcsm_user_clean_invalid_s {
+-+    struct {
+-+       unsigned int cmd;
+-+       unsigned int addr;
+-+       unsigned int size;
+-+    } s[8];
+-+};
+-+
+-+int vcsm_clean_invalid( unsigned int handle, struct vcsm_user_clean_invalid_s *s );
+-+
+-+
+- #ifdef __cplusplus
+- }
+- #endif
+--- 
+-2.7.4
+-
+-
+-From 09685ab55aecb9400e354522894e0fbbb6381ca9 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Wed, 20 May 2015 21:12:55 +0100
+-Subject: [PATCH 43/68] Added multi mailbox - not working
+-
+----
+- libavcodec/hevc.c        | 40 ++++++++++++++++++++++++++++---
+- libavcodec/rpi_mailbox.c | 47 +++++++++++++++++++++++++++++++++++++
+- libavcodec/rpi_mailbox.h |  5 ++++
+- libavcodec/rpi_qpu.c     | 61 ++++++++++++++++++++++++++++++++++++++++++++----
+- libavcodec/rpi_qpu.h     |  2 ++
+- 5 files changed, 147 insertions(+), 8 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 951e2d3..ab63efd 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -47,6 +47,11 @@
+-   //#define EARLY_MALLOC
+-   // Move Inter prediction into separate pass
+-   #define RPI_INTER
+-+
+-+  #ifdef RPI_INTER_QPU
+-+    // Define RPI_MULTI_MAILBOX to use the updated mailbox that can launch both QPU and VPU
+-+    #define RPI_MULTI_MAILBOX
+-+  #endif
+- #endif
+- 
+- // #define DISABLE_MC
+-@@ -2843,10 +2848,14 @@ static void rpi_inter_clear(HEVCContext *s)
+- static void rpi_execute_inter_qpu(HEVCContext *s)
+- {
+-     int k;
+-+    int i;
+-     uint32_t *unif_vc = (uint32_t *)s->unif_mvs_ptr.vc;
+--
+--    if (s->sh.slice_type == I_SLICE)
+--        return;
+-+    if (s->sh.slice_type == I_SLICE) {
+-+#ifdef RPI_MULTI_MAILBOX
+-+      rpi_execute_transform(s);
+-+      return;
+-+#endif
+-+    }
+-     for(k=0;k<8;k++) {
+-         s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
+-         s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
+-@@ -2856,6 +2865,22 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
+- 
+-     s->u_mvs[8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
+- 
+-+#ifdef RPI_MULTI_MAILBOX
+-+    gpu_cache_flush(&s->coeffs_buf_accelerated);
+-+    s->vpu_id = vpu_qpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0,
+-+                                   qpu_get_fn(QPU_MC_SETUP_UV),
+-+                                   (uint32_t)(unif_vc+(s->mvs_base[0 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-+                                   (uint32_t)(unif_vc+(s->mvs_base[1 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-+                                   (uint32_t)(unif_vc+(s->mvs_base[2 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-+                                   (uint32_t)(unif_vc+(s->mvs_base[3 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-+                                   (uint32_t)(unif_vc+(s->mvs_base[4 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-+                                   (uint32_t)(unif_vc+(s->mvs_base[5 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-+                                   (uint32_t)(unif_vc+(s->mvs_base[6 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-+                                   (uint32_t)(unif_vc+(s->mvs_base[7 ] - (uint32_t*)s->unif_mvs_ptr.arm))
+-+                                 );
+-+    for(i=0;i<4;i++)
+-+        s->num_coeffs[i] = 0;
+-+#else
+-     qpu_run_shader8(qpu_get_fn(QPU_MC_SETUP_UV),
+-       (uint32_t)(unif_vc+(s->mvs_base[0 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-       (uint32_t)(unif_vc+(s->mvs_base[1 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-@@ -2866,6 +2891,7 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
+-       (uint32_t)(unif_vc+(s->mvs_base[6 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-       (uint32_t)(unif_vc+(s->mvs_base[7 ] - (uint32_t*)s->unif_mvs_ptr.arm))
+-       );
+-+#endif
+- }
+- #endif
+- 
+-@@ -2945,6 +2971,12 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-           if ( (((y_ctb + ctb_size)&63) == 0) && x_ctb + ctb_size >= s->ps.sps->width) {
+-             // Transform all blocks
+-             // printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
+-+#ifdef RPI_MULTI_MAILBOX
+-+            // Kick off inter prediction on QPUs
+-+            rpi_execute_inter_qpu(s);
+-+            // Perform luma inter prediction
+-+            rpi_execute_inter_cmds(s);
+-+#else
+-             rpi_execute_transform(s);
+-             // Perform inter prediction
+-             rpi_execute_inter_cmds(s);
+-@@ -2952,6 +2984,8 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-             // Kick off inter prediction on QPUs
+-             rpi_execute_inter_qpu(s);
+- #endif
+-+#endif
+-+
+-             // Wait for transform completion
+-             vpu_wait(s->vpu_id);
+- 
+-diff --git a/libavcodec/rpi_mailbox.c b/libavcodec/rpi_mailbox.c
+-index 77a56dd..3904efc 100644
+---- a/libavcodec/rpi_mailbox.c
+-+++ b/libavcodec/rpi_mailbox.c
+-@@ -276,6 +276,53 @@ unsigned execute_qpu(int file_desc, unsigned num_qpus, unsigned control, unsigne
+-    return p[5];
+- }
+- 
+-+void execute_multi(int file_desc,
+-+   unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout,
+-+   unsigned num_qpus_2, unsigned control_2, unsigned noflush_2, unsigned timeout_2,
+-+   unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
+-+   unsigned code_2, unsigned r0_2, unsigned r1_2, unsigned r2_2, unsigned r3_2, unsigned r4_2, unsigned r5_2) {
+-+   int i=0;
+-+   unsigned p[32];
+-+
+-+   p[i++] = 0; // size
+-+   p[i++] = 0x00000000; // process request
+-+   p[i++] = 0x30018; // (the tag id)
+-+   p[i++] = 88; // (size of the buffer)
+-+   p[i++] = 88; // (size of the data)
+-+
+-+   p[i++] = num_qpus;
+-+   p[i++] = control;
+-+   p[i++] = noflush;
+-+   p[i++] = timeout; // ms
+-+
+-+   p[i++] = num_qpus_2;
+-+   p[i++] = control_2;
+-+   p[i++] = noflush_2;
+-+   p[i++] = timeout_2; // ms
+-+
+-+   p[i++] = code;
+-+   p[i++] = r0;
+-+   p[i++] = r1;
+-+   p[i++] = r2;
+-+   p[i++] = r3;
+-+   p[i++] = r4;
+-+   p[i++] = r5;
+-+
+-+   p[i++] = code_2;
+-+   p[i++] = r0_2;
+-+   p[i++] = r1_2;
+-+   p[i++] = r2_2;
+-+   p[i++] = r3_2;
+-+   p[i++] = r4_2;
+-+   p[i++] = r5_2;
+-+
+-+   p[i++] = 0x00000000; // end tag
+-+   p[0] = i*sizeof *p; // actual size
+-+
+-+   mbox_property(file_desc, p);
+-+   return;
+-+}
+-+
+- int mbox_open() {
+-    int file_desc;
+- 
+-diff --git a/libavcodec/rpi_mailbox.h b/libavcodec/rpi_mailbox.h
+-index c264d2e..5898102 100644
+---- a/libavcodec/rpi_mailbox.h
+-+++ b/libavcodec/rpi_mailbox.h
+-@@ -15,6 +15,11 @@ extern void unmapmem(void *addr, unsigned size);
+- 
+- extern unsigned execute_code(int file_desc, unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5);
+- extern unsigned execute_qpu(int file_desc, unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout);
+-+extern void execute_multi(int file_desc,
+-+   unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout,
+-+   unsigned num_qpus_2, unsigned control_2, unsigned noflush_2, unsigned timeout_2,
+-+   unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
+-+   unsigned code_2, unsigned r0_2, unsigned r1_2, unsigned r2_2, unsigned r3_2, unsigned r4_2, unsigned r5_2);
+- extern unsigned qpu_enable(int file_desc, unsigned enable);
+- 
+- #endif
+-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
+-index fd8a276..feb3284 100644
+---- a/libavcodec/rpi_qpu.c
+-+++ b/libavcodec/rpi_qpu.c
+-@@ -123,7 +123,7 @@ static pthread_cond_t post_cond_head = PTHREAD_COND_INITIALIZER;
+- static pthread_cond_t post_cond_tail = PTHREAD_COND_INITIALIZER;
+- static pthread_mutex_t post_mutex = PTHREAD_MUTEX_INITIALIZER;
+- 
+--static int vpu_cmds[MAXCMDS][8];
+-+static int vpu_cmds[MAXCMDS][16];
+- static volatile int vpu_async_tail=0; // Contains the number of posted jobs
+- static volatile int vpu_async_head=0;
+- #endif
+-@@ -346,6 +346,7 @@ unsigned int vpu_get_constants(void) {
+- static void *vpu_start(void *arg) {
+-   while(1) {
+-     int *p;
+-+    int qpu_code;
+-     pthread_mutex_lock(&post_mutex);
+-     while( vpu_async_tail - vpu_async_head <= 0)
+-     {
+-@@ -358,12 +359,25 @@ static void *vpu_start(void *arg) {
+-     if (p[6] == -1) {
+-       break; // Last job
+-     }
+--    if (p[7]) {
+-+    qpu_code = p[7];
+-+    //if (p[7]) {
+-         //GPU_MEM_PTR_T *buf = (GPU_MEM_PTR_T *)p[7];
+-         //gpu_cache_flush(buf);
+--    }
+--    vpu_execute_code(p[0], p[1], p[2], p[3], p[4], p[5], p[6]);
+-+    //}
+-+    if (!qpu_code) {
+-+      vpu_execute_code(p[0], p[1], p[2], p[3], p[4], p[5], p[6]);
+-+    } else {
+-+      int i;
+-+      for(i=0;i<8;i++) {
+-+        gpu->mail[i*2] = p[8+i];
+-+        gpu->mail[i*2 + 1] = qpu_code;
+-+      }
+- 
+-+      execute_multi(gpu->mb,8,gpu->vc + offsetof(struct GPU, mail), 1 /* no flush */, 5000 /* timeout ms */,
+-+                              0, 0, 0, 0,
+-+                              p[0], p[1], p[2], p[3], p[4], p[5], p[6], // VPU0
+-+                              0,    0   , 0   , 0   , 0   , 0   , 0); // VPU1
+-+    }
+-     pthread_mutex_lock(&post_mutex);
+-     vpu_async_head++;
+-     pthread_cond_broadcast(&post_cond_head);
+-@@ -400,7 +414,43 @@ int vpu_post_code(unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned
+-     p[4] = r3;
+-     p[5] = r4;
+-     p[6] = r5;
+--    p[7] = (int) buf;
+-+    p[7] = 0;
+-+    if (num<=1)
+-+      pthread_cond_broadcast(&post_cond_tail); // Otherwise the vpu thread must already be awake
+-+    pthread_mutex_unlock(&post_mutex);
+-+    return id;
+-+  }
+-+}
+-+
+-+int vpu_qpu_post_code(unsigned vpu_code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
+-+                      int qpu_code, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8)
+-+{
+-+
+-+  pthread_mutex_lock(&post_mutex);
+-+  {
+-+    int id = vpu_async_tail++;
+-+    int *p = vpu_cmds[id%MAXCMDS];
+-+    int num = vpu_async_tail - vpu_async_head;
+-+    if (num>MAXCMDS) {
+-+      printf("Too many commands submitted\n");
+-+      exit(-1);
+-+    }
+-+    p[0] = vpu_code;
+-+    p[1] = r0;
+-+    p[2] = r1;
+-+    p[3] = r2;
+-+    p[4] = r3;
+-+    p[5] = r4;
+-+    p[6] = r5;
+-+    p[7] = qpu_code;
+-+    p[8 ] = unifs1;
+-+    p[9 ] = unifs2;
+-+    p[10] = unifs3;
+-+    p[11] = unifs4;
+-+    p[12] = unifs5;
+-+    p[13] = unifs6;
+-+    p[14] = unifs7;
+-+    p[15] = unifs8;
+-     if (num<=1)
+-       pthread_cond_broadcast(&post_cond_tail); // Otherwise the vpu thread must already be awake
+-     pthread_mutex_unlock(&post_mutex);
+-@@ -966,6 +1016,7 @@ void rpi_do_block(const uint8_t *in_buffer_vc, int src_pitch, uint8_t *dst_vc, i
+- }
+- 
+- 
+-+
+- #endif
+- 
+- #endif // RPI
+-diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
+-index 88965e5..2f08f03 100644
+---- a/libavcodec/rpi_qpu.h
+-+++ b/libavcodec/rpi_qpu.h
+-@@ -41,6 +41,8 @@ extern unsigned int vpu_get_fn(void);
+- extern unsigned int vpu_get_constants(void);
+- extern unsigned vpu_execute_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5);
+- extern int vpu_post_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5, GPU_MEM_PTR_T *buf);
+-+int vpu_qpu_post_code(unsigned vpu_code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
+-+                      int qpu_code, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8);
+- extern void vpu_wait( int id);
+- 
+- // Simple test of shader code
+--- 
+-2.7.4
+-
+-
+-From 311f2da06d13a98d9bdda2df8684d7cf55b9a08e Mon Sep 17 00:00:00 2001
+-From: popcornmix <popcornmix@gmail.com>
+-Date: Thu, 21 May 2015 16:50:02 +0100
+-Subject: [PATCH 44/68] Pass qpu number in as uniform
+-
+----
+- libavcodec/hevc.c          |    2 +-
+- libavcodec/rpi_shader.c    | 1288 ++++++++++++++++++++++----------------------
+- libavcodec/rpi_shader.h    |   20 +-
+- libavcodec/rpi_shader.qasm |   10 +-
+- 4 files changed, 657 insertions(+), 663 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index ab63efd..caadfaa 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -2834,6 +2834,7 @@ static void rpi_inter_clear(HEVCContext *s)
+-         *s->u_mvs[i]++ = pic_height;
+-         *s->u_mvs[i]++ = s->frame->linesize[1];
+-         *s->u_mvs[i]++ = s->frame->linesize[2];
+-+        *s->u_mvs[i]++ = i;
+-         if (weight_flag) {
+-             *s->u_mvs[i]++ = 1 << (s->sh.chroma_log2_weight_denom + 6 - 1);
+-             *s->u_mvs[i]++ = s->sh.chroma_log2_weight_denom + 6;
+-@@ -2841,7 +2842,6 @@ static void rpi_inter_clear(HEVCContext *s)
+-             *s->u_mvs[i]++ = 1 << 5;
+-             *s->u_mvs[i]++ = 6;
+-         }
+--        s->u_mvs[i] += 1;  // Padding words
+-     }
+- }
+- 
+-diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
+-index 9c30e32..a0f0282 100644
+---- a/libavcodec/rpi_shader.c
+-+++ b/libavcodec/rpi_shader.c
+-@@ -48,8 +48,8 @@ unsigned int rpi_shader[] = {
+- /* [0x000000b8] */ 0x00000000, 0xe0020367, // mov ra13, 0
+- /* [0x000000c0] */ 0x00000000, 0xe00203a7, // mov ra14, 0
+- /* [0x000000c8] */ 0x00000000, 0xe00203e7, // mov ra15, 0
+--/* [0x000000d0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+--/* [0x000000d8] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
+-+/* [0x000000d0] */ 0x15827d80, 0x100208e7, // mov r3, unif
+-+/* [0x000000d8] */ 0x119c17c0, 0xd00208a7, // shl r2, r3, 1
+- /* [0x000000e0] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+- /* [0x000000e8] */ 0x159e7480, 0x10020867, // mov r1, r2
+- /* [0x000000f0] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+-@@ -60,669 +60,669 @@ unsigned int rpi_shader[] = {
+- /* [0x00000118] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
+- /* [0x00000120] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+- /* [0x00000128] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
+--/* [0x00000130] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+--/* [0x00000138] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
+--/* [0x00000140] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+--/* [0x00000148] */ 0x159e7480, 0x10020867, // mov r1, r2
+--/* [0x00000150] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+--/* [0x00000158] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+--/* [0x00000160] */ 0x159e7480, 0x10020827, // mov r0, r2
+--/* [0x00000168] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+--/* [0x00000170] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000178] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+--/* [0x00000180] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
+--/* [0x00000188] */ 0x0f9c11c0, 0xd0020827, // asr r0, r0, 1
+--/* [0x00000190] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
+--/* [0x00000198] */ 0x0c9e7040, 0x10021567, // add rb21, r0, r1
+--/* [0x000001a0] */ 0x15427d80, 0x10020827, // mov r0, ra_x
+--/* [0x000001a8] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
+--/* [0x000001b0] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base
+--/* [0x000001b8] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
+--/* [0x000001c0] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
+--/* [0x000001c8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x000001d0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+--/* [0x000001d8] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x, r0
+--/* [0x000001e0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+--/* [0x000001e8] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
+--/* [0x000001f0] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_frame_base, r2
+--/* [0x000001f8] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
+--/* [0x00000200] */ 0x15827d80, 0x10021327, // mov rb12,unif
+--/* [0x00000208] */ 0x15827d80, 0x10021367, // mov rb13,unif
+--/* [0x00000210] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000218] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
+--/* [0x00000220] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+--/* [0x00000228] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
+--/* [0x00000230] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000238] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+--/* [0x00000240] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x
+--/* [0x00000248] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_frame_base
+-+/* [0x00000130] */ 0x119c17c0, 0xd00208a7, // shl r2, r3, 1
+-+/* [0x00000138] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+-+/* [0x00000140] */ 0x159e7480, 0x10020867, // mov r1, r2
+-+/* [0x00000148] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+-+/* [0x00000150] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+-+/* [0x00000158] */ 0x159e7480, 0x10020827, // mov r0, r2
+-+/* [0x00000160] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+-+/* [0x00000168] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000170] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+-+/* [0x00000178] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
+-+/* [0x00000180] */ 0x0f9c11c0, 0xd0020827, // asr r0, r0, 1
+-+/* [0x00000188] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
+-+/* [0x00000190] */ 0x0c9e7040, 0x10021567, // add rb21, r0, r1
+-+/* [0x00000198] */ 0x15427d80, 0x10020827, // mov r0, ra_x
+-+/* [0x000001a0] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
+-+/* [0x000001a8] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base
+-+/* [0x000001b0] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
+-+/* [0x000001b8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
+-+/* [0x000001c0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x000001c8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+-+/* [0x000001d0] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x, r0
+-+/* [0x000001d8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x000001e0] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
+-+/* [0x000001e8] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_frame_base, r2
+-+/* [0x000001f0] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
+-+/* [0x000001f8] */ 0x15827d80, 0x10021327, // mov rb12,unif
+-+/* [0x00000200] */ 0x15827d80, 0x10021367, // mov rb13,unif
+-+/* [0x00000208] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
+-+/* [0x00000210] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x00000218] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
+-+/* [0x00000220] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000228] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+-+/* [0x00000230] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x
+-+/* [0x00000238] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_frame_base
+- // ::mc_filter_uv
+--/* [0x00000250] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00000258] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x00000260] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00000268] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x00000270] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+--/* [0x00000278] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x00000280] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+--/* [0x00000288] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x00000290] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
+--/* [0x00000298] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x000002a0] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
+--/* [0x000002a8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+--/* [0x000002b0] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x000002b8] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000002c0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x000002c8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x000002d0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x000002d8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+--/* [0x000002e0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+--/* [0x000002e8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x000002f0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x000002f8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000300] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00000308] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000310] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000318] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000320] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000328] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x00000330] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000338] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000340] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000348] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
+--/* [0x00000350] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000358] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000360] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
+--/* [0x00000368] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000370] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
+--/* [0x00000378] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000380] */ 0x0f9e7080, 0x100613e7, // asr.ifnz rb15, r0, r2
+--/* [0x00000388] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000390] */ 0x0f9e7080, 0x100613a7, // asr.ifnz rb14, r0, r2
+--/* [0x00000398] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x000003a0] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x00000240] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000248] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00000250] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000258] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000260] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x00000268] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000270] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x00000278] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x00000280] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
+-+/* [0x00000288] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x00000290] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
+-+/* [0x00000298] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x000002a0] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x000002a8] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000002b0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x000002b8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x000002c0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x000002c8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+-+/* [0x000002d0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+-+/* [0x000002d8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x000002e0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x000002e8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x000002f0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x000002f8] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000300] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000308] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000310] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000318] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x00000320] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000328] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000330] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000338] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
+-+/* [0x00000340] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000348] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000350] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
+-+/* [0x00000358] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000360] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
+-+/* [0x00000368] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000370] */ 0x0f9e7080, 0x100613e7, // asr.ifnz rb15, r0, r2
+-+/* [0x00000378] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000380] */ 0x0f9e7080, 0x100613a7, // asr.ifnz rb14, r0, r2
+-+/* [0x00000388] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x00000390] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop
+--/* [0x000003a8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x000003b0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
+--/* [0x000003b8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+--/* [0x000003c0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x000003c8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x000003d0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x000003d8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x000003e0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x000003e8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
+--/* [0x000003f0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_frame_base, r2
+--/* [0x000003f8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000400] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x00000408] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x00000410] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x00000418] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x00000420] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x00000428] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x00000430] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x00000438] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00000440] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+--/* [0x00000448] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+--/* [0x00000450] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
+--/* [0x00000458] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x00000460] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+--/* [0x00000468] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+--/* [0x00000470] */ 0x00000020, 0xe0021327, // mov rb12,32
+--/* [0x00000478] */ 0x00000006, 0xe0021367, // mov rb13,6
+--/* [0x00000480] */ 0x00000001, 0xe00213a7, // mov rb14,1
+--/* [0x00000488] */ 0x00000000, 0xe00213e7, // mov rb15,0
+--/* [0x00000490] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+--/* [0x00000498] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+--/* [0x000004a0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+--/* [0x000004a8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+--/* [0x000004b0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x000004b8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x000004c0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+--/* [0x000004c8] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
+--/* [0x000004d0] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
+--/* [0x000004d8] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
+--/* [0x000004e0] */ 0xfffffea8, 0xf06809e7, // brr.anyn -, r:uvloop
+--/* [0x000004e8] */ 0x0c9cf3c0, 0x10020867, // add r1, r1, rb15
+--/* [0x000004f0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x000004f8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+--/* [0x00000500] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00000508] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000510] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x00000518] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x00000520] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x00000528] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000530] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x00000538] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000540] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000398] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x000003a0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
+-+/* [0x000003a8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+-+/* [0x000003b0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x000003b8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x000003c0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x000003c8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x000003d0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x000003d8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
+-+/* [0x000003e0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_frame_base, r2
+-+/* [0x000003e8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x000003f0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x000003f8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x00000400] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00000408] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x00000410] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000418] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00000420] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00000428] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00000430] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+-+/* [0x00000438] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+-+/* [0x00000440] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
+-+/* [0x00000448] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00000450] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+-+/* [0x00000458] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+-+/* [0x00000460] */ 0x00000020, 0xe0021327, // mov rb12,32
+-+/* [0x00000468] */ 0x00000006, 0xe0021367, // mov rb13,6
+-+/* [0x00000470] */ 0x00000001, 0xe00213a7, // mov rb14,1
+-+/* [0x00000478] */ 0x00000000, 0xe00213e7, // mov rb15,0
+-+/* [0x00000480] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+-+/* [0x00000488] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+-+/* [0x00000490] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+/* [0x00000498] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x000004a0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x000004a8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x000004b0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x000004b8] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
+-+/* [0x000004c0] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
+-+/* [0x000004c8] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
+-+/* [0x000004d0] */ 0xfffffea8, 0xf06809e7, // brr.anyn -, r:uvloop
+-+/* [0x000004d8] */ 0x0c9cf3c0, 0x10020867, // add r1, r1, rb15
+-+/* [0x000004e0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x000004e8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x000004f0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x000004f8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000500] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000508] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x00000510] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x00000518] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000520] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x00000528] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000530] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_filter_uv_b0
+--/* [0x00000548] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00000550] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x00000558] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00000560] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x00000568] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+--/* [0x00000570] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x00000578] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+--/* [0x00000580] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x00000588] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
+--/* [0x00000590] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x00000598] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
+--/* [0x000005a0] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
+--/* [0x000005a8] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x000005b0] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000005b8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x000005c0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x000005c8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x000005d0] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+--/* [0x000005d8] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+--/* [0x000005e0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x000005e8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x000005f0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x000005f8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00000600] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000608] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000610] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000618] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000620] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x00000628] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000630] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000638] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000640] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
+--/* [0x00000648] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000650] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000658] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x00000660] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000668] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x00000538] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000540] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00000548] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000550] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000558] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x00000560] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000568] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x00000570] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x00000578] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
+-+/* [0x00000580] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x00000588] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
+-+/* [0x00000590] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
+-+/* [0x00000598] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x000005a0] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000005a8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x000005b0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x000005b8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x000005c0] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+-+/* [0x000005c8] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+-+/* [0x000005d0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x000005d8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x000005e0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x000005e8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x000005f0] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000005f8] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000600] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000608] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000610] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x00000618] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000620] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000628] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000630] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
+-+/* [0x00000638] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000640] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000648] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x00000650] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000658] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop_b0
+--/* [0x00000670] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x00000678] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
+--/* [0x00000680] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+--/* [0x00000688] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x00000690] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x00000698] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x000006a0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x000006a8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x000006b0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
+--/* [0x000006b8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_frame_base, r2
+--/* [0x000006c0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x000006c8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x000006d0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x000006d8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x000006e0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x000006e8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x000006f0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x000006f8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x00000700] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00000708] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+--/* [0x00000710] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+--/* [0x00000718] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+--/* [0x00000720] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x00000728] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+--/* [0x00000730] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+--/* [0x00000738] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+--/* [0x00000740] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+--/* [0x00000748] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+--/* [0x00000750] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+--/* [0x00000758] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x00000760] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
+--/* [0x00000768] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+--/* [0x00000770] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
+--/* [0x00000778] */ 0x009e7000, 0x100009e7, // nop
+--/* [0x00000780] */ 0x009e7000, 0x100009e7, // nop
+--/* [0x00000788] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000790] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000798] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000007a0] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x00000660] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x00000668] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
+-+/* [0x00000670] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+-+/* [0x00000678] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000680] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x00000688] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000690] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000698] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x000006a0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
+-+/* [0x000006a8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_frame_base, r2
+-+/* [0x000006b0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x000006b8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x000006c0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x000006c8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x000006d0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x000006d8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x000006e0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x000006e8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x000006f0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x000006f8] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+-+/* [0x00000700] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+-+/* [0x00000708] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+-+/* [0x00000710] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00000718] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+-+/* [0x00000720] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+-+/* [0x00000728] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+-+/* [0x00000730] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+-+/* [0x00000738] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+/* [0x00000740] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x00000748] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x00000750] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
+-+/* [0x00000758] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+-+/* [0x00000760] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
+-+/* [0x00000768] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x00000770] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x00000778] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000780] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000788] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000790] */ 0x009e7000, 0x100009e7, // nop
+- // ::mc_filter_uv_b
+--/* [0x000007a8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x000007b0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x000007b8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x000007c0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x000007c8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+--/* [0x000007d0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x000007d8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+--/* [0x000007e0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x000007e8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
+--/* [0x000007f0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x000007f8] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
+--/* [0x00000800] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+--/* [0x00000808] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x00000810] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000818] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x00000820] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x00000828] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x00000830] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+--/* [0x00000838] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+--/* [0x00000840] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x00000848] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+--/* [0x00000850] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+--/* [0x00000858] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+--/* [0x00000860] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000868] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000870] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00000878] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
+--/* [0x00000880] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000888] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000890] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000898] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000008a0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x000008a8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000008b0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000008b8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000008c0] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
+--/* [0x000008c8] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000008d0] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000008d8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x000008e0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x000008e8] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x00000798] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x000007a0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x000007a8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x000007b0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x000007b8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x000007c0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x000007c8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x000007d0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x000007d8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
+-+/* [0x000007e0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x000007e8] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
+-+/* [0x000007f0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x000007f8] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x00000800] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000808] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x00000810] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x00000818] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x00000820] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+-+/* [0x00000828] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+-+/* [0x00000830] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x00000838] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+-+/* [0x00000840] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+-+/* [0x00000848] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+-+/* [0x00000850] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000858] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000860] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00000868] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
+-+/* [0x00000870] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000878] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000880] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000888] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000890] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x00000898] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000008a0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000008a8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000008b0] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
+-+/* [0x000008b8] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000008c0] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000008c8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x000008d0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x000008d8] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop_b
+--/* [0x000008f0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x000008f8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
+--/* [0x00000900] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+--/* [0x00000908] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x00000910] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x00000918] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x00000920] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x00000928] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x00000930] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
+--/* [0x00000938] */ 0x0c627c80, 0x10020e27, // add t0s, ra_frame_base, r2
+--/* [0x00000940] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000948] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x00000950] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x00000958] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x00000960] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x00000968] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x00000970] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x00000978] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x00000980] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00000988] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+--/* [0x00000990] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+--/* [0x00000998] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
+--/* [0x000009a0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x000009a8] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+--/* [0x000009b0] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+--/* [0x000009b8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+--/* [0x000009c0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+--/* [0x000009c8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+--/* [0x000009d0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+--/* [0x000009d8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x000009e0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x000009e8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+--/* [0x000009f0] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
+--/* [0x000009f8] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
+--/* [0x00000a00] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
+--/* [0x00000a08] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
+--/* [0x00000a10] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x00000a18] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+--/* [0x00000a20] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00000a28] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000a30] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x00000a38] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x00000a40] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x00000a48] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000a50] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x00000a58] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000a60] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x000008e0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x000008e8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
+-+/* [0x000008f0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+-+/* [0x000008f8] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000900] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x00000908] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000910] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000918] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x00000920] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
+-+/* [0x00000928] */ 0x0c627c80, 0x10020e27, // add t0s, ra_frame_base, r2
+-+/* [0x00000930] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000938] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x00000940] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x00000948] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00000950] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x00000958] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000960] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00000968] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00000970] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00000978] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+-+/* [0x00000980] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+-+/* [0x00000988] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x00000990] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00000998] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+-+/* [0x000009a0] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+-+/* [0x000009a8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+-+/* [0x000009b0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+-+/* [0x000009b8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+/* [0x000009c0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x000009c8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x000009d0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x000009d8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x000009e0] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
+-+/* [0x000009e8] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
+-+/* [0x000009f0] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x000009f8] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
+-+/* [0x00000a00] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x00000a08] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x00000a10] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00000a18] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000a20] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000a28] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x00000a30] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x00000a38] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000a40] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x00000a48] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000a50] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_exit
+--/* [0x00000a68] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x00000a70] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+-+/* [0x00000a58] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00000a60] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+-+/* [0x00000a68] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a70] */ 0x009e7000, 0xa00009e7, // ldtmu0
+- /* [0x00000a78] */ 0x009e7000, 0xa00009e7, // ldtmu0
+- /* [0x00000a80] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000a88] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000a90] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000a98] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x00000aa0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+--/* [0x00000aa8] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000a88] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00000a90] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000a98] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_interrupt_exit8
+--/* [0x00000ab0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00000aa0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00000aa8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000ab0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+- /* [0x00000ab8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+- /* [0x00000ac0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000ac8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000ad0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000ac8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000ad0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00000ad8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00000ae0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00000ae8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00000af0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00000af8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000b00] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000b08] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000b10] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x00000b18] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+--/* [0x00000b20] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000b00] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00000b08] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x00000b10] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_setup
+--/* [0x00000b28] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000b30] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00000b38] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00000b40] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x00000b48] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
+--/* [0x00000b50] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x00000b58] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
+--/* [0x00000b60] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+--/* [0x00000b68] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
+--/* [0x00000b70] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+--/* [0x00000b78] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+--/* [0x00000b80] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
+--/* [0x00000b88] */ 0x8c9e7452, 0x10025e18, // add t0s, r2, r1 ; mov ra_frame_base, r2
+--/* [0x00000b90] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00000b98] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x00000ba0] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
+--/* [0x00000ba8] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
+--/* [0x00000bb0] */ 0x0c9c13c0, 0xd0020567, // add ra_y2, r1, 1
+--/* [0x00000bb8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+--/* [0x00000bc0] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
+--/* [0x00000bc8] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+--/* [0x00000bd0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+--/* [0x00000bd8] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
+--/* [0x00000be0] */ 0x8c9e7452, 0x10025e19, // add t0s, r2, r1 ; mov ra_frame_base2, r2
+--/* [0x00000be8] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
+--/* [0x00000bf0] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
+--/* [0x00000bf8] */ 0x15827d80, 0x10021427, // mov rb16, unif
+--/* [0x00000c00] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000c08] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+--/* [0x00000c10] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
+--/* [0x00000c18] */ 0x00000001, 0xe0020527, // mov ra20, 1
+--/* [0x00000c20] */ 0x00000100, 0xe00205a7, // mov ra22, 256
+--/* [0x00000c28] */ 0x00000040, 0xe00207a7, // mov ra30, 64
+--/* [0x00000c30] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
+--/* [0x00000c38] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
+--/* [0x00000c40] */ 0x00000018, 0xe00215e7, // mov rb23, 24
+--/* [0x00000c48] */ 0x00000000, 0xe0020227, // mov ra8, 0
+--/* [0x00000c50] */ 0x00000000, 0xe0020267, // mov ra9, 0
+--/* [0x00000c58] */ 0x00000000, 0xe00202a7, // mov ra10, 0
+--/* [0x00000c60] */ 0x00000000, 0xe00202e7, // mov ra11, 0
+--/* [0x00000c68] */ 0x00000000, 0xe0020327, // mov ra12, 0
+--/* [0x00000c70] */ 0x00000000, 0xe0020367, // mov ra13, 0
+--/* [0x00000c78] */ 0x00000000, 0xe00203a7, // mov ra14, 0
+--/* [0x00000c80] */ 0x00000000, 0xe00203e7, // mov ra15, 0
+--/* [0x00000c88] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+--/* [0x00000c90] */ 0x159e7480, 0x10020867, // mov r1, r2
+--/* [0x00000c98] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+--/* [0x00000ca0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+--/* [0x00000ca8] */ 0x159e7480, 0x10020827, // mov r0, r2
+--/* [0x00000cb0] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+--/* [0x00000cb8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000cc0] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
+--/* [0x00000cc8] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+--/* [0x00000cd0] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
+--/* [0x00000cd8] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+--/* [0x00000ce0] */ 0x159e7480, 0x10020867, // mov r1, r2
+--/* [0x00000ce8] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+--/* [0x00000cf0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+--/* [0x00000cf8] */ 0x159e7480, 0x10020827, // mov r0, r2
+--/* [0x00000d00] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+--/* [0x00000d08] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000d10] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+--/* [0x00000d18] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
+--/* [0x00000d20] */ 0x15827d80, 0x10021327, // mov rb12,unif
+--/* [0x00000d28] */ 0x15827d80, 0x10021367, // mov rb13,unif
+--/* [0x00000d30] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000d38] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
+--/* [0x00000d40] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+--/* [0x00000d48] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
+--/* [0x00000d50] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+--/* [0x00000d58] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_frame_base
+--/* [0x00000d60] */ 0x13540dc0, 0xd0020867, // max r1, ra_y2, 0
+--/* [0x00000d68] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+--/* [0x00000d70] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000d78] */ 0x0c541dc0, 0xd0020567, // add ra_y2, ra_y2, 1
+--/* [0x00000d80] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+--/* [0x00000d88] */ 0x0c667380, 0x10020e27, // add t0s, r1, ra_frame_base2
+-+/* [0x00000b18] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000b20] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000b28] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000b30] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000b38] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
+-+/* [0x00000b40] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000b48] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
+-+/* [0x00000b50] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+-+/* [0x00000b58] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
+-+/* [0x00000b60] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+-+/* [0x00000b68] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x00000b70] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
+-+/* [0x00000b78] */ 0x8c9e7452, 0x10025e18, // add t0s, r2, r1 ; mov ra_frame_base, r2
+-+/* [0x00000b80] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000b88] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000b90] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
+-+/* [0x00000b98] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
+-+/* [0x00000ba0] */ 0x0c9c13c0, 0xd0020567, // add ra_y2, r1, 1
+-+/* [0x00000ba8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+-+/* [0x00000bb0] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
+-+/* [0x00000bb8] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+-+/* [0x00000bc0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x00000bc8] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
+-+/* [0x00000bd0] */ 0x8c9e7452, 0x10025e19, // add t0s, r2, r1 ; mov ra_frame_base2, r2
+-+/* [0x00000bd8] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
+-+/* [0x00000be0] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
+-+/* [0x00000be8] */ 0x15827d80, 0x10021427, // mov rb16, unif
+-+/* [0x00000bf0] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000bf8] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+-+/* [0x00000c00] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
+-+/* [0x00000c08] */ 0x00000001, 0xe0020527, // mov ra20, 1
+-+/* [0x00000c10] */ 0x00000100, 0xe00205a7, // mov ra22, 256
+-+/* [0x00000c18] */ 0x00000040, 0xe00207a7, // mov ra30, 64
+-+/* [0x00000c20] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
+-+/* [0x00000c28] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
+-+/* [0x00000c30] */ 0x00000018, 0xe00215e7, // mov rb23, 24
+-+/* [0x00000c38] */ 0x00000000, 0xe0020227, // mov ra8, 0
+-+/* [0x00000c40] */ 0x00000000, 0xe0020267, // mov ra9, 0
+-+/* [0x00000c48] */ 0x00000000, 0xe00202a7, // mov ra10, 0
+-+/* [0x00000c50] */ 0x00000000, 0xe00202e7, // mov ra11, 0
+-+/* [0x00000c58] */ 0x00000000, 0xe0020327, // mov ra12, 0
+-+/* [0x00000c60] */ 0x00000000, 0xe0020367, // mov ra13, 0
+-+/* [0x00000c68] */ 0x00000000, 0xe00203a7, // mov ra14, 0
+-+/* [0x00000c70] */ 0x00000000, 0xe00203e7, // mov ra15, 0
+-+/* [0x00000c78] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+-+/* [0x00000c80] */ 0x159e7480, 0x10020867, // mov r1, r2
+-+/* [0x00000c88] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+-+/* [0x00000c90] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+-+/* [0x00000c98] */ 0x159e7480, 0x10020827, // mov r0, r2
+-+/* [0x00000ca0] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+-+/* [0x00000ca8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000cb0] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
+-+/* [0x00000cb8] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+-+/* [0x00000cc0] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
+-+/* [0x00000cc8] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+-+/* [0x00000cd0] */ 0x159e7480, 0x10020867, // mov r1, r2
+-+/* [0x00000cd8] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+-+/* [0x00000ce0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+-+/* [0x00000ce8] */ 0x159e7480, 0x10020827, // mov r0, r2
+-+/* [0x00000cf0] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+-+/* [0x00000cf8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000d00] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+-+/* [0x00000d08] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
+-+/* [0x00000d10] */ 0x15827d80, 0x10021327, // mov rb12,unif
+-+/* [0x00000d18] */ 0x15827d80, 0x10021367, // mov rb13,unif
+-+/* [0x00000d20] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000d28] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
+-+/* [0x00000d30] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x00000d38] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
+-+/* [0x00000d40] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+-+/* [0x00000d48] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_frame_base
+-+/* [0x00000d50] */ 0x13540dc0, 0xd0020867, // max r1, ra_y2, 0
+-+/* [0x00000d58] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x00000d60] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000d68] */ 0x0c541dc0, 0xd0020567, // add ra_y2, ra_y2, 1
+-+/* [0x00000d70] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+-+/* [0x00000d78] */ 0x0c667380, 0x10020e27, // add t0s, r1, ra_frame_base2
+- // ::mc_filter
+--/* [0x00000d90] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000d98] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00000da0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x00000da8] */ 0x155e7d80, 0x10021027, // mov rx_xshift2, rx_xshift2_next
+--/* [0x00000db0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00000db8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x00000dc0] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
+--/* [0x00000dc8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x00000dd0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x00000dd8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+--/* [0x00000de0] */ 0x0c9e7400, 0x100206a7, // add ra_frame_base_next, r2, r0
+--/* [0x00000de8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00000df0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0   ; mov r1, unif
+--/* [0x00000df8] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
+--/* [0x00000e00] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
+--/* [0x00000e08] */ 0x0c9c13c0, 0xd0021067, // add ra_y2_next, r1, 1
+--/* [0x00000e10] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+--/* [0x00000e18] */ 0x0c9e7400, 0x100214e7, // add rx_frame_base2_next, r2, r0
+--/* [0x00000e20] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+--/* [0x00000e28] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x00000e30] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000e38] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x00000e40] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x00000e48] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x00000e50] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+--/* [0x00000e58] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+--/* [0x00000e60] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x00000e68] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000e70] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000e78] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00000e80] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000e88] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
+--/* [0x00000e90] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000e98] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000ea0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000ea8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x00000eb0] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
+--/* [0x00000eb8] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000ec0] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000ec8] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000ed0] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+--/* [0x00000ed8] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
+--/* [0x00000ee0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000ee8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000ef0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000ef8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+--/* [0x00000f00] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
+--/* [0x00000f08] */ 0x4f5971c6, 0x100251e0, // asr rb7, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000f10] */ 0x4f5971c6, 0x100251a0, // asr rb6, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000f18] */ 0x4f5971c6, 0x10025160, // asr rb5, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000f20] */ 0x0f9d71c0, 0x10021127, // asr rb4, r0, rb23
+--/* [0x00000f28] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000f30] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
+--/* [0x00000f38] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
+--/* [0x00000f40] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000f48] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
+--/* [0x00000f50] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x00000d80] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000d88] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000d90] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00000d98] */ 0x155e7d80, 0x10021027, // mov rx_xshift2, rx_xshift2_next
+-+/* [0x00000da0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000da8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000db0] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
+-+/* [0x00000db8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000dc0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x00000dc8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+-+/* [0x00000dd0] */ 0x0c9e7400, 0x100206a7, // add ra_frame_base_next, r2, r0
+-+/* [0x00000dd8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000de0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0   ; mov r1, unif
+-+/* [0x00000de8] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
+-+/* [0x00000df0] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
+-+/* [0x00000df8] */ 0x0c9c13c0, 0xd0021067, // add ra_y2_next, r1, 1
+-+/* [0x00000e00] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+-+/* [0x00000e08] */ 0x0c9e7400, 0x100214e7, // add rx_frame_base2_next, r2, r0
+-+/* [0x00000e10] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x00000e18] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x00000e20] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000e28] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x00000e30] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x00000e38] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x00000e40] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+-+/* [0x00000e48] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+-+/* [0x00000e50] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x00000e58] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000e60] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000e68] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00000e70] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000e78] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
+-+/* [0x00000e80] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000e88] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000e90] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000e98] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x00000ea0] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
+-+/* [0x00000ea8] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000eb0] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000eb8] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000ec0] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+-+/* [0x00000ec8] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
+-+/* [0x00000ed0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000ed8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000ee0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000ee8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+-+/* [0x00000ef0] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
+-+/* [0x00000ef8] */ 0x4f5971c6, 0x100251e0, // asr rb7, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000f00] */ 0x4f5971c6, 0x100251a0, // asr rb6, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000f08] */ 0x4f5971c6, 0x10025160, // asr rb5, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000f10] */ 0x0f9d71c0, 0x10021127, // asr rb4, r0, rb23
+-+/* [0x00000f18] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000f20] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
+-+/* [0x00000f28] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
+-+/* [0x00000f30] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000f38] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
+-+/* [0x00000f40] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :yloop
+--/* [0x00000f58] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
+--/* [0x00000f60] */ 0x8e4539bf, 0xa0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu0
+--/* [0x00000f68] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+--/* [0x00000f70] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x00000f78] */ 0x0e9c09c0, 0x10020867, // shr r1, r4, rx_xshift2
+--/* [0x00000f80] */ 0x159c1fc0, 0x10040567, // mov.ifz ra_y2, ra_y2_next
+--/* [0x00000f88] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x00000f90] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x00000f98] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
+--/* [0x00000fa0] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
+--/* [0x00000fa8] */ 0x13540dc0, 0xd00208a7, // max r2, ra_y2, 0
+--/* [0x00000fb0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x00000fb8] */ 0x4c541dd3, 0xd0024562, // add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
+--/* [0x00000fc0] */ 0xec654c87, 0x10024e20, // add t0s, ra_frame_base2, r2   ; v8subs r0, r0, rb20
+--/* [0x00000fc8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000fd0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x00000fd8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x00000fe0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x00000fe8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x00000ff0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x00000ff8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x00001000] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x00001008] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00001010] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--/* [0x00001018] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--/* [0x00001020] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--/* [0x00001028] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--/* [0x00001030] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--/* [0x00001038] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--/* [0x00001040] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--/* [0x00001048] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--/* [0x00001050] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+--/* [0x00001058] */ 0x8d3487f6, 0xd00279cc, // sub.setf -, r3, 8    ; mov ra12, ra13
+--/* [0x00001060] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+--/* [0x00001068] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+--/* [0x00001070] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+--/* [0x00001078] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+--/* [0x00001080] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:yloop
+--/* [0x00001088] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x00001090] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+--/* [0x00001098] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+--/* [0x000010a0] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+--/* [0x000010a8] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+--/* [0x000010b0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+--/* [0x000010b8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+--/* [0x000010c0] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb4
+--/* [0x000010c8] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb5
+--/* [0x000010d0] */ 0x4c286237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb6
+--/* [0x000010d8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
+--/* [0x000010e0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x000010e8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x000010f0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+--/* [0x000010f8] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
+--/* [0x00001100] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
+--/* [0x00001108] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
+--/* [0x00001110] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:yloop
+--/* [0x00001118] */ 0x0c9cf3c0, 0x10020867, // add r1, r1, rb15
+--/* [0x00001120] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x00001128] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+--/* [0x00001130] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00001138] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00001140] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00001148] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000f48] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
+-+/* [0x00000f50] */ 0x8e4539bf, 0xa0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu0
+-+/* [0x00000f58] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+-+/* [0x00000f60] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000f68] */ 0x0e9c09c0, 0x10020867, // shr r1, r4, rx_xshift2
+-+/* [0x00000f70] */ 0x159c1fc0, 0x10040567, // mov.ifz ra_y2, ra_y2_next
+-+/* [0x00000f78] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000f80] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000f88] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
+-+/* [0x00000f90] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
+-+/* [0x00000f98] */ 0x13540dc0, 0xd00208a7, // max r2, ra_y2, 0
+-+/* [0x00000fa0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000fa8] */ 0x4c541dd3, 0xd0024562, // add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
+-+/* [0x00000fb0] */ 0xec654c87, 0x10024e20, // add t0s, ra_frame_base2, r2   ; v8subs r0, r0, rb20
+-+/* [0x00000fb8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000fc0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x00000fc8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x00000fd0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00000fd8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x00000fe0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000fe8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00000ff0] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00000ff8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00001000] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+/* [0x00001008] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+/* [0x00001010] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+/* [0x00001018] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+/* [0x00001020] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+/* [0x00001028] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+/* [0x00001030] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+/* [0x00001038] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+/* [0x00001040] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+-+/* [0x00001048] */ 0x8d3487f6, 0xd00279cc, // sub.setf -, r3, 8    ; mov ra12, ra13
+-+/* [0x00001050] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+-+/* [0x00001058] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+-+/* [0x00001060] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+-+/* [0x00001068] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+-+/* [0x00001070] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:yloop
+-+/* [0x00001078] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00001080] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+-+/* [0x00001088] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+-+/* [0x00001090] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+-+/* [0x00001098] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+-+/* [0x000010a0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+/* [0x000010a8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x000010b0] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb4
+-+/* [0x000010b8] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb5
+-+/* [0x000010c0] */ 0x4c286237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb6
+-+/* [0x000010c8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
+-+/* [0x000010d0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x000010d8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x000010e0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x000010e8] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
+-+/* [0x000010f0] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
+-+/* [0x000010f8] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
+-+/* [0x00001100] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:yloop
+-+/* [0x00001108] */ 0x0c9cf3c0, 0x10020867, // add r1, r1, rb15
+-+/* [0x00001110] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x00001118] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x00001120] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00001128] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00001130] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00001138] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_filter_b
+--/* [0x00001150] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00001158] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00001160] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x00001168] */ 0x155e7d80, 0x10021027, // mov rx_xshift2, rx_xshift2_next
+--/* [0x00001170] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00001178] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x00001180] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
+--/* [0x00001188] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x00001190] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x00001198] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+--/* [0x000011a0] */ 0x0c9e7400, 0x100206a7, // add ra_frame_base_next, r2, r0
+--/* [0x000011a8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x000011b0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0   ; mov r1, unif
+--/* [0x000011b8] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
+--/* [0x000011c0] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
+--/* [0x000011c8] */ 0x0c9c13c0, 0xd0021067, // add ra_y2_next, r1, 1
+--/* [0x000011d0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+--/* [0x000011d8] */ 0x0c9e7400, 0x100214e7, // add rx_frame_base2_next, r2, r0
+--/* [0x000011e0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+--/* [0x000011e8] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x000011f0] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000011f8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x00001200] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x00001208] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x00001210] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+--/* [0x00001218] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+--/* [0x00001220] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x00001228] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00001230] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00001238] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00001240] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00001248] */ 0x00000001, 0xe0020867, // mov r1, 1
+--/* [0x00001250] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
+--/* [0x00001258] */ 0x409f3001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 13, r1 << 13
+--/* [0x00001260] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00001268] */ 0x409f2001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 14, r1 << 14
+--/* [0x00001270] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00001278] */ 0x409f1001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 15, r1 << 15
+--/* [0x00001280] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00001288] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x00001290] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
+--/* [0x00001298] */ 0x409f7001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 9, r1 << 9
+--/* [0x000012a0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000012a8] */ 0x409f6001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 10, r1 << 10
+--/* [0x000012b0] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000012b8] */ 0x409f5001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 11, r1 << 11
+--/* [0x000012c0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000012c8] */ 0x409f4001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 12, r1 << 12
+--/* [0x000012d0] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+--/* [0x000012d8] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
+--/* [0x000012e0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000012e8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000012f0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000012f8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+--/* [0x00001300] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
+--/* [0x00001308] */ 0x4f5971c6, 0x100251e0, // asr rb7, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00001310] */ 0x4f5971c6, 0x100251a0, // asr rb6, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00001318] */ 0x4f5971c6, 0x10025160, // asr rb5, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00001320] */ 0x0f9d71c0, 0x10021127, // asr rb4, r0, rb23
+--/* [0x00001328] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00001330] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
+--/* [0x00001338] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
+--/* [0x00001340] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00001348] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
+--/* [0x00001350] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x00001140] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00001148] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00001150] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00001158] */ 0x155e7d80, 0x10021027, // mov rx_xshift2, rx_xshift2_next
+-+/* [0x00001160] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00001168] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00001170] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
+-+/* [0x00001178] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00001180] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x00001188] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+-+/* [0x00001190] */ 0x0c9e7400, 0x100206a7, // add ra_frame_base_next, r2, r0
+-+/* [0x00001198] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x000011a0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0   ; mov r1, unif
+-+/* [0x000011a8] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
+-+/* [0x000011b0] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
+-+/* [0x000011b8] */ 0x0c9c13c0, 0xd0021067, // add ra_y2_next, r1, 1
+-+/* [0x000011c0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+-+/* [0x000011c8] */ 0x0c9e7400, 0x100214e7, // add rx_frame_base2_next, r2, r0
+-+/* [0x000011d0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x000011d8] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x000011e0] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000011e8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x000011f0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x000011f8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x00001200] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+-+/* [0x00001208] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+-+/* [0x00001210] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x00001218] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00001220] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00001228] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00001230] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00001238] */ 0x00000001, 0xe0020867, // mov r1, 1
+-+/* [0x00001240] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
+-+/* [0x00001248] */ 0x409f3001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 13, r1 << 13
+-+/* [0x00001250] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00001258] */ 0x409f2001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 14, r1 << 14
+-+/* [0x00001260] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00001268] */ 0x409f1001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 15, r1 << 15
+-+/* [0x00001270] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00001278] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x00001280] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
+-+/* [0x00001288] */ 0x409f7001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 9, r1 << 9
+-+/* [0x00001290] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00001298] */ 0x409f6001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 10, r1 << 10
+-+/* [0x000012a0] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000012a8] */ 0x409f5001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 11, r1 << 11
+-+/* [0x000012b0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000012b8] */ 0x409f4001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 12, r1 << 12
+-+/* [0x000012c0] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+-+/* [0x000012c8] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
+-+/* [0x000012d0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000012d8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000012e0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000012e8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+-+/* [0x000012f0] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
+-+/* [0x000012f8] */ 0x4f5971c6, 0x100251e0, // asr rb7, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00001300] */ 0x4f5971c6, 0x100251a0, // asr rb6, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00001308] */ 0x4f5971c6, 0x10025160, // asr rb5, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00001310] */ 0x0f9d71c0, 0x10021127, // asr rb4, r0, rb23
+-+/* [0x00001318] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00001320] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
+-+/* [0x00001328] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
+-+/* [0x00001330] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00001338] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
+-+/* [0x00001340] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :yloopb
+--/* [0x00001358] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
+--/* [0x00001360] */ 0x8e4539bf, 0xa0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu0
+--/* [0x00001368] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+--/* [0x00001370] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x00001378] */ 0x0e9c09c0, 0x10020867, // shr r1, r4, rx_xshift2
+--/* [0x00001380] */ 0x159c1fc0, 0x10040567, // mov.ifz ra_y2, ra_y2_next
+--/* [0x00001388] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x00001390] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x00001398] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
+--/* [0x000013a0] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
+--/* [0x000013a8] */ 0x13540dc0, 0xd00208a7, // max r2, ra_y2, 0
+--/* [0x000013b0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x000013b8] */ 0x4c541dd3, 0xd0024562, // add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
+--/* [0x000013c0] */ 0xec654c87, 0x10024e20, // add t0s, ra_frame_base2, r2   ; v8subs r0, r0, rb20
+--/* [0x000013c8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x000013d0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x000013d8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x000013e0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x000013e8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x000013f0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x000013f8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x00001400] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x00001408] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00001410] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--/* [0x00001418] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--/* [0x00001420] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--/* [0x00001428] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--/* [0x00001430] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--/* [0x00001438] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--/* [0x00001440] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--/* [0x00001448] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--/* [0x00001450] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+--/* [0x00001458] */ 0x8d3487f6, 0xd00279cc, // sub.setf -, r3, 8    ; mov ra12, ra13
+--/* [0x00001460] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+--/* [0x00001468] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+--/* [0x00001470] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+--/* [0x00001478] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+--/* [0x00001480] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:yloopb
+--/* [0x00001488] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x00001490] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+--/* [0x00001498] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+--/* [0x000014a0] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+--/* [0x000014a8] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+--/* [0x000014b0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+--/* [0x000014b8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+--/* [0x000014c0] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb4
+--/* [0x000014c8] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb5
+--/* [0x000014d0] */ 0x4c286237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb6
+--/* [0x000014d8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
+--/* [0x000014e0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x000014e8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x000014f0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+--/* [0x000014f8] */ 0x4053800e, 0xd00049e1, // nop                     ; mul24 r1, r1 << 8, ra20 << 8
+--/* [0x00001500] */ 0x4c78e38f, 0x10024860, // add r1, r1, ra30        ; mul24 r0, r1, rb14
+--/* [0x00001508] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
+--/* [0x00001510] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:yloopb
+--/* [0x00001518] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
+--/* [0x00001520] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x00001528] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+--/* [0x00001530] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00001538] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00001540] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00001548] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00001348] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
+-+/* [0x00001350] */ 0x8e4539bf, 0xa0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu0
+-+/* [0x00001358] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+-+/* [0x00001360] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00001368] */ 0x0e9c09c0, 0x10020867, // shr r1, r4, rx_xshift2
+-+/* [0x00001370] */ 0x159c1fc0, 0x10040567, // mov.ifz ra_y2, ra_y2_next
+-+/* [0x00001378] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00001380] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00001388] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
+-+/* [0x00001390] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
+-+/* [0x00001398] */ 0x13540dc0, 0xd00208a7, // max r2, ra_y2, 0
+-+/* [0x000013a0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x000013a8] */ 0x4c541dd3, 0xd0024562, // add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
+-+/* [0x000013b0] */ 0xec654c87, 0x10024e20, // add t0s, ra_frame_base2, r2   ; v8subs r0, r0, rb20
+-+/* [0x000013b8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x000013c0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x000013c8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x000013d0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x000013d8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x000013e0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x000013e8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x000013f0] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x000013f8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00001400] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+/* [0x00001408] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+/* [0x00001410] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+/* [0x00001418] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+/* [0x00001420] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+/* [0x00001428] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+/* [0x00001430] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+/* [0x00001438] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+/* [0x00001440] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+-+/* [0x00001448] */ 0x8d3487f6, 0xd00279cc, // sub.setf -, r3, 8    ; mov ra12, ra13
+-+/* [0x00001450] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+-+/* [0x00001458] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+-+/* [0x00001460] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+-+/* [0x00001468] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+-+/* [0x00001470] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:yloopb
+-+/* [0x00001478] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00001480] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+-+/* [0x00001488] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+-+/* [0x00001490] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+-+/* [0x00001498] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+-+/* [0x000014a0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+/* [0x000014a8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x000014b0] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb4
+-+/* [0x000014b8] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb5
+-+/* [0x000014c0] */ 0x4c286237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb6
+-+/* [0x000014c8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
+-+/* [0x000014d0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x000014d8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x000014e0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x000014e8] */ 0x4053800e, 0xd00049e1, // nop                     ; mul24 r1, r1 << 8, ra20 << 8
+-+/* [0x000014f0] */ 0x4c78e38f, 0x10024860, // add r1, r1, ra30        ; mul24 r0, r1, rb14
+-+/* [0x000014f8] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
+-+/* [0x00001500] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:yloopb
+-+/* [0x00001508] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
+-+/* [0x00001510] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x00001518] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x00001520] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00001528] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00001530] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00001538] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_interrupt_exit12
+--/* [0x00001550] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00001540] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00001548] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00001550] */ 0x009e7000, 0xa00009e7, // ldtmu0
+- /* [0x00001558] */ 0x009e7000, 0xa00009e7, // ldtmu0
+- /* [0x00001560] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00001568] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00001570] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00001568] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001570] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00001578] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00001580] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x00001588] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-@@ -732,11 +732,9 @@ unsigned int rpi_shader[] = {
+- /* [0x000015a8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x000015b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+- /* [0x000015b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x000015c0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x000015c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x000015d0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x000015d8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+--/* [0x000015e0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x000015c0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x000015c8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x000015d0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_end
+- };
+- #ifdef __HIGHC__
+-diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
+-index 3fa8531..6e552d9 100644
+---- a/libavcodec/rpi_shader.h
+-+++ b/libavcodec/rpi_shader.h
+-@@ -4,15 +4,15 @@
+- extern unsigned int rpi_shader[];
+- 
+- #define mc_setup_uv (rpi_shader + 0)
+--#define mc_filter_uv (rpi_shader + 148)
+--#define mc_filter_uv_b0 (rpi_shader + 338)
+--#define mc_filter_uv_b (rpi_shader + 490)
+--#define mc_exit (rpi_shader + 666)
+--#define mc_interrupt_exit8 (rpi_shader + 684)
+--#define mc_setup (rpi_shader + 714)
+--#define mc_filter (rpi_shader + 868)
+--#define mc_filter_b (rpi_shader + 1108)
+--#define mc_interrupt_exit12 (rpi_shader + 1364)
+--#define mc_end (rpi_shader + 1402)
+-+#define mc_filter_uv (rpi_shader + 144)
+-+#define mc_filter_uv_b0 (rpi_shader + 334)
+-+#define mc_filter_uv_b (rpi_shader + 486)
+-+#define mc_exit (rpi_shader + 662)
+-+#define mc_interrupt_exit8 (rpi_shader + 680)
+-+#define mc_setup (rpi_shader + 710)
+-+#define mc_filter (rpi_shader + 864)
+-+#define mc_filter_b (rpi_shader + 1104)
+-+#define mc_interrupt_exit12 (rpi_shader + 1360)
+-+#define mc_end (rpi_shader + 1398)
+- 
+- #endif
+-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
+-index 9cfc0d9..a0b8e5a 100644
+---- a/libavcodec/rpi_shader.qasm
+-+++ b/libavcodec/rpi_shader.qasm
+-@@ -133,8 +133,8 @@ mov ra14, 0
+- mov ra15, 0
+- 
+- # Compute part of VPM to use for DMA output
+--mov r2, qpu_num
+--shl r2, r2, 1   # Convert QPU numbers to be even (this means we can only use 8 QPUs, but is necessary as we need to save 16bit intermediate results)
+-+mov r3, unif
+-+shl r2, r3, 1   # Convert QPU numbers to be even (this means we can only use 8 QPUs, but is necessary as we need to save 16bit intermediate results)
+- and r2, r2, 15
+- mov r1, r2
+- asr r1, r1, 2
+-@@ -147,8 +147,7 @@ shl r0, r0, 5
+- add rb27, r0, r1
+- 
+- # Compute part of VPM to save data into
+--mov r2, qpu_num   # qpu_num = abcd
+--shl r2, r2, 1
+-+shl r2, r3, 1
+- and r2, r2, 15    # r2 = bcd0
+- mov r1, r2        # r1 = bcd0
+- asr r1, r1, 2     # r1 = bc
+-@@ -181,9 +180,6 @@ add t0s, r2, r1
+- mov rb12,unif # offset before shift
+- mov rb13,unif # offset after shift
+- 
+--# Dump padding words
+--mov r0, unif
+--
+- # submit texture requests for second line
+- max r1, ra_y, 0
+- min r1, r1, rb_frame_height_minus_1
+--- 
+-2.7.4
+-
+-
+-From db6fe49d50e42c444b5833acc6206c0bbfaacef4 Mon Sep 17 00:00:00 2001
+-From: popcornmix <popcornmix@gmail.com>
+-Date: Sat, 23 May 2015 13:20:21 +0100
+-Subject: [PATCH 45/68] Add new cache flushing routine
+-
+----
+- libavcodec/hevc.c          |  8 +++--
+- libavcodec/hevc_filter.c   | 39 ++++++++++-----------
+- libavcodec/rpi_qpu.c       | 17 +++++++--
+- libavcodec/rpi_qpu.h       |  2 ++
+- libavcodec/rpi_user_vcsm.h | 86 ++++++++++++++++++++++++++--------------------
+- 5 files changed, 91 insertions(+), 61 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index caadfaa..9d12583 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -3575,9 +3575,13 @@ static int decode_nal_units(HEVCContext *s, const uint8_t *buf, int length)
+-     }
+- 
+- fail:
+--    if (s->ref && s->threads_type == FF_THREAD_FRAME)
+-+    if (s->ref && s->threads_type == FF_THREAD_FRAME) {
+-+#ifdef RPI_INTER_QPU
+-+        void ff_hevc_flush_chroma(HEVCContext *s, ThreadFrame *f, int n);
+-+        ff_hevc_flush_chroma(s, &s->ref->tf, s->ps.sps->height);
+-+#endif
+-         ff_thread_report_progress(&s->ref->tf, INT_MAX, 0);
+--
+-+    }
+-     return ret;
+- }
+- 
+-diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
+-index 186317a..ec84e8a 100644
+---- a/libavcodec/hevc_filter.c
+-+++ b/libavcodec/hevc_filter.c
+-@@ -883,36 +883,35 @@ static int ff_hevc_buf_base(AVBufferRef *bref) {
+-   return p->vc & 0x3fffffff;
+- }
+- 
+--static void ff_hevc_flush_chroma(HEVCContext *s, ThreadFrame *f, int n)
+-+void ff_hevc_flush_chroma(HEVCContext *s, ThreadFrame *f, int n);
+-+void ff_hevc_flush_chroma(HEVCContext *s, ThreadFrame *f, int n)
+- {
+-     if (s->enable_rpi && !(  s->nal_unit_type == NAL_TRAIL_N ||
+-             s->nal_unit_type == NAL_TSA_N   ||
+-             s->nal_unit_type == NAL_STSA_N  ||
+-             s->nal_unit_type == NAL_RADL_N  ||
+-             s->nal_unit_type == NAL_RASL_N )) {
+--#define RPI_FAST_CACHEFLUSH
+- #ifdef RPI_FAST_CACHEFLUSH
+-         struct vcsm_user_clean_invalid_s iocache = {};
+--        int curr_y = f->progress->data[0];
+-+        int curr_y = ((int *)f->progress->data)[0];
+-+        int curr_uv = curr_y >> s->ps.sps->vshift[1];
+-+        int n_uv = n >> s->ps.sps->vshift[1];
+-         int sz,base;
+--        if (curr_y < 0) curr_y = 0;
+--        if (n<=curr_y) return; // Should not happen
+--        sz = s->frame->linesize[1] * (n-curr_y);
+--        base = s->frame->linesize[1] * curr_y;
+--        iocache.s[0].cmd = 3; // Flush L1 cache
+--        iocache.s[0].addr = 0;
+--        iocache.s[0].size  = 0;
+--
+--        iocache.s[1].cmd = 2;
+--        iocache.s[1].addr = ff_hevc_buf_base(s->frame->buf[1]) + base;
+-+        if (curr_uv < 0) curr_uv = 0;
+-+        if (n_uv<=curr_uv) { assert(0); return; } // Should not happen
+-+        sz = s->frame->linesize[1] * (n_uv-curr_uv);
+-+        base = s->frame->linesize[1] * curr_uv;
+-+        GPU_MEM_PTR_T *p = av_buffer_pool_opaque(s->frame->buf[1]);
+-+        iocache.s[0].handle = p->vcsm_handle;
+-+        iocache.s[0].cmd = 3; // clean+invalidate
+-+        iocache.s[0].addr = p->arm + base;
+-+        iocache.s[0].size  = sz;
+-+        p = av_buffer_pool_opaque(s->frame->buf[2]);
+-+        iocache.s[1].handle = p->vcsm_handle;
+-+        iocache.s[1].cmd = 3; // clean+invalidate
+-+        iocache.s[1].addr = p->arm + base;
+-         iocache.s[1].size  = sz;
+--
+--        iocache.s[2].cmd = 2;
+--        iocache.s[2].addr = ff_hevc_buf_base(s->frame->buf[2]) + base;
+--        iocache.s[2].size  = sz;
+--
+--        vcsm_clean_invalid( gpu_get_mailbox(), &iocache );
+--
+-+        vcsm_clean_invalid( &iocache );
+- #else
+-         flush_buffer(s->frame->buf[1]);
+-         flush_buffer(s->frame->buf[2]);
+-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
+-index feb3284..aa65a77 100644
+---- a/libavcodec/rpi_qpu.c
+-+++ b/libavcodec/rpi_qpu.c
+-@@ -211,6 +211,7 @@ static void gpu_unlock(void) {
+- }
+- 
+- static int gpu_malloc_uncached_internal(int numbytes, GPU_MEM_PTR_T *p, int mb) {
+-+  p->numbytes = numbytes;
+-   p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
+-   assert(p->vcsm_handle);
+-   p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
+-@@ -243,13 +244,25 @@ int gpu_get_mailbox(void)
+-   return gpu->mb;
+- }
+- 
+-+// Call this to clean and invalidate a region of memory
+- void gpu_cache_flush(GPU_MEM_PTR_T *p)
+- {
+--  void *tmp = vcsm_lock(p->vcsm_handle);
+--  vcsm_unlock_ptr(tmp);
+-+#define RPI_FAST_CACHEFLUSH
+-+#ifdef RPI_FAST_CACHEFLUSH
+-+    struct vcsm_user_clean_invalid_s iocache = {};
+-+    iocache.s[0].handle = p->vcsm_handle;
+-+    iocache.s[0].cmd = 3; // clean+invalidate
+-+    iocache.s[0].addr = p->arm;
+-+    iocache.s[0].size  = p->numbytes;
+-+    vcsm_clean_invalid( &iocache );
+-+#else
+-+    void *tmp = vcsm_lock(p->vcsm_handle);
+-+    vcsm_unlock_ptr(tmp);
+-+#endif
+- }
+- 
+- static int gpu_malloc_cached_internal(int numbytes, GPU_MEM_PTR_T *p) {
+-+  p->numbytes = numbytes;
+-   p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST, (char *)"Video Frame" );
+-   //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_VC, (char *)"Video Frame" );
+-   //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
+-diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
+-index 2f08f03..0565a60 100644
+---- a/libavcodec/rpi_qpu.h
+-+++ b/libavcodec/rpi_qpu.h
+-@@ -1,6 +1,8 @@
+- #ifndef RPI_QPU_H
+- #define RPI_QPU_H
+- 
+-+#define RPI_FAST_CACHEFLUSH
+-+
+- typedef struct gpu_mem_ptr_s {
+-   unsigned char *arm; // Pointer to memory mapped on ARM side
+-   int vc_handle;   // Videocore handle of relocatable memory
+-diff --git a/libavcodec/rpi_user_vcsm.h b/libavcodec/rpi_user_vcsm.h
+-index 95e6de1..db41a4d 100644
+---- a/libavcodec/rpi_user_vcsm.h
+-+++ b/libavcodec/rpi_user_vcsm.h
+-@@ -1,29 +1,41 @@
+--/*
+--Copyright (c) 2012, Broadcom Europe Ltd
+--All rights reserved.
+--
+--Redistribution and use in source and binary forms, with or without
+--modification, are permitted provided that the following conditions are met:
+--    * Redistributions of source code must retain the above copyright
+--      notice, this list of conditions and the following disclaimer.
+--    * Redistributions in binary form must reproduce the above copyright
+--      notice, this list of conditions and the following disclaimer in the
+--      documentation and/or other materials provided with the distribution.
+--    * Neither the name of the copyright holder nor the
+--      names of its contributors may be used to endorse or promote products
+--      derived from this software without specific prior written permission.
+--
+--THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+--ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+--WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+--DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+--DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+--(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+--LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+--ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+--(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+--SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+--*/
+-+/*****************************************************************************
+-+* Copyright 2001 - 2011 Broadcom Corporation.  All rights reserved.
+-+*
+-+* This program is the proprietary software of Broadcom Corporation and/or
+-+* its licensors, and may only be used, duplicated, modified or distributed
+-+* pursuant to the terms and conditions of a separate, written license
+-+* agreement executed between you and Broadcom (an "Authorized License").
+-+* Except as set forth in an Authorized License, Broadcom grants no license
+-+* (express or implied), right to use, or waiver of any kind with respect to
+-+* the Software, and Broadcom expressly reserves all rights in and to the
+-+* Software and all intellectual property rights therein.  IF YOU HAVE NO
+-+* AUTHORIZED LICENSE, THEN YOU HAVE NO RIGHT TO USE THIS SOFTWARE IN ANY
+-+* WAY, AND SHOULD IMMEDIATELY NOTIFY BROADCOM AND DISCONTINUE ALL USE OF
+-+* THE SOFTWARE.
+-+*
+-+* Except as expressly set forth in the Authorized License,
+-+* 1. This program, including its structure, sequence and organization,
+-+*    constitutes the valuable trade secrets of Broadcom, and you shall use
+-+*    all reasonable efforts to protect the confidentiality thereof, and to
+-+*    use this information only in connection with your use of Broadcom
+-+*    integrated circuit products.
+-+* 2. TO THE MAXIMUM EXTENT PERMITTED BY LAW, THE SOFTWARE IS PROVIDED "AS IS"
+-+*    AND WITH ALL FAULTS AND BROADCOM MAKES NO PROMISES, REPRESENTATIONS OR
+-+*    WARRANTIES, EITHER EXPRESS, IMPLIED, STATUTORY, OR OTHERWISE, WITH
+-+*    RESPECT TO THE SOFTWARE.  BROADCOM SPECIFICALLY DISCLAIMS ANY AND ALL
+-+*    IMPLIED WARRANTIES OF TITLE, MERCHANTABILITY, NONINFRINGEMENT, FITNESS
+-+*    FOR A PARTICULAR PURPOSE, LACK OF VIRUSES, ACCURACY OR COMPLETENESS,
+-+*    QUIET ENJOYMENT, QUIET POSSESSION OR CORRESPONDENCE TO DESCRIPTION. YOU
+-+*    ASSUME THE ENTIRE RISK ARISING OUT OF USE OR PERFORMANCE OF THE SOFTWARE.
+-+* 3. TO THE MAXIMUM EXTENT PERMITTED BY LAW, IN NO EVENT SHALL BROADCOM OR ITS
+-+*    LICENSORS BE LIABLE FOR (i) CONSEQUENTIAL, INCIDENTAL, SPECIAL, INDIRECT,
+-+*    OR EXEMPLARY DAMAGES WHATSOEVER ARISING OUT OF OR IN ANY WAY RELATING TO
+-+*    YOUR USE OF OR INABILITY TO USE THE SOFTWARE EVEN IF BROADCOM HAS BEEN
+-+*    ADVISED OF THE POSSIBILITY OF SUCH DAMAGES; OR (ii) ANY AMOUNT IN EXCESS
+-+*    OF THE AMOUNT ACTUALLY PAID FOR THE SOFTWARE ITSELF OR U.S. $1, WHICHEVER
+-+*    IS GREATER. THESE LIMITATIONS SHALL APPLY NOTWITHSTANDING ANY FAILURE OF
+-+*    ESSENTIAL PURPOSE OF ANY LIMITED REMEDY.
+-+*****************************************************************************/
+- 
+- #ifndef __USER_VCSM__H__INCLUDED__
+- #define __USER_VCSM__H__INCLUDED__
+-@@ -424,21 +436,21 @@ int vcsm_unlock_hdl_sp( unsigned int handle, int cache_no_flush );
+- **
+- ** structure contains a list of flush/invalidate commands. Commands are:
+- ** 0: nop
+--** 1: invalidate given physical range in L2
+--** 2: clean      given physical range in L2
+--** 3: clean+invalidate all of L1
+--** 4: flush      all of L2 and all of L1
+-+** 1: invalidate       given virtual range in L1/L2
+-+** 2: clean            given virtual range in L1/L2
+-+** 3: clean+invalidate given virtual range in L1/L2
+-+** 4: flush all L1/L2
+- */
+- struct vcsm_user_clean_invalid_s {
+--    struct {
+--       unsigned int cmd;
+--       unsigned int addr;
+--       unsigned int size;
+--    } s[8];
+-+   struct {
+-+      unsigned int cmd;
+-+      unsigned int handle;
+-+      unsigned int addr;
+-+      unsigned int size;
+-+   } s[8];
+- };
+- 
+--int vcsm_clean_invalid( unsigned int handle, struct vcsm_user_clean_invalid_s *s );
+--
+-+int vcsm_clean_invalid( struct vcsm_user_clean_invalid_s *s );
+- 
+- #ifdef __cplusplus
+- }
+--- 
+-2.7.4
+-
+-
+-From 87a6cb3a4f7189e711c85de6d20077b6453b2ebe Mon Sep 17 00:00:00 2001
+-From: popcornmix <popcornmix@gmail.com>
+-Date: Sat, 23 May 2015 21:10:10 +0100
+-Subject: [PATCH 46/68] Fix multi mailbox extra transform call
+-
+----
+- libavcodec/hevc.c | 2 ++
+- 1 file changed, 2 insertions(+)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 9d12583..30f5834 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -3024,7 +3024,9 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+- #ifdef RPI_INTER_QPU
+-         rpi_execute_inter_qpu(s);
+- #endif
+-+#ifndef RPI_MULTI_MAILBOX
+-         rpi_execute_transform(s);
+-+#endif
+-         rpi_execute_inter_cmds(s);
+-         vpu_wait(s->vpu_id);
+-         rpi_execute_pred_cmds(s);
+--- 
+-2.7.4
+-
+-
+-From 2a3672a1bda0296453953bebe8b17d69445260b4 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Wed, 27 May 2015 16:44:29 +0100
+-Subject: [PATCH 47/68] Added support for running luma prediction on QPUs
+-
+----
+- libavcodec/hevc.c          |  237 +++++++-
+- libavcodec/hevc.h          |   26 +-
+- libavcodec/hevc_filter.c   |   23 +-
+- libavcodec/rpi_qpu.c       |  156 ++++--
+- libavcodec/rpi_qpu.h       |    8 +-
+- libavcodec/rpi_shader.c    | 1313 ++++++++++++++++++++++----------------------
+- libavcodec/rpi_shader.h    |   21 +-
+- libavcodec/rpi_shader.qasm |  883 ++++++++++++++---------------
+- 8 files changed, 1464 insertions(+), 1203 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 30f5834..2da88ec 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -52,6 +52,11 @@
+-     // Define RPI_MULTI_MAILBOX to use the updated mailbox that can launch both QPU and VPU
+-     #define RPI_MULTI_MAILBOX
+-   #endif
+-+
+-+  // Define RPI_CACHE_UNIF_MVS to write motion vector uniform stream to cached memory
+-+  // RPI_CACHE_UNIF_MVS doesn't seem to make much difference, so left undefined.
+-+
+-+
+- #endif
+- 
+- // #define DISABLE_MC
+-@@ -74,6 +79,13 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
+- // The QPU code for UV blocks only works up to a block width of 8
+- #define RPI_CHROMA_BLOCK_WIDTH 8
+- 
+-+// Split image of 2048 into parts 64 wide
+-+// So some QPUs will have 3 blocks of 64 to do, and others 2 blocks for an image 2048 wide with 32 blocks across
+-+// Each block of 64*64
+-+// Smallest CTU size is 16x16, so smallest block is 8x8
+-+// Corresponds to a total of 83kbytes over all 12 QPUs
+-+#define RPI_LUMA_COMMAND_WORDS 9
+-+#define Y_COMMANDS_PER_QPU ((1+3*(64*64)/(8*8)) * RPI_LUMA_COMMAND_WORDS)
+- 
+- #define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24)
+- 
+-@@ -2015,10 +2027,46 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-         int nPbW_c = nPbW >> s->ps.sps->hshift[1];
+-         int nPbH_c = nPbH >> s->ps.sps->vshift[1];
+- 
+--        RPI_REDIRECT(luma_mc_uni)(s, dst0, s->frame->linesize[0], ref0->frame,
+-+#ifdef RPI_LUMA_QPU
+-+        if (s->enable_rpi) {
+-+            int reflist = 0;
+-+            const Mv *mv         = &current_mv.mv[reflist];
+-+            int mx          = mv->x & 3;
+-+            int my          = mv->y & 3;
+-+            int my_mx = (my<<8) + mx;
+-+            int my2_mx2_my_mx = (my_mx << 16) + my_mx;
+-+            int x1 = x0 + (mv->x >> 2);
+-+            int y1 = y0 + (mv->y >> 2);
+-+            int chan = x0>>6; // 64 wide blocks per QPU
+-+            int weight_flag = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
+-+                              (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
+-+            uint32_t *y = s->y_mvs[chan % 12];
+-+            for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
+-+              for(int start_x=0;start_x < nPbW;start_x+=16) {
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff);
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[0]);
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + 8 + start_x) & 0xffff);
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[0]);
+-+                  *y++ = ( (nPbW<16 ? nPbW : 16) << 16 ) + (nPbH<16 ? nPbH : 16);
+-+                  *y++ = my2_mx2_my_mx;
+-+                  if (weight_flag) {
+-+                      *y++ = (s->sh.luma_offset_l0[current_mv.ref_idx[reflist]] << 16) + (s->sh.luma_weight_l0[current_mv.ref_idx[reflist]] & 0xffff);
+-+                  } else {
+-+                      *y++ = 1; // Weight of 1 and offset of 0
+-+                  }
+-+                  *y++ = (get_vc_address(s->frame->buf[0]) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter;
+-+                }
+-+            }
+-+            s->y_mvs[chan % 12] = y;
+-+        } else
+-+#endif
+-+        {
+-+            RPI_REDIRECT(luma_mc_uni)(s, dst0, s->frame->linesize[0], ref0->frame,
+-                     &current_mv.mv[0], x0, y0, nPbW, nPbH,
+-                     s->sh.luma_weight_l0[current_mv.ref_idx[0]],
+-                     s->sh.luma_offset_l0[current_mv.ref_idx[0]]);
+-+        }
+- 
+-         if (s->ps.sps->chroma_format_idc) {
+- #ifdef RPI_INTER_QPU
+-@@ -2078,10 +2126,47 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-         int nPbW_c = nPbW >> s->ps.sps->hshift[1];
+-         int nPbH_c = nPbH >> s->ps.sps->vshift[1];
+- 
+--        RPI_REDIRECT(luma_mc_uni)(s, dst0, s->frame->linesize[0], ref1->frame,
+-+#ifdef RPI_LUMA_QPU
+-+        if (s->enable_rpi) {
+-+            int reflist = 1;
+-+            const Mv *mv    = &current_mv.mv[reflist];
+-+            int mx          = mv->x & 3;
+-+            int my          = mv->y & 3;
+-+            int my_mx = (my<<8) + mx;
+-+            int my2_mx2_my_mx = (my_mx << 16) + my_mx;
+-+            int x1 = x0 + (mv->x >> 2);
+-+            int y1 = y0 + (mv->y >> 2);
+-+            int chan = x0>>6; // 64 wide blocks per QPU
+-+            int weight_flag = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
+-+                              (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
+-+            uint32_t *y = s->y_mvs[chan % 12];
+-+            for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
+-+              for(int start_x=0;start_x < nPbW;start_x+=16) {
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff);
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[0]);
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + 8 + start_x) & 0xffff);
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[0]);
+-+                  *y++ = ( (nPbW<16 ? nPbW : 16) << 16 ) + (nPbH<16 ? nPbH : 16);
+-+                  *y++ = my2_mx2_my_mx;
+-+                  if (weight_flag) {
+-+                      *y++ = (s->sh.luma_offset_l0[current_mv.ref_idx[reflist]] << 16) + (s->sh.luma_weight_l0[current_mv.ref_idx[reflist]] & 0xffff);
+-+                  } else {
+-+                      *y++ = 1; // Weight of 1 and offset of 0
+-+                  }
+-+                  *y++ = (get_vc_address(s->frame->buf[0]) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter;
+-+                }
+-+            }
+-+            s->y_mvs[chan % 12] = y;
+-+        } else
+-+#endif
+-+
+-+        {
+-+            RPI_REDIRECT(luma_mc_uni)(s, dst0, s->frame->linesize[0], ref1->frame,
+-                     &current_mv.mv[1], x0, y0, nPbW, nPbH,
+-                     s->sh.luma_weight_l1[current_mv.ref_idx[1]],
+-                     s->sh.luma_offset_l1[current_mv.ref_idx[1]]);
+-+        }
+- 
+-         if (s->ps.sps->chroma_format_idc) {
+- #ifdef RPI_INTER_QPU
+-@@ -2115,8 +2200,8 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                       *u++ = rpi_filter_coefs[_mx][0];
+-                       *u++ = rpi_filter_coefs[_my][0];
+-                       if (weight_flag) {
+--                          *u++ = (s->sh.chroma_offset_l0[current_mv.ref_idx[1]][0] << 16) + (s->sh.chroma_weight_l0[current_mv.ref_idx[1]][0] & 0xffff);
+--                          *u++ = (s->sh.chroma_offset_l0[current_mv.ref_idx[1]][1] << 16) + (s->sh.chroma_weight_l0[current_mv.ref_idx[1]][1] & 0xffff);
+-+                          *u++ = (s->sh.chroma_offset_l0[current_mv.ref_idx[reflist]][0] << 16) + (s->sh.chroma_weight_l0[current_mv.ref_idx[reflist]][0] & 0xffff);
+-+                          *u++ = (s->sh.chroma_offset_l0[current_mv.ref_idx[reflist]][1] << 16) + (s->sh.chroma_weight_l0[current_mv.ref_idx[reflist]][1] & 0xffff);
+-                       } else {
+-                           *u++ = 1; // Weight of 1 and offset of 0
+-                           *u++ = 1;
+-@@ -2143,9 +2228,44 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-         int nPbW_c = nPbW >> s->ps.sps->hshift[1];
+-         int nPbH_c = nPbH >> s->ps.sps->vshift[1];
+- 
+--        RPI_REDIRECT(luma_mc_bi)(s, dst0, s->frame->linesize[0], ref0->frame,
+-+#ifdef RPI_LUMA_QPU
+-+        if (s->enable_rpi) {
+-+            const Mv *mv    = &current_mv.mv[0];
+-+            int mx          = mv->x & 3;
+-+            int my          = mv->y & 3;
+-+            int my_mx = (my<<8) + mx;
+-+            const Mv *mv2    = &current_mv.mv[1];
+-+            int mx2          = mv2->x & 3;
+-+            int my2          = mv2->y & 3;
+-+            int my2_mx2 = (my2<<8) + mx2;
+-+            int my2_mx2_my_mx = (my2_mx2 << 16) + my_mx;
+-+            int x1 = x0 + (mv->x >> 2);
+-+            int y1 = y0 + (mv->y >> 2);
+-+            int x2 = x0 + (mv2->x >> 2);
+-+            int y2 = y0 + (mv2->y >> 2);
+-+            int chan = x0>>6; // 64 wide blocks per QPU
+-+            uint32_t *y = s->y_mvs[chan % 12];
+-+            for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
+-+              for(int start_x=0;start_x < nPbW;start_x+=8) { // B blocks work 8 at a time
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff);
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[0]);
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y2 - 3 + start_y) << 16) + ( (x2 - 3 + start_x) & 0xffff); // Second fetch is for ref1
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[0]);
+-+                  *y++ = ( (nPbW<8 ? nPbW : 8) << 16 ) + (nPbH<16 ? nPbH : 16);
+-+                  *y++ = my2_mx2_my_mx;
+-+                  *y++ = 1; // B frame weighted prediction not supported
+-+                  *y++ = (get_vc_address(s->frame->buf[0]) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter_b;
+-+                }
+-+            }
+-+            s->y_mvs[chan % 12] = y;
+-+        } else
+-+#endif
+-+        {
+-+            RPI_REDIRECT(luma_mc_bi)(s, dst0, s->frame->linesize[0], ref0->frame,
+-                    &current_mv.mv[0], x0, y0, nPbW, nPbH,
+-                    ref1->frame, &current_mv.mv[1], &current_mv);
+-+        }
+- 
+-         if (s->ps.sps->chroma_format_idc) {
+- #ifdef RPI_INTER_QPU
+-@@ -2834,7 +2954,6 @@ static void rpi_inter_clear(HEVCContext *s)
+-         *s->u_mvs[i]++ = pic_height;
+-         *s->u_mvs[i]++ = s->frame->linesize[1];
+-         *s->u_mvs[i]++ = s->frame->linesize[2];
+--        *s->u_mvs[i]++ = i;
+-         if (weight_flag) {
+-             *s->u_mvs[i]++ = 1 << (s->sh.chroma_log2_weight_denom + 6 - 1);
+-             *s->u_mvs[i]++ = s->sh.chroma_log2_weight_denom + 6;
+-@@ -2842,7 +2961,31 @@ static void rpi_inter_clear(HEVCContext *s)
+-             *s->u_mvs[i]++ = 1 << 5;
+-             *s->u_mvs[i]++ = 6;
+-         }
+-+        *s->u_mvs[i]++ = i;  // Select section of VPM (avoid collisions with 3d unit)
+-+    }
+-+
+-+#ifdef RPI_LUMA_QPU
+-+    for(i=0;i<12;i++) {
+-+        s->y_mvs[i] = s->y_mvs_base[i];
+-+        *s->y_mvs[i]++ = 0; // y_x
+-+        *s->y_mvs[i]++ = 0; // ref_y_base
+-+        *s->y_mvs[i]++ = 0; // y2_x2
+-+        *s->y_mvs[i]++ = 0; // ref_y2_base
+-+        *s->y_mvs[i]++ = (s->ps.sps->width << 16) + s->ps.sps->height;
+-+        *s->y_mvs[i]++ = s->frame->linesize[0]; // pitch
+-+        *s->y_mvs[i]++ = s->frame->linesize[0]; // dst_pitch
+-+        if (weight_flag) {
+-+            int offset = 1 << (s->sh.luma_log2_weight_denom + 6 - 1);
+-+            int shift = s->sh.luma_log2_weight_denom + 6;
+-+            *s->y_mvs[i]++ = (offset << 16) + shift;
+-+        } else {
+-+            int offset = 1 << 5;
+-+            int shift = 6;
+-+            *s->y_mvs[i]++ = (offset << 16) + shift;
+-+        }
+-+        *s->y_mvs[i]++ = 0; // Next kernel
+-     }
+-+#endif
+- }
+- 
+- static void rpi_execute_inter_qpu(HEVCContext *s)
+-@@ -2850,6 +2993,9 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
+-     int k;
+-     int i;
+-     uint32_t *unif_vc = (uint32_t *)s->unif_mvs_ptr.vc;
+-+#ifdef RPI_LUMA_QPU
+-+    uint32_t *y_unif_vc = (uint32_t *)s->y_unif_mvs_ptr.vc;
+-+#endif
+-     if (s->sh.slice_type == I_SLICE) {
+- #ifdef RPI_MULTI_MAILBOX
+-       rpi_execute_transform(s);
+-@@ -2865,8 +3011,23 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
+- 
+-     s->u_mvs[8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
+- 
+-+#ifdef RPI_LUMA_QPU
+-+    for(k=0;k<12;k++) {
+-+        s->y_mvs[k][-RPI_LUMA_COMMAND_WORDS+1] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
+-+        s->y_mvs[k][-RPI_LUMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // Also need a dummy for second request
+-+        s->y_mvs[k][-RPI_LUMA_COMMAND_WORDS+8] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
+-+        assert(s->y_mvs[k] - s->y_mvs_base[k] < Y_COMMANDS_PER_QPU);
+-+    }
+-+    s->y_mvs[12-1][-RPI_LUMA_COMMAND_WORDS+8] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT12); // This QPU will signal interrupt when all others are done and have acquired a semaphore
+-+#endif
+-+
+-+
+- #ifdef RPI_MULTI_MAILBOX
+-+#ifdef RPI_CACHE_UNIF_MVS
+-+    gpu_cache_flush3(&s->coeffs_buf_accelerated,&s->y_unif_mvs_ptr, &s->unif_mvs_ptr);
+-+#else
+-     gpu_cache_flush(&s->coeffs_buf_accelerated);
+-+#endif
+-     s->vpu_id = vpu_qpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0,
+-                                    qpu_get_fn(QPU_MC_SETUP_UV),
+-                                    (uint32_t)(unif_vc+(s->mvs_base[0 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-@@ -2876,7 +3037,27 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
+-                                    (uint32_t)(unif_vc+(s->mvs_base[4 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-                                    (uint32_t)(unif_vc+(s->mvs_base[5 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-                                    (uint32_t)(unif_vc+(s->mvs_base[6 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+--                                   (uint32_t)(unif_vc+(s->mvs_base[7 ] - (uint32_t*)s->unif_mvs_ptr.arm))
+-+                                   (uint32_t)(unif_vc+(s->mvs_base[7 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-+#ifdef RPI_LUMA_QPU
+-+                                   qpu_get_fn(QPU_MC_SETUP),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[0 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[1 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[2 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[3 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[4 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[5 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[6 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[7 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[8 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[9 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[10 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[11 ] - (uint32_t*)s->y_unif_mvs_ptr.arm))
+-+#else
+-+                                   0,
+-+                                   0,0,0,0,
+-+                                   0,0,0,0,
+-+                                   0,0,0,0
+-+#endif
+-                                  );
+-     for(i=0;i<4;i++)
+-         s->num_coeffs[i] = 0;
+-@@ -2892,6 +3073,8 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
+-       (uint32_t)(unif_vc+(s->mvs_base[7 ] - (uint32_t*)s->unif_mvs_ptr.arm))
+-       );
+- #endif
+-+
+-+
+- }
+- #endif
+- 
+-@@ -3579,8 +3762,7 @@ static int decode_nal_units(HEVCContext *s, const uint8_t *buf, int length)
+- fail:
+-     if (s->ref && s->threads_type == FF_THREAD_FRAME) {
+- #ifdef RPI_INTER_QPU
+--        void ff_hevc_flush_chroma(HEVCContext *s, ThreadFrame *f, int n);
+--        ff_hevc_flush_chroma(s, &s->ref->tf, s->ps.sps->height);
+-+        ff_hevc_flush_buffer(s, &s->ref->tf, s->ps.sps->height);
+- #endif
+-         ff_thread_report_progress(&s->ref->tf, INT_MAX, 0);
+-     }
+-@@ -3767,7 +3949,6 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
+- 
+- #ifdef RPI
+-     av_freep(&s->unif_mv_cmds);
+--    av_freep(&s->unif_xfm_cmds);
+-     av_freep(&s->univ_pred_cmds);
+- 
+- #ifdef RPI_INTER_QPU
+-@@ -3776,7 +3957,12 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
+-         s->unif_mvs = 0;
+-     }
+- #endif
+--    //gpu_free(&s->dummy);
+-+#ifdef RPI_LUMA_QPU
+-+    if (s->y_unif_mvs) {
+-+        gpu_free( &s->y_unif_mvs_ptr );
+-+        s->y_unif_mvs = 0;
+-+    }
+-+#endif
+- 
+- #ifdef EARLY_MALLOC
+-     printf("hevc_decode_free\n");
+-@@ -3861,9 +4047,6 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
+-     s->unif_mv_cmds = av_mallocz(sizeof(HEVCMvCmd)*RPI_MAX_MV_CMDS);
+-     if (!s->unif_mv_cmds)
+-         goto fail;
+--    s->unif_xfm_cmds = av_mallocz(sizeof(HEVCXfmCmd)*RPI_MAX_XFM_CMDS);
+--    if (!s->unif_xfm_cmds)
+--        goto fail;
+-     s->univ_pred_cmds = av_mallocz(sizeof(HEVCPredCmd)*RPI_MAX_PRED_CMDS);
+-     if (!s->univ_pred_cmds)
+-         goto fail;
+-@@ -3877,7 +4060,11 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
+-     {
+-         int uv_commands_per_qpu = UV_COMMANDS_PER_QPU;
+-         uint32_t *p;
+-+#ifdef RPI_CACHE_UNIF_MVS
+-+        gpu_malloc_cached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr );
+-+#else
+-         gpu_malloc_uncached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr );
+-+#endif
+-         s->unif_mvs = (uint32_t *) s->unif_mvs_ptr.arm; // TODO support this allocation in non EARLY_MALLOC
+- 
+-         // Set up initial locations for uniform streams
+-@@ -3892,6 +4079,28 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
+- 
+-     }
+- #endif
+-+#ifdef RPI_LUMA_QPU
+-+    {
+-+        int y_commands_per_qpu = Y_COMMANDS_PER_QPU;
+-+        uint32_t *p;
+-+#ifdef RPI_CACHE_UNIF_MVS
+-+        gpu_malloc_cached( 12 * y_commands_per_qpu * sizeof(uint32_t), &s->y_unif_mvs_ptr );
+-+#else
+-+        gpu_malloc_uncached( 12 * y_commands_per_qpu * sizeof(uint32_t), &s->y_unif_mvs_ptr );
+-+#endif
+-+        s->y_unif_mvs = (uint32_t *) s->y_unif_mvs_ptr.arm; // TODO support this allocation in non EARLY_MALLOC
+-+
+-+        // Set up initial locations for uniform streams
+-+        p = s->y_unif_mvs;
+-+        for(i = 0; i < 12; i++) {
+-+            s->y_mvs_base[i] = p;
+-+            p += y_commands_per_qpu;
+-+        }
+-+        s->mc_filter = qpu_get_fn(QPU_MC_FILTER);
+-+        s->mc_filter_b = qpu_get_fn(QPU_MC_FILTER_B);
+-+
+-+    }
+-+#endif
+-     //gpu_malloc_uncached(2048*64,&s->dummy);
+- 
+- #ifdef EARLY_MALLOC
+-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
+-index 4a39e39..5df9dcd 100644
+---- a/libavcodec/hevc.h
+-+++ b/libavcodec/hevc.h
+-@@ -44,9 +44,13 @@
+- #ifdef RPI
+- 
+-   #include "rpi_qpu.h"
+--  // Use QPU for inter prediction
+-+  // Define RPI_INTER_QPU to use QPU for chroma inter prediction
+-   #define RPI_INTER_QPU
+- 
+-+  #ifdef RPI_INTER_QPU
+-+    // Define RPI_LUMA_QPU to also use QPU for luma inter prediction
+-+    #define RPI_LUMA_QPU
+-+  #endif
+- #endif
+- 
+- #define MAX_DPB_SIZE 16 // A.4.1
+-@@ -809,7 +813,6 @@ typedef struct HEVCLocalContext {
+- 
+- // Worst case is for 4:4:4 4x4 blocks with 64 high coding tree blocks, so 16 MV cmds per 4 pixels across for each colour plane, * 2 for bi
+- #define RPI_MAX_MV_CMDS   (2*16*3*(RPI_MAX_WIDTH/4))
+--#define RPI_MAX_XFM_CMDS  (16*3*(RPI_MAX_WIDTH/4))
+- // Each block can have an intra prediction and a transform_add command
+- #define RPI_MAX_PRED_CMDS (2*16*3*(RPI_MAX_WIDTH/4))
+- // Worst case is 16x16 CTUs
+-@@ -844,9 +847,6 @@ typedef struct HEVCMvCmd {
+-     int8_t ref_idx[2];
+- } HEVCMvCmd;
+- 
+--// Command for transform to process a block of coefficients
+--typedef struct HEVCXfmCmd {
+--} HEVCXfmCmd;
+- 
+- // Command for intra prediction and transform_add of predictions to coefficients
+- #define RPI_PRED_TRANSFORM_ADD 0
+-@@ -892,8 +892,7 @@ typedef struct HEVCContext {
+- 
+- #ifdef RPI
+-     int enable_rpi;
+--    HEVCMvCmd *unif_mv_cmds;  // TODO rename
+--    HEVCXfmCmd *unif_xfm_cmds;
+-+    HEVCMvCmd *unif_mv_cmds;
+-     HEVCPredCmd *univ_pred_cmds;
+-     int buf_width;
+-     GPU_MEM_PTR_T coeffs_buf_default;
+-@@ -920,6 +919,15 @@ typedef struct HEVCContext {
+-     uint32_t mc_filter_uv_b0;
+-     uint32_t mc_filter_uv_b;
+- #endif
+-+#ifdef RPI_LUMA_QPU
+-+    GPU_MEM_PTR_T y_unif_mvs_ptr;
+-+    uint32_t *y_unif_mvs; // Base of memory for motion vector commands
+-+    uint32_t *y_mvs_base[12];
+-+    uint32_t *y_mvs[12];
+-+    // Function pointers
+-+    uint32_t mc_filter;
+-+    uint32_t mc_filter_b;
+-+#endif
+- 
+- #endif
+- 
+-@@ -1166,6 +1174,10 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+-                                  int log2_trafo_size, enum ScanType scan_idx,
+-                                  int c_idx);
+- 
+-+#ifdef RPI_INTER_QPU
+-+extern void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n);
+-+#endif
+-+
+- void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size);
+- 
+- 
+-diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
+-index ec84e8a..11629e4 100644
+---- a/libavcodec/hevc_filter.c
+-+++ b/libavcodec/hevc_filter.c
+-@@ -883,8 +883,7 @@ static int ff_hevc_buf_base(AVBufferRef *bref) {
+-   return p->vc & 0x3fffffff;
+- }
+- 
+--void ff_hevc_flush_chroma(HEVCContext *s, ThreadFrame *f, int n);
+--void ff_hevc_flush_chroma(HEVCContext *s, ThreadFrame *f, int n)
+-+void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n)
+- {
+-     if (s->enable_rpi && !(  s->nal_unit_type == NAL_TRAIL_N ||
+-             s->nal_unit_type == NAL_TSA_N   ||
+-@@ -911,10 +910,24 @@ void ff_hevc_flush_chroma(HEVCContext *s, ThreadFrame *f, int n)
+-         iocache.s[1].cmd = 3; // clean+invalidate
+-         iocache.s[1].addr = p->arm + base;
+-         iocache.s[1].size  = sz;
+-+
+-+#ifdef RPI_LUMA_QPU
+-+        p = av_buffer_pool_opaque(s->frame->buf[0]);
+-+        sz = s->frame->linesize[0] * (n-curr_y);
+-+        base = s->frame->linesize[0] * curr_y;
+-+        iocache.s[2].handle = p->vcsm_handle;
+-+        iocache.s[2].cmd = 3; // clean+invalidate
+-+        iocache.s[2].addr = p->arm + base;
+-+        iocache.s[2].size  = sz;
+-+#endif
+-         vcsm_clean_invalid( &iocache );
+- #else
+-         flush_buffer(s->frame->buf[1]);
+-         flush_buffer(s->frame->buf[2]);
+-+#ifdef RPI_LUMA_QPU
+-+        flush_buffer(s->frame->buf[1]);
+-+#endif
+-+
+- #endif
+-         //memcpy(s->dummy.arm,s->frame->data[0],2048*64);
+-         //memcpy(s->dummy.arm,s->frame->data[1],1024*32);
+-@@ -938,7 +951,7 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
+-             sao_filter_CTB(s, x, y - ctb_size);
+-             if (s->threads_type & FF_THREAD_FRAME ) {
+- #ifdef RPI_INTER_QPU
+--                ff_hevc_flush_chroma(s,&s->ref->tf, y);
+-+                ff_hevc_flush_buffer(s,&s->ref->tf, y);
+- #endif
+-                 ff_thread_report_progress(&s->ref->tf, y, 0);
+-             }
+-@@ -947,7 +960,7 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
+-             sao_filter_CTB(s, x , y);
+-             if (s->threads_type & FF_THREAD_FRAME ) {
+- #ifdef RPI_INTER_QPU
+--                ff_hevc_flush_chroma(s, &s->ref->tf, y + ctb_size);
+-+                ff_hevc_flush_buffer(s, &s->ref->tf, y + ctb_size);
+- #endif
+-                 ff_thread_report_progress(&s->ref->tf, y + ctb_size, 0);
+-             }
+-@@ -957,7 +970,7 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
+-         //int currh = s->ref->tf.progress->data[0];
+-         //if (((y + ctb_size)&63)==0)
+- #ifdef RPI_INTER_QPU
+--        ff_hevc_flush_chroma(s, &s->ref->tf, y + ctb_size - 4);
+-+        ff_hevc_flush_buffer(s, &s->ref->tf, y + ctb_size - 4);
+- #endif
+-         ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
+-     }
+-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
+-index aa65a77..e12304b 100644
+---- a/libavcodec/rpi_qpu.c
+-+++ b/libavcodec/rpi_qpu.c
+-@@ -1,9 +1,11 @@
+- #ifdef RPI
+- // This works better than the mmap in that the memory can be cached, but requires a kernel modification to enable the device.
+- // define RPI_TIME_TOTAL_QPU to print out how much time is spent in the QPU code
+--#define RPI_TIME_TOTAL_QPU
+-+//#define RPI_TIME_TOTAL_QPU
+- // define RPI_TIME_TOTAL_VPU to print out how much time is spent in the VPI code
+- //#define RPI_TIME_TOTAL_VPU
+-+// define RPI_TIME_TOTAL_POSTED to print out how much time is spent in the multi execute QPU/VPU combined
+-+//#define RPI_TIME_TOTAL_POSTED
+- // define RPI_ASYNC to run the VPU in a separate thread, need to make a separate call to check for completion
+- #define RPI_ASYNC
+- 
+-@@ -94,7 +96,8 @@ struct GPU
+-   int open_count; // Number of allocated video buffers
+-   int      mb; // Mailbox handle
+-   int      vc; // Address in GPU memory
+--  int mail[12]; // These are used to pass pairs of code/unifs to the QPUs
+-+  int mail[12*2]; // These are used to pass pairs of code/unifs to the QPUs for the first QPU task
+-+  int mail2[12*2]; // These are used to pass pairs of code/unifs to the QPUs for the second QPU task
+- };
+- 
+- // Stop more than one thread trying to allocate memory or use the processing resources at once
+-@@ -102,7 +105,7 @@ static pthread_mutex_t gpu_mutex = PTHREAD_MUTEX_INITIALIZER;
+- static volatile struct GPU* gpu = NULL;
+- static GPU_MEM_PTR_T gpu_mem_ptr;
+- 
+--#if defined(RPI_TIME_TOTAL_QPU) || defined(RPI_TIME_TOTAL_VPU)
+-+#if defined(RPI_TIME_TOTAL_QPU) || defined(RPI_TIME_TOTAL_VPU) || defined(RPI_TIME_TOTAL_POSTED)
+- static unsigned int Microseconds(void) {
+-     struct timespec ts;
+-     unsigned int x;
+-@@ -123,7 +126,7 @@ static pthread_cond_t post_cond_head = PTHREAD_COND_INITIALIZER;
+- static pthread_cond_t post_cond_tail = PTHREAD_COND_INITIALIZER;
+- static pthread_mutex_t post_mutex = PTHREAD_MUTEX_INITIALIZER;
+- 
+--static int vpu_cmds[MAXCMDS][16];
+-+static int vpu_cmds[MAXCMDS][32];
+- static volatile int vpu_async_tail=0; // Contains the number of posted jobs
+- static volatile int vpu_async_head=0;
+- #endif
+-@@ -247,7 +250,6 @@ int gpu_get_mailbox(void)
+- // Call this to clean and invalidate a region of memory
+- void gpu_cache_flush(GPU_MEM_PTR_T *p)
+- {
+--#define RPI_FAST_CACHEFLUSH
+- #ifdef RPI_FAST_CACHEFLUSH
+-     struct vcsm_user_clean_invalid_s iocache = {};
+-     iocache.s[0].handle = p->vcsm_handle;
+-@@ -261,6 +263,34 @@ void gpu_cache_flush(GPU_MEM_PTR_T *p)
+- #endif
+- }
+- 
+-+void gpu_cache_flush3(GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2)
+-+{
+-+#ifdef RPI_FAST_CACHEFLUSH
+-+    struct vcsm_user_clean_invalid_s iocache = {};
+-+    iocache.s[0].handle = p0->vcsm_handle;
+-+    iocache.s[0].cmd = 3; // clean+invalidate
+-+    iocache.s[0].addr = (int) p0->arm;
+-+    iocache.s[0].size  = p0->numbytes;
+-+    iocache.s[1].handle = p1->vcsm_handle;
+-+    iocache.s[1].cmd = 3; // clean+invalidate
+-+    iocache.s[1].addr = (int) p1->arm;
+-+    iocache.s[1].size  = p1->numbytes;
+-+    iocache.s[2].handle = p2->vcsm_handle;
+-+    iocache.s[2].cmd = 3; // clean+invalidate
+-+    iocache.s[2].addr = (int) p2->arm;
+-+    iocache.s[2].size  = p2->numbytes;
+-+    vcsm_clean_invalid( &iocache );
+-+#else
+-+    void *tmp;
+-+    tmp = vcsm_lock(p0->vcsm_handle);
+-+    vcsm_unlock_ptr(tmp);
+-+    tmp = vcsm_lock(p1->vcsm_handle);
+-+    vcsm_unlock_ptr(tmp);
+-+    tmp = vcsm_lock(p2->vcsm_handle);
+-+    vcsm_unlock_ptr(tmp);
+-+#endif
+-+}
+-+
+- static int gpu_malloc_cached_internal(int numbytes, GPU_MEM_PTR_T *p) {
+-   p->numbytes = numbytes;
+-   p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST, (char *)"Video Frame" );
+-@@ -357,9 +387,19 @@ unsigned int vpu_get_constants(void) {
+- #ifdef RPI_ASYNC
+- 
+- static void *vpu_start(void *arg) {
+-+#ifdef RPI_TIME_TOTAL_POSTED
+-+  int last_time=0;
+-+  long long on_time=0;
+-+  long long off_time=0;
+-+  int start_time;
+-+  int end_time;
+-+  int count=0;
+-+#endif
+-   while(1) {
+-+    int i;
+-     int *p;
+-     int qpu_code;
+-+    int qpu_codeb;
+-     pthread_mutex_lock(&post_mutex);
+-     while( vpu_async_tail - vpu_async_head <= 0)
+-     {
+-@@ -373,24 +413,49 @@ static void *vpu_start(void *arg) {
+-       break; // Last job
+-     }
+-     qpu_code = p[7];
+-+    qpu_codeb = p[16];
+-     //if (p[7]) {
+-         //GPU_MEM_PTR_T *buf = (GPU_MEM_PTR_T *)p[7];
+-         //gpu_cache_flush(buf);
+-     //}
+-+
+-+#ifdef RPI_TIME_TOTAL_POSTED
+-+    start_time = Microseconds();
+-+    if (last_time==0)
+-+      last_time = start_time;
+-+    off_time += start_time-last_time;
+-+#endif
+-+
+-     if (!qpu_code) {
+-       vpu_execute_code(p[0], p[1], p[2], p[3], p[4], p[5], p[6]);
+-     } else {
+--      int i;
+-       for(i=0;i<8;i++) {
+-         gpu->mail[i*2] = p[8+i];
+-         gpu->mail[i*2 + 1] = qpu_code;
+-       }
+--
+--      execute_multi(gpu->mb,8,gpu->vc + offsetof(struct GPU, mail), 1 /* no flush */, 5000 /* timeout ms */,
+--                              0, 0, 0, 0,
+-+      for(i=0;i<12;i++) {
+-+        gpu->mail2[i*2] = p[17+i];
+-+        gpu->mail2[i*2 + 1] = qpu_codeb;
+-+      }
+-+#if (0)
+-+      vpu_execute_code(p[0], p[1], p[2], p[3], p[4], p[5], p[6]);
+-+      execute_qpu(gpu->mb,8,gpu->vc + offsetof(struct GPU, mail), 1 /* no flush */, 5000 /* timeout ms */);
+-+#else
+-+      execute_multi(gpu->mb,
+-+                              12,gpu->vc + offsetof(struct GPU, mail2), 1, 5000,
+-+                              8,gpu->vc + offsetof(struct GPU, mail), 1 /* no flush */, 5000 /* timeout ms */,
+-                               p[0], p[1], p[2], p[3], p[4], p[5], p[6], // VPU0
+-                               0,    0   , 0   , 0   , 0   , 0   , 0); // VPU1
+-+#endif
+-     }
+-+#ifdef RPI_TIME_TOTAL_POSTED
+-+    end_time = Microseconds();
+-+    last_time = end_time;
+-+    on_time += end_time - start_time;
+-+    count++;
+-+    if ((count&0x7f)==0)
+-+      printf("Posted %d On=%dms, Off=%dms\n",count,(int)(on_time/1000),(int)(off_time/1000));
+-+#endif
+-     pthread_mutex_lock(&post_mutex);
+-     vpu_async_head++;
+-     pthread_cond_broadcast(&post_cond_head);
+-@@ -436,7 +501,9 @@ int vpu_post_code(unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned
+- }
+- 
+- int vpu_qpu_post_code(unsigned vpu_code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
+--                      int qpu_code, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8)
+-+                      int qpu_code, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8,
+-+                      int qpu_codeb, int unifs1b, int unifs2b, int unifs3b, int unifs4b, int unifs5b, int unifs6b, int unifs7b, int unifs8b, int unifs9b, int unifs10b, int unifs11b, int unifs12b
+-+                      )
+- {
+- 
+-   pthread_mutex_lock(&post_mutex);
+-@@ -464,6 +531,21 @@ int vpu_qpu_post_code(unsigned vpu_code, unsigned r0, unsigned r1, unsigned r2,
+-     p[13] = unifs6;
+-     p[14] = unifs7;
+-     p[15] = unifs8;
+-+
+-+    p[16] = qpu_codeb;
+-+    p[17] = unifs1b;
+-+    p[18] = unifs2b;
+-+    p[19] = unifs3b;
+-+    p[20] = unifs4b;
+-+    p[21] = unifs5b;
+-+    p[22] = unifs6b;
+-+    p[23] = unifs7b;
+-+    p[24] = unifs8b;
+-+    p[25] = unifs9b;
+-+    p[26] = unifs10b;
+-+    p[27] = unifs11b;
+-+    p[28] = unifs12b;
+-+
+-     if (num<=1)
+-       pthread_cond_broadcast(&post_cond_tail); // Otherwise the vpu thread must already be awake
+-     pthread_mutex_unlock(&post_mutex);
+-@@ -544,27 +626,27 @@ void qpu_run_shader12(int code, int num, int code2, int num2, int unifs1, int un
+-   off_time += start_time-last_time;
+- #endif
+-   for(i=0;i<num;i++) {
+--    gpu->mail[i*2 + 1] = code;
+-+    gpu->mail2[i*2 + 1] = code;
+-   }
+-   for(;i<num+num2;i++) {
+--    gpu->mail[i*2 + 1] = code2;
+-+    gpu->mail2[i*2 + 1] = code2;
+-   }
+--  gpu->mail[0 ] = unifs1;
+--  gpu->mail[2 ] = unifs2;
+--  gpu->mail[4 ] = unifs3;
+--  gpu->mail[6 ] = unifs4;
+--  gpu->mail[8 ] = unifs5;
+--  gpu->mail[10] = unifs6;
+--	gpu->mail[12] = unifs7;
+--	gpu->mail[14] = unifs8;
+--	gpu->mail[16] = unifs9;
+--	gpu->mail[18] = unifs10;
+--	gpu->mail[20] = unifs11;
+--	gpu->mail[22] = unifs12;
+-+  gpu->mail2[0 ] = unifs1;
+-+  gpu->mail2[2 ] = unifs2;
+-+  gpu->mail2[4 ] = unifs3;
+-+  gpu->mail2[6 ] = unifs4;
+-+  gpu->mail2[8 ] = unifs5;
+-+  gpu->mail2[10] = unifs6;
+-+	gpu->mail2[12] = unifs7;
+-+	gpu->mail2[14] = unifs8;
+-+	gpu->mail2[16] = unifs9;
+-+	gpu->mail2[18] = unifs10;
+-+	gpu->mail2[20] = unifs11;
+-+	gpu->mail2[22] = unifs12;
+- 	execute_qpu(
+- 		gpu->mb,
+- 		12 /* Number of QPUs */,
+--		gpu->vc + offsetof(struct GPU, mail),
+-+		gpu->vc + offsetof(struct GPU, mail2),
+- 		1 /* no flush */,  // Don't flush VPU L1 cache
+- 		5000 /* timeout ms */);
+- #ifdef RPI_TIME_TOTAL_QPU
+-@@ -635,21 +717,21 @@ unsigned int qpu_get_fn(int num) {
+-       gpu_unlock();
+-     }
+-     switch(num) {
+--    //case QPU_MC_SETUP:
+--    //  fn = mc_setup;
+--    //  break;
+--    //case QPU_MC_FILTER:
+--    //  fn = mc_filter;
+--    //  break;
+-+    case QPU_MC_SETUP:
+-+      fn = mc_setup;
+-+      break;
+-+    case QPU_MC_FILTER:
+-+      fn = mc_filter;
+-+      break;
+-     case QPU_MC_EXIT:
+-       fn = mc_exit;
+-       break;
+--    //case QPU_MC_INTERRUPT_EXIT:
+--    //  fn = mc_interrupt_exit;
+--    //  break;
+--    //case QPU_MC_FILTER_B:
+--    //  fn = mc_filter_b;
+--    //  break;
+-+    case QPU_MC_INTERRUPT_EXIT12:
+-+      fn = mc_interrupt_exit12;
+-+      break;
+-+    case QPU_MC_FILTER_B:
+-+      fn = mc_filter_b;
+-+      break;
+-     //case QPU_MC_FILTER_HONLY:
+-     //  fn = mc_filter_honly;
+-     //  break;
+-diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
+-index 0565a60..81c2bb1 100644
+---- a/libavcodec/rpi_qpu.h
+-+++ b/libavcodec/rpi_qpu.h
+-@@ -1,6 +1,7 @@
+- #ifndef RPI_QPU_H
+- #define RPI_QPU_H
+- 
+-+// Define RPI_FAST_CACHEFLUSH to use the VCSM cache flush code
+- #define RPI_FAST_CACHEFLUSH
+- 
+- typedef struct gpu_mem_ptr_s {
+-@@ -16,6 +17,7 @@ extern int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p);
+- extern int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p);
+- extern void gpu_free(GPU_MEM_PTR_T *p);
+- extern void gpu_cache_flush(GPU_MEM_PTR_T *p);
+-+extern void gpu_cache_flush3(GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2);
+- 
+- // QPU specific functions
+- extern void qpu_run_shader8(int code, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8);
+-@@ -26,7 +28,7 @@ enum {
+-   QPU_MC_SETUP,
+-   QPU_MC_FILTER,
+-   QPU_MC_EXIT,
+--  QPU_MC_INTERRUPT_EXIT,
+-+  QPU_MC_INTERRUPT_EXIT12,
+-   QPU_MC_FILTER_B,
+-   QPU_MC_FILTER_HONLY,
+-   QPU_MC_SETUP_UV,
+-@@ -44,7 +46,9 @@ extern unsigned int vpu_get_constants(void);
+- extern unsigned vpu_execute_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5);
+- extern int vpu_post_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5, GPU_MEM_PTR_T *buf);
+- int vpu_qpu_post_code(unsigned vpu_code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
+--                      int qpu_code, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8);
+-+                      int qpu_code, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8,
+-+                      int qpu_codeb, int unifs1b, int unifs2b, int unifs3b, int unifs4b, int unifs5b, int unifs6b, int unifs7b, int unifs8b, int unifs9b, int unifs10b, int unifs11b, int unifs12b
+-+                      );
+- extern void vpu_wait( int id);
+- 
+- // Simple test of shader code
+-diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
+-index a0f0282..e86eb30 100644
+---- a/libavcodec/rpi_shader.c
+-+++ b/libavcodec/rpi_shader.c
+-@@ -48,693 +48,674 @@ unsigned int rpi_shader[] = {
+- /* [0x000000b8] */ 0x00000000, 0xe0020367, // mov ra13, 0
+- /* [0x000000c0] */ 0x00000000, 0xe00203a7, // mov ra14, 0
+- /* [0x000000c8] */ 0x00000000, 0xe00203e7, // mov ra15, 0
+--/* [0x000000d0] */ 0x15827d80, 0x100208e7, // mov r3, unif
+--/* [0x000000d8] */ 0x119c17c0, 0xd00208a7, // shl r2, r3, 1
+--/* [0x000000e0] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+--/* [0x000000e8] */ 0x159e7480, 0x10020867, // mov r1, r2
+--/* [0x000000f0] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+--/* [0x000000f8] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+--/* [0x00000100] */ 0x159e7480, 0x10020827, // mov r0, r2
+--/* [0x00000108] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+--/* [0x00000110] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000118] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
+--/* [0x00000120] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+--/* [0x00000128] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
+--/* [0x00000130] */ 0x119c17c0, 0xd00208a7, // shl r2, r3, 1
+--/* [0x00000138] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+--/* [0x00000140] */ 0x159e7480, 0x10020867, // mov r1, r2
+--/* [0x00000148] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+--/* [0x00000150] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+--/* [0x00000158] */ 0x159e7480, 0x10020827, // mov r0, r2
+--/* [0x00000160] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+--/* [0x00000168] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000170] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+--/* [0x00000178] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
+--/* [0x00000180] */ 0x0f9c11c0, 0xd0020827, // asr r0, r0, 1
+--/* [0x00000188] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
+--/* [0x00000190] */ 0x0c9e7040, 0x10021567, // add rb21, r0, r1
+--/* [0x00000198] */ 0x15427d80, 0x10020827, // mov r0, ra_x
+--/* [0x000001a0] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
+--/* [0x000001a8] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base
+--/* [0x000001b0] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
+--/* [0x000001b8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
+--/* [0x000001c0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x000001c8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+--/* [0x000001d0] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x, r0
+-+/* [0x000000d0] */ 0x15427d80, 0x10020827, // mov r0, ra_x
+-+/* [0x000000d8] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
+-+/* [0x000000e0] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base
+-+/* [0x000000e8] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
+-+/* [0x000000f0] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
+-+/* [0x000000f8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x00000100] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+-+/* [0x00000108] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x, r0
+-+/* [0x00000110] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x00000118] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
+-+/* [0x00000120] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_frame_base, r2
+-+/* [0x00000128] */ 0x0c9e7440, 0x10020f27, // add t1s, r2, r1
+-+/* [0x00000130] */ 0x00000008, 0xe00208a7, // mov r2,8
+-+/* [0x00000138] */ 0x11827c80, 0x10021327, // shl rb12,unif, r2
+-+/* [0x00000140] */ 0x0c827c80, 0x10021367, // add rb13,unif,r2
+-+/* [0x00000148] */ 0x15827d80, 0x100208a7, // mov r2, unif
+-+/* [0x00000150] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
+-+/* [0x00000158] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+-+/* [0x00000160] */ 0x159e7480, 0x10020867, // mov r1, r2
+-+/* [0x00000168] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+-+/* [0x00000170] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+-+/* [0x00000178] */ 0x159e7480, 0x10020827, // mov r0, r2
+-+/* [0x00000180] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+-+/* [0x00000188] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000190] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+-+/* [0x00000198] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
+-+/* [0x000001a0] */ 0x0f9c11c0, 0xd00208a7, // asr r2, r0, 1
+-+/* [0x000001a8] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
+-+/* [0x000001b0] */ 0x0c9e7440, 0x10021567, // add rb21, r2, r1
+-+/* [0x000001b8] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
+-+/* [0x000001c0] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+-+/* [0x000001c8] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
+-+/* [0x000001d0] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
+- /* [0x000001d8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+--/* [0x000001e0] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
+--/* [0x000001e8] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_frame_base, r2
+--/* [0x000001f0] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
+--/* [0x000001f8] */ 0x15827d80, 0x10021327, // mov rb12,unif
+--/* [0x00000200] */ 0x15827d80, 0x10021367, // mov rb13,unif
+--/* [0x00000208] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
+--/* [0x00000210] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+--/* [0x00000218] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
+--/* [0x00000220] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000228] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+--/* [0x00000230] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x
+--/* [0x00000238] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_frame_base
+-+/* [0x000001e0] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
+-+/* [0x000001e8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x000001f0] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+-+/* [0x000001f8] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x
+-+/* [0x00000200] */ 0x0c627380, 0x10020f27, // add t1s, r1, ra_frame_base
+- // ::mc_filter_uv
+--/* [0x00000240] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00000248] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x00000250] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00000258] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x00000260] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+--/* [0x00000268] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x00000270] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+--/* [0x00000278] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x00000280] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
+--/* [0x00000288] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x00000290] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
+--/* [0x00000298] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+--/* [0x000002a0] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x000002a8] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000002b0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x000002b8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x000002c0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x000002c8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+--/* [0x000002d0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+--/* [0x000002d8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x000002e0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x000002e8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x000002f0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x000002f8] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000300] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000308] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000310] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000318] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x00000320] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000328] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000330] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000338] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
+--/* [0x00000340] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000348] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000350] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
+--/* [0x00000358] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000360] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
+--/* [0x00000368] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000370] */ 0x0f9e7080, 0x100613e7, // asr.ifnz rb15, r0, r2
+--/* [0x00000378] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000380] */ 0x0f9e7080, 0x100613a7, // asr.ifnz rb14, r0, r2
+--/* [0x00000388] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x00000390] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x00000208] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000210] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00000218] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000220] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000228] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x00000230] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000238] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x00000240] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x00000248] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
+-+/* [0x00000250] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x00000258] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
+-+/* [0x00000260] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x00000268] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x00000270] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000278] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x00000280] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x00000288] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x00000290] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+-+/* [0x00000298] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+-+/* [0x000002a0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x000002a8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x000002b0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x000002b8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x000002c0] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000002c8] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000002d0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000002d8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000002e0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x000002e8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000002f0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000002f8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000300] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
+-+/* [0x00000308] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000310] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000318] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
+-+/* [0x00000320] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000328] */ 0x0f9e7080, 0x100208e7, // asr r3, r0, r2
+-+/* [0x00000330] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000338] */ 0x0f9e7080, 0x100613e7, // asr.ifnz rb15, r0, r2
+-+/* [0x00000340] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000348] */ 0x0f9e7080, 0x100608e7, // asr.ifnz r3, r0, r2
+-+/* [0x00000350] */ 0x119c87c0, 0xd00213a7, // shl rb14,r3,8
+-+/* [0x00000358] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop
+--/* [0x00000398] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x000003a0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
+--/* [0x000003a8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+--/* [0x000003b0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x000003b8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x000003c0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x000003c8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x000003d0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x000003d8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
+--/* [0x000003e0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_frame_base, r2
+--/* [0x000003e8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x000003f0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x000003f8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x00000400] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x00000408] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x00000410] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x00000418] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x00000420] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x00000428] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00000430] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+--/* [0x00000438] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+--/* [0x00000440] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
+--/* [0x00000448] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x00000450] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+--/* [0x00000458] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+--/* [0x00000460] */ 0x00000020, 0xe0021327, // mov rb12,32
+--/* [0x00000468] */ 0x00000006, 0xe0021367, // mov rb13,6
+--/* [0x00000470] */ 0x00000001, 0xe00213a7, // mov rb14,1
+--/* [0x00000478] */ 0x00000000, 0xe00213e7, // mov rb15,0
+--/* [0x00000480] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+--/* [0x00000488] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+--/* [0x00000490] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+--/* [0x00000498] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+--/* [0x000004a0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x000004a8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x000004b0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+--/* [0x000004b8] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
+--/* [0x000004c0] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
+--/* [0x000004c8] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
+--/* [0x000004d0] */ 0xfffffea8, 0xf06809e7, // brr.anyn -, r:uvloop
+--/* [0x000004d8] */ 0x0c9cf3c0, 0x10020867, // add r1, r1, rb15
+--/* [0x000004e0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x000004e8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+--/* [0x000004f0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x000004f8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000500] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x00000508] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x00000510] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x00000518] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000520] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x00000528] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000530] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000360] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x00000368] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
+-+/* [0x00000370] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+-+/* [0x00000378] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000380] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x00000388] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000390] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000398] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x000003a0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
+-+/* [0x000003a8] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2
+-+/* [0x000003b0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x000003b8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x000003c0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x000003c8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x000003d0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x000003d8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x000003e0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x000003e8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x000003f0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x000003f8] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+-+/* [0x00000400] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+-+/* [0x00000408] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
+-+/* [0x00000410] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00000418] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+-+/* [0x00000420] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+-+/* [0x00000428] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+-+/* [0x00000430] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+-+/* [0x00000438] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+/* [0x00000440] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x00000448] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x00000450] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x00000458] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x00000460] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
+-+/* [0x00000468] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
+-+/* [0x00000470] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
+-+/* [0x00000478] */ 0xfffffec8, 0xf06809e7, // brr.anyn -, r:uvloop
+-+/* [0x00000480] */ 0x0c9cf3c0, 0x10020867, // add r1, r1, rb15
+-+/* [0x00000488] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x00000490] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x00000498] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x000004a0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x000004a8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x000004b0] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x000004b8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x000004c0] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x000004c8] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x000004d0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x000004d8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_filter_uv_b0
+--/* [0x00000538] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00000540] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x00000548] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00000550] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x00000558] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+--/* [0x00000560] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x00000568] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+--/* [0x00000570] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x00000578] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
+--/* [0x00000580] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x00000588] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
+--/* [0x00000590] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
+--/* [0x00000598] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x000005a0] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000005a8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x000005b0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x000005b8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x000005c0] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+--/* [0x000005c8] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+--/* [0x000005d0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x000005d8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x000005e0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x000005e8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x000005f0] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000005f8] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000600] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000608] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000610] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x00000618] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000620] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000628] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000630] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
+--/* [0x00000638] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000640] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000648] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x00000650] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000658] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x000004e0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x000004e8] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x000004f0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x000004f8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000500] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x00000508] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000510] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x00000518] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x00000520] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
+-+/* [0x00000528] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x00000530] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
+-+/* [0x00000538] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
+-+/* [0x00000540] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x00000548] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000550] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x00000558] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x00000560] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x00000568] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+-+/* [0x00000570] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+-+/* [0x00000578] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x00000580] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000588] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000590] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00000598] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000005a0] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000005a8] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000005b0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000005b8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x000005c0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000005c8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x000005d0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x000005d8] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
+-+/* [0x000005e0] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000005e8] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000005f0] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x000005f8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000600] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop_b0
+--/* [0x00000660] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x00000668] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
+--/* [0x00000670] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+--/* [0x00000678] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x00000680] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x00000688] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x00000690] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x00000698] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x000006a0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
+--/* [0x000006a8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_frame_base, r2
+--/* [0x000006b0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x000006b8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x000006c0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x000006c8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x000006d0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x000006d8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x000006e0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x000006e8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x000006f0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x000006f8] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+--/* [0x00000700] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+--/* [0x00000708] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+--/* [0x00000710] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x00000718] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+--/* [0x00000720] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+--/* [0x00000728] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+--/* [0x00000730] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+--/* [0x00000738] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+--/* [0x00000740] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+--/* [0x00000748] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x00000750] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
+--/* [0x00000758] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+--/* [0x00000760] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
+--/* [0x00000768] */ 0x009e7000, 0x100009e7, // nop
+--/* [0x00000770] */ 0x009e7000, 0x100009e7, // nop
+--/* [0x00000778] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000780] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000788] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000790] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x00000608] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x00000610] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
+-+/* [0x00000618] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+-+/* [0x00000620] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000628] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x00000630] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000638] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000640] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x00000648] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
+-+/* [0x00000650] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2
+-+/* [0x00000658] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000660] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x00000668] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x00000670] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x00000678] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x00000680] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000688] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00000690] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00000698] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x000006a0] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+-+/* [0x000006a8] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+-+/* [0x000006b0] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+-+/* [0x000006b8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x000006c0] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+-+/* [0x000006c8] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+-+/* [0x000006d0] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+-+/* [0x000006d8] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+-+/* [0x000006e0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+/* [0x000006e8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x000006f0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x000006f8] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
+-+/* [0x00000700] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+-+/* [0x00000708] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
+-+/* [0x00000710] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x00000718] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x00000720] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000728] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000730] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000738] */ 0x009e7000, 0x100009e7, // nop
+- // ::mc_filter_uv_b
+--/* [0x00000798] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x000007a0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x000007a8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x000007b0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x000007b8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+--/* [0x000007c0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x000007c8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+--/* [0x000007d0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+--/* [0x000007d8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
+--/* [0x000007e0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x000007e8] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
+--/* [0x000007f0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+--/* [0x000007f8] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x00000800] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000808] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x00000810] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x00000818] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x00000820] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+--/* [0x00000828] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+--/* [0x00000830] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x00000838] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+--/* [0x00000840] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+--/* [0x00000848] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+--/* [0x00000850] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000858] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000860] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00000868] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
+--/* [0x00000870] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000878] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000880] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000888] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000890] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x00000898] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000008a0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000008a8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000008b0] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
+--/* [0x000008b8] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000008c0] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000008c8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+--/* [0x000008d0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x000008d8] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x00000740] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000748] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00000750] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000758] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+-+/* [0x00000760] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x00000768] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000770] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
+-+/* [0x00000778] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x00000780] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
+-+/* [0x00000788] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x00000790] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
+-+/* [0x00000798] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x000007a0] */ 0x00000010, 0xe00208a7, // mov r2, 16
+-+/* [0x000007a8] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x000007b0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+-+/* [0x000007b8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x000007c0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x000007c8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
+-+/* [0x000007d0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
+-+/* [0x000007d8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x000007e0] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
+-+/* [0x000007e8] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
+-+/* [0x000007f0] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+-+/* [0x000007f8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000800] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+-+/* [0x00000808] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00000810] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
+-+/* [0x00000818] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000820] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000828] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000830] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000838] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+-+/* [0x00000840] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000848] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+-+/* [0x00000850] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000858] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
+-+/* [0x00000860] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000868] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000870] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
+-+/* [0x00000878] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000880] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :uvloop_b
+--/* [0x000008e0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+--/* [0x000008e8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
+--/* [0x000008f0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+--/* [0x000008f8] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x00000900] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+--/* [0x00000908] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x00000910] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x00000918] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+--/* [0x00000920] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
+--/* [0x00000928] */ 0x0c627c80, 0x10020e27, // add t0s, ra_frame_base, r2
+--/* [0x00000930] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000938] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x00000940] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x00000948] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x00000950] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x00000958] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x00000960] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x00000968] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x00000970] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00000978] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+--/* [0x00000980] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+--/* [0x00000988] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
+--/* [0x00000990] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x00000998] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+--/* [0x000009a0] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+--/* [0x000009a8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+--/* [0x000009b0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+--/* [0x000009b8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+--/* [0x000009c0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+--/* [0x000009c8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x000009d0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x000009d8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+--/* [0x000009e0] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
+--/* [0x000009e8] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
+--/* [0x000009f0] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
+--/* [0x000009f8] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
+--/* [0x00000a00] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x00000a08] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+--/* [0x00000a10] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00000a18] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000a20] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+--/* [0x00000a28] */ 0x00000010, 0xe0020827, // mov r0, 16
+--/* [0x00000a30] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+--/* [0x00000a38] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00000a40] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+--/* [0x00000a48] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00000a50] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000888] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
+-+/* [0x00000890] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
+-+/* [0x00000898] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+-+/* [0x000008a0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x000008a8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x000008b0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x000008b8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x000008c0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x000008c8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
+-+/* [0x000008d0] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2
+-+/* [0x000008d8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x000008e0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x000008e8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x000008f0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x000008f8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x00000900] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00000908] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00000910] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00000918] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00000920] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+-+/* [0x00000928] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+-+/* [0x00000930] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x00000938] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00000940] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+-+/* [0x00000948] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+-+/* [0x00000950] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+-+/* [0x00000958] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+-+/* [0x00000960] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+/* [0x00000968] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x00000970] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x00000978] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x00000980] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x00000988] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
+-+/* [0x00000990] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
+-+/* [0x00000998] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x000009a0] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
+-+/* [0x000009a8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x000009b0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x000009b8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x000009c0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x000009c8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x000009d0] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x000009d8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x000009e0] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x000009e8] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x000009f0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x000009f8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_exit
+--/* [0x00000a58] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x00000a60] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+--/* [0x00000a68] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000a70] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000a78] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000a80] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000a88] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x00000a90] */ 0x009e7000, 0x100009e7, // nop        ; nop
+--/* [0x00000a98] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000a00] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00000a08] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+-+/* [0x00000a10] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a18] */ 0x009e7000, 0xb00009e7, // ldtmu1
+-+/* [0x00000a20] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a28] */ 0x009e7000, 0xb00009e7, // ldtmu1
+-+/* [0x00000a30] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00000a38] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000a40] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_interrupt_exit8
+--/* [0x00000aa0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x00000aa8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000ab0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000ab8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000ac0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00000ac8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000ad0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000ad8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000ae0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000ae8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000af0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000af8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00000b00] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x00000b08] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+--/* [0x00000b10] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000a48] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00000a50] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a58] */ 0x009e7000, 0xb00009e7, // ldtmu1
+-+/* [0x00000a60] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a68] */ 0x009e7000, 0xb00009e7, // ldtmu1
+-+/* [0x00000a70] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000a78] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000a80] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000a88] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000a90] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000a98] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000aa0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000aa8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00000ab0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x00000ab8] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_setup
+--/* [0x00000b18] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000b20] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00000b28] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00000b30] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x00000b38] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
+--/* [0x00000b40] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x00000b48] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
+--/* [0x00000b50] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+--/* [0x00000b58] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
+--/* [0x00000b60] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+--/* [0x00000b68] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+--/* [0x00000b70] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
+--/* [0x00000b78] */ 0x8c9e7452, 0x10025e18, // add t0s, r2, r1 ; mov ra_frame_base, r2
+--/* [0x00000b80] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00000b88] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x00000b90] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
+--/* [0x00000b98] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
+--/* [0x00000ba0] */ 0x0c9c13c0, 0xd0020567, // add ra_y2, r1, 1
+--/* [0x00000ba8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+--/* [0x00000bb0] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
+--/* [0x00000bb8] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+--/* [0x00000bc0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+--/* [0x00000bc8] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
+--/* [0x00000bd0] */ 0x8c9e7452, 0x10025e19, // add t0s, r2, r1 ; mov ra_frame_base2, r2
+--/* [0x00000bd8] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
+--/* [0x00000be0] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
+--/* [0x00000be8] */ 0x15827d80, 0x10021427, // mov rb16, unif
+--/* [0x00000bf0] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000bf8] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+--/* [0x00000c00] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
+--/* [0x00000c08] */ 0x00000001, 0xe0020527, // mov ra20, 1
+--/* [0x00000c10] */ 0x00000100, 0xe00205a7, // mov ra22, 256
+--/* [0x00000c18] */ 0x00000040, 0xe00207a7, // mov ra30, 64
+--/* [0x00000c20] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
+--/* [0x00000c28] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
+--/* [0x00000c30] */ 0x00000018, 0xe00215e7, // mov rb23, 24
+--/* [0x00000c38] */ 0x00000000, 0xe0020227, // mov ra8, 0
+--/* [0x00000c40] */ 0x00000000, 0xe0020267, // mov ra9, 0
+--/* [0x00000c48] */ 0x00000000, 0xe00202a7, // mov ra10, 0
+--/* [0x00000c50] */ 0x00000000, 0xe00202e7, // mov ra11, 0
+--/* [0x00000c58] */ 0x00000000, 0xe0020327, // mov ra12, 0
+--/* [0x00000c60] */ 0x00000000, 0xe0020367, // mov ra13, 0
+--/* [0x00000c68] */ 0x00000000, 0xe00203a7, // mov ra14, 0
+--/* [0x00000c70] */ 0x00000000, 0xe00203e7, // mov ra15, 0
+--/* [0x00000c78] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+--/* [0x00000c80] */ 0x159e7480, 0x10020867, // mov r1, r2
+--/* [0x00000c88] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+--/* [0x00000c90] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+--/* [0x00000c98] */ 0x159e7480, 0x10020827, // mov r0, r2
+--/* [0x00000ca0] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+--/* [0x00000ca8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000cb0] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
+--/* [0x00000cb8] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+--/* [0x00000cc0] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
+--/* [0x00000cc8] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+--/* [0x00000cd0] */ 0x159e7480, 0x10020867, // mov r1, r2
+--/* [0x00000cd8] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+--/* [0x00000ce0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+--/* [0x00000ce8] */ 0x159e7480, 0x10020827, // mov r0, r2
+--/* [0x00000cf0] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+--/* [0x00000cf8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000d00] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+--/* [0x00000d08] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
+--/* [0x00000d10] */ 0x15827d80, 0x10021327, // mov rb12,unif
+--/* [0x00000d18] */ 0x15827d80, 0x10021367, // mov rb13,unif
+--/* [0x00000d20] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000d28] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
+--/* [0x00000d30] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+--/* [0x00000d38] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
+--/* [0x00000d40] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+--/* [0x00000d48] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_frame_base
+--/* [0x00000d50] */ 0x13540dc0, 0xd0020867, // max r1, ra_y2, 0
+--/* [0x00000d58] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+--/* [0x00000d60] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000ac0] */ 0x00000010, 0xe00208e7, // mov r3, 16
+-+/* [0x00000ac8] */ 0x15827d80, 0x10020227, // mov ra8, unif
+-+/* [0x00000ad0] */ 0x15827d80, 0x10020267, // mov ra9, unif
+-+/* [0x00000ad8] */ 0x15827d80, 0x100202a7, // mov ra10, unif
+-+/* [0x00000ae0] */ 0x15827d80, 0x100202e7, // mov ra11, unif
+-+/* [0x00000ae8] */ 0x15827d80, 0x10020867, // mov r1, unif
+-+/* [0x00000af0] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3
+-+/* [0x00000af8] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3
+-+/* [0x00000b00] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3
+-+/* [0x00000b08] */ 0x0d9c13c0, 0xd0021667, // sub rb_frame_width_minus_1,r1,1
+-+/* [0x00000b10] */ 0x0d9c11c0, 0xd00217a7, // sub rb_frame_height_minus_1,r0,1
+-+/* [0x00000b18] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
+-+/* [0x00000b20] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000b28] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+-+/* [0x00000b30] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
+-+/* [0x00000b38] */ 0x15227d80, 0x10020867, // mov r1, ra8
+-+/* [0x00000b40] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3
+-+/* [0x00000b48] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3
+-+/* [0x00000b50] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3
+-+/* [0x00000b58] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
+-+/* [0x00000b60] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+-+/* [0x00000b68] */ 0x922591f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, ra9
+-+/* [0x00000b70] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000b78] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
+-+/* [0x00000b80] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+-+/* [0x00000b88] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
+-+/* [0x00000b90] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+-+/* [0x00000b98] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x00000ba0] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
+-+/* [0x00000ba8] */ 0x8c9e7452, 0x10025e18, // add t0s, r2, r1 ; mov ra_frame_base, r2
+-+/* [0x00000bb0] */ 0x152a7d80, 0x10020867, // mov r1, ra10
+-+/* [0x00000bb8] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3
+-+/* [0x00000bc0] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3
+-+/* [0x00000bc8] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3
+-+/* [0x00000bd0] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
+-+/* [0x00000bd8] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+-+/* [0x00000be0] */ 0x922d91f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, ra11
+-+/* [0x00000be8] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
+-+/* [0x00000bf0] */ 0x0c9c13c0, 0xd0020567, // add ra_y2, r1, 1
+-+/* [0x00000bf8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+-+/* [0x00000c00] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
+-+/* [0x00000c08] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+-+/* [0x00000c10] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x00000c18] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
+-+/* [0x00000c20] */ 0x8c9e7452, 0x10025f19, // add t1s, r2, r1 ; mov ra_frame_base2, r2
+-+/* [0x00000c28] */ 0x00000001, 0xe0020527, // mov ra20, 1
+-+/* [0x00000c30] */ 0x00000100, 0xe00205a7, // mov ra22, 256
+-+/* [0x00000c38] */ 0x00000040, 0xe00207a7, // mov ra30, 64
+-+/* [0x00000c40] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
+-+/* [0x00000c48] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
+-+/* [0x00000c50] */ 0x00000018, 0xe00215e7, // mov rb23, 24
+-+/* [0x00000c58] */ 0x00000000, 0xe0020227, // mov ra8, 0
+-+/* [0x00000c60] */ 0x00000000, 0xe0020267, // mov ra9, 0
+-+/* [0x00000c68] */ 0x00000000, 0xe00202a7, // mov ra10, 0
+-+/* [0x00000c70] */ 0x00000000, 0xe00202e7, // mov ra11, 0
+-+/* [0x00000c78] */ 0x00000000, 0xe0020327, // mov ra12, 0
+-+/* [0x00000c80] */ 0x00000000, 0xe0020367, // mov ra13, 0
+-+/* [0x00000c88] */ 0x00000000, 0xe00203a7, // mov ra14, 0
+-+/* [0x00000c90] */ 0x00000000, 0xe00203e7, // mov ra15, 0
+-+/* [0x00000c98] */ 0x00004000, 0xe00204a7, // mov ra18, 0x4000
+-+/* [0x00000ca0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+-+/* [0x00000ca8] */ 0x159e7480, 0x10020867, // mov r1, r2
+-+/* [0x00000cb0] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+-+/* [0x00000cb8] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+-+/* [0x00000cc0] */ 0x159e7480, 0x10020827, // mov r0, r2
+-+/* [0x00000cc8] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+-+/* [0x00000cd0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000cd8] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+-+/* [0x00000ce0] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
+-+/* [0x00000ce8] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
+-+/* [0x00000cf0] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+-+/* [0x00000cf8] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
+-+/* [0x00000d00] */ 0x15827d80, 0x10020867, // mov r1, unif
+-+/* [0x00000d08] */ 0x919c82ff, 0xd0024822, // shl r0,r1,r3 ; mov r2,8
+-+/* [0x00000d10] */ 0x0f9e70c0, 0x10021367, // asr rb13,r0,r3
+-+/* [0x00000d18] */ 0x0f9e72c0, 0x10021327, // asr rb12,r1,r3
+-+/* [0x00000d20] */ 0x0c9cde80, 0x10021367, // add rb13,rb13,r2
+-+/* [0x00000d28] */ 0x119cce80, 0x10021327, // shl rb12, rb12, r2
+-+/* [0x00000d30] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
+-+/* [0x00000d38] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x00000d40] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
+-+/* [0x00000d48] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+-+/* [0x00000d50] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_frame_base
+-+/* [0x00000d58] */ 0x13540dc0, 0xd0020867, // max r1, ra_y2, 0
+-+/* [0x00000d60] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+- /* [0x00000d68] */ 0x0c541dc0, 0xd0020567, // add ra_y2, ra_y2, 1
+- /* [0x00000d70] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+--/* [0x00000d78] */ 0x0c667380, 0x10020e27, // add t0s, r1, ra_frame_base2
+--// ::mc_filter
+-+/* [0x00000d78] */ 0x0c667380, 0x10020f27, // add t1s, r1, ra_frame_base2
+-+// :per_block_setup
+- /* [0x00000d80] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+- /* [0x00000d88] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+- /* [0x00000d90] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+- /* [0x00000d98] */ 0x155e7d80, 0x10021027, // mov rx_xshift2, rx_xshift2_next
+--/* [0x00000da0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00000da8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x00000db0] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
+--/* [0x00000db8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x00000dc0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x00000dc8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+--/* [0x00000dd0] */ 0x0c9e7400, 0x100206a7, // add ra_frame_base_next, r2, r0
+--/* [0x00000dd8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00000de0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0   ; mov r1, unif
+--/* [0x00000de8] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
+--/* [0x00000df0] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
+--/* [0x00000df8] */ 0x0c9c13c0, 0xd0021067, // add ra_y2_next, r1, 1
+--/* [0x00000e00] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+--/* [0x00000e08] */ 0x0c9e7400, 0x100214e7, // add rx_frame_base2_next, r2, r0
+--/* [0x00000e10] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+--/* [0x00000e18] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x00000e20] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000e28] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x00000e30] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x00000e38] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x00000e40] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+--/* [0x00000e48] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+--/* [0x00000e50] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x00000e58] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00000e60] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000e68] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00000e70] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000e78] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
+--/* [0x00000e80] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000e88] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000e90] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000e98] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x00000ea0] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
+--/* [0x00000ea8] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000eb0] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000eb8] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000ec0] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+--/* [0x00000ec8] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
+--/* [0x00000ed0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000ed8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x00000ee0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000ee8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+--/* [0x00000ef0] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
+--/* [0x00000ef8] */ 0x4f5971c6, 0x100251e0, // asr rb7, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000f00] */ 0x4f5971c6, 0x100251a0, // asr rb6, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00000f08] */ 0x4f5971c6, 0x10025160, // asr rb5, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000da0] */ 0x00000010, 0xe00208e7, // mov r3, 16
+-+/* [0x00000da8] */ 0x15827d80, 0x10020867, // mov r1, unif
+-+/* [0x00000db0] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3
+-+/* [0x00000db8] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3
+-+/* [0x00000dc0] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3
+-+/* [0x00000dc8] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
+-+/* [0x00000dd0] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+-+/* [0x00000dd8] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
+-+/* [0x00000de0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000de8] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+-+/* [0x00000df0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+-+/* [0x00000df8] */ 0x8c827436, 0x100246a1, // add ra_frame_base_next, r2, r0 ; mov r1, unif
+-+/* [0x00000e00] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3
+-+/* [0x00000e08] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3
+-+/* [0x00000e10] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3
+-+/* [0x00000e18] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
+-+/* [0x00000e20] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+-+/* [0x00000e28] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
+-+/* [0x00000e30] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
+-+/* [0x00000e38] */ 0x159e7240, 0x10021067, // mov ra_y2_next, r1
+-+/* [0x00000e40] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+-+/* [0x00000e48] */ 0x0c9e7400, 0x100214e7, // add rx_frame_base2_next, r2, r0
+-+/* [0x00000e50] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x00000e58] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000e60] */ 0x0e9e70c0, 0x10020867, // shr r1, r0, r3
+-+/* [0x00000e68] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+-+/* [0x00000e70] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+-+/* [0x00000e78] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+-+/* [0x00000e80] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+-+/* [0x00000e88] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+-+/* [0x00000e90] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000e98] */ 0x119e70c0, 0x10020827, // shl r0, r0, r3
+-+/* [0x00000ea0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00000ea8] */ 0x95801dbf, 0xd0024821, // mov r0, unif ; mov r1,1
+-+/* [0x00000eb0] */ 0x4f5971c6, 0x10024260, // asr ra9, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000eb8] */ 0x4f5971c6, 0x10024220, // asr ra8, r0, rb23;      mul24 r0, r0, ra22
+-+/* [0x00000ec0] */ 0x4f5971c6, 0x10044260, // asr.ifz ra9, r0, rb23;  mul24 r0, r0, ra22
+-+/* [0x00000ec8] */ 0x0f9d71c0, 0x10040227, // asr.ifz ra8, r0, rb23
+-+/* [0x00000ed0] */ 0x0d243f80, 0xd0020267, // sub ra9,3,ra9
+-+/* [0x00000ed8] */ 0x0d203f80, 0xd0020227, // sub ra8,3,ra8
+-+/* [0x00000ee0] */ 0x11243dc0, 0xd0020267, // shl ra9,ra9,3
+-+/* [0x00000ee8] */ 0x11203dc0, 0xd0020227, // shl ra8,ra8,3
+-+/* [0x00000ef0] */ 0x00ffff00, 0xe0020867, // mov r1,0xffff00
+-+/* [0x00000ef8] */ 0x11227380, 0x10020827, // shl r0, r1, ra8
+-+/* [0x00000f00] */ 0x0f9d71c0, 0x10020027, // asr ra0, r0, rb23
+-+/* [0x00000f08] */ 0x11267380, 0x10020827, // shl r0, r1, ra9
+- /* [0x00000f10] */ 0x0f9d71c0, 0x10021127, // asr rb4, r0, rb23
+--/* [0x00000f18] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00000f20] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
+--/* [0x00000f28] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
+--/* [0x00000f30] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00000f38] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
+--/* [0x00000f40] */ 0x00000000, 0xe00208e7, // mov r3, 0
+-+/* [0x00000f18] */ 0x01040400, 0xe0020867, // mov r1,0x1040400
+-+/* [0x00000f20] */ 0x11227380, 0x10020827, // shl r0, r1, ra8
+-+/* [0x00000f28] */ 0x0f9d71c0, 0x10020067, // asr ra1, r0, rb23
+-+/* [0x00000f30] */ 0x11267380, 0x10020827, // shl r0, r1, ra9
+-+/* [0x00000f38] */ 0x0f9d71c0, 0x10021167, // asr rb5, r0, rb23
+-+/* [0x00000f40] */ 0xfbf5f600, 0xe0020867, // mov r1,0xfbf5f600
+-+/* [0x00000f48] */ 0x11227380, 0x10020827, // shl r0, r1, ra8
+-+/* [0x00000f50] */ 0x0f9d71c0, 0x100200a7, // asr ra2, r0, rb23
+-+/* [0x00000f58] */ 0x11267380, 0x10020827, // shl r0, r1, ra9
+-+/* [0x00000f60] */ 0x0f9d71c0, 0x100211a7, // asr rb6, r0, rb23
+-+/* [0x00000f68] */ 0x11283a40, 0xe0020867, // mov r1,0x11283a40
+-+/* [0x00000f70] */ 0x11227380, 0x10020827, // shl r0, r1, ra8
+-+/* [0x00000f78] */ 0x0f9d71c0, 0x100200e7, // asr ra3, r0, rb23
+-+/* [0x00000f80] */ 0x11267380, 0x10020827, // shl r0, r1, ra9
+-+/* [0x00000f88] */ 0x0f9d71c0, 0x100211e7, // asr rb7, r0, rb23
+-+/* [0x00000f90] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100
+-+/* [0x00000f98] */ 0x11227380, 0x10020827, // shl r0, r1, ra8
+-+/* [0x00000fa0] */ 0x0f9d71c0, 0x10020127, // asr ra4, r0, rb23
+-+/* [0x00000fa8] */ 0x11267380, 0x10020827, // shl r0, r1, ra9
+-+/* [0x00000fb0] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
+-+/* [0x00000fb8] */ 0xf6f5fb00, 0xe0020867, // mov r1,0xf6f5fb00
+-+/* [0x00000fc0] */ 0x11227380, 0x10020827, // shl r0, r1, ra8
+-+/* [0x00000fc8] */ 0x0f9d71c0, 0x10020167, // asr ra5, r0, rb23
+-+/* [0x00000fd0] */ 0x11267380, 0x10020827, // shl r0, r1, ra9
+-+/* [0x00000fd8] */ 0x0f9d71c0, 0x10021267, // asr rb9, r0, rb23
+-+/* [0x00000fe0] */ 0x04040100, 0xe0020867, // mov r1,0x4040100
+-+/* [0x00000fe8] */ 0x11227380, 0x10020827, // shl r0, r1, ra8
+-+/* [0x00000ff0] */ 0x0f9d71c0, 0x100201a7, // asr ra6, r0, rb23
+-+/* [0x00000ff8] */ 0x11267380, 0x10020827, // shl r0, r1, ra9
+-+/* [0x00001000] */ 0x0f9d71c0, 0x100212a7, // asr rb10, r0, rb23
+-+/* [0x00001008] */ 0xffff0000, 0xe0020867, // mov r1,0xffff0000
+-+/* [0x00001010] */ 0x11227380, 0x10020827, // shl r0, r1, ra8
+-+/* [0x00001018] */ 0x0f9d71c0, 0x100201e7, // asr ra7, r0, rb23
+-+/* [0x00001020] */ 0x11267380, 0x10020827, // shl r0, r1, ra9
+-+/* [0x00001028] */ 0x0f9d71c0, 0x100212e7, // asr rb11, r0, rb23
+-+/* [0x00001030] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00001038] */ 0x0f9e70c0, 0x100213e7, // asr rb15, r0, r3
+-+/* [0x00001040] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00001048] */ 0x119e70c0, 0x10020827, // shl r0, r0, r3
+-+/* [0x00001050] */ 0x8f9c00ff, 0xd0024823, // asr r0, r0, r3 ; mov r3, 0
+-+/* [0x00001058] */ 0x119c81c0, 0xd00213a7, // shl rb14, r0, 8
+-+// ::mc_filter
+- // :yloop
+--/* [0x00000f48] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
+--/* [0x00000f50] */ 0x8e4539bf, 0xa0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu0
+--/* [0x00000f58] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+--/* [0x00000f60] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x00000f68] */ 0x0e9c09c0, 0x10020867, // shr r1, r4, rx_xshift2
+--/* [0x00000f70] */ 0x159c1fc0, 0x10040567, // mov.ifz ra_y2, ra_y2_next
+--/* [0x00000f78] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x00000f80] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x00000f88] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
+--/* [0x00000f90] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
+--/* [0x00000f98] */ 0x13540dc0, 0xd00208a7, // max r2, ra_y2, 0
+--/* [0x00000fa0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x00000fa8] */ 0x4c541dd3, 0xd0024562, // add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
+--/* [0x00000fb0] */ 0xec654c87, 0x10024e20, // add t0s, ra_frame_base2, r2   ; v8subs r0, r0, rb20
+--/* [0x00000fb8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00000fc0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x00000fc8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x00000fd0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x00000fd8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x00000fe0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x00000fe8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x00000ff0] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x00000ff8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00001000] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--/* [0x00001008] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--/* [0x00001010] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--/* [0x00001018] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--/* [0x00001020] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--/* [0x00001028] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--/* [0x00001030] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--/* [0x00001038] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--/* [0x00001040] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+--/* [0x00001048] */ 0x8d3487f6, 0xd00279cc, // sub.setf -, r3, 8    ; mov ra12, ra13
+--/* [0x00001050] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+--/* [0x00001058] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+--/* [0x00001060] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+--/* [0x00001068] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+--/* [0x00001070] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:yloop
+--/* [0x00001078] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x00001080] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+--/* [0x00001088] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+--/* [0x00001090] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+--/* [0x00001098] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+--/* [0x000010a0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+--/* [0x000010a8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+--/* [0x000010b0] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb4
+--/* [0x000010b8] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb5
+--/* [0x000010c0] */ 0x4c286237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb6
+--/* [0x000010c8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
+--/* [0x000010d0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x000010d8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x000010e0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+--/* [0x000010e8] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
+--/* [0x000010f0] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
+--/* [0x000010f8] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
+--/* [0x00001100] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:yloop
+--/* [0x00001108] */ 0x0c9cf3c0, 0x10020867, // add r1, r1, rb15
+--/* [0x00001110] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x00001118] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+--/* [0x00001120] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00001128] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00001130] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00001138] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00001060] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
+-+/* [0x00001068] */ 0x8e4539bf, 0xb0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu1
+-+/* [0x00001070] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+-+/* [0x00001078] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00001080] */ 0x0e9c09c0, 0x10020867, // shr r1, r4, rx_xshift2
+-+/* [0x00001088] */ 0x159c1fc0, 0x10040567, // mov.ifz ra_y2, ra_y2_next
+-+/* [0x00001090] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00001098] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x000010a0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
+-+/* [0x000010a8] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
+-+/* [0x000010b0] */ 0x13540dc0, 0xd00208a7, // max r2, ra_y2, 0
+-+/* [0x000010b8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x000010c0] */ 0x4c541dd3, 0xd0024562, // add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
+-+/* [0x000010c8] */ 0xec654c8f, 0x10024f21, // add t1s, ra_frame_base2, r2   ; v8subs r1, r1, rb20
+-+/* [0x000010d0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x000010d8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x000010e0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x000010e8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x000010f0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x000010f8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x00001100] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00001108] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00001110] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00001118] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+/* [0x00001120] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+/* [0x00001128] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+/* [0x00001130] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+/* [0x00001138] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+/* [0x00001140] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+/* [0x00001148] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+/* [0x00001150] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+/* [0x00001158] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+-+/* [0x00001160] */ 0x8d2487f6, 0xd00279c8, // sub.setf -, r3, 8    ; mov ra8, ra9
+-+/* [0x00001168] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+-+/* [0x00001170] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+-+/* [0x00001178] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+-+/* [0x00001180] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+-+/* [0x00001188] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:yloop
+-+/* [0x00001190] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00001198] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+-+/* [0x000011a0] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+-+/* [0x000011a8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+-+/* [0x000011b0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+-+/* [0x000011b8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+/* [0x000011c0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x000011c8] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb4
+-+/* [0x000011d0] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb5
+-+/* [0x000011d8] */ 0x4c286237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb6
+-+/* [0x000011e0] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
+-+/* [0x000011e8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x000011f0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x000011f8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x00001200] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
+-+/* [0x00001208] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
+-+/* [0x00001210] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
+-+/* [0x00001218] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:yloop
+-+/* [0x00001220] */ 0x0c9cf3c0, 0x10020867, // add r1, r1, rb15
+-+/* [0x00001228] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x00001230] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x00001238] */ 0xfffffb28, 0xf0f809e7, // brr -, r:per_block_setup
+-+/* [0x00001240] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00001248] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00001250] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_filter_b
+--/* [0x00001140] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x00001148] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+--/* [0x00001150] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+--/* [0x00001158] */ 0x155e7d80, 0x10021027, // mov rx_xshift2, rx_xshift2_next
+--/* [0x00001160] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x00001168] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
+--/* [0x00001170] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
+--/* [0x00001178] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+--/* [0x00001180] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
+--/* [0x00001188] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+--/* [0x00001190] */ 0x0c9e7400, 0x100206a7, // add ra_frame_base_next, r2, r0
+--/* [0x00001198] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+--/* [0x000011a0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0   ; mov r1, unif
+--/* [0x000011a8] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
+--/* [0x000011b0] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
+--/* [0x000011b8] */ 0x0c9c13c0, 0xd0021067, // add ra_y2_next, r1, 1
+--/* [0x000011c0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+--/* [0x000011c8] */ 0x0c9e7400, 0x100214e7, // add rx_frame_base2_next, r2, r0
+--/* [0x000011d0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+--/* [0x000011d8] */ 0x00000010, 0xe00208a7, // mov r2, 16
+--/* [0x000011e0] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x000011e8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
+--/* [0x000011f0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
+--/* [0x000011f8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
+--/* [0x00001200] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
+--/* [0x00001208] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+--/* [0x00001210] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
+--/* [0x00001218] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+--/* [0x00001220] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00001228] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+--/* [0x00001230] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00001238] */ 0x00000001, 0xe0020867, // mov r1, 1
+--/* [0x00001240] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
+--/* [0x00001248] */ 0x409f3001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 13, r1 << 13
+--/* [0x00001250] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00001258] */ 0x409f2001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 14, r1 << 14
+--/* [0x00001260] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00001268] */ 0x409f1001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 15, r1 << 15
+--/* [0x00001270] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00001278] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
+--/* [0x00001280] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
+--/* [0x00001288] */ 0x409f7001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 9, r1 << 9
+--/* [0x00001290] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00001298] */ 0x409f6001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 10, r1 << 10
+--/* [0x000012a0] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000012a8] */ 0x409f5001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 11, r1 << 11
+--/* [0x000012b0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000012b8] */ 0x409f4001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 12, r1 << 12
+--/* [0x000012c0] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
+--/* [0x000012c8] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
+--/* [0x000012d0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000012d8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--/* [0x000012e0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x000012e8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
+--/* [0x000012f0] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
+--/* [0x000012f8] */ 0x4f5971c6, 0x100251e0, // asr rb7, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00001300] */ 0x4f5971c6, 0x100251a0, // asr rb6, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00001308] */ 0x4f5971c6, 0x10025160, // asr rb5, r0, rb23;      mul24 r0, r0, ra22
+--/* [0x00001310] */ 0x0f9d71c0, 0x10021127, // asr rb4, r0, rb23
+--/* [0x00001318] */ 0x15827d80, 0x10020827, // mov r0, unif
+--/* [0x00001320] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
+--/* [0x00001328] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
+--/* [0x00001330] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
+--/* [0x00001338] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
+--/* [0x00001340] */ 0x00000000, 0xe00208e7, // mov r3, 0
+- // :yloopb
+--/* [0x00001348] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
+--/* [0x00001350] */ 0x8e4539bf, 0xa0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu0
+--/* [0x00001358] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+--/* [0x00001360] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--/* [0x00001368] */ 0x0e9c09c0, 0x10020867, // shr r1, r4, rx_xshift2
+--/* [0x00001370] */ 0x159c1fc0, 0x10040567, // mov.ifz ra_y2, ra_y2_next
+--/* [0x00001378] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+--/* [0x00001380] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x00001388] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
+--/* [0x00001390] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
+--/* [0x00001398] */ 0x13540dc0, 0xd00208a7, // max r2, ra_y2, 0
+--/* [0x000013a0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+--/* [0x000013a8] */ 0x4c541dd3, 0xd0024562, // add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
+--/* [0x000013b0] */ 0xec654c87, 0x10024e20, // add t0s, ra_frame_base2, r2   ; v8subs r0, r0, rb20
+--/* [0x000013b8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--/* [0x000013c0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+--/* [0x000013c8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--/* [0x000013d0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--/* [0x000013d8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--/* [0x000013e0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--/* [0x000013e8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--/* [0x000013f0] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--/* [0x000013f8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--/* [0x00001400] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--/* [0x00001408] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--/* [0x00001410] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--/* [0x00001418] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--/* [0x00001420] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--/* [0x00001428] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--/* [0x00001430] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--/* [0x00001438] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--/* [0x00001440] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+--/* [0x00001448] */ 0x8d3487f6, 0xd00279cc, // sub.setf -, r3, 8    ; mov ra12, ra13
+--/* [0x00001450] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+--/* [0x00001458] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+--/* [0x00001460] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+--/* [0x00001468] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+--/* [0x00001470] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:yloopb
+--/* [0x00001478] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+--/* [0x00001480] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+--/* [0x00001488] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+--/* [0x00001490] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+--/* [0x00001498] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+--/* [0x000014a0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+--/* [0x000014a8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+--/* [0x000014b0] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb4
+--/* [0x000014b8] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb5
+--/* [0x000014c0] */ 0x4c286237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb6
+--/* [0x000014c8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
+--/* [0x000014d0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+--/* [0x000014d8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--/* [0x000014e0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+--/* [0x000014e8] */ 0x4053800e, 0xd00049e1, // nop                     ; mul24 r1, r1 << 8, ra20 << 8
+--/* [0x000014f0] */ 0x4c78e38f, 0x10024860, // add r1, r1, ra30        ; mul24 r0, r1, rb14
+--/* [0x000014f8] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
+--/* [0x00001500] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:yloopb
+--/* [0x00001508] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
+--/* [0x00001510] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+--/* [0x00001518] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+--/* [0x00001520] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+--/* [0x00001528] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+--/* [0x00001530] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+--/* [0x00001538] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00001258] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
+-+/* [0x00001260] */ 0x8e4539bf, 0xb0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu1
+-+/* [0x00001268] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+-+/* [0x00001270] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00001278] */ 0x0e9c09c0, 0x10020867, // shr r1, r4, rx_xshift2
+-+/* [0x00001280] */ 0x159c1fc0, 0x10040567, // mov.ifz ra_y2, ra_y2_next
+-+/* [0x00001288] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00001290] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00001298] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
+-+/* [0x000012a0] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
+-+/* [0x000012a8] */ 0x13540dc0, 0xd00208a7, // max r2, ra_y2, 0
+-+/* [0x000012b0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x000012b8] */ 0x4c541dd3, 0xd0024562, // add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
+-+/* [0x000012c0] */ 0xec654c8f, 0x10024f21, // add t1s, ra_frame_base2, r2   ; v8subs r1, r1, rb20
+-+/* [0x000012c8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x000012d0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
+-+/* [0x000012d8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+/* [0x000012e0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+/* [0x000012e8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+/* [0x000012f0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+/* [0x000012f8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+/* [0x00001300] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+/* [0x00001308] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+/* [0x00001310] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+/* [0x00001318] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+/* [0x00001320] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+/* [0x00001328] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+/* [0x00001330] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+/* [0x00001338] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+/* [0x00001340] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+/* [0x00001348] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+/* [0x00001350] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
+-+/* [0x00001358] */ 0x8d2487f6, 0xd00279c8, // sub.setf -, r3, 8    ; mov ra8, ra9
+-+/* [0x00001360] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
+-+/* [0x00001368] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
+-+/* [0x00001370] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
+-+/* [0x00001378] */ 0x15367d80, 0x10020327, // mov ra12, ra13
+-+/* [0x00001380] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:yloopb
+-+/* [0x00001388] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
+-+/* [0x00001390] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+-+/* [0x00001398] */ 0x159e7000, 0x100203e7, // mov ra15, r0
+-+/* [0x000013a0] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
+-+/* [0x000013a8] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
+-+/* [0x000013b0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+/* [0x000013b8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x000013c0] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb4
+-+/* [0x000013c8] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb5
+-+/* [0x000013d0] */ 0x4c286237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb6
+-+/* [0x000013d8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
+-+/* [0x000013e0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x000013e8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+/* [0x000013f0] */ 0x0f9ce3c0, 0xd0020827, // asr r0, r1, 14
+-+/* [0x000013f8] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
+-+/* [0x00001400] */ 0x405b8006, 0xd00049e0, // nop                     ; mul24 r0, r0 << 8, ra22 << 8
+-+/* [0x00001408] */ 0x0c4a7380, 0x10020867, // add r1, r1, ra18
+-+/* [0x00001410] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
+-+/* [0x00001418] */ 0xfffffe20, 0xf06809e7, // brr.anyn -, r:yloopb
+-+/* [0x00001420] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
+-+/* [0x00001428] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
+-+/* [0x00001430] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x00001438] */ 0xfffff928, 0xf0f809e7, // brr -, r:per_block_setup
+-+/* [0x00001440] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00001448] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00001450] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+- // ::mc_interrupt_exit12
+--/* [0x00001540] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+--/* [0x00001548] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00001550] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00001558] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00001560] */ 0x009e7000, 0xa00009e7, // ldtmu0
+--/* [0x00001568] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00001570] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00001578] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00001580] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00001588] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00001590] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x00001598] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x000015a0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x000015a8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x000015b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x000015b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+--/* [0x000015c0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+--/* [0x000015c8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+--/* [0x000015d0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00001458] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00001460] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00001468] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00001470] */ 0x009e7000, 0xb00009e7, // ldtmu1
+-+/* [0x00001478] */ 0x009e7000, 0xb00009e7, // ldtmu1
+-+/* [0x00001480] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001488] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001490] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001498] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000014a0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000014a8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000014b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000014b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000014c0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000014c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000014d0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000014d8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x000014e0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x000014e8] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+// ::mc_exit1
+-+/* [0x000014f0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x000014f8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00001500] */ 0x009e7000, 0xb00009e7, // ldtmu1
+-+/* [0x00001508] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00001510] */ 0x009e7000, 0xb00009e7, // ldtmu1
+-+/* [0x00001518] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00001520] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x00001528] */ 0x009e7000, 0x100009e7, // nop        ; nop
+- // ::mc_end
+- };
+- #ifdef __HIGHC__
+-diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
+-index 6e552d9..760bd17 100644
+---- a/libavcodec/rpi_shader.h
+-+++ b/libavcodec/rpi_shader.h
+-@@ -4,15 +4,16 @@
+- extern unsigned int rpi_shader[];
+- 
+- #define mc_setup_uv (rpi_shader + 0)
+--#define mc_filter_uv (rpi_shader + 144)
+--#define mc_filter_uv_b0 (rpi_shader + 334)
+--#define mc_filter_uv_b (rpi_shader + 486)
+--#define mc_exit (rpi_shader + 662)
+--#define mc_interrupt_exit8 (rpi_shader + 680)
+--#define mc_setup (rpi_shader + 710)
+--#define mc_filter (rpi_shader + 864)
+--#define mc_filter_b (rpi_shader + 1104)
+--#define mc_interrupt_exit12 (rpi_shader + 1360)
+--#define mc_end (rpi_shader + 1398)
+-+#define mc_filter_uv (rpi_shader + 130)
+-+#define mc_filter_uv_b0 (rpi_shader + 312)
+-+#define mc_filter_uv_b (rpi_shader + 464)
+-+#define mc_exit (rpi_shader + 640)
+-+#define mc_interrupt_exit8 (rpi_shader + 658)
+-+#define mc_setup (rpi_shader + 688)
+-+#define mc_filter (rpi_shader + 1048)
+-+#define mc_filter_b (rpi_shader + 1174)
+-+#define mc_interrupt_exit12 (rpi_shader + 1302)
+-+#define mc_exit1 (rpi_shader + 1340)
+-+#define mc_end (rpi_shader + 1356)
+- 
+- #endif
+-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
+-index a0b8e5a..60d1ec2 100644
+---- a/libavcodec/rpi_shader.qasm
+-+++ b/libavcodec/rpi_shader.qasm
+-@@ -21,6 +21,7 @@
+- #
+- # ra16                                          clipped(row start address+elem_num)&~3
+- # ra17                                          per-channel shifts
+-+# ra18                                          0x4000
+- # ra19                                          next ra17
+- #
+- # rb16                                          pitch
+-@@ -86,7 +87,7 @@
+- 
+- 
+- ################################################################################
+--# mc_setup_uv(next_kernel, x, y, ref_u_base, ref_v_base, frame_width, frame_height, pitch, dst_pitch, pad0, pad1, pad2)
+-+# mc_setup_uv(next_kernel, x, y, ref_u_base, ref_v_base, frame_width, frame_height, pitch, dst_pitch, offset, denom, vpm_id)
+- ::mc_setup_uv
+- 
+- # Read starting kernel
+-@@ -132,36 +133,6 @@ mov ra13, 0
+- mov ra14, 0
+- mov ra15, 0
+- 
+--# Compute part of VPM to use for DMA output
+--mov r3, unif
+--shl r2, r3, 1   # Convert QPU numbers to be even (this means we can only use 8 QPUs, but is necessary as we need to save 16bit intermediate results)
+--and r2, r2, 15
+--mov r1, r2
+--asr r1, r1, 2
+--shl r1, r1, 6
+--mov r0, r2
+--and r0, r0, 3
+--add r0, r0, r1
+--mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
+--shl r0, r0, 5
+--add rb27, r0, r1
+--
+--# Compute part of VPM to save data into
+--shl r2, r3, 1
+--and r2, r2, 15    # r2 = bcd0
+--mov r1, r2        # r1 = bcd0
+--asr r1, r1, 2     # r1 = bc
+--shl r1, r1, 6     # r1 = bc000000
+--mov r0, r2        # r0 = bcd0
+--and r0, r0, 3     # r0 = d0
+--add r0, r0, r1    # r0 = bc0000d0
+--mov r1, vpm_setup(0, 4, h8p(0, 0))   # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
+--add rb28, r0, r1
+--asr r0, r0, 1     # r0 = bc0000d
+--# Prepare VPM command for 16bit intermediates
+--mov r1, vpm_setup(0, 2, h16p(0, 0))  # 2 is stride - stride acts on ADDR which is Y[5:0],H[0] for 16 bit
+--add rb21, r0, r1
+--
+- # Compute base address for first and second access
+- mov r0, ra_x           # Load x
+- max r0, r0, 0; mov r1, ra_y # Load y
+-@@ -175,10 +146,31 @@ min r1, r1, rb_frame_height_minus_1
+- # submit texture requests for first line
+- add r2, r2, r0 ; mul24 r1, r1, rb_pitch
+- add t0s, r0, r1 ; mov ra_frame_base, r2
+--add t0s, r2, r1
+-+add t1s, r2, r1
+-+
+-+mov r2,8
+-+shl rb12,unif, r2 # offset before shift
+-+add rb13,unif,r2  # offset after shift
+-+
+-+# Compute part of VPM to use for DMA output
+-+mov r2, unif
+-+shl r2, r2, 1   # Convert QPU numbers to be even (this means we can only use 8 QPUs, but is necessary as we need to save 16bit intermediate results)
+-+and r2, r2, 15
+-+mov r1, r2
+-+asr r1, r1, 2
+-+shl r1, r1, 6
+-+mov r0, r2
+-+and r0, r0, 3
+-+add r0, r0, r1
+- 
+--mov rb12,unif # offset before shift
+--mov rb13,unif # offset after shift
+-+mov r1, vpm_setup(0, 4, h8p(0, 0))   # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
+-+add rb28, r0, r1  # VPM 8bit storage
+-+asr r2, r0, 1     # r0 = bc0000d
+-+mov r1, vpm_setup(0, 2, h16p(0, 0))  # 2 is stride - stride acts on ADDR which is Y[5:0],H[0] for 16 bit
+-+add rb21, r2, r1  # VPM for 16bit intermediates
+-+mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
+-+shl r0, r0, 5
+-+add rb27, r0, r1  # DMA out
+- 
+- # submit texture requests for second line
+- max r1, ra_y, 0
+-@@ -187,7 +179,7 @@ add ra_y, ra_y, 1
+- bra -, ra31
+- nop ; mul24 r1, r1, rb_pitch
+- add t0s, r1, ra_x
+--add t0s, r1, ra_frame_base
+-+add t1s, r1, ra_frame_base
+- 
+- 
+- 
+-@@ -248,17 +240,15 @@ mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+- mov r0, unif # U offset/weight
+- asr rb15, r0, r2  # Compute offset from MSBs
+- shl r0, r0, r2
+--asr rb14, r0, r2  # Compute weight from LSBs
+-+asr r3, r0, r2  # Compute weight from LSBs
+- mov r0, unif # V offset/weight
+- asr.ifnz rb15, r0, r2
+- shl r0, r0, r2
+--asr.ifnz rb14, r0, r2
+-+asr.ifnz r3, r0, r2
+-+shl rb14,r3,8 # Scale up weights so we can use mul24 in signed fashion
+- 
+- # r2 is elem_num
+- # r3 is loop counter
+--
+--mov r5rep, -8
+--
+- # retrieve texture results and pick out bytes
+- # then submit two more texture requests
+- 
+-@@ -269,7 +259,7 @@ mov r3, 0
+- # then submit two more texture requests
+- 
+- sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
+--shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
+-+shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
+- mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+- mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+- shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
+-@@ -278,7 +268,7 @@ max r2, ra_y, 0  # y
+- min r2, r2, rb_frame_height_minus_1
+- add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+- add t0s, ra_x, r2    ; v8subs r1, r1, rb20
+--add t0s, ra_frame_base, r2
+-+add t1s, ra_frame_base, r2
+- 
+- # generate seven shifted versions
+- # interleave with scroll of vertical context
+-@@ -301,11 +291,6 @@ mov ra13, ra14       # Delay slot 1
+- mov ra14, ra15       # Delay slot 2
+- mov ra15, r0         # Delay slot 3
+- 
+--mov rb12,32 # TODO remove these to make P weighted prediction work properly
+--mov rb13,6
+--mov rb14,1
+--mov rb15,0
+--
+- # apply vertical filter and write to VPM
+- 
+- nop                     ; mul24 r1, ra14, rb10
+-@@ -412,7 +397,7 @@ mov r3, 0
+- # then submit two more texture requests
+- 
+- sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
+--shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
+-+shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
+- mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+- mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+- shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
+-@@ -421,7 +406,7 @@ max r2, ra_y, 0  # y
+- min r2, r2, rb_frame_height_minus_1
+- add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+- add t0s, ra_x, r2    ; v8subs r1, r1, rb20
+--add t0s, ra_frame_base, r2
+-+add t1s, ra_frame_base, r2
+- 
+- # generate seven shifted versions
+- # interleave with scroll of vertical context
+-@@ -542,7 +527,7 @@ mov r3, 0
+- # then submit two more texture requests
+- 
+- sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
+--shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
+-+shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
+- mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+- mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+- shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
+-@@ -551,7 +536,7 @@ max r2, ra_y, 0  # y
+- min r2, r2, rb_frame_height_minus_1
+- add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+- add t0s, ra_x, r2    ; v8subs r1, r1, rb20
+--add t0s, ra_frame_base, r2
+-+add t1s, ra_frame_base, r2
+- 
+- # generate seven shifted versions
+- # interleave with scroll of vertical context
+-@@ -617,9 +602,9 @@ mov  -, vw_wait # wait on the VDW
+- mov -,srel(0)
+- 
+- ldtmu0
+-+ldtmu1
+- ldtmu0
+--ldtmu0
+--ldtmu0
+-+ldtmu1
+- 
+- nop        ; nop ; thrend
+- nop        ; nop # delay slot 1
+-@@ -630,9 +615,9 @@ nop        ; nop # delay slot 2
+- mov  -, vw_wait # wait on the VDW
+- 
+- ldtmu0
+-+ldtmu1
+- ldtmu0
+--ldtmu0
+--ldtmu0
+-+ldtmu1
+- 
+- mov -,sacq(0) # 1
+- mov -,sacq(0) # 2
+-@@ -656,200 +641,249 @@ nop        ; nop # delay slot 2
+- # For P frames we make the second x,y coordinates offset by +8
+- 
+- ################################################################################
+--# mc_setup(next_kernel, x, y, ref_y_base, x2, y2, ref_y2_base, frame_width, frame_height, pitch, dst_pitch, offset, shift, pad2)
+-+# mc_setup(y_x, ref_y_base, y2_x2, ref_y2_base, frame_width_height, pitch, dst_pitch, offset_shift, next_kernel)
+- ::mc_setup
+-+  mov r3, 16
+- 
+--# Read starting kernel
+--mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--mov ra31, unif
+--
+--# Compute base address for first and second access
+--add r0, unif, elem_num # Load x
+--max r0, r0, 0; mov r1, unif # Load y
+--min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
+--shl ra_xshift_next, r0, 3 # Compute shifts
+--add ra_y, r1, 1
+--and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
+--add r2, r2, r0  # r2 is address for frame0 (not including y offset)
+--max r1, r1, 0
+--min r1, r1, rb_frame_height_minus_1
+--nop             ; mul24 r1, r1, rb_pitch   # r2 contains the addresses (not including y offset) for frame0
+--add t0s, r2, r1 ; mov ra_frame_base, r2
+--
+--add r0, unif, elem_num # Load x
+--max r0, r0, 0; mov r1, unif # Load y
+--min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
+--shl rx_xshift2_next, r0, 3 # Compute shifts
+--add ra_y2, r1, 1
+--and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
+--add r2, r2, r0  # r2 is address for frame1 (not including y offset)
+--max r1, r1, 0
+--min r1, r1, rb_frame_height_minus_1
+--nop             ; mul24 r1, r1, rb_pitch   # r2 contains the addresses (not including y offset) for frame0
+--add t0s, r2, r1 ; mov ra_frame_base2, r2
+--
+-+  # Need to save these because we need to know the frame dimensions before computing texture coordinates
+-+  mov ra8, unif
+-+  mov ra9, unif
+-+  mov ra10, unif
+-+  mov ra11, unif
+- 
+- # Read image dimensions
+--sub rb25,unif,1
+--sub rb30,unif,1
+-+  mov r1, unif # width_height
+-+  shl r0,r1,r3
+-+  asr r1,r1,r3 # width
+-+  asr r0,r0,r3 # height
+-+  sub rb_frame_width_minus_1,r1,1
+-+  sub rb_frame_height_minus_1,r0,1
+- 
+- # get source pitch
+--mov rb16, unif
+-+  mov rb_pitch, unif
+- 
+- # get destination pitch
+--mov r0, unif
+--mov r1, vdw_setup_1(0)
+--add rb24, r1, r0
+-+  mov r0, unif
+-+  mov r1, vdw_setup_1(0)
+-+  add rb24, r1, r0
+- 
+--# load constants
+--
+--mov ra20, 1
+--mov ra22, 256
+--mov ra30, 64
+--
+--mov rb20, 0xffffff00
+--mov rb22, 255
+--mov rb23, 24
+-+# Compute base address for first and second access
+-+  mov r1, ra8 # y_x
+-+  shl r0,r1,r3 # r0 is x<<16
+-+  asr r1,r1,r3 # r1 is y
+-+  asr r0,r0,r3 # r0 is x
+-+  add r0, r0, elem_num # Load x
+-+  max r0, r0, 0
+-+  min r0, r0, rb_frame_width_minus_1 ; mov r2, ra9  # Load the frame base
+-+  shl ra_xshift_next, r0, 3 # Compute shifts
+-+  add ra_y, r1, 1
+-+  and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
+-+  add r2, r2, r0  # r2 is address for frame0 (not including y offset)
+-+  max r1, r1, 0
+-+  min r1, r1, rb_frame_height_minus_1
+-+  nop             ; mul24 r1, r1, rb_pitch   # r2 contains the addresses (not including y offset) for frame0
+-+  add t0s, r2, r1 ; mov ra_frame_base, r2
+-+
+-+  mov r1, ra10 # y_x
+-+  shl r0,r1,r3 # r0 is x<<16
+-+  asr r1,r1,r3 # r1 is y
+-+  asr r0,r0,r3 # r0 is x
+-+  add r0, r0, elem_num # Load x
+-+  max r0, r0, 0
+-+  min r0, r0, rb_frame_width_minus_1 ; mov r2, ra11  # Load the frame base
+-+  shl rx_xshift2_next, r0, 3 # Compute shifts
+-+  add ra_y2, r1, 1
+-+  and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
+-+  add r2, r2, r0  # r2 is address for frame1 (not including y offset)
+-+  max r1, r1, 0
+-+  min r1, r1, rb_frame_height_minus_1
+-+  nop             ; mul24 r1, r1, rb_pitch   # r2 contains the addresses (not including y offset) for frame0
+-+  add t1s, r2, r1 ; mov ra_frame_base2, r2
+- 
+--# touch vertical context to keep simulator happy
+- 
+--mov ra8, 0
+--mov ra9, 0
+--mov ra10, 0
+--mov ra11, 0
+--mov ra12, 0
+--mov ra13, 0
+--mov ra14, 0
+--mov ra15, 0
+-+# load constants
+- 
+--# Compute part of VPM to use for DMA output
+--mov r2, qpu_num
+--mov r1, r2
+--asr r1, r1, 2
+--shl r1, r1, 6
+--mov r0, r2
+--and r0, r0, 3
+--add r0, r0, r1
+--mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
+--shl r0, r0, 5
+--add rb27, r0, r1
+-+  mov ra20, 1
+-+  mov ra22, 256
+-+  mov ra30, 64
+- 
+--# Compute part of VPM to save data into
+--mov r2, qpu_num   # qpu_num = abcd
+--mov r1, r2
+--asr r1, r1, 2
+--shl r1, r1, 6
+--mov r0, r2
+--and r0, r0, 3
+--add r0, r0, r1
+--mov r1, vpm_setup(0, 4, h8p(0, 0))   # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
+--add rb28, r0, r1
+-+  mov rb20, 0xffffff00
+-+  mov rb22, 255
+-+  mov rb23, 24
+- 
+--mov rb12,unif # offset before shift
+--mov rb13,unif # shift
+-+# touch vertical context to keep simulator happy
+- 
+--# Dump padding words
+--mov r0, unif
+-+  mov ra8, 0
+-+  mov ra9, 0
+-+  mov ra10, 0
+-+  mov ra11, 0
+-+  mov ra12, 0
+-+  mov ra13, 0
+-+  mov ra14, 0
+-+  mov ra15, 0
+-+  mov ra18, 0x4000
+-+
+-+# Compute part of VPM to use
+-+  mov r2, qpu_num
+-+  mov r1, r2
+-+  asr r1, r1, 2
+-+  shl r1, r1, 6
+-+  mov r0, r2
+-+  and r0, r0, 3
+-+  add r0, r0, r1
+-+  mov r1, vpm_setup(0, 4, h8p(0, 0))   # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
+-+  add rb28, r0, r1  # VPM for saving data
+-+  mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
+-+  shl r0, r0, 5
+-+  add rb27, r0, r1  # Command for dma output
+-+
+-+# Weighted prediction denom
+-+
+-+  mov r1, unif # offset_shift
+-+  shl r0,r1,r3 ; mov r2,8
+-+  asr rb13,r0,r3 # shift
+-+  asr rb12,r1,r3 # offset
+-+  add rb13,rb13,r2    # mul24 is unsigned so scale up into high bits
+-+  shl rb12, rb12, r2 # Account for larger shift
+- 
+- # submit texture requests for second line
+--max r1, ra_y, 0
+--min r1, r1, rb_frame_height_minus_1
+--add ra_y, ra_y, 1
+--nop ; mul24 r1, r1, rb_pitch
+--add t0s, r1, ra_frame_base
+--
+--max r1, ra_y2, 0
+--min r1, r1, rb_frame_height_minus_1
+--bra -, ra31
+--add ra_y2, ra_y2, 1           # Delay 1
+--nop ; mul24 r1, r1, rb_pitch  # Delay 2
+--add t0s, r1, ra_frame_base2   # Delay 3
+--
+--
+--################################################################################
+--
+--# mc_filter(next_kernel, x, y, frame_base, x2, y2, frame_base2, height, hcoeffs[0], hcoeffs2[0], hcoeffs[1], hcoeffs2[1], vcoeffs[0], vcoeffs2[0], vcoeffs[1], vcoeffs2[1], offsetweight0, offsetweight1, this_dst)
+--# In a P block, only the first half of coefficients contain used information.
+--# At this point we have already issued two pairs of texture requests for the current block
+--# ra_x, ra_x16_base point to the current coordinates for this block
+--::mc_filter
+--mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--mov ra31, unif
+-+  max r1, ra_y, 0
+-+  min r1, r1, rb_frame_height_minus_1
+-+  add ra_y, ra_y, 1
+-+  nop ; mul24 r1, r1, rb_pitch
+-+  add t0s, r1, ra_frame_base
+-+
+-+  max r1, ra_y2, 0
+-+  min r1, r1, rb_frame_height_minus_1
+-+  add ra_y2, ra_y2, 1
+-+  nop ; mul24 r1, r1, rb_pitch
+-+  add t1s, r1, ra_frame_base2
+-+
+-+# FALL THROUGHT TO PER-BLOCK SETUP
+-+
+-+# Start of per-block setup code
+-+# P and B blocks share the same setup code to save on Icache space
+-+:per_block_setup
+-+  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+  mov ra31, unif
+- 
+- # per-channel shifts were calculated on the *previous* invocation
+--
+--mov ra_xshift, ra_xshift_next
+--mov rx_xshift2, rx_xshift2_next
+-+  mov ra_xshift, ra_xshift_next
+-+  mov rx_xshift2, rx_xshift2_next
+- 
+- # get base addresses and per-channel shifts for *next* invocation
+--add r0, unif, elem_num # Load x
+--max r0, r0, 0; mov r1, unif # Load y
+--min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
+--shl ra_xshift_next, r0, 3 # Compute shifts
+--mov ra_y_next, r1
+--and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
+--add ra_frame_base_next, r2, r0  # r2 is address for frame0 (not including y offset)
+--
+--add r0, unif, elem_num # Load x
+--max r0, r0, 0   ; mov r1, unif # Load y
+--min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
+--shl rx_xshift2_next, r0, 3 # Compute shifts
+--add ra_y2_next, r1, 1
+--and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
+--add rx_frame_base2_next, r2, r0  # r2 is address for frame1 (not including y offset)
+--
+-+  mov r3, 16
+-+  mov r1, unif # y_x
+-+  shl r0,r1,r3 # r0 is x<<16
+-+  asr r1,r1,r3 # r1 is y
+-+  asr r0,r0,r3 # r0 is x
+-+  add r0, r0, elem_num # Load x
+-+  max r0, r0, 0
+-+  min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
+-+  shl ra_xshift_next, r0, 3 # Compute shifts
+-+  mov ra_y_next, r1
+-+  and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
+-+  add ra_frame_base_next, r2, r0 ; mov r1, unif # y2_x2
+-+
+-+  shl r0,r1,r3 # r0 is x2<<16
+-+  asr r1,r1,r3 # r1 is y2
+-+  asr r0,r0,r3 # r0 is x2
+-+  add r0, r0, elem_num # Load x
+-+  max r0, r0, 0
+-+  min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
+-+  shl rx_xshift2_next, r0, 3 # Compute shifts
+-+  mov ra_y2_next, r1
+-+  and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
+-+  add rx_frame_base2_next, r2, r0  # r2 is address for frame1 (not including y offset)
+- 
+- # set up VPM write
+--mov vw_setup, rb28
+-+  mov vw_setup, rb28
+- 
+- # get width,height of block
+--mov r2, 16
+--mov r0, unif
+--shr r1, r0, r2 # Extract width
+--sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
+--and r0, r0, rb22 # Extract height
+--add rb17, r0, 5
+--add rb18, r0, 7
+--shl r0, r0, 7
+--add r0, r0, r1 # Combine width and height of destination area
+--shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
+--add rb26, r0, rb27
+-+  mov r0, unif
+-+  shr r1, r0, r3 # Extract width
+-+  sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
+-+  and r0, r0, rb22 # Extract height
+-+  add rb17, r0, 5
+-+  add rb18, r0, 7
+-+  shl r0, r0, 7
+-+  add r0, r0, r1 # Combine width and height of destination area
+-+  shl r0, r0, r3 # Shift into bits 16 upwards of the vdw_setup0 register
+-+  add rb26, r0, rb27
+- 
+- # get filter coefficients and discard unused B frame values
+--mov r0, unif
+--mov.ifnz -, unif # Alternate coefficients are unused for P frames
+--asr ra3, r0, rb23;      mul24 r0, r0, ra22 # These may need some pre-rotation to be used in B frames correctly
+--asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--asr ra0, r0, rb23;      mov r0, unif
+--mov.ifnz -, unif
+--asr ra7, r0, rb23;      mul24 r0, r0, ra22
+--asr ra6, r0, rb23;      mul24 r0, r0, ra22
+--asr ra5, r0, rb23;      mul24 r0, r0, ra22
+--asr ra4, r0, rb23;      mov r0, unif
+--mov.ifnz -, unif
+--asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--asr rb8, r0, rb23;      mov r0, unif
+--mov.ifnz -, unif
+--asr rb7, r0, rb23;      mul24 r0, r0, ra22
+--asr rb6, r0, rb23;      mul24 r0, r0, ra22
+--asr rb5, r0, rb23;      mul24 r0, r0, ra22
+--asr rb4, r0, rb23
+--
+--mov r0, unif # Frame0 offset/weight
+--mov.ifnz -, unif # Frame1 offset/weight unused
+--asr rb15, r0, r2  # Compute offset from MSBs
+--shl r0, r0, r2
+--asr rb14, r0, r2  # Compute weight from LSBs
+--
+--# r3 is loop counter
+-+  mov r0, unif ; mov r1,1  # Packed filter offsets, unpack into ra8... (to be used for vertical context later)
+-+  asr ra9, r0, rb23;      mul24 r0, r0, ra22 # my2
+-+  asr ra8, r0, rb23;      mul24 r0, r0, ra22 # mx2
+-+  asr.ifz ra9, r0, rb23;  mul24 r0, r0, ra22 # my:my2
+-+  asr.ifz ra8, r0, rb23                      # mx:mx2
+-+  sub ra9,3,ra9
+-+  sub ra8,3,ra8
+-+  shl ra9,ra9,3   # Scale up by 8
+-+  shl ra8,ra8,3   # Scale up by 8
+-+# Now if we want aligned we have a mul of 1, so put 0 coefficients at the top
+-+  mov r1,0xffff00
+-+  shl r0, r1, ra8
+-+  asr ra0, r0, rb23
+-+  shl r0, r1, ra9
+-+  asr rb4, r0, rb23
+-+
+-+  mov r1,0x1040400
+-+  shl r0, r1, ra8
+-+  asr ra1, r0, rb23
+-+  shl r0, r1, ra9
+-+  asr rb5, r0, rb23
+-+
+-+  mov r1,0xfbf5f600
+-+  shl r0, r1, ra8
+-+  asr ra2, r0, rb23
+-+  shl r0, r1, ra9
+-+  asr rb6, r0, rb23
+-+
+-+  mov r1,0x11283a40
+-+  shl r0, r1, ra8
+-+  asr ra3, r0, rb23
+-+  shl r0, r1, ra9
+-+  asr rb7, r0, rb23
+-+
+-+  mov r1,0x3a281100
+-+  shl r0, r1, ra8
+-+  asr ra4, r0, rb23
+-+  shl r0, r1, ra9
+-+  asr rb8, r0, rb23
+-+
+-+  mov r1,0xf6f5fb00
+-+  shl r0, r1, ra8
+-+  asr ra5, r0, rb23
+-+  shl r0, r1, ra9
+-+  asr rb9, r0, rb23
+-+
+-+  mov r1,0x4040100
+-+  shl r0, r1, ra8
+-+  asr ra6, r0, rb23
+-+  shl r0, r1, ra9
+-+  asr rb10, r0, rb23
+-+
+-+  mov r1,0xffff0000
+-+  shl r0, r1, ra8
+-+  asr ra7, r0, rb23
+-+  shl r0, r1, ra9
+-+  asr rb11, r0, rb23
+-+
+-+# Extract weighted prediction information
+-+  mov r0, unif      # offset/weight  TODO move up
+-+  asr rb15, r0, r3  # Compute offset from MSBs
+-+  bra -, ra31
+-+  shl r0, r0, r3    #                                                            Delay 1
+-+  asr r0, r0, r3 ; mov r3, 0 # Compute weight from LSBs and reset loop counter   Delay 2
+-+  shl rb14, r0, 8 # Use a larger shift to avoid unsigned multiply problem        Delay 3
+- 
+--# retrieve texture results and pick out bytes
+--# then submit two more texture requests
+-+################################################################################
+-+# mc_filter(y_x, frame_base, y2_x2, frame_base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
+-+# In a P block, y2_x2 should be y_x+8
+-+# At this point we have already issued two pairs of texture requests for the current block
+- 
+--mov r3, 0
+-+::mc_filter
+- 
+- :yloop
+- # retrieve texture results and pick out bytes
+-@@ -858,91 +892,90 @@ mov r3, 0
+- # If we knew there was no clipping then this code would get simpler.
+- # Perhaps we could add on the pitch and clip using larger values?
+- 
+--sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
+--shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu0
+--mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+--mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--shr r1, r4, rx_xshift2
+--mov.ifz ra_y2, ra_y2_next
+-+  sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
+-+  shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu1
+-+  mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+-+  mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+  shr r1, r4, rx_xshift2
+-+  mov.ifz ra_y2, ra_y2_next
+- 
+--max r2, ra_y, 0  # y
+--min r2, r2, rb_frame_height_minus_1
+--add ra_y, ra_y, 1            ; mul24 r2, r2, r3
+--add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte
+--
+--max r2, ra_y2, 0  # y
+--min r2, r2, rb_frame_height_minus_1
+--add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
+--add t0s, ra_frame_base2, r2   ; v8subs r0, r0, rb20
+-+  max r2, ra_y, 0  # y
+-+  min r2, r2, rb_frame_height_minus_1
+-+  add ra_y, ra_y, 1            ; mul24 r2, r2, r3
+-+  add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte
+- 
+-+  max r2, ra_y2, 0  # y
+-+  min r2, r2, rb_frame_height_minus_1
+-+  add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
+-+  add t1s, ra_frame_base2, r2   ; v8subs r1, r1, rb20
+- 
+- # generate seven shifted versions
+- # interleave with scroll of vertical context
+- 
+--mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+- 
+- # apply horizontal filter
+--nop                  ; mul24 r2, r0, ra0
+--nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--add r0, r2, r3       ; mov r3, rb31
+--sub.setf -, r3, 8    ; mov ra12, ra13
+--mov ra9, ra10
+--mov ra10, ra11
+--mov ra11, ra12
+--mov ra12, ra13
+--brr.anyn -, r:yloop
+--mov ra13, ra14       # Delay slot 1
+--mov ra14, ra15       # Delay slot 2
+--mov ra15, r0         # Delay slot 3
+-+  nop                  ; mul24 r2, r0, ra0
+-+  nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+  nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+  nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+  add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+  nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+  add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+  nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+  add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+  nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+  add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+  nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+  add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+  nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+  add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+  nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+  add r0, r2, r3       ; mov r3, rb31
+-+  sub.setf -, r3, 8    ; mov ra8, ra9
+-+  mov ra9, ra10
+-+  mov ra10, ra11
+-+  mov ra11, ra12
+-+  mov ra12, ra13
+-+  brr.anyn -, r:yloop
+-+  mov ra13, ra14       # Delay slot 1
+-+  mov ra14, ra15       # Delay slot 2
+-+  mov ra15, r0         # Delay slot 3
+- 
+- # apply vertical filter and write to VPM
+- 
+--nop                     ; mul24 r1, ra14, rb10
+--nop                     ; mul24 r0, ra13, rb9
+--add r1, r1, r0          ; mul24 r0, ra12, rb8
+--add r1, r1, r0          ; mul24 r0, ra15, rb11
+--add r1, r1, r0          ; mul24 r0, ra8, rb4
+--add r1, r1, r0          ; mul24 r0, ra9, rb5
+--add r1, r1, r0          ; mul24 r0, ra10, rb6
+--add r1, r1, r0          ; mul24 r0, ra11, rb7
+--
+--add r1, r1, r0          ; mov -, vw_wait
+--sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--asr r1, r1, 14
+--nop                     ; mul24 r1, r1, rb14
+--add r1, r1, rb12
+--asr r1, r1, rb13
+--brr.anyn -, r:yloop
+--add r1, r1, rb15       # Delay 1
+--min r1, r1, rb22       # Delay 2
+--max vpm, r1, 0         # Delay 3
+-+  nop                     ; mul24 r1, ra14, rb10
+-+  nop                     ; mul24 r0, ra13, rb9
+-+  add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+  add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+  add r1, r1, r0          ; mul24 r0, ra8, rb4
+-+  add r1, r1, r0          ; mul24 r0, ra9, rb5
+-+  add r1, r1, r0          ; mul24 r0, ra10, rb6
+-+  add r1, r1, r0          ; mul24 r0, ra11, rb7
+-+
+-+  add r1, r1, r0          ; mov -, vw_wait
+-+  sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+  asr r1, r1, 14
+-+  nop                     ; mul24 r1, r1, rb14
+-+  add r1, r1, rb12
+-+  asr r1, r1, rb13
+-+  brr.anyn -, r:yloop
+-+  add r1, r1, rb15       # Delay 1
+-+  min r1, r1, rb22       # Delay 2
+-+  max vpm, r1, 0         # Delay 3
+- 
+- # DMA out
+- 
+--bra -, ra31
+--mov vw_setup, rb26 # VDW setup 0    Delay 1
+--mov vw_setup, rb29 # Stride         Delay 2
+--mov vw_addr, unif # start the VDW   Delay 3
+-+  brr -, r:per_block_setup
+-+  mov vw_setup, rb26 # VDW setup 0    Delay 1
+-+  mov vw_setup, rb29 # Stride         Delay 2
+-+  mov vw_addr, unif # start the VDW   Delay 3
+- 
+- 
+- 
+- ################################################################################
+- 
+--# mc_filter_b(next_kernel, x, y, frame_base, x2, y2, frame_base2, width_height, hcoeffs[0], hcoeffs2[0], hcoeffs[1], hcoeffs2[1], vcoeffs[0], vcoeffs2[0], vcoeffs[1], vcoeffs2[1], offsetweight0, offsetweight1, this_dst)
+-+# mc_filter_b(y_x, frame_base, y2_x2, frame_base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
+- # In a P block, only the first half of coefficients contain used information.
+- # At this point we have already issued two pairs of texture requests for the current block
+- # May be better to just send 16.16 motion vector and figure out the coefficients inside this block (only 4 cases so can compute hcoeffs in around 24 cycles?)
+-@@ -952,92 +985,6 @@ mov vw_addr, unif # start the VDW   Delay 3
+- # Or possibly by taking advantage of symmetry?
+- # From 19->7 32bits per command.
+- ::mc_filter_b
+--mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+--mov ra31, unif
+--
+--# per-channel shifts were calculated on the *previous* invocation
+--
+--mov ra_xshift, ra_xshift_next
+--mov rx_xshift2, rx_xshift2_next
+--
+--# get base addresses and per-channel shifts for *next* invocation
+--add r0, unif, elem_num # Load x
+--max r0, r0, 0; mov r1, unif # Load y
+--min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
+--shl ra_xshift_next, r0, 3 # Compute shifts
+--mov ra_y_next, r1
+--and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
+--add ra_frame_base_next, r2, r0  # r2 is address for frame0 (not including y offset)
+--
+--add r0, unif, elem_num # Load x
+--max r0, r0, 0   ; mov r1, unif # Load y
+--min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
+--shl rx_xshift2_next, r0, 3 # Compute shifts
+--add ra_y2_next, r1, 1
+--and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
+--add rx_frame_base2_next, r2, r0  # r2 is address for frame1 (not including y offset)
+--
+--
+--# set up VPM write
+--mov vw_setup, rb28
+--
+--# get width,height of block
+--mov r2, 16
+--mov r0, unif
+--shr r1, r0, r2 # Extract width
+--sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
+--and r0, r0, rb22 # Extract height
+--add rb17, r0, 5
+--add rb18, r0, 7
+--shl r0, r0, 7
+--add r0, r0, r1 # Combine width and height of destination area
+--shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
+--add rb26, r0, rb27
+--
+--# get filter coefficients and discard unused B frame values
+--mov r0, unif
+--mov r1, 1
+--mov.ifnz r0, unif # Alternate coefficients are unused for P frames
+--nop              ;      mul24 r0, r0 << 13, r1 << 13
+--asr ra3, r0, rb23;      mul24 r0, r0, ra22
+--nop              ;      mul24 r0, r0 << 14, r1 << 14
+--asr ra2, r0, rb23;      mul24 r0, r0, ra22
+--nop              ;      mul24 r0, r0 << 15, r1 << 15 # Adjust such that a rotate of 1 will produce the values with first 8 on left, second 8 on right
+--asr ra1, r0, rb23;      mul24 r0, r0, ra22
+--asr ra0, r0, rb23;      mov r0, unif
+--mov.ifnz r0, unif
+--nop              ;      mul24 r0, r0 << 9, r1 << 9
+--asr ra7, r0, rb23;      mul24 r0, r0, ra22
+--nop              ;      mul24 r0, r0 << 10, r1 << 10
+--asr ra6, r0, rb23;      mul24 r0, r0, ra22
+--nop              ;      mul24 r0, r0 << 11, r1 << 11
+--asr ra5, r0, rb23;      mul24 r0, r0, ra22
+--nop              ;      mul24 r0, r0 << 12, r1 << 12
+--asr ra4, r0, rb23;      mov r0, unif
+--mov.ifnz r0, unif
+--asr rb11, r0, rb23;     mul24 r0, r0, ra22
+--asr rb10, r0, rb23;     mul24 r0, r0, ra22
+--asr rb9, r0, rb23;      mul24 r0, r0, ra22
+--asr rb8, r0, rb23;      mov r0, unif
+--mov.ifnz r0, unif
+--asr rb7, r0, rb23;      mul24 r0, r0, ra22
+--asr rb6, r0, rb23;      mul24 r0, r0, ra22
+--asr rb5, r0, rb23;      mul24 r0, r0, ra22
+--asr rb4, r0, rb23
+--
+--mov r0, unif # Frame0 offset/weight
+--mov.ifnz r0, unif # Frame1 offset/weight unused
+--asr rb15, r0, r2  # Compute offset from MSBs
+--shl r0, r0, r2
+--asr rb14, r0, r2  # Compute weight from LSBs
+--
+--# r3 is loop counter
+--
+--# retrieve texture results and pick out bytes
+--# then submit two more texture requests
+--
+--mov r3, 0
+--
+- :yloopb
+- # retrieve texture results and pick out bytes
+- # then submit two more texture requests
+-@@ -1045,111 +992,123 @@ mov r3, 0
+- # If we knew there was no clipping then this code would get simpler.
+- # Perhaps we could add on the pitch and clip using larger values?
+- 
+--sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
+--shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu0
+--mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+--mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+--shr r1, r4, rx_xshift2
+--mov.ifz ra_y2, ra_y2_next
+-+  sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
+-+  shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu1
+-+  mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+-+  mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+  shr r1, r4, rx_xshift2
+-+  mov.ifz ra_y2, ra_y2_next
+- 
+--max r2, ra_y, 0  # y
+--min r2, r2, rb_frame_height_minus_1
+--add ra_y, ra_y, 1            ; mul24 r2, r2, r3
+--add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte
+--
+--max r2, ra_y2, 0  # y
+--min r2, r2, rb_frame_height_minus_1
+--add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
+--add t0s, ra_frame_base2, r2   ; v8subs r0, r0, rb20
+-+  max r2, ra_y, 0  # y
+-+  min r2, r2, rb_frame_height_minus_1
+-+  add ra_y, ra_y, 1            ; mul24 r2, r2, r3
+-+  add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte
+- 
+-+  max r2, ra_y2, 0  # y
+-+  min r2, r2, rb_frame_height_minus_1
+-+  add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
+-+  add t1s, ra_frame_base2, r2   ; v8subs r1, r1, rb20
+- 
+- # generate seven shifted versions
+- # interleave with scroll of vertical context
+- 
+--mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+- 
+- # apply horizontal filter
+--nop                  ; mul24 r2, r0, ra0
+--nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+--nop                  ; mul24      r3, ra1 << 1, r0 << 1
+--nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+--add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+--nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+--add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+--nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+--add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+--nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+--add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+--nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+--add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+--nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+--add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+--nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+--add r0, r2, r3       ; mov r3, rb31
+--sub.setf -, r3, 8    ; mov ra12, ra13
+--mov ra9, ra10
+--mov ra10, ra11
+--mov ra11, ra12
+--mov ra12, ra13
+--brr.anyn -, r:yloopb
+--mov ra13, ra14       # Delay slot 1
+--mov ra14, ra15       # Delay slot 2
+--mov ra15, r0         # Delay slot 3
+--
+--# apply vertical filter and write to VPM
+--
+--nop                     ; mul24 r1, ra14, rb10
+--nop                     ; mul24 r0, ra13, rb9
+--add r1, r1, r0          ; mul24 r0, ra12, rb8
+--add r1, r1, r0          ; mul24 r0, ra15, rb11
+--add r1, r1, r0          ; mul24 r0, ra8, rb4
+--add r1, r1, r0          ; mul24 r0, ra9, rb5
+--add r1, r1, r0          ; mul24 r0, ra10, rb6
+--add r1, r1, r0          ; mul24 r0, ra11, rb7
+--
+--add r1, r1, r0          ; mov -, vw_wait
+--sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+--asr r1, r1, 14
+--nop                     ; mul24 r1, r1 << 8, ra20 << 8 # Rotate to align left and right halves
+--add r1, r1, ra30        ; mul24 r0, r1, rb14
+--add r1, r1, r0
+--brr.anyn -, r:yloopb
+--asr r1, r1, 7          # Delay 1
+--min r1, r1, rb22       # Delay 2
+--max vpm, r1, 0         # Delay 3
+-+  nop                  ; mul24 r2, r0, ra0
+-+  nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
+-+  nop                  ; mul24      r3, ra1 << 1, r0 << 1
+-+  nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
+-+  add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
+-+  nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
+-+  add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
+-+  nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
+-+  add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
+-+  nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
+-+  add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
+-+  nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
+-+  add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
+-+  nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
+-+  add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
+-+  nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
+-+  add r0, r2, r3       ; mov r3, rb31
+-+  sub.setf -, r3, 8    ; mov ra8, ra9
+-+  mov ra9, ra10
+-+  mov ra10, ra11
+-+  mov ra11, ra12
+-+  mov ra12, ra13
+-+  brr.anyn -, r:yloopb
+-+  mov ra13, ra14       # Delay slot 1
+-+  mov ra14, ra15       # Delay slot 2
+-+  mov ra15, r0         # Delay slot 3
+-+
+-+  # apply vertical filter and write to VPM
+-+
+-+  nop                     ; mul24 r1, ra14, rb10
+-+  nop                     ; mul24 r0, ra13, rb9
+-+  add r1, r1, r0          ; mul24 r0, ra12, rb8
+-+  add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+  add r1, r1, r0          ; mul24 r0, ra8, rb4
+-+  add r1, r1, r0          ; mul24 r0, ra9, rb5
+-+  add r1, r1, r0          ; mul24 r0, ra10, rb6
+-+  add r1, r1, r0          ; mul24 r0, ra11, rb7
+-+
+-+  add r1, r1, r0          ; mov -, vw_wait
+-+  sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
+-+  asr r0, r1, 14
+-+  asr r1, r1, 6           # Wait state so we can use the rotate instruction
+-+  nop                     ; mul24 r0, r0 << 8, ra22 << 8 # Rotate to align left and right halves
+-+  add r1, r1, ra18
+-+  add r1, r1, r0
+-+  brr.anyn -, r:yloopb
+-+  asr r1, r1, 15         # Delay 1
+-+  min r1, r1, rb22       # Delay 2
+-+  max vpm, r1, 0         # Delay 3
+- 
+- # DMA out
+--bra -, ra31
+--mov vw_setup, rb26 # VDW setup 0    Delay 1
+--mov vw_setup, rb29 # Stride         Delay 2
+--mov vw_addr, unif # start the VDW   Delay 3
+-+  brr -, r:per_block_setup
+-+  mov vw_setup, rb26 # VDW setup 0    Delay 1
+-+  mov vw_setup, rb29 # Stride         Delay 2
+-+  mov vw_addr, unif # start the VDW   Delay 3
+- 
+- ################################################################################
+- 
+- # mc_interrupt_exit12()
+- ::mc_interrupt_exit12
+--mov  -, vw_wait # wait on the VDW
+--
+--ldtmu0
+--ldtmu0
+--ldtmu0
+--ldtmu0
+--
+--mov -,sacq(0) # 1
+--mov -,sacq(0) # 2
+--mov -,sacq(0) # 3
+--mov -,sacq(0) # 4
+--mov -,sacq(0) # 5
+--mov -,sacq(0) # 6
+--mov -,sacq(0) # 7
+--mov -,sacq(0) # 8
+--mov -,sacq(0) # 9
+--mov -,sacq(0) # 10
+--mov -,sacq(0) # 11
+--
+--nop        ; nop ; thrend
+--mov interrupt, 1; nop # delay slot 1
+--nop        ; nop # delay slot 2
+-+  mov  -, vw_wait # wait on the VDW
+-+
+-+  ldtmu0
+-+  ldtmu0
+-+  ldtmu1
+-+  ldtmu1
+-+
+-+  mov -,sacq(0) # 1
+-+  mov -,sacq(0) # 2
+-+  mov -,sacq(0) # 3
+-+  mov -,sacq(0) # 4
+-+  mov -,sacq(0) # 5
+-+  mov -,sacq(0) # 6
+-+  mov -,sacq(0) # 7
+-+  mov -,sacq(0) # 8
+-+  mov -,sacq(0) # 9
+-+  mov -,sacq(0) # 10
+-+  mov -,sacq(0) # 11
+-+
+-+  nop        ; nop ; thrend
+-+  mov interrupt, 1; nop # delay slot 1
+-+  nop        ; nop # delay slot 2
+-+
+-+
+-+::mc_exit1
+-+  mov  -, vw_wait # wait on the VDW
+-+
+-+  ldtmu0
+-+  ldtmu1
+-+  ldtmu0
+-+  ldtmu1
+-+  nop        ; nop ; thrend
+-+  mov interrupt, 1; nop # delay slot 1
+-+  nop        ; nop # delay slot 2
+- 
+- 
+- ::mc_end
+--- 
+-2.7.4
+-
+-
+-From f02ec34c772aad3caa17432c6a4860f9ed0d5dc6 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Tue, 2 Jun 2015 10:58:25 +0100
+-Subject: [PATCH 48/68] Added option to simulate QPUs
+-
+----
+- libavcodec/hevc.c          | 288 +++++++++++++++++++++++++++++++++++++++++++--
+- libavcodec/rpi_qpu.c       |  24 ++--
+- libavcodec/rpi_shader.qasm |   6 +-
+- 3 files changed, 295 insertions(+), 23 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 2da88ec..34d92e2 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -56,6 +56,8 @@
+-   // Define RPI_CACHE_UNIF_MVS to write motion vector uniform stream to cached memory
+-   // RPI_CACHE_UNIF_MVS doesn't seem to make much difference, so left undefined.
+- 
+-+  // Define RPI_SIMULATE_QPUS for debugging to run QPU code on the ARMs
+-+  //#define RPI_SIMULATE_QPUS
+- 
+- #endif
+- 
+-@@ -124,7 +126,6 @@ static void pic_arrays_free(HEVCContext *s)
+- 
+- #ifdef EARLY_MALLOC
+- #else
+--    printf("pic_arrays_free\n");
+-     if (s->coeffs_buf_arm[0]) {
+-       gpu_free(&s->coeffs_buf_default);
+-       s->coeffs_buf_arm[0] = 0;
+-@@ -174,11 +175,9 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
+- #ifdef RPI
+- #ifdef EARLY_MALLOC
+- #else
+--    assert(sps);
+-+    av_assert0(sps);
+-     int coeffs_in_ctb = (1 << sps->log2_ctb_size) * (1 << sps->log2_ctb_size);
+-     int coefs_per_row = sps->ctb_width * coeffs_in_ctb * 3;  // Allow space for chroma
+--    printf("pic_arrays_init\n");
+--    printf("Allocated %d\n",coefs_per_row);
+-     gpu_malloc_cached(sizeof(int16_t) * coefs_per_row, &s->coeffs_buf_default);
+-     s->coeffs_buf_arm[0] = (int16_t*) s->coeffs_buf_default.arm;
+-     if (!s->coeffs_buf_arm[0])
+-@@ -2988,6 +2987,274 @@ static void rpi_inter_clear(HEVCContext *s)
+- #endif
+- }
+- 
+-+
+-+#ifdef RPI_SIMULATE_QPUS
+-+
+-+static int32_t clipx(int x,int FRAME_WIDTH)
+-+{
+-+	if (x<=0) return 0;
+-+	if (x>=FRAME_WIDTH) return FRAME_WIDTH-1;
+-+	return x;
+-+}
+-+
+-+static int32_t clipy(int y,int FRAME_HEIGHT)
+-+{
+-+	if (y<=0) return 0;
+-+	if (y>=FRAME_HEIGHT) return FRAME_HEIGHT-1;
+-+	return y;
+-+}
+-+
+-+/*static int32_t filter8(uint8_t *data, int x0, int y0, int pitch, int mx, int my,int round,int denom,int weight,int offset)
+-+{
+-+   int32_t vsum = 0;
+-+   int x, y;
+-+
+-+   for (y = 0; y < 8; y++) {
+-+      int32_t hsum = 0;
+-+
+-+      for (x = 0; x < 8; x++)
+-+         hsum += lumaFilter[mx][x]*data[clipx(x + x0) + clipy(y + y0) * pitch];
+-+
+-+      vsum += lumaFilter[my][y]*hsum;
+-+   }
+-+   vsum >>= 6;
+-+   vsum = (((vsum*weight)+round)>>denom)+offset;
+-+
+-+   return av_clip_uint8( vsum );
+-+}*/
+-+
+-+static int32_t filter8_chroma(uint8_t *data, int x0, int y0, int pitch, int hcoeffs, int vcoeffs,int offset_weight,int offset_before,int denom,int pic_width, int pic_height)
+-+{
+-+  int32_t vsum = 0;
+-+  int x, y;
+-+  int chromaFilterH[4];
+-+  int chromaFilterV[4];
+-+  int i;
+-+  int offset_after = offset_weight>>16;
+-+  int weight = (offset_weight<<16)>>16;
+-+  for(i=0;i<4;i++) {
+-+    chromaFilterH[i] = ((hcoeffs>>(8*i))<<24)>>24;
+-+    chromaFilterV[i] = ((vcoeffs>>(8*i))<<24)>>24;
+-+  }
+-+
+-+   for (y = 0; y < 4; y++) {
+-+      int32_t hsum = 0;
+-+
+-+      for (x = 0; x < 4; x++)
+-+         hsum += chromaFilterH[x]*data[clipx(x + x0,pic_width) + clipy(y + y0,pic_height) * pitch];
+-+
+-+      vsum += chromaFilterV[y]*hsum;
+-+   }
+-+   vsum >>= 6;
+-+   vsum = (((vsum*weight)+offset_before)>>denom)+offset_after;
+-+
+-+   return vsum;
+-+}
+-+
+-+int lumaFilter[4][8]={ {0,0,0,64,0,0,0,0},{-1,4,-10,58,17,-5,1,0},{-1,4,-11,40,40,-11,4,-1},{0,1,-5,17,58,-10,4,-1} };
+-+
+-+static int32_t filter8_luma(uint8_t *data, int x0, int y0, int pitch, int my_mx,int offset_weight,int offset_before,int denom,int pic_width, int pic_height)
+-+{
+-+  int32_t vsum = 0;
+-+  int x, y;
+-+  int i;
+-+  int offset_after = offset_weight>>16;
+-+  int weight = (offset_weight<<16)>>16;
+-+
+-+   for (y = 0; y < 8; y++) {
+-+      int32_t hsum = 0;
+-+
+-+      for (x = 0; x < 8; x++)
+-+         hsum += lumaFilter[my_mx&3][x]*data[clipx(x + x0,pic_width) + clipy(y + y0,pic_height) * pitch];
+-+
+-+      vsum += lumaFilter[(my_mx>>8)&3][y]*hsum;
+-+   }
+-+   vsum >>= 6;
+-+   vsum = (((vsum*weight)+offset_before)>>denom)+offset_after;
+-+
+-+   return vsum;
+-+}
+-+
+-+static uint8_t *test_frame(HEVCContext *s,uint32_t p, AVFrame *frame, int cIdx)
+-+{
+-+  //int pic_width        = s->ps.sps->width >> s->ps.sps->hshift[cIdx];
+-+  int pic_height       = s->ps.sps->height >> s->ps.sps->vshift[cIdx];
+-+  int pitch = frame->linesize[cIdx];
+-+  uint32_t base = get_vc_address(frame->buf[cIdx]);
+-+  if (p>=base && p<base+pitch*pic_height) {
+-+    return frame->data[cIdx] + (p-base);
+-+  }
+-+  return NULL;
+-+}
+-+
+-+static uint8_t *compute_arm_addr(HEVCContext *s,uint32_t p, int cIdx)
+-+{
+-+  SliceHeader *sh   = &s->sh;
+-+  uint8_t *arm = test_frame(s,p,s->frame,cIdx);
+-+  int i;
+-+  if (arm) return arm;
+-+  if (sh->slice_type == P_SLICE || sh->slice_type == B_SLICE)
+-+  {
+-+    for(i=0;i<sh->nb_refs[L0];i++) {
+-+      arm = test_frame(s,p,s->ref->refPicList[0].ref[i]->frame,cIdx);
+-+      if (arm) return arm;
+-+    }
+-+  }
+-+  if (sh->slice_type == B_SLICE) {
+-+    for(i=0;i<sh->nb_refs[L1];i++) {
+-+      arm = test_frame(s,p,s->ref->refPicList[1].ref[i]->frame,cIdx);
+-+      if (arm) return arm;
+-+    }
+-+  }
+-+  printf("Frame 0x%x not found! Exit=%x\n",p,qpu_get_fn(QPU_MC_EXIT));
+-+  exit(-1);
+-+  return NULL;
+-+}
+-+
+-+static void rpi_simulate_inter_chroma(HEVCContext *s,uint32_t *p)
+-+{
+-+  uint32_t next_kernel;
+-+  uint32_t x0;
+-+  uint32_t y0;
+-+  uint8_t *ref_u_base;
+-+  uint8_t *ref_v_base;
+-+  uint32_t frame_width = p[5];
+-+  uint32_t frame_height = p[6];
+-+  uint32_t pitch = p[7];
+-+  uint32_t dst_pitch = p[8];
+-+  int32_t offset_before = p[9];
+-+  int32_t denom = p[10];
+-+  uint32_t vpm_id = p[11];
+-+  uint32_t tmp_u_dst[256];
+-+  uint32_t tmp_v_dst[256];
+-+  while(1) {
+-+    p += 12;
+-+    next_kernel = p[0-12];
+-+    x0 = p[1-12];
+-+    y0 = p[2-12];
+-+    if (next_kernel==s->mc_filter_uv || next_kernel==s->mc_filter_uv_b0 || next_kernel==s->mc_filter_uv_b) {
+-+      int x,y;
+-+      uint32_t width_height = p[5];
+-+      uint32_t hcoeffs = p[6];
+-+      uint32_t vcoeffs = p[7];
+-+      uint32_t offset_weight_u = p[8];
+-+      uint32_t offset_weight_v = p[9];
+-+      uint8_t *this_u_dst;
+-+      uint8_t *this_v_dst;
+-+      uint32_t width = width_height >> 16;
+-+      uint32_t height = (width_height << 16) >> 16;
+-+      ref_u_base = compute_arm_addr(s,p[3-12],1);
+-+      ref_v_base = compute_arm_addr(s,p[4-12],2);
+-+      if (next_kernel!=s->mc_filter_uv_b0)
+-+      {
+-+        this_u_dst = compute_arm_addr(s,p[10],1);
+-+        this_v_dst = compute_arm_addr(s,p[11],2);
+-+      }
+-+      for (y=0; y<height; ++y) {
+-+        for (x=0; x<width; ++x) {
+-+          if (next_kernel==s->mc_filter_uv) {
+-+            int32_t refa = filter8_chroma(ref_u_base,x+x0, y+y0, pitch, hcoeffs, vcoeffs, offset_weight_u,offset_before,denom,frame_width,frame_height);
+-+            int32_t refb = filter8_chroma(ref_v_base,x+x0, y+y0, pitch, hcoeffs, vcoeffs, offset_weight_v,offset_before,denom,frame_width,frame_height);
+-+            this_u_dst[x+y*dst_pitch] = av_clip_uint8(refa);
+-+            this_v_dst[x+y*dst_pitch] = av_clip_uint8(refb);
+-+          } else if (next_kernel==s->mc_filter_uv_b0) {
+-+            int32_t refa = filter8_chroma(ref_u_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1,0,0,frame_width,frame_height);
+-+            int32_t refb = filter8_chroma(ref_v_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1,0,0,frame_width,frame_height);
+-+            tmp_u_dst[x+y*16] = refa;
+-+            tmp_v_dst[x+y*16] = refb;
+-+          } else {
+-+            int32_t refa = filter8_chroma(ref_u_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1, 64 + tmp_u_dst[x+y*16], 7, frame_width, frame_height);
+-+            int32_t refb = filter8_chroma(ref_v_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1, 64 + tmp_v_dst[x+y*16], 7, frame_width, frame_height);
+-+            this_u_dst[x+y*dst_pitch] = av_clip_uint8(refa);
+-+            this_v_dst[x+y*dst_pitch] = av_clip_uint8(refb);
+-+          }
+-+        }
+-+      }
+-+    } else {
+-+      av_assert0(next_kernel==qpu_get_fn(QPU_MC_INTERRUPT_EXIT8) || next_kernel==qpu_get_fn(QPU_MC_EXIT) );
+-+      break;
+-+    }
+-+  }
+-+}
+-+
+-+// mc_setup(y_x, ref_y_base, y2_x2, ref_y2_base, frame_width_height, pitch, dst_pitch, offset_shift, next_kernel)
+-+static void rpi_simulate_inter_luma(HEVCContext *s,uint32_t *p)
+-+{
+-+  uint32_t next_kernel;
+-+  int y_x,y2_x2;
+-+  uint32_t x0;
+-+  uint32_t y0;
+-+  uint32_t x2;
+-+  uint32_t y2;
+-+  uint8_t *ref_y_base;
+-+  uint8_t *ref_y2_base;
+-+  uint32_t frame_width_height = p[4];
+-+  uint32_t frame_width = frame_width_height>>16;
+-+  uint32_t frame_height = (frame_width_height<<16)>>16;
+-+  uint32_t pitch = p[5];
+-+  uint32_t dst_pitch = p[6];
+-+  int offset_shift = p[7];
+-+  int32_t offset_before = offset_shift>>16;
+-+  int32_t denom = (offset_shift<<16)>>16;
+-+  while(1) {
+-+    p += 9;
+-+    next_kernel = p[8-9];
+-+    y_x = p[0-9];
+-+    x0 = (y_x<<16)>>16;
+-+    y0 = y_x>>16;
+-+    y2_x2 = p[2-9];
+-+    x2 = (y2_x2<<16)>>16;
+-+    y2 = y2_x2>>16;
+-+
+-+    if (next_kernel==s->mc_filter || next_kernel==s->mc_filter_b) {
+-+      // y_x, frame_base, y2_x2, frame_base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
+-+      int x,y;
+-+      uint32_t width_height = p[4];
+-+      uint32_t my2_mx2_my_mx = p[5];
+-+      uint32_t offset_weight = p[6];
+-+      uint8_t *this_dst = compute_arm_addr(s,p[7],0);
+-+      uint32_t width = width_height >> 16;
+-+      uint32_t height = (width_height << 16) >> 16;
+-+      ref_y_base = compute_arm_addr(s,p[1-9],0);
+-+      ref_y2_base = compute_arm_addr(s,p[3-9],0);
+-+      for (y=0; y<height; ++y) {
+-+        for (x=0; x<width; ++x) {
+-+          if (next_kernel==s->mc_filter) {
+-+            int32_t refa = filter8_luma(ref_y_base,x+x0, y+y0, pitch, my2_mx2_my_mx, offset_weight,offset_before,denom,frame_width,frame_height);
+-+            this_dst[x+y*dst_pitch] = av_clip_uint8(refa);
+-+          }
+-+          else {
+-+            int32_t refa = filter8_luma(ref_y_base, x+x0, y+y0, pitch, my2_mx2_my_mx, 1, 0, 0, frame_width, frame_height);
+-+            int32_t refb = filter8_luma(ref_y2_base, x+x2, y+y2, pitch, my2_mx2_my_mx>>16, 1, 64 + refa, 7, frame_width, frame_height);
+-+            this_dst[x+y*dst_pitch] = av_clip_uint8(refb);
+-+          }
+-+        }
+-+      }
+-+    } else {
+-+      av_assert0(next_kernel==qpu_get_fn(QPU_MC_INTERRUPT_EXIT12) || next_kernel==qpu_get_fn(QPU_MC_EXIT) );
+-+      break;
+-+    }
+-+  }
+-+}
+-+
+-+static void rpi_simulate_inter_qpu(HEVCContext *s)
+-+{
+-+  // First run the transform as normal
+-+  int i;
+-+  rpi_execute_transform(s);
+-+  for(i=0;i<8;i++)
+-+  {
+-+    rpi_simulate_inter_chroma(s,s->mvs_base[i]);
+-+  }
+-+  for(i=0;i<12;i++)
+-+  {
+-+    rpi_simulate_inter_luma(s,s->y_mvs_base[i]);
+-+  }
+-+}
+-+
+-+#endif
+-+
+-+
+- static void rpi_execute_inter_qpu(HEVCContext *s)
+- {
+-     int k;
+-@@ -3006,7 +3273,7 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
+-         s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
+-         s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
+-         s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP_UV); // Also need a dummy for V
+--        assert(s->u_mvs[k] - s->mvs_base[k] < UV_COMMANDS_PER_QPU);
+-+        av_assert0(s->u_mvs[k] - s->mvs_base[k] < UV_COMMANDS_PER_QPU);
+-     }
+- 
+-     s->u_mvs[8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
+-@@ -3016,11 +3283,16 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
+-         s->y_mvs[k][-RPI_LUMA_COMMAND_WORDS+1] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
+-         s->y_mvs[k][-RPI_LUMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // Also need a dummy for second request
+-         s->y_mvs[k][-RPI_LUMA_COMMAND_WORDS+8] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
+--        assert(s->y_mvs[k] - s->y_mvs_base[k] < Y_COMMANDS_PER_QPU);
+-+        av_assert0(s->y_mvs[k] - s->y_mvs_base[k] < Y_COMMANDS_PER_QPU);
+-     }
+-     s->y_mvs[12-1][-RPI_LUMA_COMMAND_WORDS+8] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT12); // This QPU will signal interrupt when all others are done and have acquired a semaphore
+- #endif
+- 
+-+#ifdef RPI_SIMULATE_QPUS
+-+    rpi_simulate_inter_qpu(s);
+-+    s->vpu_id = -1;
+-+    return;
+-+#endif
+- 
+- #ifdef RPI_MULTI_MAILBOX
+- #ifdef RPI_CACHE_UNIF_MVS
+-@@ -3101,7 +3373,7 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-                     && s->ps.pps->num_tile_rows <= 1 && s->ps.pps->num_tile_columns <= 1;
+- #endif
+- 
+--    /*if (!s->enable_rpi) {
+-+    if (!s->enable_rpi) {
+-       if (s->ps.pps->cross_component_prediction_enabled_flag)
+-         printf("Cross component\n");
+-       if (s->ps.pps->num_tile_rows > 1 || s->ps.pps->num_tile_columns > 1)
+-@@ -3110,7 +3382,7 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-         printf("Weighted P slice\n");
+-       if (s->ps.pps->weighted_bipred_flag && s->sh.slice_type == B_SLICE)
+-         printf("Weighted B slice\n");
+--    }*/
+-+    }
+- 
+- #endif
+- 
+-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
+-index e12304b..4480f72 100644
+---- a/libavcodec/rpi_qpu.c
+-+++ b/libavcodec/rpi_qpu.c
+-@@ -13,7 +13,7 @@
+- #include <stdlib.h>
+- #include <string.h>
+- #include <stddef.h>
+--#include <assert.h>
+-+#include "libavutil/avassert.h"
+- 
+- #include "config.h"
+- 
+-@@ -160,13 +160,13 @@ static int gpu_init(volatile struct GPU **gpu) {
+-   // Now copy over the QPU code into GPU memory
+-   {
+-     int num_bytes = qpu_get_fn(QPU_MC_END) - qpu_get_fn(QPU_MC_SETUP_UV);
+--    assert(num_bytes<=QPU_CODE_SIZE*sizeof(unsigned int));
+-+    av_assert0(num_bytes<=QPU_CODE_SIZE*sizeof(unsigned int));
+-     memcpy((void*)ptr->qpu_code, rpi_shader, num_bytes);
+-   }
+-   // And the VPU code
+-   {
+-     int num_bytes = sizeof(rpi_hevc_transform);
+--    assert(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
+-+    av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
+-     memcpy((void*)ptr->vpu_code, rpi_hevc_transform, num_bytes);
+-   }
+-   // And the transform coefficients
+-@@ -216,13 +216,13 @@ static void gpu_unlock(void) {
+- static int gpu_malloc_uncached_internal(int numbytes, GPU_MEM_PTR_T *p, int mb) {
+-   p->numbytes = numbytes;
+-   p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
+--  assert(p->vcsm_handle);
+-+  av_assert0(p->vcsm_handle);
+-   p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
+--  assert(p->vc_handle);
+-+  av_assert0(p->vc_handle);
+-   p->arm = vcsm_lock(p->vcsm_handle);
+--  assert(p->arm);
+-+  av_assert0(p->arm);
+-   p->vc = mem_lock(mb, p->vc_handle);
+--  assert(p->vc);
+-+  av_assert0(p->vc);
+-   return 0;
+- }
+- 
+-@@ -243,7 +243,7 @@ int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p)
+- 
+- int gpu_get_mailbox(void)
+- {
+--  assert(gpu);
+-+  av_assert0(gpu);
+-   return gpu->mb;
+- }
+- 
+-@@ -297,13 +297,13 @@ static int gpu_malloc_cached_internal(int numbytes, GPU_MEM_PTR_T *p) {
+-   //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_VC, (char *)"Video Frame" );
+-   //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
+-   //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST_AND_VC, (char *)"Video Frame" );
+--  assert(p->vcsm_handle);
+-+  av_assert0(p->vcsm_handle);
+-   p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
+--  assert(p->vc_handle);
+-+  av_assert0(p->vc_handle);
+-   p->arm = vcsm_lock(p->vcsm_handle);
+--  assert(p->arm);
+-+  av_assert0(p->arm);
+-   p->vc = mem_lock(gpu->mb, p->vc_handle);
+--  assert(p->vc);
+-+  av_assert0(p->vc);
+-   return 0;
+- }
+- 
+-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
+-index 60d1ec2..0686249 100644
+---- a/libavcodec/rpi_shader.qasm
+-+++ b/libavcodec/rpi_shader.qasm
+-@@ -149,8 +149,8 @@ add t0s, r0, r1 ; mov ra_frame_base, r2
+- add t1s, r2, r1
+- 
+- mov r2,8
+--shl rb12,unif, r2 # offset before shift
+--add rb13,unif,r2  # offset after shift
+-+shl rb12,unif,r2 # offset before shift
+-+add rb13,unif,r2  # denominator
+- 
+- # Compute part of VPM to use for DMA output
+- mov r2, unif
+-@@ -185,7 +185,7 @@ add t1s, r1, ra_frame_base
+- 
+- ################################################################################
+- 
+--# mc_filter_uv(next_kernel, x, y, frame_u_base, frame_v_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_u_dst, this_v_dst)
+-+# mc_filter_uv(next_kernel, x, y, frame_u_base, frame_v_base, width_height, hcoeffs, vcoeffs, offset_weight_u, offset_weight_v, this_u_dst, this_v_dst)
+- 
+- # At this point we have already issued two pairs of texture requests for the current block
+- # ra_x, ra_x16_base point to the current coordinates for this block
+--- 
+-2.7.4
+-
+-
+-From 8bdf6b06c612ff4971c2ce99a62d093cf92468ca Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Tue, 2 Jun 2015 13:17:50 +0100
+-Subject: [PATCH 49/68] Increased motion vector memory and fixed block size
+- computation for non-multiple of 2 block sizes
+-
+----
+- libavcodec/hevc.c | 50 +++++++++++++++++++++++++++++++-------------------
+- 1 file changed, 31 insertions(+), 19 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 34d92e2..3fb1e2a 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -83,11 +83,9 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
+- 
+- // Split image of 2048 into parts 64 wide
+- // So some QPUs will have 3 blocks of 64 to do, and others 2 blocks for an image 2048 wide with 32 blocks across
+--// Each block of 64*64
+--// Smallest CTU size is 16x16, so smallest block is 8x8
+--// Corresponds to a total of 83kbytes over all 12 QPUs
+-+// For each block of 64*64 the smallest block size is 8x4
+- #define RPI_LUMA_COMMAND_WORDS 9
+--#define Y_COMMANDS_PER_QPU ((1+3*(64*64)/(8*8)) * RPI_LUMA_COMMAND_WORDS)
+-+#define Y_COMMANDS_PER_QPU ((1+3*(64*64)/(8*4)) * RPI_LUMA_COMMAND_WORDS)
+- 
+- #define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24)
+- 
+-@@ -2042,11 +2040,13 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-             uint32_t *y = s->y_mvs[chan % 12];
+-             for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
+-               for(int start_x=0;start_x < nPbW;start_x+=16) {
+-+                  int bw = nPbW-start_x;
+-+                  int bh = nPbH-start_y;
+-                   y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff);
+-                   y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[0]);
+-                   y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + 8 + start_x) & 0xffff);
+-                   y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[0]);
+--                  *y++ = ( (nPbW<16 ? nPbW : 16) << 16 ) + (nPbH<16 ? nPbH : 16);
+-+                  *y++ = ( (bw<16 ? bw : 16) << 16 ) + (bh<16 ? bh : 16);
+-                   *y++ = my2_mx2_my_mx;
+-                   if (weight_flag) {
+-                       *y++ = (s->sh.luma_offset_l0[current_mv.ref_idx[reflist]] << 16) + (s->sh.luma_weight_l0[current_mv.ref_idx[reflist]] & 0xffff);
+-@@ -2089,12 +2089,14 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                 uint32_t *u = s->u_mvs[chan & 7];
+-                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
+-                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
+-+                      int bw = nPbW_c-start_x;
+-+                      int bh = nPbH_c-start_y;
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[1]);
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[2]);
+--                      *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
+-+                      *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
+-                       *u++ = rpi_filter_coefs[_mx][0];
+-                       *u++ = rpi_filter_coefs[_my][0];
+-                       if (weight_flag) {
+-@@ -2141,11 +2143,13 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-             uint32_t *y = s->y_mvs[chan % 12];
+-             for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
+-               for(int start_x=0;start_x < nPbW;start_x+=16) {
+-+                  int bw = nPbW-start_x;
+-+                  int bh = nPbH-start_y;
+-                   y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff);
+-                   y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[0]);
+-                   y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + 8 + start_x) & 0xffff);
+-                   y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[0]);
+--                  *y++ = ( (nPbW<16 ? nPbW : 16) << 16 ) + (nPbH<16 ? nPbH : 16);
+-+                  *y++ = ( (bw<16 ? bw : 16) << 16 ) + (bh<16 ? bh : 16);
+-                   *y++ = my2_mx2_my_mx;
+-                   if (weight_flag) {
+-                       *y++ = (s->sh.luma_offset_l0[current_mv.ref_idx[reflist]] << 16) + (s->sh.luma_weight_l0[current_mv.ref_idx[reflist]] & 0xffff);
+-@@ -2189,12 +2193,14 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                 uint32_t *u = s->u_mvs[chan & 7];
+-                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
+-                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
+-+                      int bw = nPbW_c-start_x;
+-+                      int bh = nPbH_c-start_y;
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[1]);
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[2]);
+--                      *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
+-+                      *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
+-                       // TODO chroma weight and offset... s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]
+-                       *u++ = rpi_filter_coefs[_mx][0];
+-                       *u++ = rpi_filter_coefs[_my][0];
+-@@ -2246,11 +2252,13 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-             uint32_t *y = s->y_mvs[chan % 12];
+-             for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
+-               for(int start_x=0;start_x < nPbW;start_x+=8) { // B blocks work 8 at a time
+-+                  int bw = nPbW-start_x;
+-+                  int bh = nPbH-start_y;
+-                   y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff);
+-                   y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[0]);
+-                   y++[-RPI_LUMA_COMMAND_WORDS] = ((y2 - 3 + start_y) << 16) + ( (x2 - 3 + start_x) & 0xffff); // Second fetch is for ref1
+-                   y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[0]);
+--                  *y++ = ( (nPbW<8 ? nPbW : 8) << 16 ) + (nPbH<16 ? nPbH : 16);
+-+                  *y++ = ( (bw<8 ? bw : 8) << 16 ) + (bh<16 ? bh : 16);
+-                   *y++ = my2_mx2_my_mx;
+-                   *y++ = 1; // B frame weighted prediction not supported
+-                   *y++ = (get_vc_address(s->frame->buf[0]) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
+-@@ -2293,12 +2301,14 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                 uint32_t *u = s->u_mvs[chan & 7];
+-                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
+-                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
+-+                      int bw = nPbW_c-start_x;
+-+                      int bh = nPbH_c-start_y;
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b0;
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[1]);
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[2]);
+--                      *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
+-+                      *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
+-                       *u++ = rpi_filter_coefs[_mx][0];
+-                       *u++ = rpi_filter_coefs[_my][0];
+-                       u+=2; // Weights not supported in B slices
+-@@ -2309,7 +2319,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = y2_c - 1 + start_y;
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[1]);
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[2]);
+--                      *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
+-+                      *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
+-                       *u++ = rpi_filter_coefs[_mx2][0];
+-                       *u++ = rpi_filter_coefs[_my2][0];
+-                       u+=2; // Weights not supported in B slices
+-@@ -3178,14 +3188,15 @@ static void rpi_simulate_inter_chroma(HEVCContext *s,uint32_t *p)
+- }
+- 
+- // mc_setup(y_x, ref_y_base, y2_x2, ref_y2_base, frame_width_height, pitch, dst_pitch, offset_shift, next_kernel)
+--static void rpi_simulate_inter_luma(HEVCContext *s,uint32_t *p)
+-+static void rpi_simulate_inter_luma(HEVCContext *s,uint32_t *p,int chan)
+- {
+-   uint32_t next_kernel;
+-   int y_x,y2_x2;
+--  uint32_t x0;
+--  uint32_t y0;
+--  uint32_t x2;
+--  uint32_t y2;
+-+  int x0;
+-+  int y0;
+-+  int x2;
+-+  int y2;
+-+  uint32_t *p0 = p;
+-   uint8_t *ref_y_base;
+-   uint8_t *ref_y2_base;
+-   uint32_t frame_width_height = p[4];
+-@@ -3215,13 +3226,15 @@ static void rpi_simulate_inter_luma(HEVCContext *s,uint32_t *p)
+-       uint8_t *this_dst = compute_arm_addr(s,p[7],0);
+-       uint32_t width = width_height >> 16;
+-       uint32_t height = (width_height << 16) >> 16;
+-+      uint8_t *dst_base = s->frame->data[0];
+-       ref_y_base = compute_arm_addr(s,p[1-9],0);
+-       ref_y2_base = compute_arm_addr(s,p[3-9],0);
+-       for (y=0; y<height; ++y) {
+-         for (x=0; x<width; ++x) {
+-           if (next_kernel==s->mc_filter) {
+-             int32_t refa = filter8_luma(ref_y_base,x+x0, y+y0, pitch, my2_mx2_my_mx, offset_weight,offset_before,denom,frame_width,frame_height);
+--            this_dst[x+y*dst_pitch] = av_clip_uint8(refa);
+-+            refa = av_clip_uint8(refa);
+-+            this_dst[x+y*dst_pitch] = refa;
+-           }
+-           else {
+-             int32_t refa = filter8_luma(ref_y_base, x+x0, y+y0, pitch, my2_mx2_my_mx, 1, 0, 0, frame_width, frame_height);
+-@@ -3248,7 +3261,7 @@ static void rpi_simulate_inter_qpu(HEVCContext *s)
+-   }
+-   for(i=0;i<12;i++)
+-   {
+--    rpi_simulate_inter_luma(s,s->y_mvs_base[i]);
+-+    rpi_simulate_inter_luma(s,s->y_mvs_base[i],i);
+-   }
+- }
+- 
+-@@ -3290,7 +3303,6 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
+- 
+- #ifdef RPI_SIMULATE_QPUS
+-     rpi_simulate_inter_qpu(s);
+--    s->vpu_id = -1;
+-     return;
+- #endif
+- 
+--- 
+-2.7.4
+-
+-
+-From da5ae7e96dd961ccc7bc162c8acf336d54a50092 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Tue, 2 Jun 2015 14:36:54 +0100
+-Subject: [PATCH 50/68] Added support for skip deblock
+-
+----
+- libavcodec/hevc.c        |  5 +++++
+- libavcodec/hevc.h        |  2 ++
+- libavcodec/hevc_filter.c | 14 ++++----------
+- 3 files changed, 11 insertions(+), 10 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 3fb1e2a..0ac4f4c 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -3397,6 +3397,11 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-     }
+- 
+- #endif
+-+    s->used_for_ref = !(s->nal_unit_type == NAL_TRAIL_N ||
+-+                        s->nal_unit_type == NAL_TSA_N   ||
+-+                        s->nal_unit_type == NAL_STSA_N  ||
+-+                        s->nal_unit_type == NAL_RADL_N  ||
+-+                        s->nal_unit_type == NAL_RASL_N);
+- 
+-     if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) {
+-         av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n");
+-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
+-index 5df9dcd..5cb90b5 100644
+---- a/libavcodec/hevc.h
+-+++ b/libavcodec/hevc.h
+-@@ -890,6 +890,8 @@ typedef struct HEVCContext {
+-     int                 width;
+-     int                 height;
+- 
+-+    int used_for_ref;
+-+
+- #ifdef RPI
+-     int enable_rpi;
+-     HEVCMvCmd *unif_mv_cmds;
+-diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
+-index 11629e4..14a0952 100644
+---- a/libavcodec/hevc_filter.c
+-+++ b/libavcodec/hevc_filter.c
+-@@ -512,16 +512,14 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+-                s->ps.pps->transquant_bypass_enable_flag;
+- 
+- #ifdef DISABLE_DEBLOCK_NONREF
+--    if (    s->nal_unit_type == NAL_TRAIL_N ||
+--            s->nal_unit_type == NAL_TSA_N   ||
+--            s->nal_unit_type == NAL_STSA_N  ||
+--            s->nal_unit_type == NAL_RADL_N  ||
+--            s->nal_unit_type == NAL_RASL_N )
+-+    if (!s->used_for_ref)
+-       return; // Don't deblock non-reference frames
+- #endif
+- #ifdef DISABLE_DEBLOCK
+-     return;
+- #endif
+-+    if (!s->used_for_ref && s->avctx->skip_loop_filter >= AVDISCARD_NONREF)
+-+        return;
+- 
+-     if (x0) {
+-         left_tc_offset   = s->deblock[ctb - 1].tc_offset;
+-@@ -885,11 +883,7 @@ static int ff_hevc_buf_base(AVBufferRef *bref) {
+- 
+- void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n)
+- {
+--    if (s->enable_rpi && !(  s->nal_unit_type == NAL_TRAIL_N ||
+--            s->nal_unit_type == NAL_TSA_N   ||
+--            s->nal_unit_type == NAL_STSA_N  ||
+--            s->nal_unit_type == NAL_RADL_N  ||
+--            s->nal_unit_type == NAL_RASL_N )) {
+-+    if (s->enable_rpi && s->used_for_ref) {
+- #ifdef RPI_FAST_CACHEFLUSH
+-         struct vcsm_user_clean_invalid_s iocache = {};
+-         int curr_y = ((int *)f->progress->data)[0];
+--- 
+-2.7.4
+-
+-
+-From 6401d88c310cd3bfec7be94bf3ceb6d0c5736c7e Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Tue, 2 Jun 2015 15:22:52 +0100
+-Subject: [PATCH 51/68] Added support for skip_frame
+-
+----
+- libavcodec/hevc.c | 15 ++++++++++-----
+- 1 file changed, 10 insertions(+), 5 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 0ac4f4c..639e4df 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -3397,11 +3397,6 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-     }
+- 
+- #endif
+--    s->used_for_ref = !(s->nal_unit_type == NAL_TRAIL_N ||
+--                        s->nal_unit_type == NAL_TSA_N   ||
+--                        s->nal_unit_type == NAL_STSA_N  ||
+--                        s->nal_unit_type == NAL_RADL_N  ||
+--                        s->nal_unit_type == NAL_RASL_N);
+- 
+-     if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) {
+-         av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n");
+-@@ -3925,6 +3920,16 @@ static int decode_nal_unit(HEVCContext *s, const H2645NAL *nal)
+-         if (ret < 0)
+-             return ret;
+- 
+-+        s->used_for_ref = !(s->nal_unit_type == NAL_TRAIL_N ||
+-+                        s->nal_unit_type == NAL_TSA_N   ||
+-+                        s->nal_unit_type == NAL_STSA_N  ||
+-+                        s->nal_unit_type == NAL_RADL_N  ||
+-+                        s->nal_unit_type == NAL_RASL_N);
+-+
+-+        if (!s->used_for_ref && s->avctx->skip_frame >= AVDISCARD_NONREF) {
+-+            s->is_decoded = 0;
+-+            break;
+-+        }
+-         if (s->max_ra == INT_MAX) {
+-             if (s->nal_unit_type == NAL_CRA_NUT || IS_BLA(s)) {
+-                 s->max_ra = s->poc;
+--- 
+-2.7.4
+-
+-
+-From d2951e2ca73e234d1b775621e3993948a4a2c8ea Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Wed, 3 Jun 2015 09:15:38 +0100
+-Subject: [PATCH 52/68] Fixed cache flushing of luma when using old method
+-
+----
+- libavcodec/hevc_filter.c | 2 +-
+- 1 file changed, 1 insertion(+), 1 deletion(-)
+-
+-diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
+-index 14a0952..b286bbf 100644
+---- a/libavcodec/hevc_filter.c
+-+++ b/libavcodec/hevc_filter.c
+-@@ -919,7 +919,7 @@ void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n)
+-         flush_buffer(s->frame->buf[1]);
+-         flush_buffer(s->frame->buf[2]);
+- #ifdef RPI_LUMA_QPU
+--        flush_buffer(s->frame->buf[1]);
+-+        flush_buffer(s->frame->buf[0]);
+- #endif
+- 
+- #endif
+--- 
+-2.7.4
+-
+-
+-From 7ae612e69c1cabcc7d0b37b65efa8c5bdcfa7bf5 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Wed, 3 Jun 2015 11:37:27 +0100
+-Subject: [PATCH 53/68] Option to parallelise coefficient decode and inter
+- prediction and deblock for each frame
+-
+----
+- libavcodec/hevc.c              | 701 +++++++++++++++++++++++++++--------------
+- libavcodec/hevc.h              |  74 +++--
+- libavcodec/hevc_cabac.c        |  12 +-
+- libavcodec/hevcpred_template.c |   5 +-
+- 4 files changed, 522 insertions(+), 270 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 639e4df..12aacc5 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -43,8 +43,6 @@
+- 
+- #ifdef RPI
+-   #include "rpi_qpu.h"
+--  // For some unknown reason, the code seems to crash if I do a late malloc
+--  //#define EARLY_MALLOC
+-   // Move Inter prediction into separate pass
+-   #define RPI_INTER
+- 
+-@@ -58,6 +56,21 @@
+- 
+-   // Define RPI_SIMULATE_QPUS for debugging to run QPU code on the ARMs
+-   //#define RPI_SIMULATE_QPUS
+-+  #ifdef RPI_WORKER
+-+    #include "pthread.h"
+-+  #endif
+-+
+-+  static void rpi_execute_dblk_cmds(HEVCContext *s);
+-+  static void rpi_execute_transform(HEVCContext *s);
+-+  static void rpi_execute_inter_qpu(HEVCContext *s);
+-+  static void rpi_execute_pred_cmds(HEVCContext *s);
+-+  static void rpi_execute_inter_cmds(HEVCContext *s);
+-+  static void rpi_inter_clear(HEVCContext *s);
+-+
+-+  // Define INTER_PASS0 to do inter prediction in first pass
+-+  //#define INTER_PASS0
+-+  // Define LAUNCH_PASS0 to launch QPU/VPU from pass0
+-+  //#define LAUNCH_PASS0
+- 
+- #endif
+- 
+-@@ -105,6 +118,143 @@ static uint32_t get_vc_address(AVBufferRef *bref) {
+-   GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
+-   return p->vc;
+- }
+-+#endif
+-+
+-+
+-+#ifdef RPI_WORKER
+-+
+-+//#define LOG_ENTER printf("Enter %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s);
+-+//#define LOG_EXIT printf("Exit %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s);
+-+
+-+#define LOG_ENTER
+-+#define LOG_EXIT
+-+
+-+// Call this when we have completed pass0 and wish to trigger pass1 for the current job
+-+static void worker_submit_job(HEVCContext *s)
+-+{
+-+  LOG_ENTER
+-+  //pthread_mutex_lock(&s->worker_mutex);
+-+  s->worker_tail++; // This is the only place that can change tail so we do not need the mutex
+-+  s->pass0_job = (s->pass0_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
+-+  pthread_cond_broadcast(&s->worker_cond_tail); // Let people know that the tail has moved
+-+  //pthread_mutex_unlock(&s->worker_mutex);
+-+  LOG_EXIT
+-+}
+-+
+-+// Call this to say we have completed pass1
+-+static void worker_complete_middle_job(HEVCContext *s)
+-+{
+-+  LOG_ENTER
+-+  //pthread_mutex_lock(&s->worker_mutex);
+-+  s->worker_middle++; // This is the only place that can change head so we do not need the mutex
+-+  s->pass1_job = (s->pass1_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
+-+  pthread_cond_broadcast(&s->worker_cond_middle); // Let people know that the tail has moved
+-+  //pthread_mutex_unlock(&s->worker_mutex);
+-+  LOG_EXIT
+-+}
+-+
+-+// Call this to say we have completed pass2
+-+static void worker_complete_job(HEVCContext *s)
+-+{
+-+  LOG_ENTER
+-+  //pthread_mutex_lock(&s->worker_mutex);
+-+  s->worker_head++; // This is the only place that can change head so we do not need the mutex
+-+  s->pass2_job = (s->pass2_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
+-+  pthread_cond_broadcast(&s->worker_cond_head); // Let people know that the tail has moved
+-+  //pthread_mutex_unlock(&s->worker_mutex);
+-+  LOG_EXIT
+-+}
+-+
+-+// Call this to wait for all jobs to have completed at the end of a frame
+-+static void worker_wait(HEVCContext *s)
+-+{
+-+  LOG_ENTER
+-+  pthread_mutex_lock(&s->worker_mutex);
+-+  while( s->worker_head !=s->worker_tail)
+-+  {
+-+    pthread_cond_wait(&s->worker_cond_head, &s->worker_mutex);
+-+  }
+-+  pthread_mutex_unlock(&s->worker_mutex);
+-+  LOG_EXIT
+-+}
+-+
+-+// Call worker_pass0_ready to wait until the s->pass0_job slot becomes
+-+// available to receive the next job.
+-+static void worker_pass0_ready(HEVCContext *s)
+-+{
+-+  LOG_ENTER
+-+    pthread_mutex_lock(&s->worker_mutex);
+-+    // tail is number of submitted jobs
+-+    // head is number of completed jobs
+-+    // tail-head is number of outstanding jobs in the queue
+-+    // we need to ensure there is at least 1 space left for us to use
+-+    while( s->worker_tail - s->worker_head >= RPI_MAX_JOBS)
+-+    {
+-+      // Wait until another job is completed
+-+      pthread_cond_wait(&s->worker_cond_head, &s->worker_mutex);
+-+    }
+-+    pthread_mutex_unlock(&s->worker_mutex);
+-+  LOG_EXIT
+-+}
+-+
+-+static void *worker_start(void *arg)
+-+{
+-+  HEVCContext *s = (HEVCContext *)arg;
+-+  while(1) {
+-+    pthread_mutex_lock(&s->worker_mutex);
+-+
+-+    while( !s->kill_worker && s->worker_tail - s->worker_middle <= 0)
+-+    {
+-+      pthread_cond_wait(&s->worker_cond_tail, &s->worker_mutex);
+-+    }
+-+    pthread_mutex_unlock(&s->worker_mutex);
+-+
+-+    if (s->kill_worker) {
+-+      break;
+-+    }
+-+    LOG_ENTER
+-+    // printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
+-+#ifndef LAUNCH_PASS0
+-+    rpi_execute_inter_qpu(s);
+-+#endif
+-+#ifndef INTER_PASS0
+-+    // Perform inter prediction
+-+    rpi_execute_inter_cmds(s);
+-+#endif
+-+    // Wait for transform completion
+-+    vpu_wait(s->vpu_id);
+-+
+-+    worker_complete_middle_job(s);
+-+    LOG_EXIT
+-+  }
+-+  return NULL;
+-+}
+-+
+-+static void *worker_deblock_start(void *arg)
+-+{
+-+  HEVCContext *s = (HEVCContext *)arg;
+-+  while(1) {
+-+    pthread_mutex_lock(&s->worker_mutex);
+-+    while( !s->kill_worker && s->worker_middle - s->worker_head <= 0)
+-+    {
+-+      pthread_cond_wait(&s->worker_cond_middle, &s->worker_mutex);
+-+    }
+-+    pthread_mutex_unlock(&s->worker_mutex);
+-+
+-+    if (s->kill_worker) {
+-+      break;
+-+    }
+-+    LOG_ENTER
+-+    // Perform intra prediction and residual reconstruction
+-+    rpi_execute_pred_cmds(s);
+-+    // Perform deblocking for CTBs in this row
+-+    rpi_execute_dblk_cmds(s);
+-+
+-+    worker_complete_job(s);
+-+    LOG_EXIT
+-+  }
+-+  return NULL;
+-+}
+- 
+- #endif
+- 
+-@@ -121,19 +271,18 @@ static uint32_t get_vc_address(AVBufferRef *bref) {
+- static void pic_arrays_free(HEVCContext *s)
+- {
+- #ifdef RPI
+--
+--#ifdef EARLY_MALLOC
+--#else
+--    if (s->coeffs_buf_arm[0]) {
+--      gpu_free(&s->coeffs_buf_default);
+--      s->coeffs_buf_arm[0] = 0;
+--    }
+--    if (s->coeffs_buf_arm[2]) {
+--      gpu_free(&s->coeffs_buf_accelerated);
+--      s->coeffs_buf_arm[2] = 0;
+-+    int job;
+-+    for(job=0;job<RPI_MAX_JOBS;job++) {
+-+      if (s->coeffs_buf_arm[job][0]) {
+-+        gpu_free(&s->coeffs_buf_default[job]);
+-+        s->coeffs_buf_arm[job][0] = 0;
+-+      }
+-+      if (s->coeffs_buf_arm[job][2]) {
+-+        gpu_free(&s->coeffs_buf_accelerated[job]);
+-+        s->coeffs_buf_arm[job][2] = 0;
+-+      }
+-     }
+- #endif
+--#endif
+-     av_freep(&s->sao);
+-     av_freep(&s->deblock);
+- 
+-@@ -171,24 +320,26 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
+-     int min_pu_size      = sps->min_pu_width * sps->min_pu_height;
+- 
+- #ifdef RPI
+--#ifdef EARLY_MALLOC
+--#else
+-     av_assert0(sps);
+-     int coeffs_in_ctb = (1 << sps->log2_ctb_size) * (1 << sps->log2_ctb_size);
+-     int coefs_per_row = sps->ctb_width * coeffs_in_ctb * 3;  // Allow space for chroma
+--    gpu_malloc_cached(sizeof(int16_t) * coefs_per_row, &s->coeffs_buf_default);
+--    s->coeffs_buf_arm[0] = (int16_t*) s->coeffs_buf_default.arm;
+--    if (!s->coeffs_buf_arm[0])
+--        goto fail;
+--    gpu_malloc_cached(sizeof(int16_t) * coefs_per_row * 2, &s->coeffs_buf_accelerated);
+--    s->coeffs_buf_arm[2] = (int16_t*) s->coeffs_buf_accelerated.arm;
+--    s->coeffs_buf_vc[2] = s->coeffs_buf_accelerated.vc;
+--    if (!s->coeffs_buf_arm[2])
+--        goto fail;
+--    s->coeffs_buf_arm[3] = coefs_per_row + s->coeffs_buf_arm[2];
+--    s->coeffs_buf_vc[3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[2];
+--    printf("Done\n");
+--#endif
+-+    int job;
+-+    for(job=0;job<RPI_MAX_JOBS;job++) {
+-+      printf("Allocated %d\n",coefs_per_row);
+-+      for(job=0;job<RPI_MAX_JOBS;job++) {
+-+        gpu_malloc_cached(sizeof(int16_t) * coefs_per_row, &s->coeffs_buf_default[job]);
+-+        s->coeffs_buf_arm[job][0] = (int16_t*) s->coeffs_buf_default[job].arm;
+-+        if (!s->coeffs_buf_arm[job][0])
+-+            goto fail;
+-+        gpu_malloc_cached(sizeof(int16_t) * coefs_per_row * 2, &s->coeffs_buf_accelerated[job]);
+-+        s->coeffs_buf_arm[job][2] = (int16_t*) s->coeffs_buf_accelerated[job].arm;
+-+        s->coeffs_buf_vc[job][2] = s->coeffs_buf_accelerated[job].vc;
+-+        if (!s->coeffs_buf_arm[job][2])
+-+            goto fail;
+-+        s->coeffs_buf_arm[job][3] = coefs_per_row + s->coeffs_buf_arm[job][2];
+-+        s->coeffs_buf_vc[job][3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[job][2];
+-+      }
+-+    }
+- #endif
+- 
+-     s->bs_width  = (width  >> 2) + 1;
+-@@ -1036,7 +1187,7 @@ static void rpi_intra_pred(HEVCContext *s, int log2_trafo_size, int x0, int y0,
+- {
+-     if (s->enable_rpi) {
+-         HEVCLocalContext *lc = s->HEVClc;
+--        HEVCPredCmd *cmd = s->univ_pred_cmds + s->num_pred_cmds++;
+-+        HEVCPredCmd *cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++;
+-         cmd->type = RPI_PRED_INTRA;
+-         cmd->size = log2_trafo_size;
+-         cmd->c_idx = c_idx;
+-@@ -1496,7 +1647,7 @@ static void rpi_luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+-                         AVFrame *ref, const Mv *mv, int x_off, int y_off,
+-                         int block_w, int block_h, int luma_weight, int luma_offset)
+- {
+--    HEVCMvCmd *cmd = s->unif_mv_cmds + s->num_mv_cmds++;
+-+    HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++;
+-     cmd->cmd = RPI_CMD_LUMA_UNI;
+-     cmd->dst = dst;
+-     cmd->dststride = dststride;
+-@@ -1515,7 +1666,7 @@ static void rpi_luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+-                        AVFrame *ref0, const Mv *mv0, int x_off, int y_off,
+-                        int block_w, int block_h, AVFrame *ref1, const Mv *mv1, struct MvField *current_mv)
+- {
+--    HEVCMvCmd *cmd = s->unif_mv_cmds + s->num_mv_cmds++;
+-+    HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++;
+-     cmd->cmd = RPI_CMD_LUMA_BI;
+-     cmd->dst = dst;
+-     cmd->dststride = dststride;
+-@@ -1537,7 +1688,7 @@ static void rpi_chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
+-                           ptrdiff_t dststride, uint8_t *src0, ptrdiff_t srcstride, int reflist,
+-                           int x_off, int y_off, int block_w, int block_h, struct MvField *current_mv, int chroma_weight, int chroma_offset)
+- {
+--    HEVCMvCmd *cmd = s->unif_mv_cmds + s->num_mv_cmds++;
+-+    HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++;
+-     cmd->cmd = RPI_CMD_CHROMA_UNI;
+-     cmd->dst = dst0;
+-     cmd->dststride = dststride;
+-@@ -1555,7 +1706,7 @@ static void rpi_chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
+- static void rpi_chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVFrame *ref0, AVFrame *ref1,
+-                          int x_off, int y_off, int block_w, int block_h, struct MvField *current_mv, int cidx)
+- {
+--    HEVCMvCmd *cmd = s->unif_mv_cmds + s->num_mv_cmds++;
+-+    HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++;
+-     cmd->cmd = RPI_CMD_CHROMA_BI+cidx;
+-     cmd->dst = dst0;
+-     cmd->dststride = dststride;
+-@@ -2037,7 +2188,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-             int chan = x0>>6; // 64 wide blocks per QPU
+-             int weight_flag = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
+-                               (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
+--            uint32_t *y = s->y_mvs[chan % 12];
+-+            uint32_t *y = s->y_mvs[s->pass0_job][chan % 12];
+-             for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
+-               for(int start_x=0;start_x < nPbW;start_x+=16) {
+-                   int bw = nPbW-start_x;
+-@@ -2057,7 +2208,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                   y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter;
+-                 }
+-             }
+--            s->y_mvs[chan % 12] = y;
+-+            s->y_mvs[s->pass0_job][chan % 12] = y;
+-         } else
+- #endif
+-         {
+-@@ -2086,7 +2237,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                 int weight_flag      = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
+-                                        (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
+- 
+--                uint32_t *u = s->u_mvs[chan & 7];
+-+                uint32_t *u = s->u_mvs[s->pass0_job][chan & 7];
+-                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
+-                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
+-                       int bw = nPbW_c-start_x;
+-@@ -2110,7 +2261,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+-                     }
+-                 }
+--                s->u_mvs[chan & 7] = u;
+-+                s->u_mvs[s->pass0_job][chan & 7] = u;
+-                 return;
+-             }
+- #endif
+-@@ -2140,7 +2291,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-             int chan = x0>>6; // 64 wide blocks per QPU
+-             int weight_flag = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
+-                               (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
+--            uint32_t *y = s->y_mvs[chan % 12];
+-+            uint32_t *y = s->y_mvs[s->pass0_job][chan % 12];
+-             for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
+-               for(int start_x=0;start_x < nPbW;start_x+=16) {
+-                   int bw = nPbW-start_x;
+-@@ -2160,7 +2311,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                   y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter;
+-                 }
+-             }
+--            s->y_mvs[chan % 12] = y;
+-+            s->y_mvs[s->pass0_job][chan % 12] = y;
+-         } else
+- #endif
+- 
+-@@ -2190,7 +2341,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                 int weight_flag      = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
+-                                        (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
+- 
+--                uint32_t *u = s->u_mvs[chan & 7];
+-+                uint32_t *u = s->u_mvs[s->pass0_job][chan & 7];
+-                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
+-                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
+-                       int bw = nPbW_c-start_x;
+-@@ -2215,7 +2366,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+-                     }
+-                 }
+--                s->u_mvs[chan & 7] = u;
+-+                s->u_mvs[s->pass0_job][chan & 7] = u;
+-                 return;
+-             }
+- #endif
+-@@ -2249,7 +2400,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-             int x2 = x0 + (mv2->x >> 2);
+-             int y2 = y0 + (mv2->y >> 2);
+-             int chan = x0>>6; // 64 wide blocks per QPU
+--            uint32_t *y = s->y_mvs[chan % 12];
+-+            uint32_t *y = s->y_mvs[s->pass0_job][chan % 12];
+-             for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
+-               for(int start_x=0;start_x < nPbW;start_x+=8) { // B blocks work 8 at a time
+-                   int bw = nPbW-start_x;
+-@@ -2265,7 +2416,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                   y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter_b;
+-                 }
+-             }
+--            s->y_mvs[chan % 12] = y;
+-+            s->y_mvs[s->pass0_job][chan % 12] = y;
+-         } else
+- #endif
+-         {
+-@@ -2298,7 +2449,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+- 
+-                 int chan = x0>>8; // Allocate commands for the first 256 luma pixels across to the first QPU.  This is optimised for images around 1920 width
+- 
+--                uint32_t *u = s->u_mvs[chan & 7];
+-+                uint32_t *u = s->u_mvs[s->pass0_job][chan & 7];
+-                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
+-                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
+-                       int bw = nPbW_c-start_x;
+-@@ -2327,7 +2478,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+-                     }
+-                 }
+--                s->u_mvs[chan & 7] = u;
+-+                s->u_mvs[s->pass0_job][chan & 7] = u;
+-                 return;
+-             }
+- #endif
+-@@ -2832,40 +2983,54 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
+- static void rpi_execute_dblk_cmds(HEVCContext *s)
+- {
+-     int n;
+-+    int job = s->pass2_job;
+-     int ctb_size    = 1 << s->ps.sps->log2_ctb_size;
+--    int (*p)[2] = s->dblk_cmds;
+--    for(n = s->num_dblk_cmds; n>0 ;n--,p++) {
+-+    int (*p)[2] = s->dblk_cmds[job];
+-+    for(n = s->num_dblk_cmds[job]; n>0 ;n--,p++) {
+-         ff_hevc_hls_filters(s, (*p)[0], (*p)[1], ctb_size);
+-     }
+--    s->num_dblk_cmds = 0;
+-+    s->num_dblk_cmds[job] = 0;
+- }
+- 
+- static void rpi_execute_transform(HEVCContext *s)
+- {
+-     int i=2;
+-+#ifdef LAUNCH_PASS0
+-+    int job = s->pass0_job;
+-+#else
+-+    int job = s->pass1_job;
+-+#endif
+-     //int j;
+-     //int16_t *coeffs = s->coeffs_buf_arm[i];
+-     //for(j=s->num_coeffs[i]; j > 0; j-= 16*16, coeffs+=16*16) {
+-     //    s->hevcdsp.idct[4-2](coeffs, 16);
+-     //}
+- 
+--    gpu_cache_flush(&s->coeffs_buf_accelerated);
+--    s->vpu_id = vpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0, &s->coeffs_buf_accelerated);
+-+    gpu_cache_flush(&s->coeffs_buf_accelerated[job]);
+-+    s->vpu_id = vpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2],
+-+                               s->num_coeffs[job][2] >> 8, s->coeffs_buf_vc[job][3],
+-+                               s->num_coeffs[job][3] >> 10, 0, &s->coeffs_buf_accelerated[job]);
+-     //vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0);
+-     //gpu_cache_flush(&s->coeffs_buf_accelerated);
+-     //vpu_wait(s->vpu_id);
+- 
+-     for(i=0;i<4;i++)
+--        s->num_coeffs[i] = 0;
+-+        s->num_coeffs[job][i] = 0;
+- }
+- 
+- static void rpi_execute_pred_cmds(HEVCContext *s)
+- {
+-   int i;
+--  HEVCPredCmd *cmd = s->univ_pred_cmds;
+-+  int job = s->pass2_job;
+-+  HEVCPredCmd *cmd = s->univ_pred_cmds[job];
+-+#ifdef RPI_WORKER
+-+  HEVCLocalContextIntra *lc = &s->HEVClcIntra;
+-+#else
+-   HEVCLocalContext *lc = s->HEVClc;
+-+#endif
+- 
+--  for(i = s->num_pred_cmds; i > 0; i--, cmd++) {
+-+  for(i = s->num_pred_cmds[job]; i > 0; i--, cmd++) {
+-+      //printf("i=%d cmd=%p job1=%d job0=%d\n",i,cmd,s->pass1_job,s->pass0_job);
+-       if (cmd->type == RPI_PRED_INTRA) {
+-           lc->tu.intra_pred_mode_c = lc->tu.intra_pred_mode = cmd->mode;
+-           lc->na.cand_bottom_left  = (cmd->na >> 4) & 1;
+-@@ -2884,21 +3049,26 @@ static void rpi_execute_pred_cmds(HEVCContext *s)
+- #endif
+-       }
+-   }
+--  s->num_pred_cmds = 0;
+-+  s->num_pred_cmds[job] = 0;
+- }
+- 
+- static void rpi_execute_inter_cmds(HEVCContext *s)
+- {
+--    HEVCMvCmd *cmd = s->unif_mv_cmds;
+-+#ifdef INTER_PASS0
+-+    int job = s->pass0_job;
+-+#else
+-+    int job = s->pass1_job;
+-+#endif
+-+    HEVCMvCmd *cmd = s->unif_mv_cmds[job];
+-     int n,cidx;
+-     AVFrame myref;
+-     AVFrame myref1;
+-     struct MvField mymv;
+--    if (s->num_mv_cmds > RPI_MAX_MV_CMDS) {
+-+    if (s->num_mv_cmds[job] > RPI_MAX_MV_CMDS) {
+-         printf("Overflow inter_cmds\n");
+-         exit(-1);
+-     }
+--    for(n = s->num_mv_cmds; n>0 ; n--, cmd++) {
+-+    for(n = s->num_mv_cmds[job]; n>0 ; n--, cmd++) {
+-         switch(cmd->cmd) {
+-         case RPI_CMD_LUMA_UNI:
+-             myref.data[0] = cmd->src;
+-@@ -2938,7 +3108,28 @@ static void rpi_execute_inter_cmds(HEVCContext *s)
+-             break;
+-         }
+-     }
+--    s->num_mv_cmds = 0;
+-+    s->num_mv_cmds[job] = 0;
+-+}
+-+
+-+static void rpi_do_all_passes(HEVCContext *s)
+-+{
+-+#ifdef RPI_INTER_QPU
+-+    // Kick off inter prediction on QPUs
+-+    rpi_execute_inter_qpu(s);
+-+#else
+-+    rpi_execute_transform(s);
+-+#endif
+-+    // Perform luma inter prediction
+-+    rpi_execute_inter_cmds(s);
+-+    // Wait for transform completion
+-+    vpu_wait(s->vpu_id);
+-+    // Perform intra prediction and residual reconstruction
+-+    rpi_execute_pred_cmds(s);
+-+    // Perform deblocking for CTBs in this row
+-+    rpi_execute_dblk_cmds(s);
+-+#ifdef RPI_INTER_QPU
+-+    rpi_inter_clear(s);
+-+#endif
+- }
+- 
+- #endif
+-@@ -2946,6 +3137,7 @@ static void rpi_execute_inter_cmds(HEVCContext *s)
+- #ifdef RPI_INTER_QPU
+- static void rpi_inter_clear(HEVCContext *s)
+- {
+-+    int job = s->pass0_job;
+-     int i;
+-     int pic_width        = s->ps.sps->width >> s->ps.sps->hshift[1];
+-     int pic_height       = s->ps.sps->height >> s->ps.sps->vshift[1];
+-@@ -2953,51 +3145,50 @@ static void rpi_inter_clear(HEVCContext *s)
+-                            (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
+- 
+-     for(i=0;i<8;i++) {
+--        s->u_mvs[i] = s->mvs_base[i];
+--        *s->u_mvs[i]++ = 0;
+--        *s->u_mvs[i]++ = 0;
+--        *s->u_mvs[i]++ = 0;
+--        *s->u_mvs[i]++ = 0;
+--        *s->u_mvs[i]++ = 0;
+--        *s->u_mvs[i]++ = pic_width;
+--        *s->u_mvs[i]++ = pic_height;
+--        *s->u_mvs[i]++ = s->frame->linesize[1];
+--        *s->u_mvs[i]++ = s->frame->linesize[2];
+-+        s->u_mvs[job][i] = s->mvs_base[job][i];
+-+        *s->u_mvs[job][i]++ = 0;
+-+        *s->u_mvs[job][i]++ = 0;
+-+        *s->u_mvs[job][i]++ = 0;
+-+        *s->u_mvs[job][i]++ = 0;
+-+        *s->u_mvs[job][i]++ = 0;
+-+        *s->u_mvs[job][i]++ = pic_width;
+-+        *s->u_mvs[job][i]++ = pic_height;
+-+        *s->u_mvs[job][i]++ = s->frame->linesize[1];
+-+        *s->u_mvs[job][i]++ = s->frame->linesize[2];
+-         if (weight_flag) {
+--            *s->u_mvs[i]++ = 1 << (s->sh.chroma_log2_weight_denom + 6 - 1);
+--            *s->u_mvs[i]++ = s->sh.chroma_log2_weight_denom + 6;
+-+            *s->u_mvs[job][i]++ = 1 << (s->sh.chroma_log2_weight_denom + 6 - 1);
+-+            *s->u_mvs[job][i]++ = s->sh.chroma_log2_weight_denom + 6;
+-         } else {
+--            *s->u_mvs[i]++ = 1 << 5;
+--            *s->u_mvs[i]++ = 6;
+-+            *s->u_mvs[job][i]++ = 1 << 5;
+-+            *s->u_mvs[job][i]++ = 6;
+-         }
+--        *s->u_mvs[i]++ = i;  // Select section of VPM (avoid collisions with 3d unit)
+-+        *s->u_mvs[job][i]++ = i;  // Select section of VPM (avoid collisions with 3d unit)
+-     }
+- 
+- #ifdef RPI_LUMA_QPU
+-     for(i=0;i<12;i++) {
+--        s->y_mvs[i] = s->y_mvs_base[i];
+--        *s->y_mvs[i]++ = 0; // y_x
+--        *s->y_mvs[i]++ = 0; // ref_y_base
+--        *s->y_mvs[i]++ = 0; // y2_x2
+--        *s->y_mvs[i]++ = 0; // ref_y2_base
+--        *s->y_mvs[i]++ = (s->ps.sps->width << 16) + s->ps.sps->height;
+--        *s->y_mvs[i]++ = s->frame->linesize[0]; // pitch
+--        *s->y_mvs[i]++ = s->frame->linesize[0]; // dst_pitch
+-+        s->y_mvs[job][i] = s->y_mvs_base[job][i];
+-+        *s->y_mvs[job][i]++ = 0; // y_x
+-+        *s->y_mvs[job][i]++ = 0; // ref_y_base
+-+        *s->y_mvs[job][i]++ = 0; // y2_x2
+-+        *s->y_mvs[job][i]++ = 0; // ref_y2_base
+-+        *s->y_mvs[job][i]++ = (s->ps.sps->width << 16) + s->ps.sps->height;
+-+        *s->y_mvs[job][i]++ = s->frame->linesize[0]; // pitch
+-+        *s->y_mvs[job][i]++ = s->frame->linesize[0]; // dst_pitch
+-         if (weight_flag) {
+-             int offset = 1 << (s->sh.luma_log2_weight_denom + 6 - 1);
+-             int shift = s->sh.luma_log2_weight_denom + 6;
+--            *s->y_mvs[i]++ = (offset << 16) + shift;
+-+            *s->y_mvs[job][i]++ = (offset << 16) + shift;
+-         } else {
+-             int offset = 1 << 5;
+-             int shift = 6;
+--            *s->y_mvs[i]++ = (offset << 16) + shift;
+-+            *s->y_mvs[job][i]++ = (offset << 16) + shift;
+-         }
+--        *s->y_mvs[i]++ = 0; // Next kernel
+-+        *s->y_mvs[job][i]++ = 0; // Next kernel
+-     }
+- #endif
+- }
+- 
+--
+- #ifdef RPI_SIMULATE_QPUS
+- 
+- static int32_t clipx(int x,int FRAME_WIDTH)
+-@@ -3271,10 +3462,15 @@ static void rpi_simulate_inter_qpu(HEVCContext *s)
+- static void rpi_execute_inter_qpu(HEVCContext *s)
+- {
+-     int k;
+-+#ifdef LAUNCH_PASS0
+-+    int job = s->pass0_job;
+-+#else
+-+    int job = s->pass1_job;
+-+#endif
+-     int i;
+--    uint32_t *unif_vc = (uint32_t *)s->unif_mvs_ptr.vc;
+-+    uint32_t *unif_vc = (uint32_t *)s->unif_mvs_ptr[job].vc;
+- #ifdef RPI_LUMA_QPU
+--    uint32_t *y_unif_vc = (uint32_t *)s->y_unif_mvs_ptr.vc;
+-+    uint32_t *y_unif_vc = (uint32_t *)s->y_unif_mvs_ptr[job].vc;
+- #endif
+-     if (s->sh.slice_type == I_SLICE) {
+- #ifdef RPI_MULTI_MAILBOX
+-@@ -3283,22 +3479,22 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
+- #endif
+-     }
+-     for(k=0;k<8;k++) {
+--        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
+--        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
+--        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP_UV); // Also need a dummy for V
+--        av_assert0(s->u_mvs[k] - s->mvs_base[k] < UV_COMMANDS_PER_QPU);
+-+        s->u_mvs[job][k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
+-+        s->u_mvs[job][k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
+-+        s->u_mvs[job][k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP_UV); // Also need a dummy for V
+-+        av_assert0(s->u_mvs[job][k] - s->mvs_base[job][k] < UV_COMMANDS_PER_QPU);
+-     }
+- 
+--    s->u_mvs[8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
+-+    s->u_mvs[job][8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
+- 
+- #ifdef RPI_LUMA_QPU
+-     for(k=0;k<12;k++) {
+--        s->y_mvs[k][-RPI_LUMA_COMMAND_WORDS+1] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
+--        s->y_mvs[k][-RPI_LUMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // Also need a dummy for second request
+--        s->y_mvs[k][-RPI_LUMA_COMMAND_WORDS+8] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
+--        av_assert0(s->y_mvs[k] - s->y_mvs_base[k] < Y_COMMANDS_PER_QPU);
+-+        s->y_mvs[job][k][-RPI_LUMA_COMMAND_WORDS+1] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
+-+        s->y_mvs[job][k][-RPI_LUMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // Also need a dummy for second request
+-+        s->y_mvs[job][k][-RPI_LUMA_COMMAND_WORDS+8] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
+-+        av_assert0(s->y_mvs[job][k] - s->y_mvs_base[job][k] < Y_COMMANDS_PER_QPU);
+-     }
+--    s->y_mvs[12-1][-RPI_LUMA_COMMAND_WORDS+8] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT12); // This QPU will signal interrupt when all others are done and have acquired a semaphore
+-+    s->y_mvs[job][12-1][-RPI_LUMA_COMMAND_WORDS+8] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT12); // This QPU will signal interrupt when all others are done and have acquired a semaphore
+- #endif
+- 
+- #ifdef RPI_SIMULATE_QPUS
+-@@ -3308,34 +3504,34 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
+- 
+- #ifdef RPI_MULTI_MAILBOX
+- #ifdef RPI_CACHE_UNIF_MVS
+--    gpu_cache_flush3(&s->coeffs_buf_accelerated,&s->y_unif_mvs_ptr, &s->unif_mvs_ptr);
+-+    gpu_cache_flush3(&s->coeffs_buf_accelerated[job],&s->y_unif_mvs_ptr[job], &s->unif_mvs_ptr[job]);
+- #else
+--    gpu_cache_flush(&s->coeffs_buf_accelerated);
+-+    gpu_cache_flush(&s->coeffs_buf_accelerated[job]);
+- #endif
+--    s->vpu_id = vpu_qpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0,
+-+    s->vpu_id = vpu_qpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2], s->num_coeffs[job][2] >> 8, s->coeffs_buf_vc[job][3], s->num_coeffs[job][3] >> 10, 0,
+-                                    qpu_get_fn(QPU_MC_SETUP_UV),
+--                                   (uint32_t)(unif_vc+(s->mvs_base[0 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+--                                   (uint32_t)(unif_vc+(s->mvs_base[1 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+--                                   (uint32_t)(unif_vc+(s->mvs_base[2 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+--                                   (uint32_t)(unif_vc+(s->mvs_base[3 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+--                                   (uint32_t)(unif_vc+(s->mvs_base[4 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+--                                   (uint32_t)(unif_vc+(s->mvs_base[5 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+--                                   (uint32_t)(unif_vc+(s->mvs_base[6 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+--                                   (uint32_t)(unif_vc+(s->mvs_base[7 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-+                                   (uint32_t)(unif_vc+(s->mvs_base[job][0 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(unif_vc+(s->mvs_base[job][1 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(unif_vc+(s->mvs_base[job][2 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(unif_vc+(s->mvs_base[job][3 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(unif_vc+(s->mvs_base[job][4 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(unif_vc+(s->mvs_base[job][5 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(unif_vc+(s->mvs_base[job][6 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(unif_vc+(s->mvs_base[job][7 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+- #ifdef RPI_LUMA_QPU
+-                                    qpu_get_fn(QPU_MC_SETUP),
+--                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[0 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
+--                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[1 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
+--                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[2 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
+--                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[3 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
+--                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[4 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
+--                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[5 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
+--                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[6 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
+--                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[7 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
+--                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[8 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
+--                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[9 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
+--                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[10 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
+--                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[11 ] - (uint32_t*)s->y_unif_mvs_ptr.arm))
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][0 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][1 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][2 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][3 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][4 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][5 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][6 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][7 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][8 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][9 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][10 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][11 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm))
+- #else
+-                                    0,
+-                                    0,0,0,0,
+-@@ -3344,17 +3540,17 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
+- #endif
+-                                  );
+-     for(i=0;i<4;i++)
+--        s->num_coeffs[i] = 0;
+-+        s->num_coeffs[job][i] = 0;
+- #else
+-     qpu_run_shader8(qpu_get_fn(QPU_MC_SETUP_UV),
+--      (uint32_t)(unif_vc+(s->mvs_base[0 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+--      (uint32_t)(unif_vc+(s->mvs_base[1 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+--      (uint32_t)(unif_vc+(s->mvs_base[2 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+--      (uint32_t)(unif_vc+(s->mvs_base[3 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+--      (uint32_t)(unif_vc+(s->mvs_base[4 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+--      (uint32_t)(unif_vc+(s->mvs_base[5 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+--      (uint32_t)(unif_vc+(s->mvs_base[6 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+--      (uint32_t)(unif_vc+(s->mvs_base[7 ] - (uint32_t*)s->unif_mvs_ptr.arm))
+-+      (uint32_t)(unif_vc+(s->mvs_base[job][0 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[job][1 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[job][2 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[job][3 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[job][4 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[job][5 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[job][6 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[job][7 ] - (uint32_t*)s->unif_mvs_ptr[job].arm))
+-       );
+- #endif
+- 
+-@@ -3411,6 +3607,11 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-         }
+-     }
+- 
+-+#ifdef RPI_WORKER
+-+    s->pass0_job = 0;
+-+    s->pass1_job = 0;
+-+    s->pass2_job = 0;
+-+#endif
+- #ifdef RPI_INTER_QPU
+-     rpi_inter_clear(s);
+- #endif
+-@@ -3431,46 +3632,42 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-         s->filter_slice_edges[ctb_addr_rs]  = s->sh.slice_loop_filter_across_slices_enabled_flag;
+- 
+-         more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
+-+
+- #ifdef RPI
+-         if (s->enable_rpi) {
+--          s->dblk_cmds[s->num_dblk_cmds][0] = x_ctb;
+--          s->dblk_cmds[s->num_dblk_cmds++][1] = y_ctb;
+-+          s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]][0] = x_ctb;
+-+          s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]++][1] = y_ctb;
+-           if ( (((y_ctb + ctb_size)&63) == 0) && x_ctb + ctb_size >= s->ps.sps->width) {
+--            // Transform all blocks
+--            // printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
+--#ifdef RPI_MULTI_MAILBOX
+--            // Kick off inter prediction on QPUs
+--            rpi_execute_inter_qpu(s);
+--            // Perform luma inter prediction
+--            rpi_execute_inter_cmds(s);
+--#else
+--            rpi_execute_transform(s);
+--            // Perform inter prediction
+--            rpi_execute_inter_cmds(s);
+--#ifdef RPI_INTER_QPU
+--            // Kick off inter prediction on QPUs
+--            rpi_execute_inter_qpu(s);
+--#endif
+--#endif
+--
+--            // Wait for transform completion
+--            vpu_wait(s->vpu_id);
+--
+--            // Copy back reconstructed data
+--            //memcpy(s->frame->data[0],s->dummy.arm,2048*64);
+--            //memcpy(s->frame->data[1],s->dummy.arm,1024*32);
+--            //memcpy(s->frame->data[2],s->dummy.arm,1024*32);
+-+#ifdef RPI_WORKER
+-+            if (s->used_for_ref) {
+-+              // Split work load onto separate threads so we make as rapid progress as possible with this frame
+-+  #ifdef INTER_PASS0
+-+              rpi_execute_inter_cmds(s);
+-+  #endif
+-+  #ifdef LAUNCH_PASS0
+-+              rpi_execute_inter_qpu(s);
+-+  #endif
+-+              // Pass on this job to worker thread
+-+              worker_submit_job(s);
+-+              // Make sure we have space to prepare the next job
+-+              worker_pass0_ready(s);
+- 
+--            // Perform intra prediction and residual reconstruction
+--            rpi_execute_pred_cmds(s);
+--            // Perform deblocking for CTBs in this row
+--            rpi_execute_dblk_cmds(s);
+-+              // Prepare the next batch of commands
+- #ifdef RPI_INTER_QPU
+--            rpi_inter_clear(s);
+-+              rpi_inter_clear(s);
+-+#endif
+-+            } else {
+-+              // Non-ref frame so do it all on this thread
+-+              rpi_do_all_passes(s);
+-+            }
+-+#else
+-+            rpi_do_all_passes(s);
+- #endif
+-           }
+-         }
+- #endif
+-+
+-+
+-         if (more_data < 0) {
+-             s->tab_slice_address[ctb_addr_rs] = -1;
+-             return more_data;
+-@@ -3487,18 +3684,21 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-     }
+- 
+- #ifdef RPI
+--    if (s->enable_rpi && s->num_dblk_cmds) {
+--#ifdef RPI_INTER_QPU
+--        rpi_execute_inter_qpu(s);
+--#endif
+--#ifndef RPI_MULTI_MAILBOX
+--        rpi_execute_transform(s);
+-+
+-+#ifdef RPI_WORKER
+-+    // Wait for the worker to finish all its jobs
+-+    if (s->enable_rpi) {
+-+        worker_wait(s);
+-+        av_assert0(s->pass0_job==s->pass1_job);
+-+        av_assert0(s->pass1_job==s->pass2_job);
+-+    }
+- #endif
+--        rpi_execute_inter_cmds(s);
+--        vpu_wait(s->vpu_id);
+--        rpi_execute_pred_cmds(s);
+--        rpi_execute_dblk_cmds(s);
+-+
+-+    // Finish off any half-completed rows
+-+    if (s->enable_rpi && s->num_dblk_cmds[s->pass0_job]) {
+-+        rpi_do_all_passes(s);
+-     }
+-+
+- #endif
+- 
+-     if (x_ctb + ctb_size >= s->ps.sps->width &&
+-@@ -4230,6 +4430,48 @@ fail:
+-     return AVERROR(ENOMEM);
+- }
+- 
+-+#ifdef RPI_WORKER
+-+static av_cold void hevc_init_worker(HEVCContext *s)
+-+{
+-+    int err;
+-+    pthread_cond_init(&s->worker_cond_head, NULL);
+-+    pthread_cond_init(&s->worker_cond_middle, NULL);
+-+    pthread_cond_init(&s->worker_cond_tail, NULL);
+-+    pthread_mutex_init(&s->worker_mutex, NULL);
+-+
+-+    s->worker_tail=0;
+-+    s->worker_middle=0;
+-+    s->worker_head=0;
+-+    s->kill_worker=0;
+-+    err = pthread_create(&s->worker_thread, NULL, worker_start, s);
+-+    err = pthread_create(&s->worker_deblock_thread, NULL, worker_deblock_start, s);
+-+    if (err) {
+-+        printf("Failed to create worker thread\n");
+-+        exit(-1);
+-+    }
+-+}
+-+
+-+static av_cold void hevc_exit_worker(HEVCContext *s)
+-+{
+-+    void *res;
+-+    s->kill_worker=1;
+-+    pthread_cond_broadcast(&s->worker_cond_tail);
+-+    pthread_cond_broadcast(&s->worker_cond_middle);
+-+    pthread_join(s->worker_thread, &res);
+-+    pthread_join(s->worker_deblock_thread, &res);
+-+
+-+    pthread_cond_destroy(&s->worker_cond_head);
+-+    pthread_cond_destroy(&s->worker_cond_middle);
+-+    pthread_cond_destroy(&s->worker_cond_tail);
+-+    pthread_mutex_destroy(&s->worker_mutex);
+-+
+-+    s->worker_tail=0;
+-+    s->worker_middle=0;
+-+    s->worker_head=0;
+-+    s->kill_worker=0;
+-+}
+-+#endif
+-+
+- static av_cold int hevc_decode_free(AVCodecContext *avctx)
+- {
+-     HEVCContext       *s = avctx->priv_data;
+-@@ -4242,33 +4484,29 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
+-     av_freep(&s->cabac_state);
+- 
+- #ifdef RPI
+--    av_freep(&s->unif_mv_cmds);
+--    av_freep(&s->univ_pred_cmds);
+-+
+-+#ifdef RPI_WORKER
+-+    hevc_exit_worker(s);
+-+#endif
+-+
+-+    for(i=0;i<RPI_MAX_JOBS;i++) {
+-+      av_freep(&s->unif_mv_cmds[i]);
+-+      av_freep(&s->univ_pred_cmds[i]);
+- 
+- #ifdef RPI_INTER_QPU
+--    if (s->unif_mvs) {
+--        gpu_free( &s->unif_mvs_ptr );
+--        s->unif_mvs = 0;
+--    }
+-+      if (s->unif_mvs[i]) {
+-+        gpu_free( &s->unif_mvs_ptr[i] );
+-+        s->unif_mvs[i] = 0;
+-+      }
+- #endif
+- #ifdef RPI_LUMA_QPU
+--    if (s->y_unif_mvs) {
+--        gpu_free( &s->y_unif_mvs_ptr );
+--        s->y_unif_mvs = 0;
+--    }
+-+      if (s->y_unif_mvs[i]) {
+-+        gpu_free( &s->y_unif_mvs_ptr[i] );
+-+        s->y_unif_mvs[i] = 0;
+-+      }
+- #endif
+--
+--#ifdef EARLY_MALLOC
+--    printf("hevc_decode_free\n");
+--    if (s->coeffs_buf_arm[0]) {
+--      gpu_free(&s->coeffs_buf_default);
+--      s->coeffs_buf_arm[0] = 0;
+--    }
+--    if (s->coeffs_buf_arm[2]) {
+--      gpu_free(&s->coeffs_buf_accelerated);
+--      s->coeffs_buf_arm[2] = 0;
+-     }
+--#endif
+-+
+- #endif
+- 
+-     for (i = 0; i < 3; i++) {
+-@@ -4328,6 +4566,7 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
+- {
+-     HEVCContext *s = avctx->priv_data;
+-     int i;
+-+    int job;
+- 
+-     s->avctx = avctx;
+- 
+-@@ -4338,12 +4577,14 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
+-     s->sList[0] = s;
+- 
+- #ifdef RPI
+--    s->unif_mv_cmds = av_mallocz(sizeof(HEVCMvCmd)*RPI_MAX_MV_CMDS);
+--    if (!s->unif_mv_cmds)
+--        goto fail;
+--    s->univ_pred_cmds = av_mallocz(sizeof(HEVCPredCmd)*RPI_MAX_PRED_CMDS);
+--    if (!s->univ_pred_cmds)
+--        goto fail;
+-+    for(job=0;job<RPI_MAX_JOBS;job++) {
+-+        s->unif_mv_cmds[job] = av_mallocz(sizeof(HEVCMvCmd)*RPI_MAX_MV_CMDS);
+-+        if (!s->unif_mv_cmds[job])
+-+            goto fail;
+-+        s->univ_pred_cmds[job] = av_mallocz(sizeof(HEVCPredCmd)*RPI_MAX_PRED_CMDS);
+-+        if (!s->univ_pred_cmds[job])
+-+            goto fail;
+-+    }
+- 
+- #ifdef RPI_INTER_QPU
+-     // We divide the image into blocks 256 wide and 64 high
+-@@ -4354,18 +4595,20 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
+-     {
+-         int uv_commands_per_qpu = UV_COMMANDS_PER_QPU;
+-         uint32_t *p;
+-+		for(job=0;job<RPI_MAX_JOBS;job++) {
+- #ifdef RPI_CACHE_UNIF_MVS
+--        gpu_malloc_cached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr );
+-+          gpu_malloc_cached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr[job] );
+- #else
+--        gpu_malloc_uncached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr );
+-+          gpu_malloc_uncached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr[job] );
+- #endif
+--        s->unif_mvs = (uint32_t *) s->unif_mvs_ptr.arm; // TODO support this allocation in non EARLY_MALLOC
+-+          s->unif_mvs[job] = (uint32_t *) s->unif_mvs_ptr[job].arm;
+- 
+--        // Set up initial locations for uniform streams
+--        p = s->unif_mvs;
+--        for(i = 0; i < 8; i++) {
+--            s->mvs_base[i] = p;
+-+          // Set up initial locations for uniform streams
+-+          p = s->unif_mvs[job];
+-+          for(i = 0; i < 8; i++) {
+-+            s->mvs_base[job][i] = p;
+-             p += uv_commands_per_qpu;
+-+          }
+-         }
+-         s->mc_filter_uv = qpu_get_fn(QPU_MC_FILTER_UV);
+-         s->mc_filter_uv_b0 = qpu_get_fn(QPU_MC_FILTER_UV_B0);
+-@@ -4374,61 +4617,35 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
+-     }
+- #endif
+- #ifdef RPI_LUMA_QPU
+-+    for(job=0;job<RPI_MAX_JOBS;job++)
+-     {
+-         int y_commands_per_qpu = Y_COMMANDS_PER_QPU;
+-         uint32_t *p;
+- #ifdef RPI_CACHE_UNIF_MVS
+--        gpu_malloc_cached( 12 * y_commands_per_qpu * sizeof(uint32_t), &s->y_unif_mvs_ptr );
+-+        gpu_malloc_cached( 12 * y_commands_per_qpu * sizeof(uint32_t), &s->y_unif_mvs_ptr[job] );
+- #else
+--        gpu_malloc_uncached( 12 * y_commands_per_qpu * sizeof(uint32_t), &s->y_unif_mvs_ptr );
+-+        gpu_malloc_uncached( 12 * y_commands_per_qpu * sizeof(uint32_t), &s->y_unif_mvs_ptr[job] );
+- #endif
+--        s->y_unif_mvs = (uint32_t *) s->y_unif_mvs_ptr.arm; // TODO support this allocation in non EARLY_MALLOC
+-+        s->y_unif_mvs[job] = (uint32_t *) s->y_unif_mvs_ptr[job].arm;
+- 
+-         // Set up initial locations for uniform streams
+--        p = s->y_unif_mvs;
+-+        p = s->y_unif_mvs[job];
+-         for(i = 0; i < 12; i++) {
+--            s->y_mvs_base[i] = p;
+-+            s->y_mvs_base[job][i] = p;
+-             p += y_commands_per_qpu;
+-         }
+--        s->mc_filter = qpu_get_fn(QPU_MC_FILTER);
+--        s->mc_filter_b = qpu_get_fn(QPU_MC_FILTER_B);
+--
+-     }
+-+    s->mc_filter = qpu_get_fn(QPU_MC_FILTER);
+-+    s->mc_filter_b = qpu_get_fn(QPU_MC_FILTER_B);
+- #endif
+-     //gpu_malloc_uncached(2048*64,&s->dummy);
+- 
+--#ifdef EARLY_MALLOC
+--    {
+--        int coeffs_in_ctb = 64*64;
+--        int coefs_per_row = (2048/64) * coeffs_in_ctb * 3;  // Allow space for chroma
+--        s->coeffs_buf_arm[0] = 0;
+--        s->coeffs_buf_arm[2] = 0;
+--        printf("Allocated %d\n",coefs_per_row);
+--        gpu_malloc_cached(sizeof(int16_t) * coefs_per_row, &s->coeffs_buf_default);
+--        s->coeffs_buf_arm[0] = (int16_t*) s->coeffs_buf_default.arm;
+--        if (!s->coeffs_buf_arm[0])
+--            goto fail;
+--        gpu_malloc_cached(sizeof(int16_t) * coefs_per_row * 2, &s->coeffs_buf_accelerated);
+--        s->coeffs_buf_arm[2] = (int16_t*) s->coeffs_buf_accelerated.arm;
+--        s->coeffs_buf_vc[2] = s->coeffs_buf_accelerated.vc;
+--        if (!s->coeffs_buf_arm[2])
+--            goto fail;
+--        s->coeffs_buf_arm[3] = coefs_per_row + s->coeffs_buf_arm[2];
+--        s->coeffs_buf_vc[3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[2];
+--        printf("Done\n");
+--#ifdef RPI_PRECLEAR
+--        //memset(s->coeffs_buf_arm[0],0, sizeof(int16_t) * coefs_per_row);
+--        memclear16(s->coeffs_buf_arm[0], coefs_per_row);
+--        //memset(s->coeffs_buf_arm[2],0, sizeof(int16_t) * coefs_per_row);
+--        memclear16(s->coeffs_buf_arm[2], coefs_per_row);
+--        //memset(s->coeffs_buf_arm[3],0, sizeof(int16_t) * coefs_per_row);
+--        memclear16(s->coeffs_buf_arm[3], coefs_per_row);
+--#endif
+--    }
+--#endif
+--
+-     s->enable_rpi = 0;
+- 
+-+#ifdef RPI_WORKER
+-+    hevc_init_worker(s);
+-+#endif
+-+
+- #endif
+- 
+-     s->cabac_state = av_malloc(HEVC_CONTEXTS);
+-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
+-index 5cb90b5..7bd295a 100644
+---- a/libavcodec/hevc.h
+-+++ b/libavcodec/hevc.h
+-@@ -51,6 +51,12 @@
+-     // Define RPI_LUMA_QPU to also use QPU for luma inter prediction
+-     #define RPI_LUMA_QPU
+-   #endif
+-+
+-+  // By passing jobs to a worker thread we hope to be able to catch up during slow frames
+-+  #define RPI_MAX_JOBS 2
+-+  // Define RPI_WORKER to launch a worker thread for pixel processing tasks
+-+  #define RPI_WORKER
+-+
+- #endif
+- 
+- #define MAX_DPB_SIZE 16 // A.4.1
+-@@ -806,6 +812,13 @@ typedef struct HEVCLocalContext {
+-     int boundary_flags;
+- } HEVCLocalContext;
+- 
+-+#ifdef RPI_WORKER
+-+typedef struct HEVCLocalContextIntra {
+-+    TransformUnit tu;
+-+    NeighbourAvailable na;
+-+} HEVCLocalContextIntra;
+-+#endif
+-+
+- #ifdef RPI
+- 
+- // RPI_MAX_WIDTH is maximum width in pixels supported by the accelerated code
+-@@ -874,7 +887,7 @@ typedef struct HEVCPredCmd {
+- 
+- typedef struct HEVCContext {
+- #ifdef RPI
+--    int dblk_cmds[RPI_MAX_DEBLOCK_CMDS][2];
+-+    int dblk_cmds[RPI_MAX_JOBS][RPI_MAX_DEBLOCK_CMDS][2];
+- #endif
+-     const AVClass *c;  // needed by private avoptions
+-     AVCodecContext *avctx;
+-@@ -883,7 +896,9 @@ typedef struct HEVCContext {
+- 
+-     HEVCLocalContext    *HEVClcList[MAX_NB_THREADS];
+-     HEVCLocalContext    *HEVClc;
+--
+-+#ifdef RPI_WORKER
+-+    HEVCLocalContextIntra HEVClcIntra;
+-+#endif
+-     uint8_t             threads_type;
+-     uint8_t             threads_number;
+- 
+-@@ -894,43 +909,60 @@ typedef struct HEVCContext {
+- 
+- #ifdef RPI
+-     int enable_rpi;
+--    HEVCMvCmd *unif_mv_cmds;
+--    HEVCPredCmd *univ_pred_cmds;
+-+    HEVCMvCmd *unif_mv_cmds[RPI_MAX_JOBS];
+-+    HEVCPredCmd *univ_pred_cmds[RPI_MAX_JOBS];
+-     int buf_width;
+--    GPU_MEM_PTR_T coeffs_buf_default;
+--    GPU_MEM_PTR_T coeffs_buf_accelerated;
+--    int16_t *coeffs_buf_arm[4];
+--    unsigned int coeffs_buf_vc[4];
+--    int num_coeffs[4];
+--    int num_xfm_cmds;
+--    int num_mv_cmds;
+--    int num_pred_cmds;
+--    int num_dblk_cmds;
+-+    GPU_MEM_PTR_T coeffs_buf_default[RPI_MAX_JOBS];
+-+    GPU_MEM_PTR_T coeffs_buf_accelerated[RPI_MAX_JOBS];
+-+    int16_t *coeffs_buf_arm[RPI_MAX_JOBS][4];
+-+    unsigned int coeffs_buf_vc[RPI_MAX_JOBS][4];
+-+    int num_coeffs[RPI_MAX_JOBS][4];
+-+    int num_xfm_cmds[RPI_MAX_JOBS];
+-+    int num_mv_cmds[RPI_MAX_JOBS];
+-+    int num_pred_cmds[RPI_MAX_JOBS];
+-+    int num_dblk_cmds[RPI_MAX_JOBS];
+-     int vpu_id;
+-     //GPU_MEM_PTR_T dummy;
+-+    int pass0_job; // Pass0 does coefficient decode
+-+    int pass1_job; // Pass1 does pixel processing
+-+    int pass2_job; // Pass2 does reconstruction and deblocking
+- #ifdef RPI_INTER_QPU
+--    GPU_MEM_PTR_T unif_mvs_ptr;
+--    uint32_t *unif_mvs; // Base of memory for motion vector commands
+-+    GPU_MEM_PTR_T unif_mvs_ptr[RPI_MAX_JOBS];
+-+    uint32_t *unif_mvs[RPI_MAX_JOBS]; // Base of memory for motion vector commands
+- 
+-     // _base pointers are to the start of the row
+--    uint32_t *mvs_base[8];
+-+    uint32_t *mvs_base[RPI_MAX_JOBS][8];
+-     // these pointers are to the next free space
+--    uint32_t *u_mvs[8];
+-+    uint32_t *u_mvs[RPI_MAX_JOBS][8];
+-     // Function pointers
+-     uint32_t mc_filter_uv;
+-     uint32_t mc_filter_uv_b0;
+-     uint32_t mc_filter_uv_b;
+- #endif
+- #ifdef RPI_LUMA_QPU
+--    GPU_MEM_PTR_T y_unif_mvs_ptr;
+--    uint32_t *y_unif_mvs; // Base of memory for motion vector commands
+--    uint32_t *y_mvs_base[12];
+--    uint32_t *y_mvs[12];
+-+    GPU_MEM_PTR_T y_unif_mvs_ptr[RPI_MAX_JOBS];
+-+    uint32_t *y_unif_mvs[RPI_MAX_JOBS]; // Base of memory for motion vector commands
+-+    uint32_t *y_mvs_base[RPI_MAX_JOBS][12];
+-+    uint32_t *y_mvs[RPI_MAX_JOBS][12];
+-     // Function pointers
+-     uint32_t mc_filter;
+-     uint32_t mc_filter_b;
+- #endif
+- 
+-+#ifdef RPI_WORKER
+-+    pthread_t worker_thread;
+-+    pthread_t worker_deblock_thread;
+-+    pthread_cond_t worker_cond_head;
+-+    pthread_cond_t worker_cond_tail;
+-+    pthread_cond_t worker_cond_middle;
+-+    pthread_mutex_t worker_mutex;
+-+
+-+    int worker_tail; // Contains the number of posted jobs
+-+    int worker_head; // Contains the number of completed jobs
+-+    int worker_middle; // Contains the number of completed jobs
+-+    int kill_worker; // set to 1 to terminate the worker
+-+#endif
+-+
+- #endif
+- 
+-     uint8_t *cabac_state;
+-diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
+-index 38f53de..f0982cd 100644
+---- a/libavcodec/hevc_cabac.c
+-+++ b/libavcodec/hevc_cabac.c
+-@@ -1051,11 +1051,11 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+-     if (s->enable_rpi) {
+-         int n = trafo_size * trafo_size;
+-         if (use_vpu) {
+--            coeffs = s->coeffs_buf_arm[log2_trafo_size - 2] + s->num_coeffs[log2_trafo_size - 2];
+--            s->num_coeffs[log2_trafo_size - 2] += n;
+-+            coeffs = s->coeffs_buf_arm[s->pass0_job][log2_trafo_size - 2] + s->num_coeffs[s->pass0_job][log2_trafo_size - 2];
+-+            s->num_coeffs[s->pass0_job][log2_trafo_size - 2] += n;
+-         } else {
+--            coeffs = s->coeffs_buf_arm[0] + s->num_coeffs[0];
+--            s->num_coeffs[0] += n;
+-+            coeffs = s->coeffs_buf_arm[s->pass0_job][0] + s->num_coeffs[s->pass0_job][0];
+-+            s->num_coeffs[s->pass0_job][0] += n;
+-         }
+-     }
+-     // We now do the memset after transform_add while we know the data is cached.
+-@@ -1508,7 +1508,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+-                 s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
+-             }
+-         } else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2) {
+--            s->hevcdsp.idct_4x4_luma(coeffs);
+-+           s->hevcdsp.idct_4x4_luma(coeffs);
+-         } else {
+- #ifdef RPI
+-             if (!use_vpu) {
+-@@ -1553,7 +1553,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+-     }
+- #ifdef RPI
+-     if (s->enable_rpi) {
+--        HEVCPredCmd *cmd = s->univ_pred_cmds + s->num_pred_cmds++;
+-+        HEVCPredCmd *cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++;
+-         cmd->type = RPI_PRED_TRANSFORM_ADD;
+-         cmd->size = log2_trafo_size;
+-         cmd->buf = coeffs;
+-diff --git a/libavcodec/hevcpred_template.c b/libavcodec/hevcpred_template.c
+-index 71c6d52..344e021 100644
+---- a/libavcodec/hevcpred_template.c
+-+++ b/libavcodec/hevcpred_template.c
+-@@ -71,8 +71,11 @@ do {                                  \
+-                 AV_WN4P(&ptr[i], a);                                           \
+-             else                                                               \
+-                 a = PIXEL_SPLAT_X4(ptr[i + 3])
+--
+-+#ifdef RPI_WORKER
+-+    HEVCLocalContextIntra *lc = &s->HEVClcIntra;
+-+#else
+-     HEVCLocalContext *lc = s->HEVClc;
+-+#endif
+-     int i;
+-     int hshift = s->ps.sps->hshift[c_idx];
+-     int vshift = s->ps.sps->vshift[c_idx];
+--- 
+-2.7.4
+-
+-
+-From 1e0885f8d98175777fff65b4cedd708176c2abcf Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Wed, 3 Jun 2015 13:43:48 +0100
+-Subject: [PATCH 54/68] Avoid lockup bug with RPI_WORKER enabled
+-
+----
+- libavcodec/hevc.c       | 22 +++++++++++-----------
+- libavcodec/hevc_cabac.c |  1 -
+- 2 files changed, 11 insertions(+), 12 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 12aacc5..182a82f 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -133,11 +133,11 @@ static uint32_t get_vc_address(AVBufferRef *bref) {
+- static void worker_submit_job(HEVCContext *s)
+- {
+-   LOG_ENTER
+--  //pthread_mutex_lock(&s->worker_mutex);
+--  s->worker_tail++; // This is the only place that can change tail so we do not need the mutex
+-+  pthread_mutex_lock(&s->worker_mutex);
+-+  s->worker_tail++;
+-   s->pass0_job = (s->pass0_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
+-   pthread_cond_broadcast(&s->worker_cond_tail); // Let people know that the tail has moved
+--  //pthread_mutex_unlock(&s->worker_mutex);
+-+  pthread_mutex_unlock(&s->worker_mutex);
+-   LOG_EXIT
+- }
+- 
+-@@ -145,11 +145,11 @@ static void worker_submit_job(HEVCContext *s)
+- static void worker_complete_middle_job(HEVCContext *s)
+- {
+-   LOG_ENTER
+--  //pthread_mutex_lock(&s->worker_mutex);
+--  s->worker_middle++; // This is the only place that can change head so we do not need the mutex
+-+  pthread_mutex_lock(&s->worker_mutex);
+-+  s->worker_middle++;
+-   s->pass1_job = (s->pass1_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
+--  pthread_cond_broadcast(&s->worker_cond_middle); // Let people know that the tail has moved
+--  //pthread_mutex_unlock(&s->worker_mutex);
+-+  pthread_cond_broadcast(&s->worker_cond_middle); // Let people know that the middle has moved
+-+  pthread_mutex_unlock(&s->worker_mutex);
+-   LOG_EXIT
+- }
+- 
+-@@ -157,11 +157,11 @@ static void worker_complete_middle_job(HEVCContext *s)
+- static void worker_complete_job(HEVCContext *s)
+- {
+-   LOG_ENTER
+--  //pthread_mutex_lock(&s->worker_mutex);
+--  s->worker_head++; // This is the only place that can change head so we do not need the mutex
+-+  pthread_mutex_lock(&s->worker_mutex);
+-+  s->worker_head++;
+-   s->pass2_job = (s->pass2_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
+--  pthread_cond_broadcast(&s->worker_cond_head); // Let people know that the tail has moved
+--  //pthread_mutex_unlock(&s->worker_mutex);
+-+  pthread_cond_broadcast(&s->worker_cond_head); // Let people know that the head has moved
+-+  pthread_mutex_unlock(&s->worker_mutex);
+-   LOG_EXIT
+- }
+- 
+-diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
+-index f0982cd..6523e66 100644
+---- a/libavcodec/hevc_cabac.c
+-+++ b/libavcodec/hevc_cabac.c
+-@@ -1497,7 +1497,6 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+-                 for (i = 0; i < 8; i++)
+-                     FFSWAP(int16_t, coeffs[i], coeffs[16 - i - 1]);
+-             }
+--
+-             s->hevcdsp.transform_skip(coeffs, log2_trafo_size);
+- 
+-             if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
+--- 
+-2.7.4
+-
+-
+-From 1d7ad81069dec6914ec7e9983855d7a1b5e4b123 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Wed, 3 Jun 2015 15:37:19 +0100
+-Subject: [PATCH 55/68] Added code to flush buffers at start of frame
+-
+----
+- libavcodec/hevc.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
+- 1 file changed, 72 insertions(+)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 182a82f..e5b9f1e 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -43,6 +43,7 @@
+- 
+- #ifdef RPI
+-   #include "rpi_qpu.h"
+-+  #include "rpi_user_vcsm.h"
+-   // Move Inter prediction into separate pass
+-   #define RPI_INTER
+- 
+-@@ -3508,6 +3509,7 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
+- #else
+-     gpu_cache_flush(&s->coeffs_buf_accelerated[job]);
+- #endif
+-+
+-     s->vpu_id = vpu_qpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2], s->num_coeffs[job][2] >> 8, s->coeffs_buf_vc[job][3], s->num_coeffs[job][3] >> 10, 0,
+-                                    qpu_get_fn(QPU_MC_SETUP_UV),
+-                                    (uint32_t)(unif_vc+(s->mvs_base[job][0 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-@@ -3558,6 +3560,71 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
+- }
+- #endif
+- 
+-+#ifdef RPI
+-+
+-+static void flush_buffer(AVBufferRef *bref) {
+-+    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
+-+    gpu_cache_flush(p);
+-+}
+-+
+-+static void flush_frame(HEVCContext *s,AVFrame *frame)
+-+{
+-+#if 1
+-+    struct vcsm_user_clean_invalid_s iocache = {};
+-+    int n = s->ps.sps->height;
+-+    int curr_y = 0;
+-+    int curr_uv = 0;
+-+    int n_uv = n >> s->ps.sps->vshift[1];
+-+    int sz,base;
+-+    sz = s->frame->linesize[1] * (n_uv-curr_uv);
+-+    base = s->frame->linesize[1] * curr_uv;
+-+    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[1]);
+-+    iocache.s[0].handle = p->vcsm_handle;
+-+    iocache.s[0].cmd = 3; // clean+invalidate
+-+    iocache.s[0].addr = p->arm + base;
+-+    iocache.s[0].size  = sz;
+-+    p = av_buffer_pool_opaque(frame->buf[2]);
+-+    iocache.s[1].handle = p->vcsm_handle;
+-+    iocache.s[1].cmd = 3; // clean+invalidate
+-+    iocache.s[1].addr = p->arm + base;
+-+    iocache.s[1].size  = sz;
+-+    p = av_buffer_pool_opaque(frame->buf[0]);
+-+    sz = s->frame->linesize[0] * (n-curr_y);
+-+    base = s->frame->linesize[0] * curr_y;
+-+    iocache.s[2].handle = p->vcsm_handle;
+-+    iocache.s[2].cmd = 3; // clean+invalidate
+-+    iocache.s[2].addr = p->arm + base;
+-+    iocache.s[2].size  = sz;
+-+    vcsm_clean_invalid( &iocache );
+-+#else
+-+    flush_buffer(frame->buf[0]);
+-+    flush_buffer(frame->buf[1]);
+-+    flush_buffer(frame->buf[2]);
+-+#endif
+-+}
+-+
+-+static void flush_all(HEVCContext *s)
+-+{
+-+#if 0
+-+    struct vcsm_user_clean_invalid_s iocache = {};
+-+    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(s->frame->buf[0]);
+-+    iocache.s[0].handle = p->vcsm_handle;
+-+    iocache.s[0].cmd = 4; // Flush all
+-+    iocache.s[0].addr = p->arm;
+-+    iocache.s[0].size  = 4096;
+-+    vcsm_clean_invalid( &iocache );
+-+#else
+-+  int i,k;
+-+  for(i=0;i<2;i++) {
+-+    for (k = 0; k < s->sh.nb_refs[i]; k++) {
+-+      flush_frame(s,s->ref->refPicList[i].ref[k]->frame);
+-+    }
+-+  }
+-+  flush_frame(s,s->frame);
+-+#endif
+-+}
+-+#endif
+-+
+- static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+- {
+-     HEVCContext *s  = avctxt->priv_data;
+-@@ -3592,8 +3659,12 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-         printf("Weighted B slice\n");
+-     }
+- 
+-+    // Now flush all reference frames and our destination frame to get everything ready for decode
+-+    flush_all(s);
+- #endif
+- 
+-+    //printf("L0=%d L1=%d\n",s->sh.nb_refs[L1],s->sh.nb_refs[L1]);
+-+
+-     if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) {
+-         av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n");
+-         return AVERROR_INVALIDDATA;
+-@@ -3664,6 +3735,7 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-             rpi_do_all_passes(s);
+- #endif
+-           }
+-+
+-         }
+- #endif
+- 
+--- 
+-2.7.4
+-
+-
+-From 7a57f233dcd4048e20a0b5bc06bc20abb589d3fa Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Wed, 3 Jun 2015 16:42:24 +0100
+-Subject: [PATCH 56/68] Reduce the amount that needs to be flushed
+-
+----
+- libavcodec/hevc.c | 35 +++++++++++------------------------
+- 1 file changed, 11 insertions(+), 24 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index e5b9f1e..73d7f74 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -3569,7 +3569,7 @@ static void flush_buffer(AVBufferRef *bref) {
+- 
+- static void flush_frame(HEVCContext *s,AVFrame *frame)
+- {
+--#if 1
+-+#ifdef RPI_FAST_CACHEFLUSH
+-     struct vcsm_user_clean_invalid_s iocache = {};
+-     int n = s->ps.sps->height;
+-     int curr_y = 0;
+-@@ -3603,26 +3603,6 @@ static void flush_frame(HEVCContext *s,AVFrame *frame)
+- #endif
+- }
+- 
+--static void flush_all(HEVCContext *s)
+--{
+--#if 0
+--    struct vcsm_user_clean_invalid_s iocache = {};
+--    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(s->frame->buf[0]);
+--    iocache.s[0].handle = p->vcsm_handle;
+--    iocache.s[0].cmd = 4; // Flush all
+--    iocache.s[0].addr = p->arm;
+--    iocache.s[0].size  = 4096;
+--    vcsm_clean_invalid( &iocache );
+--#else
+--  int i,k;
+--  for(i=0;i<2;i++) {
+--    for (k = 0; k < s->sh.nb_refs[i]; k++) {
+--      flush_frame(s,s->ref->refPicList[i].ref[k]->frame);
+--    }
+--  }
+--  flush_frame(s,s->frame);
+--#endif
+--}
+- #endif
+- 
+- static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-@@ -3658,9 +3638,6 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-       if (s->ps.pps->weighted_bipred_flag && s->sh.slice_type == B_SLICE)
+-         printf("Weighted B slice\n");
+-     }
+--
+--    // Now flush all reference frames and our destination frame to get everything ready for decode
+--    flush_all(s);
+- #endif
+- 
+-     //printf("L0=%d L1=%d\n",s->sh.nb_refs[L1],s->sh.nb_refs[L1]);
+-@@ -4130,6 +4107,11 @@ static int hevc_frame_start(HEVCContext *s)
+-     if (!s->avctx->hwaccel)
+-         ff_thread_finish_setup(s->avctx);
+- 
+-+#ifdef RPI_INTER_QPU
+-+    // Invalidate the output data buffer so it is ready for the QPUs to write into it.
+-+    flush_frame(s,s->frame);
+-+#endif
+-+
+-     return 0;
+- 
+- fail:
+-@@ -4331,6 +4313,11 @@ fail:
+-         ff_hevc_flush_buffer(s, &s->ref->tf, s->ps.sps->height);
+- #endif
+-         ff_thread_report_progress(&s->ref->tf, INT_MAX, 0);
+-+    } else if (s->ref) {
+-+#ifdef RPI_INTER_QPU
+-+      // When running single threaded we need to flush the whole frame
+-+      flush_frame(s,s->frame);
+-+#endif
+-     }
+-     return ret;
+- }
+--- 
+-2.7.4
+-
+-
+-From 26eba8e3266cc5f2120e8284a1ce486d6a402010 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Thu, 4 Jun 2015 07:59:28 +0100
+-Subject: [PATCH 57/68] Corrected support for disabled rpi when using
+- RPI_WORKER
+-
+----
+- libavcodec/hevc.h              | 18 ++++++++++--------
+- libavcodec/hevcpred_template.c |  2 +-
+- 2 files changed, 11 insertions(+), 9 deletions(-)
+-
+-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
+-index 7bd295a..3cb34bd 100644
+---- a/libavcodec/hevc.h
+-+++ b/libavcodec/hevc.h
+-@@ -769,7 +769,17 @@ typedef struct HEVCFrame {
+-     uint8_t flags;
+- } HEVCFrame;
+- 
+-+#ifdef RPI_WORKER
+-+typedef struct HEVCLocalContextIntra {
+-+    TransformUnit tu;
+-+    NeighbourAvailable na;
+-+} HEVCLocalContextIntra;
+-+#endif
+-+
+- typedef struct HEVCLocalContext {
+-+    TransformUnit tu;
+-+    NeighbourAvailable na;  // WARNING tu and na must be the first two fields to match HEVCLocalContextIntra
+-+
+-     uint8_t cabac_state[HEVC_CONTEXTS];
+- 
+-     uint8_t stat_coeff[4];
+-@@ -784,7 +794,6 @@ typedef struct HEVCLocalContext {
+- 
+-     int qPy_pred;
+- 
+--    TransformUnit tu;
+- 
+-     uint8_t ctb_left_flag;
+-     uint8_t ctb_up_flag;
+-@@ -801,7 +810,6 @@ typedef struct HEVCLocalContext {
+-     int ct_depth;
+-     CodingUnit cu;
+-     PredictionUnit pu;
+--    NeighbourAvailable na;
+- 
+- #define BOUNDARY_LEFT_SLICE     (1 << 0)
+- #define BOUNDARY_LEFT_TILE      (1 << 1)
+-@@ -812,12 +820,6 @@ typedef struct HEVCLocalContext {
+-     int boundary_flags;
+- } HEVCLocalContext;
+- 
+--#ifdef RPI_WORKER
+--typedef struct HEVCLocalContextIntra {
+--    TransformUnit tu;
+--    NeighbourAvailable na;
+--} HEVCLocalContextIntra;
+--#endif
+- 
+- #ifdef RPI
+- 
+-diff --git a/libavcodec/hevcpred_template.c b/libavcodec/hevcpred_template.c
+-index 344e021..325b60e 100644
+---- a/libavcodec/hevcpred_template.c
+-+++ b/libavcodec/hevcpred_template.c
+-@@ -72,7 +72,7 @@ do {                                  \
+-             else                                                               \
+-                 a = PIXEL_SPLAT_X4(ptr[i + 3])
+- #ifdef RPI_WORKER
+--    HEVCLocalContextIntra *lc = &s->HEVClcIntra;
+-+    HEVCLocalContextIntra *lc = s->enable_rpi ? &s->HEVClcIntra : (HEVCLocalContextIntra *)s->HEVClc ;
+- #else
+-     HEVCLocalContext *lc = s->HEVClc;
+- #endif
+--- 
+-2.7.4
+-
+-
+-From 5b3eee9be88a5326df7621de95095def969e05a8 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Thu, 4 Jun 2015 11:52:55 +0100
+-Subject: [PATCH 58/68] Draft support for tiles
+-
+----
+- libavcodec/hevc.c              | 140 +++++++++++++++++++++++------------------
+- libavcodec/hevc.h              |  21 +++++--
+- libavcodec/hevc_filter.c       |   2 +-
+- libavcodec/hevcpred_template.c |   2 +-
+- 4 files changed, 99 insertions(+), 66 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 73d7f74..ec67252 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -63,10 +63,10 @@
+- 
+-   static void rpi_execute_dblk_cmds(HEVCContext *s);
+-   static void rpi_execute_transform(HEVCContext *s);
+--  static void rpi_execute_inter_qpu(HEVCContext *s);
+-+  static void rpi_launch_vpu_qpu(HEVCContext *s);
+-   static void rpi_execute_pred_cmds(HEVCContext *s);
+-   static void rpi_execute_inter_cmds(HEVCContext *s);
+--  static void rpi_inter_clear(HEVCContext *s);
+-+  static void rpi_begin(HEVCContext *s);
+- 
+-   // Define INTER_PASS0 to do inter prediction in first pass
+-   //#define INTER_PASS0
+-@@ -90,16 +90,18 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
+- 
+- #ifdef RPI_INTER_QPU
+- 
+-+// Each luma QPU processes 2*RPI_NUM_CHUNKS 64x64 blocks
+-+// Each chroma QPU processes 3*RPI_NUM_CHUNKS 64x64 blocks, but requires two commands for B blocks
+-+// For each block of 64*64 the smallest block size is 8x4
+-+// We also need an extra command for the setup information
+-+
+- #define RPI_CHROMA_COMMAND_WORDS 12
+--#define UV_COMMANDS_PER_QPU ((1 + (256*64*2)/(4*4)) * RPI_CHROMA_COMMAND_WORDS)
+-+#define UV_COMMANDS_PER_QPU ((1 + 3*RPI_NUM_CHUNKS*(64*64)*2/(8*4)) * RPI_CHROMA_COMMAND_WORDS)
+- // The QPU code for UV blocks only works up to a block width of 8
+- #define RPI_CHROMA_BLOCK_WIDTH 8
+- 
+--// Split image of 2048 into parts 64 wide
+--// So some QPUs will have 3 blocks of 64 to do, and others 2 blocks for an image 2048 wide with 32 blocks across
+--// For each block of 64*64 the smallest block size is 8x4
+- #define RPI_LUMA_COMMAND_WORDS 9
+--#define Y_COMMANDS_PER_QPU ((1+3*(64*64)/(8*4)) * RPI_LUMA_COMMAND_WORDS)
+-+#define Y_COMMANDS_PER_QPU ((1+2*RPI_NUM_CHUNKS*(64*64)/(8*4)) * RPI_LUMA_COMMAND_WORDS)
+- 
+- #define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24)
+- 
+-@@ -216,7 +218,7 @@ static void *worker_start(void *arg)
+-     LOG_ENTER
+-     // printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
+- #ifndef LAUNCH_PASS0
+--    rpi_execute_inter_qpu(s);
+-+    rpi_launch_vpu_qpu(s);
+- #endif
+- #ifndef INTER_PASS0
+-     // Perform inter prediction
+-@@ -322,9 +324,14 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
+- 
+- #ifdef RPI
+-     av_assert0(sps);
+--    int coeffs_in_ctb = (1 << sps->log2_ctb_size) * (1 << sps->log2_ctb_size);
+--    int coefs_per_row = sps->ctb_width * coeffs_in_ctb * 3;  // Allow space for chroma
+-+    int coefs_in_ctb = (1 << sps->log2_ctb_size) * (1 << sps->log2_ctb_size);
+-+    int coefs_per_luma = 64*64*24*RPI_NUM_CHUNKS;
+-+    int coefs_per_chroma = (coefs_per_luma * 2) >> sps->vshift[1] >> sps->hshift[1];
+-+    int coefs_per_row = coefs_per_luma + coefs_per_chroma;
+-     int job;
+-+    s->max_ctu_count = coefs_per_luma / coefs_in_ctb;
+-+    s->ctu_per_y_chan = s->max_ctu_count / 12;
+-+    s->ctu_per_uv_chan = s->max_ctu_count / 8;
+-     for(job=0;job<RPI_MAX_JOBS;job++) {
+-       printf("Allocated %d\n",coefs_per_row);
+-       for(job=0;job<RPI_MAX_JOBS;job++) {
+-@@ -2186,10 +2193,9 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-             int my2_mx2_my_mx = (my_mx << 16) + my_mx;
+-             int x1 = x0 + (mv->x >> 2);
+-             int y1 = y0 + (mv->y >> 2);
+--            int chan = x0>>6; // 64 wide blocks per QPU
+-             int weight_flag = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
+-                               (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
+--            uint32_t *y = s->y_mvs[s->pass0_job][chan % 12];
+-+            uint32_t *y = s->curr_y_mvs;
+-             for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
+-               for(int start_x=0;start_x < nPbW;start_x+=16) {
+-                   int bw = nPbW-start_x;
+-@@ -2209,7 +2215,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                   y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter;
+-                 }
+-             }
+--            s->y_mvs[s->pass0_job][chan % 12] = y;
+-+            s->curr_y_mvs = y;
+-         } else
+- #endif
+-         {
+-@@ -2233,12 +2239,10 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+- 
+-                 int x1_c = x0_c + (mv->x >> (2 + hshift));
+-                 int y1_c = y0_c + (mv->y >> (2 + hshift));
+--                //int chan = x0>>8; // Allocate commands for the first 256 luma pixels across to the first QPU.  This is optimised for images around 1920 width
+--                int chan = x0>>8;
+-                 int weight_flag      = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
+-                                        (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
+- 
+--                uint32_t *u = s->u_mvs[s->pass0_job][chan & 7];
+-+                uint32_t *u = s->curr_u_mvs;
+-                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
+-                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
+-                       int bw = nPbW_c-start_x;
+-@@ -2262,7 +2266,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+-                     }
+-                 }
+--                s->u_mvs[s->pass0_job][chan & 7] = u;
+-+                s->curr_u_mvs = u;
+-                 return;
+-             }
+- #endif
+-@@ -2289,10 +2293,9 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-             int my2_mx2_my_mx = (my_mx << 16) + my_mx;
+-             int x1 = x0 + (mv->x >> 2);
+-             int y1 = y0 + (mv->y >> 2);
+--            int chan = x0>>6; // 64 wide blocks per QPU
+-             int weight_flag = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
+-                               (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
+--            uint32_t *y = s->y_mvs[s->pass0_job][chan % 12];
+-+            uint32_t *y = s->curr_y_mvs;
+-             for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
+-               for(int start_x=0;start_x < nPbW;start_x+=16) {
+-                   int bw = nPbW-start_x;
+-@@ -2312,7 +2315,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                   y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter;
+-                 }
+-             }
+--            s->y_mvs[s->pass0_job][chan % 12] = y;
+-+            s->curr_y_mvs = y;
+-         } else
+- #endif
+- 
+-@@ -2337,12 +2340,10 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+- 
+-                 int x1_c = x0_c + (mv->x >> (2 + hshift));
+-                 int y1_c = y0_c + (mv->y >> (2 + hshift));
+--                //int chan = x0>>8; // Allocate commands for the first 256 luma pixels across to the first QPU.  This is optimised for images around 1920 width
+--                int chan = x0>>8;
+-                 int weight_flag      = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
+-                                        (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
+- 
+--                uint32_t *u = s->u_mvs[s->pass0_job][chan & 7];
+-+                uint32_t *u = s->curr_u_mvs;
+-                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
+-                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
+-                       int bw = nPbW_c-start_x;
+-@@ -2367,7 +2368,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+-                     }
+-                 }
+--                s->u_mvs[s->pass0_job][chan & 7] = u;
+-+                s->curr_u_mvs = u;
+-                 return;
+-             }
+- #endif
+-@@ -2400,8 +2401,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-             int y1 = y0 + (mv->y >> 2);
+-             int x2 = x0 + (mv2->x >> 2);
+-             int y2 = y0 + (mv2->y >> 2);
+--            int chan = x0>>6; // 64 wide blocks per QPU
+--            uint32_t *y = s->y_mvs[s->pass0_job][chan % 12];
+-+            uint32_t *y = s->curr_y_mvs;
+-             for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
+-               for(int start_x=0;start_x < nPbW;start_x+=8) { // B blocks work 8 at a time
+-                   int bw = nPbW-start_x;
+-@@ -2417,7 +2417,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                   y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter_b;
+-                 }
+-             }
+--            s->y_mvs[s->pass0_job][chan % 12] = y;
+-+            s->curr_y_mvs = y;
+-         } else
+- #endif
+-         {
+-@@ -2448,9 +2448,8 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                 int x2_c = x0_c + (mv2->x >> (2 + hshift));
+-                 int y2_c = y0_c + (mv2->y >> (2 + hshift));
+- 
+--                int chan = x0>>8; // Allocate commands for the first 256 luma pixels across to the first QPU.  This is optimised for images around 1920 width
+- 
+--                uint32_t *u = s->u_mvs[s->pass0_job][chan & 7];
+-+                uint32_t *u = s->curr_u_mvs;
+-                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
+-                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
+-                       int bw = nPbW_c-start_x;
+-@@ -2479,7 +2478,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+-                     }
+-                 }
+--                s->u_mvs[s->pass0_job][chan & 7] = u;
+-+                s->curr_u_mvs = u;
+-                 return;
+-             }
+- #endif
+-@@ -3114,12 +3113,8 @@ static void rpi_execute_inter_cmds(HEVCContext *s)
+- 
+- static void rpi_do_all_passes(HEVCContext *s)
+- {
+--#ifdef RPI_INTER_QPU
+--    // Kick off inter prediction on QPUs
+--    rpi_execute_inter_qpu(s);
+--#else
+--    rpi_execute_transform(s);
+--#endif
+-+    // Kick off QPUs and VPUs
+-+    rpi_launch_vpu_qpu(s);
+-     // Perform luma inter prediction
+-     rpi_execute_inter_cmds(s);
+-     // Wait for transform completion
+-@@ -3128,18 +3123,18 @@ static void rpi_do_all_passes(HEVCContext *s)
+-     rpi_execute_pred_cmds(s);
+-     // Perform deblocking for CTBs in this row
+-     rpi_execute_dblk_cmds(s);
+--#ifdef RPI_INTER_QPU
+--    rpi_inter_clear(s);
+--#endif
+-+    // Prepare next batch
+-+    rpi_begin(s);
+- }
+- 
+- #endif
+- 
+--#ifdef RPI_INTER_QPU
+--static void rpi_inter_clear(HEVCContext *s)
+-+#ifdef RPI
+-+static void rpi_begin(HEVCContext *s)
+- {
+-     int job = s->pass0_job;
+-     int i;
+-+#ifdef RPI_INTER_QPU
+-     int pic_width        = s->ps.sps->width >> s->ps.sps->hshift[1];
+-     int pic_height       = s->ps.sps->height >> s->ps.sps->vshift[1];
+-     int weight_flag      = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
+-@@ -3165,6 +3160,8 @@ static void rpi_inter_clear(HEVCContext *s)
+-         }
+-         *s->u_mvs[job][i]++ = i;  // Select section of VPM (avoid collisions with 3d unit)
+-     }
+-+    s->curr_u_mvs = s->u_mvs[job][0];
+-+#endif
+- 
+- #ifdef RPI_LUMA_QPU
+-     for(i=0;i<12;i++) {
+-@@ -3187,8 +3184,11 @@ static void rpi_inter_clear(HEVCContext *s)
+-         }
+-         *s->y_mvs[job][i]++ = 0; // Next kernel
+-     }
+-+    s->curr_y_mvs = s->y_mvs[job][0];
+- #endif
+-+    s->ctu_count = 0;
+- }
+-+#endif
+- 
+- #ifdef RPI_SIMULATE_QPUS
+- 
+-@@ -3459,8 +3459,9 @@ static void rpi_simulate_inter_qpu(HEVCContext *s)
+- 
+- #endif
+- 
+-+#ifdef RPI_INTER_QPU
+- 
+--static void rpi_execute_inter_qpu(HEVCContext *s)
+-+static void rpi_launch_vpu_qpu(HEVCContext *s)
+- {
+-     int k;
+- #ifdef LAUNCH_PASS0
+-@@ -3558,6 +3559,15 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
+- 
+- 
+- }
+-+#else
+-+
+-+#ifdef RPI
+-+static void rpi_launch_vpu_qpu(HEVCContext *s)
+-+{
+-+  rpi_execute_transform(s);
+-+}
+-+#endif
+-+
+- #endif
+- 
+- #ifdef RPI
+-@@ -3617,29 +3627,20 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+- #ifdef RPI
+- #ifdef RPI_INTER_QPU
+-     s->enable_rpi = s->ps.sps->bit_depth == 8
+--                    && s->ps.sps->width <= RPI_MAX_WIDTH
+-                     && !s->ps.pps->cross_component_prediction_enabled_flag
+--                    && s->ps.pps->num_tile_rows <= 1 && s->ps.pps->num_tile_columns <= 1
+-                     && !(s->ps.pps->weighted_bipred_flag && s->sh.slice_type == B_SLICE);
+- #else
+-     s->enable_rpi = s->ps.sps->bit_depth == 8
+--                    && s->ps.sps->width <= RPI_MAX_WIDTH
+--                    && !s->ps.pps->cross_component_prediction_enabled_flag
+--                    && s->ps.pps->num_tile_rows <= 1 && s->ps.pps->num_tile_columns <= 1;
+-+                    && !s->ps.pps->cross_component_prediction_enabled_flag;
+- #endif
+- 
+-     if (!s->enable_rpi) {
+-       if (s->ps.pps->cross_component_prediction_enabled_flag)
+-         printf("Cross component\n");
+--      if (s->ps.pps->num_tile_rows > 1 || s->ps.pps->num_tile_columns > 1)
+--        printf("Tiles\n");
+--      if (s->ps.pps->weighted_pred_flag && s->sh.slice_type == P_SLICE)
+--        printf("Weighted P slice\n");
+-       if (s->ps.pps->weighted_bipred_flag && s->sh.slice_type == B_SLICE)
+-         printf("Weighted B slice\n");
+-     }
+- #endif
+--
+-     //printf("L0=%d L1=%d\n",s->sh.nb_refs[L1],s->sh.nb_refs[L1]);
+- 
+-     if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) {
+-@@ -3660,8 +3661,8 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-     s->pass1_job = 0;
+-     s->pass2_job = 0;
+- #endif
+--#ifdef RPI_INTER_QPU
+--    rpi_inter_clear(s);
+-+#ifdef RPI
+-+    rpi_begin(s);
+- #endif
+- 
+-     while (more_data && ctb_addr_ts < s->ps.sps->ctb_size) {
+-@@ -3679,13 +3680,34 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-         s->deblock[ctb_addr_rs].tc_offset   = s->sh.tc_offset;
+-         s->filter_slice_edges[ctb_addr_rs]  = s->sh.slice_loop_filter_across_slices_enabled_flag;
+- 
+-+#ifdef RPI_INTER_QPU
+-+        s->curr_u_mvs = s->u_mvs[s->pass0_job][s->ctu_count / s->ctu_per_uv_chan];
+-+#endif
+-+#ifdef RPI_LUMA_QPU
+-+        s->curr_y_mvs = s->y_mvs[s->pass0_job][s->ctu_count / s->ctu_per_y_chan];
+-+#endif
+-+
+-         more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
+- 
+-+#ifdef RPI_INTER_QPU
+-+        s->u_mvs[s->pass0_job][s->ctu_count / s->ctu_per_uv_chan] = s->curr_u_mvs;
+-+#endif
+-+#ifdef RPI_LUMA_QPU
+-+        s->y_mvs[s->pass0_job][s->ctu_count / s->ctu_per_y_chan] = s->curr_y_mvs;
+-+#endif
+-+
+- #ifdef RPI
+-         if (s->enable_rpi) {
+-+          //av_assert0(s->num_dblk_cmds[s->pass0_job]>=0);
+-+          //av_assert0(s->num_dblk_cmds[s->pass0_job]<RPI_MAX_DEBLOCK_CMDS);
+-+          //av_assert0(s->pass0_job<RPI_MAX_JOBS);
+-+          //av_assert0(s->pass0_job>=0);
+-           s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]][0] = x_ctb;
+-           s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]++][1] = y_ctb;
+--          if ( (((y_ctb + ctb_size)&63) == 0) && x_ctb + ctb_size >= s->ps.sps->width) {
+-+          s->ctu_count++;
+-+          //printf("%d %d/%d job=%d\n",s->ctu_count,s->num_dblk_cmds[s->pass0_job],RPI_MAX_DEBLOCK_CMDS,s->pass0_job);
+-+
+-+          if ( s->ctu_count >= s->max_ctu_count ) {
+- #ifdef RPI_WORKER
+-             if (s->used_for_ref) {
+-               // Split work load onto separate threads so we make as rapid progress as possible with this frame
+-@@ -3693,7 +3715,7 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-               rpi_execute_inter_cmds(s);
+-   #endif
+-   #ifdef LAUNCH_PASS0
+--              rpi_execute_inter_qpu(s);
+-+              rpi_launch_vpu_qpu(s);
+-   #endif
+-               // Pass on this job to worker thread
+-               worker_submit_job(s);
+-@@ -3701,9 +3723,7 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-               worker_pass0_ready(s);
+- 
+-               // Prepare the next batch of commands
+--#ifdef RPI_INTER_QPU
+--              rpi_inter_clear(s);
+--#endif
+-+              rpi_begin(s);
+-             } else {
+-               // Non-ref frame so do it all on this thread
+-               rpi_do_all_passes(s);
+-@@ -3744,7 +3764,7 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+- #endif
+- 
+-     // Finish off any half-completed rows
+--    if (s->enable_rpi && s->num_dblk_cmds[s->pass0_job]) {
+-+    if (s->enable_rpi && s->ctu_count) {
+-         rpi_do_all_passes(s);
+-     }
+- 
+-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
+-index 3cb34bd..a141316 100644
+---- a/libavcodec/hevc.h
+-+++ b/libavcodec/hevc.h
+-@@ -823,8 +823,15 @@ typedef struct HEVCLocalContext {
+- 
+- #ifdef RPI
+- 
+-+// The processing is done in chunks
+-+// Each chunk corresponds to 24 64x64 luma blocks (24 so it is divisible by 8 for chroma and 12 for luma)
+-+// This is a distance of 1536 pixels across the screen
+-+// Increasing RPI_NUM_CHUNKS will reduce time spent activating QPUs and cache flushing,
+-+// but allocate more memory and increase the latency before data in the next frame can be processed
+-+#define RPI_NUM_CHUNKS 1
+-+
+- // RPI_MAX_WIDTH is maximum width in pixels supported by the accelerated code
+--#define RPI_MAX_WIDTH 2048
+-+#define RPI_MAX_WIDTH (RPI_NUM_CHUNKS*64*24)
+- 
+- // Worst case is for 4:4:4 4x4 blocks with 64 high coding tree blocks, so 16 MV cmds per 4 pixels across for each colour plane, * 2 for bi
+- #define RPI_MAX_MV_CMDS   (2*16*3*(RPI_MAX_WIDTH/4))
+-@@ -888,9 +895,6 @@ typedef struct HEVCPredCmd {
+- #endif
+- 
+- typedef struct HEVCContext {
+--#ifdef RPI
+--    int dblk_cmds[RPI_MAX_JOBS][RPI_MAX_DEBLOCK_CMDS][2];
+--#endif
+-     const AVClass *c;  // needed by private avoptions
+-     AVCodecContext *avctx;
+- 
+-@@ -928,6 +932,10 @@ typedef struct HEVCContext {
+-     int pass0_job; // Pass0 does coefficient decode
+-     int pass1_job; // Pass1 does pixel processing
+-     int pass2_job; // Pass2 does reconstruction and deblocking
+-+    int ctu_count; // Number of CTUs done in pass0 so far
+-+    int max_ctu_count; // Number of CTUs when we trigger a round of processing
+-+    int ctu_per_y_chan; // Number of CTUs per luma QPU
+-+    int ctu_per_uv_chan; // Number of CTUs per chroma QPU
+- #ifdef RPI_INTER_QPU
+-     GPU_MEM_PTR_T unif_mvs_ptr[RPI_MAX_JOBS];
+-     uint32_t *unif_mvs[RPI_MAX_JOBS]; // Base of memory for motion vector commands
+-@@ -936,6 +944,7 @@ typedef struct HEVCContext {
+-     uint32_t *mvs_base[RPI_MAX_JOBS][8];
+-     // these pointers are to the next free space
+-     uint32_t *u_mvs[RPI_MAX_JOBS][8];
+-+    uint32_t *curr_u_mvs; // Current uniform stream to use for chroma
+-     // Function pointers
+-     uint32_t mc_filter_uv;
+-     uint32_t mc_filter_uv_b0;
+-@@ -946,6 +955,7 @@ typedef struct HEVCContext {
+-     uint32_t *y_unif_mvs[RPI_MAX_JOBS]; // Base of memory for motion vector commands
+-     uint32_t *y_mvs_base[RPI_MAX_JOBS][12];
+-     uint32_t *y_mvs[RPI_MAX_JOBS][12];
+-+    uint32_t *curr_y_mvs; // Current uniform stream for luma
+-     // Function pointers
+-     uint32_t mc_filter;
+-     uint32_t mc_filter_b;
+-@@ -1084,6 +1094,9 @@ typedef struct HEVCContext {
+-     uint32_t max_mastering_luminance;
+-     uint32_t min_mastering_luminance;
+- 
+-+#ifdef RPI
+-+    int dblk_cmds[RPI_MAX_JOBS][RPI_MAX_DEBLOCK_CMDS][2];
+-+#endif
+- } HEVCContext;
+- 
+- int ff_hevc_decode_short_term_rps(GetBitContext *gb, AVCodecContext *avctx,
+-diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
+-index b286bbf..1f04790 100644
+---- a/libavcodec/hevc_filter.c
+-+++ b/libavcodec/hevc_filter.c
+-@@ -891,7 +891,7 @@ void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n)
+-         int n_uv = n >> s->ps.sps->vshift[1];
+-         int sz,base;
+-         if (curr_uv < 0) curr_uv = 0;
+--        if (n_uv<=curr_uv) { assert(0); return; } // Should not happen
+-+        if (n_uv<=curr_uv) { return; }
+-         sz = s->frame->linesize[1] * (n_uv-curr_uv);
+-         base = s->frame->linesize[1] * curr_uv;
+-         GPU_MEM_PTR_T *p = av_buffer_pool_opaque(s->frame->buf[1]);
+-diff --git a/libavcodec/hevcpred_template.c b/libavcodec/hevcpred_template.c
+-index 325b60e..28d2653 100644
+---- a/libavcodec/hevcpred_template.c
+-+++ b/libavcodec/hevcpred_template.c
+-@@ -72,7 +72,7 @@ do {                                  \
+-             else                                                               \
+-                 a = PIXEL_SPLAT_X4(ptr[i + 3])
+- #ifdef RPI_WORKER
+--    HEVCLocalContextIntra *lc = s->enable_rpi ? &s->HEVClcIntra : (HEVCLocalContextIntra *)s->HEVClc ;
+-+    HEVCLocalContextIntra *lc = (s->enable_rpi) ? &s->HEVClcIntra : (HEVCLocalContextIntra *)s->HEVClc ;
+- #else
+-     HEVCLocalContext *lc = s->HEVClc;
+- #endif
+--- 
+-2.7.4
+-
+-
+-From 1674a80d147e5342ef6ea9a4fb4ddfc640c15a05 Mon Sep 17 00:00:00 2001
+-From: popcornmix <popcornmix@gmail.com>
+-Date: Thu, 4 Jun 2015 15:48:10 +0100
+-Subject: [PATCH 59/68] Move deblocker into second pass
+-
+----
+- libavcodec/hevc.c | 79 +++++++++++++++++++++++++++++++++++++++++++++----------
+- 1 file changed, 65 insertions(+), 14 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index ec67252..6cecbdd 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -67,6 +67,8 @@
+-   static void rpi_execute_pred_cmds(HEVCContext *s);
+-   static void rpi_execute_inter_cmds(HEVCContext *s);
+-   static void rpi_begin(HEVCContext *s);
+-+  static void flush_frame(HEVCContext *s,AVFrame *frame);
+-+  static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2);
+- 
+-   // Define INTER_PASS0 to do inter prediction in first pass
+-   //#define INTER_PASS0
+-@@ -227,6 +229,11 @@ static void *worker_start(void *arg)
+-     // Wait for transform completion
+-     vpu_wait(s->vpu_id);
+- 
+-+    // Perform intra prediction and residual reconstruction
+-+    rpi_execute_pred_cmds(s);
+-+    // Perform deblocking for CTBs in this row
+-+    rpi_execute_dblk_cmds(s);
+-+
+-     worker_complete_middle_job(s);
+-     LOG_EXIT
+-   }
+-@@ -248,10 +255,6 @@ static void *worker_deblock_start(void *arg)
+-       break;
+-     }
+-     LOG_ENTER
+--    // Perform intra prediction and residual reconstruction
+--    rpi_execute_pred_cmds(s);
+--    // Perform deblocking for CTBs in this row
+--    rpi_execute_dblk_cmds(s);
+- 
+-     worker_complete_job(s);
+-     LOG_EXIT
+-@@ -2983,7 +2986,7 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
+- static void rpi_execute_dblk_cmds(HEVCContext *s)
+- {
+-     int n;
+--    int job = s->pass2_job;
+-+    int job = s->pass1_job;
+-     int ctb_size    = 1 << s->ps.sps->log2_ctb_size;
+-     int (*p)[2] = s->dblk_cmds[job];
+-     for(n = s->num_dblk_cmds[job]; n>0 ;n--,p++) {
+-@@ -3021,7 +3024,7 @@ static void rpi_execute_transform(HEVCContext *s)
+- static void rpi_execute_pred_cmds(HEVCContext *s)
+- {
+-   int i;
+--  int job = s->pass2_job;
+-+  int job = s->pass1_job;
+-   HEVCPredCmd *cmd = s->univ_pred_cmds[job];
+- #ifdef RPI_WORKER
+-   HEVCLocalContextIntra *lc = &s->HEVClcIntra;
+-@@ -3506,11 +3509,10 @@ static void rpi_launch_vpu_qpu(HEVCContext *s)
+- 
+- #ifdef RPI_MULTI_MAILBOX
+- #ifdef RPI_CACHE_UNIF_MVS
+--    gpu_cache_flush3(&s->coeffs_buf_accelerated[job],&s->y_unif_mvs_ptr[job], &s->unif_mvs_ptr[job]);
+-+    flush_frame3(s, s->frame,&s->coeffs_buf_accelerated[job],&s->y_unif_mvs_ptr[job], &s->unif_mvs_ptr[job]);
+- #else
+--    gpu_cache_flush(&s->coeffs_buf_accelerated[job]);
+-+    flush_frame3(s, s->frame,&s->coeffs_buf_accelerated[job],NULL,NULL);
+- #endif
+--
+-     s->vpu_id = vpu_qpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2], s->num_coeffs[job][2] >> 8, s->coeffs_buf_vc[job][3], s->num_coeffs[job][3] >> 10, 0,
+-                                    qpu_get_fn(QPU_MC_SETUP_UV),
+-                                    (uint32_t)(unif_vc+(s->mvs_base[job][0 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-@@ -3613,6 +3615,60 @@ static void flush_frame(HEVCContext *s,AVFrame *frame)
+- #endif
+- }
+- 
+-+static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2)
+-+{
+-+#ifdef RPI_FAST_CACHEFLUSH
+-+    struct vcsm_user_clean_invalid_s iocache = {};
+-+    int n = s->ps.sps->height;
+-+    int curr_y = 0;
+-+    int curr_uv = 0;
+-+    int n_uv = n >> s->ps.sps->vshift[1];
+-+    int sz,base;
+-+    sz = s->frame->linesize[1] * (n_uv-curr_uv);
+-+    base = s->frame->linesize[1] * curr_uv;
+-+    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[1]);
+-+    iocache.s[0].handle = p->vcsm_handle;
+-+    iocache.s[0].cmd = 3; // clean+invalidate
+-+    iocache.s[0].addr = p->arm + base;
+-+    iocache.s[0].size  = sz;
+-+    p = av_buffer_pool_opaque(frame->buf[2]);
+-+    iocache.s[1].handle = p->vcsm_handle;
+-+    iocache.s[1].cmd = 3; // clean+invalidate
+-+    iocache.s[1].addr = p->arm + base;
+-+    iocache.s[1].size  = sz;
+-+    p = av_buffer_pool_opaque(frame->buf[0]);
+-+    sz = s->frame->linesize[0] * (n-curr_y);
+-+    base = s->frame->linesize[0] * curr_y;
+-+    iocache.s[2].handle = p->vcsm_handle;
+-+    iocache.s[2].cmd = 3; // clean+invalidate
+-+    iocache.s[2].addr = p->arm + base;
+-+    iocache.s[2].size  = sz;
+-+
+-+    iocache.s[3].handle = p0->vcsm_handle;
+-+    iocache.s[3].cmd = 3; // clean+invalidate
+-+    iocache.s[3].addr = (int) p0->arm;
+-+    iocache.s[3].size  = p0->numbytes;
+-+    if (p1) {
+-+      iocache.s[4].handle = p1->vcsm_handle;
+-+      iocache.s[4].cmd = 3; // clean+invalidate
+-+      iocache.s[4].addr = (int) p1->arm;
+-+      iocache.s[4].size  = p1->numbytes;
+-+    }
+-+    if (p2) {
+-+      iocache.s[5].handle = p2->vcsm_handle;
+-+      iocache.s[5].cmd = 3; // clean+invalidate
+-+      iocache.s[5].addr = (int) p2->arm;
+-+      iocache.s[5].size  = p2->numbytes;
+-+    }
+-+    vcsm_clean_invalid( &iocache );
+-+#else
+-+    flush_buffer(frame->buf[0]);
+-+    flush_buffer(frame->buf[1]);
+-+    flush_buffer(frame->buf[2]);
+-+    gpu_cache_flush3(p0, p1, p2);
+-+#endif
+-+}
+-+
+- #endif
+- 
+- static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-@@ -4127,11 +4183,6 @@ static int hevc_frame_start(HEVCContext *s)
+-     if (!s->avctx->hwaccel)
+-         ff_thread_finish_setup(s->avctx);
+- 
+--#ifdef RPI_INTER_QPU
+--    // Invalidate the output data buffer so it is ready for the QPUs to write into it.
+--    flush_frame(s,s->frame);
+--#endif
+--
+-     return 0;
+- 
+- fail:
+--- 
+-2.7.4
+-
+-
+-From a453fe438c4ab311d6476955d0a40a5d2ed8a1c6 Mon Sep 17 00:00:00 2001
+-From: popcornmix <popcornmix@gmail.com>
+-Date: Thu, 4 Jun 2015 16:10:23 +0100
+-Subject: [PATCH 60/68] Change order of ctu accesses to improve qpu performance
+-
+----
+- libavcodec/hevc.c | 8 ++++----
+- 1 file changed, 4 insertions(+), 4 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 6cecbdd..ec17e64 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -3737,19 +3737,19 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-         s->filter_slice_edges[ctb_addr_rs]  = s->sh.slice_loop_filter_across_slices_enabled_flag;
+- 
+- #ifdef RPI_INTER_QPU
+--        s->curr_u_mvs = s->u_mvs[s->pass0_job][s->ctu_count / s->ctu_per_uv_chan];
+-+        s->curr_u_mvs = s->u_mvs[s->pass0_job][s->ctu_count % 8];
+- #endif
+- #ifdef RPI_LUMA_QPU
+--        s->curr_y_mvs = s->y_mvs[s->pass0_job][s->ctu_count / s->ctu_per_y_chan];
+-+        s->curr_y_mvs = s->y_mvs[s->pass0_job][s->ctu_count % 12];
+- #endif
+- 
+-         more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
+- 
+- #ifdef RPI_INTER_QPU
+--        s->u_mvs[s->pass0_job][s->ctu_count / s->ctu_per_uv_chan] = s->curr_u_mvs;
+-+        s->u_mvs[s->pass0_job][s->ctu_count % 8]= s->curr_u_mvs;
+- #endif
+- #ifdef RPI_LUMA_QPU
+--        s->y_mvs[s->pass0_job][s->ctu_count / s->ctu_per_y_chan] = s->curr_y_mvs;
+-+        s->y_mvs[s->pass0_job][s->ctu_count % 12] = s->curr_y_mvs;
+- #endif
+- 
+- #ifdef RPI
+--- 
+-2.7.4
+-
+-
+-From 504de0435e8f660c1b7b2d6ec053dc922a2d2896 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Mon, 8 Jun 2015 09:36:59 +0100
+-Subject: [PATCH 61/68] Removed deblocker thread
+-
+----
+- libavcodec/hevc.c | 77 +++----------------------------------------------------
+- libavcodec/hevc.h |  4 ---
+- 2 files changed, 4 insertions(+), 77 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index ec17e64..1868532 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -70,11 +70,6 @@
+-   static void flush_frame(HEVCContext *s,AVFrame *frame);
+-   static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2);
+- 
+--  // Define INTER_PASS0 to do inter prediction in first pass
+--  //#define INTER_PASS0
+--  // Define LAUNCH_PASS0 to launch QPU/VPU from pass0
+--  //#define LAUNCH_PASS0
+--
+- #endif
+- 
+- // #define DISABLE_MC
+-@@ -147,24 +142,12 @@ static void worker_submit_job(HEVCContext *s)
+- }
+- 
+- // Call this to say we have completed pass1
+--static void worker_complete_middle_job(HEVCContext *s)
+--{
+--  LOG_ENTER
+--  pthread_mutex_lock(&s->worker_mutex);
+--  s->worker_middle++;
+--  s->pass1_job = (s->pass1_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
+--  pthread_cond_broadcast(&s->worker_cond_middle); // Let people know that the middle has moved
+--  pthread_mutex_unlock(&s->worker_mutex);
+--  LOG_EXIT
+--}
+--
+--// Call this to say we have completed pass2
+- static void worker_complete_job(HEVCContext *s)
+- {
+-   LOG_ENTER
+-   pthread_mutex_lock(&s->worker_mutex);
+-   s->worker_head++;
+--  s->pass2_job = (s->pass2_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
+-+  s->pass1_job = (s->pass1_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
+-   pthread_cond_broadcast(&s->worker_cond_head); // Let people know that the head has moved
+-   pthread_mutex_unlock(&s->worker_mutex);
+-   LOG_EXIT
+-@@ -208,7 +191,7 @@ static void *worker_start(void *arg)
+-   while(1) {
+-     pthread_mutex_lock(&s->worker_mutex);
+- 
+--    while( !s->kill_worker && s->worker_tail - s->worker_middle <= 0)
+-+    while( !s->kill_worker && s->worker_tail - s->worker_head <= 0)
+-     {
+-       pthread_cond_wait(&s->worker_cond_tail, &s->worker_mutex);
+-     }
+-@@ -219,13 +202,9 @@ static void *worker_start(void *arg)
+-     }
+-     LOG_ENTER
+-     // printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
+--#ifndef LAUNCH_PASS0
+-     rpi_launch_vpu_qpu(s);
+--#endif
+--#ifndef INTER_PASS0
+-     // Perform inter prediction
+-     rpi_execute_inter_cmds(s);
+--#endif
+-     // Wait for transform completion
+-     vpu_wait(s->vpu_id);
+- 
+-@@ -234,28 +213,6 @@ static void *worker_start(void *arg)
+-     // Perform deblocking for CTBs in this row
+-     rpi_execute_dblk_cmds(s);
+- 
+--    worker_complete_middle_job(s);
+--    LOG_EXIT
+--  }
+--  return NULL;
+--}
+--
+--static void *worker_deblock_start(void *arg)
+--{
+--  HEVCContext *s = (HEVCContext *)arg;
+--  while(1) {
+--    pthread_mutex_lock(&s->worker_mutex);
+--    while( !s->kill_worker && s->worker_middle - s->worker_head <= 0)
+--    {
+--      pthread_cond_wait(&s->worker_cond_middle, &s->worker_mutex);
+--    }
+--    pthread_mutex_unlock(&s->worker_mutex);
+--
+--    if (s->kill_worker) {
+--      break;
+--    }
+--    LOG_ENTER
+--
+-     worker_complete_job(s);
+-     LOG_EXIT
+-   }
+-@@ -2998,11 +2955,7 @@ static void rpi_execute_dblk_cmds(HEVCContext *s)
+- static void rpi_execute_transform(HEVCContext *s)
+- {
+-     int i=2;
+--#ifdef LAUNCH_PASS0
+--    int job = s->pass0_job;
+--#else
+-     int job = s->pass1_job;
+--#endif
+-     //int j;
+-     //int16_t *coeffs = s->coeffs_buf_arm[i];
+-     //for(j=s->num_coeffs[i]; j > 0; j-= 16*16, coeffs+=16*16) {
+-@@ -3057,11 +3010,7 @@ static void rpi_execute_pred_cmds(HEVCContext *s)
+- 
+- static void rpi_execute_inter_cmds(HEVCContext *s)
+- {
+--#ifdef INTER_PASS0
+--    int job = s->pass0_job;
+--#else
+-     int job = s->pass1_job;
+--#endif
+-     HEVCMvCmd *cmd = s->unif_mv_cmds[job];
+-     int n,cidx;
+-     AVFrame myref;
+-@@ -3467,11 +3416,7 @@ static void rpi_simulate_inter_qpu(HEVCContext *s)
+- static void rpi_launch_vpu_qpu(HEVCContext *s)
+- {
+-     int k;
+--#ifdef LAUNCH_PASS0
+--    int job = s->pass0_job;
+--#else
+-     int job = s->pass1_job;
+--#endif
+-     int i;
+-     uint32_t *unif_vc = (uint32_t *)s->unif_mvs_ptr[job].vc;
+- #ifdef RPI_LUMA_QPU
+-@@ -3574,10 +3519,12 @@ static void rpi_launch_vpu_qpu(HEVCContext *s)
+- 
+- #ifdef RPI
+- 
+-+#ifndef RPI_FAST_CACHEFLUSH
+- static void flush_buffer(AVBufferRef *bref) {
+-     GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
+-     gpu_cache_flush(p);
+- }
+-+#endif
+- 
+- static void flush_frame(HEVCContext *s,AVFrame *frame)
+- {
+-@@ -3715,7 +3662,6 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+- #ifdef RPI_WORKER
+-     s->pass0_job = 0;
+-     s->pass1_job = 0;
+--    s->pass2_job = 0;
+- #endif
+- #ifdef RPI
+-     rpi_begin(s);
+-@@ -3767,12 +3713,6 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+- #ifdef RPI_WORKER
+-             if (s->used_for_ref) {
+-               // Split work load onto separate threads so we make as rapid progress as possible with this frame
+--  #ifdef INTER_PASS0
+--              rpi_execute_inter_cmds(s);
+--  #endif
+--  #ifdef LAUNCH_PASS0
+--              rpi_launch_vpu_qpu(s);
+--  #endif
+-               // Pass on this job to worker thread
+-               worker_submit_job(s);
+-               // Make sure we have space to prepare the next job
+-@@ -3814,8 +3754,6 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-     // Wait for the worker to finish all its jobs
+-     if (s->enable_rpi) {
+-         worker_wait(s);
+--        av_assert0(s->pass0_job==s->pass1_job);
+--        av_assert0(s->pass1_job==s->pass2_job);
+-     }
+- #endif
+- 
+-@@ -4565,16 +4503,13 @@ static av_cold void hevc_init_worker(HEVCContext *s)
+- {
+-     int err;
+-     pthread_cond_init(&s->worker_cond_head, NULL);
+--    pthread_cond_init(&s->worker_cond_middle, NULL);
+-     pthread_cond_init(&s->worker_cond_tail, NULL);
+-     pthread_mutex_init(&s->worker_mutex, NULL);
+- 
+-     s->worker_tail=0;
+--    s->worker_middle=0;
+-     s->worker_head=0;
+-     s->kill_worker=0;
+-     err = pthread_create(&s->worker_thread, NULL, worker_start, s);
+--    err = pthread_create(&s->worker_deblock_thread, NULL, worker_deblock_start, s);
+-     if (err) {
+-         printf("Failed to create worker thread\n");
+-         exit(-1);
+-@@ -4586,17 +4521,13 @@ static av_cold void hevc_exit_worker(HEVCContext *s)
+-     void *res;
+-     s->kill_worker=1;
+-     pthread_cond_broadcast(&s->worker_cond_tail);
+--    pthread_cond_broadcast(&s->worker_cond_middle);
+-     pthread_join(s->worker_thread, &res);
+--    pthread_join(s->worker_deblock_thread, &res);
+- 
+-     pthread_cond_destroy(&s->worker_cond_head);
+--    pthread_cond_destroy(&s->worker_cond_middle);
+-     pthread_cond_destroy(&s->worker_cond_tail);
+-     pthread_mutex_destroy(&s->worker_mutex);
+- 
+-     s->worker_tail=0;
+--    s->worker_middle=0;
+-     s->worker_head=0;
+-     s->kill_worker=0;
+- }
+-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
+-index a141316..ef5bfb1 100644
+---- a/libavcodec/hevc.h
+-+++ b/libavcodec/hevc.h
+-@@ -931,7 +931,6 @@ typedef struct HEVCContext {
+-     //GPU_MEM_PTR_T dummy;
+-     int pass0_job; // Pass0 does coefficient decode
+-     int pass1_job; // Pass1 does pixel processing
+--    int pass2_job; // Pass2 does reconstruction and deblocking
+-     int ctu_count; // Number of CTUs done in pass0 so far
+-     int max_ctu_count; // Number of CTUs when we trigger a round of processing
+-     int ctu_per_y_chan; // Number of CTUs per luma QPU
+-@@ -963,15 +962,12 @@ typedef struct HEVCContext {
+- 
+- #ifdef RPI_WORKER
+-     pthread_t worker_thread;
+--    pthread_t worker_deblock_thread;
+-     pthread_cond_t worker_cond_head;
+-     pthread_cond_t worker_cond_tail;
+--    pthread_cond_t worker_cond_middle;
+-     pthread_mutex_t worker_mutex;
+- 
+-     int worker_tail; // Contains the number of posted jobs
+-     int worker_head; // Contains the number of completed jobs
+--    int worker_middle; // Contains the number of completed jobs
+-     int kill_worker; // set to 1 to terminate the worker
+- #endif
+- 
+--- 
+-2.7.4
+-
+-
+-From 74892301cdb0829de959b798debac6ffe1c71603 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Mon, 8 Jun 2015 11:04:43 +0100
+-Subject: [PATCH 62/68] Reduced amount of output frame that is invalidated
+-
+----
+- libavcodec/hevc.c | 45 +++++++++++++++++++++++++++++----------------
+- 1 file changed, 29 insertions(+), 16 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 1868532..cbb4f46 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -68,7 +68,7 @@
+-   static void rpi_execute_inter_cmds(HEVCContext *s);
+-   static void rpi_begin(HEVCContext *s);
+-   static void flush_frame(HEVCContext *s,AVFrame *frame);
+--  static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2);
+-+  static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2, int job);
+- 
+- #endif
+- 
+-@@ -3454,9 +3454,9 @@ static void rpi_launch_vpu_qpu(HEVCContext *s)
+- 
+- #ifdef RPI_MULTI_MAILBOX
+- #ifdef RPI_CACHE_UNIF_MVS
+--    flush_frame3(s, s->frame,&s->coeffs_buf_accelerated[job],&s->y_unif_mvs_ptr[job], &s->unif_mvs_ptr[job]);
+-+    flush_frame3(s, s->frame,&s->coeffs_buf_accelerated[job],&s->y_unif_mvs_ptr[job], &s->unif_mvs_ptr[job], job);
+- #else
+--    flush_frame3(s, s->frame,&s->coeffs_buf_accelerated[job],NULL,NULL);
+-+    flush_frame3(s, s->frame,&s->coeffs_buf_accelerated[job],NULL,NULL, job);
+- #endif
+-     s->vpu_id = vpu_qpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2], s->num_coeffs[job][2] >> 8, s->coeffs_buf_vc[job][3], s->num_coeffs[job][3] >> 10, 0,
+-                                    qpu_get_fn(QPU_MC_SETUP_UV),
+-@@ -3530,6 +3530,7 @@ static void flush_frame(HEVCContext *s,AVFrame *frame)
+- {
+- #ifdef RPI_FAST_CACHEFLUSH
+-     struct vcsm_user_clean_invalid_s iocache = {};
+-+    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[1]);
+-     int n = s->ps.sps->height;
+-     int curr_y = 0;
+-     int curr_uv = 0;
+-@@ -3537,22 +3538,21 @@ static void flush_frame(HEVCContext *s,AVFrame *frame)
+-     int sz,base;
+-     sz = s->frame->linesize[1] * (n_uv-curr_uv);
+-     base = s->frame->linesize[1] * curr_uv;
+--    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[1]);
+-     iocache.s[0].handle = p->vcsm_handle;
+-     iocache.s[0].cmd = 3; // clean+invalidate
+--    iocache.s[0].addr = p->arm + base;
+-+    iocache.s[0].addr = (int)(p->arm) + base;
+-     iocache.s[0].size  = sz;
+-     p = av_buffer_pool_opaque(frame->buf[2]);
+-     iocache.s[1].handle = p->vcsm_handle;
+-     iocache.s[1].cmd = 3; // clean+invalidate
+--    iocache.s[1].addr = p->arm + base;
+-+    iocache.s[1].addr = (int)(p->arm) + base;
+-     iocache.s[1].size  = sz;
+-     p = av_buffer_pool_opaque(frame->buf[0]);
+-     sz = s->frame->linesize[0] * (n-curr_y);
+-     base = s->frame->linesize[0] * curr_y;
+-     iocache.s[2].handle = p->vcsm_handle;
+-     iocache.s[2].cmd = 3; // clean+invalidate
+--    iocache.s[2].addr = p->arm + base;
+-+    iocache.s[2].addr = (int)(p->arm) + base;
+-     iocache.s[2].size  = sz;
+-     vcsm_clean_invalid( &iocache );
+- #else
+-@@ -3562,33 +3562,46 @@ static void flush_frame(HEVCContext *s,AVFrame *frame)
+- #endif
+- }
+- 
+--static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2)
+-+static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2, int job)
+- {
+- #ifdef RPI_FAST_CACHEFLUSH
+-     struct vcsm_user_clean_invalid_s iocache = {};
+--    int n = s->ps.sps->height;
+--    int curr_y = 0;
+--    int curr_uv = 0;
+--    int n_uv = n >> s->ps.sps->vshift[1];
+-+    int n;
+-+    int curr_y;
+-+    int curr_uv;
+-+    int n_uv;
+-+    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[1]);
+-     int sz,base;
+-+    int (*d)[2] = s->dblk_cmds[job];
+-+    int low=(*d)[1];
+-+    int high=(*d)[1];
+-+    for(n = s->num_dblk_cmds[job]; n>0 ;n--,d++) {
+-+        int y = (*d)[1];
+-+        low=FFMIN(low,y);
+-+        high=FFMAX(high,y);
+-+    }
+-+    curr_y = low;
+-+    n = high+(1 << s->ps.sps->log2_ctb_size);
+-+    curr_uv = curr_y >> s->ps.sps->vshift[1];
+-+    n_uv = n >> s->ps.sps->vshift[1];
+-+
+-     sz = s->frame->linesize[1] * (n_uv-curr_uv);
+-     base = s->frame->linesize[1] * curr_uv;
+--    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[1]);
+-     iocache.s[0].handle = p->vcsm_handle;
+-     iocache.s[0].cmd = 3; // clean+invalidate
+--    iocache.s[0].addr = p->arm + base;
+-+    iocache.s[0].addr = (int)(p->arm) + base;
+-     iocache.s[0].size  = sz;
+-     p = av_buffer_pool_opaque(frame->buf[2]);
+-     iocache.s[1].handle = p->vcsm_handle;
+-     iocache.s[1].cmd = 3; // clean+invalidate
+--    iocache.s[1].addr = p->arm + base;
+-+    iocache.s[1].addr = (int)(p->arm) + base;
+-     iocache.s[1].size  = sz;
+-     p = av_buffer_pool_opaque(frame->buf[0]);
+-     sz = s->frame->linesize[0] * (n-curr_y);
+-     base = s->frame->linesize[0] * curr_y;
+-     iocache.s[2].handle = p->vcsm_handle;
+-     iocache.s[2].cmd = 3; // clean+invalidate
+--    iocache.s[2].addr = p->arm + base;
+-+    iocache.s[2].addr = (int)(p->arm) + base;
+-     iocache.s[2].size  = sz;
+- 
+-     iocache.s[3].handle = p0->vcsm_handle;
+--- 
+-2.7.4
+-
+-
+-From 090b6be5b501bd3c547700926e540397f0b39e69 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Mon, 8 Jun 2015 11:55:29 +0100
+-Subject: [PATCH 63/68] Packed 16x16 and 32x32 into the same buffer
+-
+----
+- libavcodec/hevc.c       | 24 +++++++++++++++---------
+- libavcodec/hevc_cabac.c |  9 ++++++++-
+- libavcodec/rpi_qpu.c    |  2 +-
+- 3 files changed, 24 insertions(+), 11 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index cbb4f46..a596534 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -299,12 +299,12 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
+-         s->coeffs_buf_arm[job][0] = (int16_t*) s->coeffs_buf_default[job].arm;
+-         if (!s->coeffs_buf_arm[job][0])
+-             goto fail;
+--        gpu_malloc_cached(sizeof(int16_t) * coefs_per_row * 2, &s->coeffs_buf_accelerated[job]);
+-+        gpu_malloc_cached(sizeof(int16_t) * (coefs_per_row + 32*32), &s->coeffs_buf_accelerated[job]);  // We prefetch past the end so provide an extra blocks worth of data
+-         s->coeffs_buf_arm[job][2] = (int16_t*) s->coeffs_buf_accelerated[job].arm;
+-         s->coeffs_buf_vc[job][2] = s->coeffs_buf_accelerated[job].vc;
+-         if (!s->coeffs_buf_arm[job][2])
+-             goto fail;
+--        s->coeffs_buf_arm[job][3] = coefs_per_row + s->coeffs_buf_arm[job][2];
+-+        s->coeffs_buf_arm[job][3] = coefs_per_row + s->coeffs_buf_arm[job][2];  // This points to just beyond the end of the buffer.  Coefficients fill in backwards.
+-         s->coeffs_buf_vc[job][3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[job][2];
+-       }
+-     }
+-@@ -2956,15 +2956,20 @@ static void rpi_execute_transform(HEVCContext *s)
+- {
+-     int i=2;
+-     int job = s->pass1_job;
+--    //int j;
+--    //int16_t *coeffs = s->coeffs_buf_arm[i];
+--    //for(j=s->num_coeffs[i]; j > 0; j-= 16*16, coeffs+=16*16) {
+--    //    s->hevcdsp.idct[4-2](coeffs, 16);
+--    //}
+-+    /*int j;
+-+    int16_t *coeffs = s->coeffs_buf_arm[job][i];
+-+    for(j=s->num_coeffs[job][i]; j > 0; j-= 16*16, coeffs+=16*16) {
+-+        s->hevcdsp.idct[4-2](coeffs, 16);
+-+    }
+-+    i=3;
+-+    coeffs = s->coeffs_buf_arm[job][i] - s->num_coeffs[job][i];
+-+    for(j=s->num_coeffs[job][i]; j > 0; j-= 32*32, coeffs+=32*32) {
+-+        s->hevcdsp.idct[5-2](coeffs, 32);
+-+    }*/
+- 
+-     gpu_cache_flush(&s->coeffs_buf_accelerated[job]);
+-     s->vpu_id = vpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2],
+--                               s->num_coeffs[job][2] >> 8, s->coeffs_buf_vc[job][3],
+-+                               s->num_coeffs[job][2] >> 8, s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3],
+-                                s->num_coeffs[job][3] >> 10, 0, &s->coeffs_buf_accelerated[job]);
+-     //vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0);
+-     //gpu_cache_flush(&s->coeffs_buf_accelerated);
+-@@ -3458,7 +3463,8 @@ static void rpi_launch_vpu_qpu(HEVCContext *s)
+- #else
+-     flush_frame3(s, s->frame,&s->coeffs_buf_accelerated[job],NULL,NULL, job);
+- #endif
+--    s->vpu_id = vpu_qpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2], s->num_coeffs[job][2] >> 8, s->coeffs_buf_vc[job][3], s->num_coeffs[job][3] >> 10, 0,
+-+    s->vpu_id = vpu_qpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2], s->num_coeffs[job][2] >> 8,
+-+                                                                      s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3], s->num_coeffs[job][3] >> 10, 0,
+-                                    qpu_get_fn(QPU_MC_SETUP_UV),
+-                                    (uint32_t)(unif_vc+(s->mvs_base[job][0 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-                                    (uint32_t)(unif_vc+(s->mvs_base[job][1 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
+-index 6523e66..8656917 100644
+---- a/libavcodec/hevc_cabac.c
+-+++ b/libavcodec/hevc_cabac.c
+-@@ -1051,7 +1051,14 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+-     if (s->enable_rpi) {
+-         int n = trafo_size * trafo_size;
+-         if (use_vpu) {
+--            coeffs = s->coeffs_buf_arm[s->pass0_job][log2_trafo_size - 2] + s->num_coeffs[s->pass0_job][log2_trafo_size - 2];
+-+            // We support size 4 and size 5.
+-+            // Size 4 grows from the front  (Coeffs_buf_arm[2] points to start of buf)
+-+            // Size 5 grows from the back   (Coeffs_buf_arm[3] points to end of buf)
+-+            // num_coeffs is indexed by log2_trafo_size-2
+-+            if (log2_trafo_size == 4)
+-+                coeffs = s->coeffs_buf_arm[s->pass0_job][log2_trafo_size - 2] + s->num_coeffs[s->pass0_job][log2_trafo_size - 2];
+-+            else
+-+                coeffs = s->coeffs_buf_arm[s->pass0_job][log2_trafo_size - 2] - s->num_coeffs[s->pass0_job][log2_trafo_size - 2] - n;
+-             s->num_coeffs[s->pass0_job][log2_trafo_size - 2] += n;
+-         } else {
+-             coeffs = s->coeffs_buf_arm[s->pass0_job][0] + s->num_coeffs[s->pass0_job][0];
+-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
+-index 4480f72..0121fca 100644
+---- a/libavcodec/rpi_qpu.c
+-+++ b/libavcodec/rpi_qpu.c
+-@@ -5,7 +5,7 @@
+- // define RPI_TIME_TOTAL_VPU to print out how much time is spent in the VPI code
+- //#define RPI_TIME_TOTAL_VPU
+- // define RPI_TIME_TOTAL_POSTED to print out how much time is spent in the multi execute QPU/VPU combined
+--//#define RPI_TIME_TOTAL_POSTED
+-+#define RPI_TIME_TOTAL_POSTED
+- // define RPI_ASYNC to run the VPU in a separate thread, need to make a separate call to check for completion
+- #define RPI_ASYNC
+- 
+--- 
+-2.7.4
+-
+-
+-From ed359bbce56817bf9db0e54701103bd0505c353b Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Thu, 25 Jun 2015 09:02:47 +0100
+-Subject: [PATCH 64/68] Moved luma deblock to VPU
+-
+----
+- libavcodec/hevc.c               |   18 +-
+- libavcodec/hevc.h               |   11 +
+- libavcodec/hevc_filter.c        |  120 ++-
+- libavcodec/rpi_hevc_transform.h | 1802 ++++++++++++++++++++++++++++++++++++++-
+- libavcodec/rpi_hevc_transform.s |  426 +++++++++
+- libavcodec/rpi_qpu.c            |   12 +-
+- libavcodec/rpi_shader.c         |    2 +-
+- 7 files changed, 2378 insertions(+), 13 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index a596534..4ce94a7 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -246,6 +246,12 @@ static void pic_arrays_free(HEVCContext *s)
+-       }
+-     }
+- #endif
+-+#ifdef RPI_DEBLOCK_VPU
+-+    if (s->y_setup_arm) {
+-+      gpu_free(&s->y_setup_ptr);
+-+      s->y_setup_arm = 0;
+-+    }
+-+#endif
+-     av_freep(&s->sao);
+-     av_freep(&s->deblock);
+- 
+-@@ -283,12 +289,12 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
+-     int min_pu_size      = sps->min_pu_width * sps->min_pu_height;
+- 
+- #ifdef RPI
+--    av_assert0(sps);
+-     int coefs_in_ctb = (1 << sps->log2_ctb_size) * (1 << sps->log2_ctb_size);
+-     int coefs_per_luma = 64*64*24*RPI_NUM_CHUNKS;
+-     int coefs_per_chroma = (coefs_per_luma * 2) >> sps->vshift[1] >> sps->hshift[1];
+-     int coefs_per_row = coefs_per_luma + coefs_per_chroma;
+-     int job;
+-+    av_assert0(sps);
+-     s->max_ctu_count = coefs_per_luma / coefs_in_ctb;
+-     s->ctu_per_y_chan = s->max_ctu_count / 12;
+-     s->ctu_per_uv_chan = s->max_ctu_count / 8;
+-@@ -309,6 +315,16 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
+-       }
+-     }
+- #endif
+-+#ifdef RPI_DEBLOCK_VPU
+-+    s->enable_rpi_deblock = !sps->sao_enabled;
+-+    s->setup_width = (sps->width+15) / 16;
+-+    s->setup_height = (sps->height+15) / 16;
+-+    gpu_malloc_uncached(sizeof(*s->y_setup_arm) * s->setup_width * s->setup_height, &s->y_setup_ptr); // TODO make this cached
+-+    s->y_setup_arm = (void*)s->y_setup_ptr.arm;
+-+    s->y_setup_vc = (void*)s->y_setup_ptr.vc;
+-+    memset(s->y_setup_arm, 0, s->y_setup_ptr.numbytes);
+-+    printf("Setup %d by %d by %d\n",s->setup_width,s->setup_height,sizeof(*s->y_setup_arm));
+-+#endif
+- 
+-     s->bs_width  = (width  >> 2) + 1;
+-     s->bs_height = (height >> 2) + 1;
+-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
+-index ef5bfb1..cf08489 100644
+---- a/libavcodec/hevc.h
+-+++ b/libavcodec/hevc.h
+-@@ -57,6 +57,8 @@
+-   // Define RPI_WORKER to launch a worker thread for pixel processing tasks
+-   #define RPI_WORKER
+- 
+-+  #define RPI_DEBLOCK_VPU
+-+
+- #endif
+- 
+- #define MAX_DPB_SIZE 16 // A.4.1
+-@@ -971,6 +973,15 @@ typedef struct HEVCContext {
+-     int kill_worker; // set to 1 to terminate the worker
+- #endif
+- 
+-+#ifdef RPI_DEBLOCK_VPU
+-+    int enable_rpi_deblock;
+-+    GPU_MEM_PTR_T y_setup_ptr;
+-+    uint8_t (*y_setup_arm)[2][2][2][4];
+-+    uint8_t (*y_setup_vc)[2][2][2][4];
+-+    int setup_width; // Number of 16x16 blocks across the image
+-+    int setup_height; // Number of 16x16 blocks down the image
+-+#endif
+-+
+- #endif
+- 
+-     uint8_t *cabac_state;
+-diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
+-index 1f04790..06371da 100644
+---- a/libavcodec/hevc_filter.c
+-+++ b/libavcodec/hevc_filter.c
+-@@ -564,6 +564,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+-                                                          s->frame->linesize[LUMA],
+-                                                          beta, tc, no_p, no_q);
+-                 } else
+-+#ifdef RPI_DEBLOCK_VPU
+-+                if (s->enable_rpi_deblock) {
+-+                    uint8_t (*setup)[2][2][4];
+-+                    int num16 = (y>>4)*s->setup_width + (x>>4);
+-+                    int a = ((y>>3) & 1) << 1;
+-+                    int b = (x>>3) & 1;
+-+                    setup = s->y_setup_arm[num16];
+-+                    setup[0][b][0][a] = beta;
+-+                    setup[0][b][0][a + 1] = beta;
+-+                    setup[0][b][1][a] = tc[0];
+-+                    setup[0][b][1][a + 1] = tc[1];
+-+                } else
+-+#endif
+-                     s->hevcdsp.hevc_v_loop_filter_luma(src,
+-                                                        s->frame->linesize[LUMA],
+-                                                        beta, tc, no_p, no_q);
+-@@ -596,6 +609,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+-                                                          s->frame->linesize[LUMA],
+-                                                          beta, tc, no_p, no_q);
+-                 } else
+-+#ifdef RPI_DEBLOCK_VPU
+-+                if (s->enable_rpi_deblock) {
+-+                    uint8_t (*setup)[2][2][4];
+-+                    int num16 = (y>>4)*s->setup_width + (x>>4);
+-+                    int a = ((x>>3) & 1) << 1;
+-+                    int b = (y>>3) & 1;
+-+                    setup = s->y_setup_arm[num16];
+-+                    setup[1][b][0][a] = beta;
+-+                    setup[1][b][0][a + 1] = beta;
+-+                    setup[1][b][1][a] = tc[0];
+-+                    setup[1][b][1][a + 1] = tc[1];
+-+                } else
+-+#endif
+-                     s->hevcdsp.hevc_h_loop_filter_luma(src,
+-                                                        s->frame->linesize[LUMA],
+-                                                        beta, tc, no_p, no_q);
+-@@ -876,33 +902,85 @@ static void flush_buffer(AVBufferRef *bref) {
+- }
+- 
+- // Return Physical address for this image
+--static int ff_hevc_buf_base(AVBufferRef *bref) {
+-+static uint32_t get_vc_address(AVBufferRef *bref) {
+-   GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
+--  return p->vc & 0x3fffffff;
+-+  return p->vc;
+- }
+- 
+-+// ff_hevc_flush_buffer_lines
+-+// flushes and invalidates all pixel rows in [start,end-1]
+-+static void ff_hevc_flush_buffer_lines(HEVCContext *s, int start, int end, int flush_luma, int flush_chroma)
+-+{
+-+#ifdef RPI_FAST_CACHEFLUSH
+-+        struct vcsm_user_clean_invalid_s iocache = {};
+-+        int curr_y = start;
+-+        int n = end;
+-+        int curr_uv = curr_y >> s->ps.sps->vshift[1];
+-+        int n_uv = n >> s->ps.sps->vshift[1];
+-+        int sz,base;
+-+        GPU_MEM_PTR_T *p;
+-+        if (curr_uv < 0) curr_uv = 0;
+-+        if (n_uv<=curr_uv) { return; }
+-+        sz = s->frame->linesize[1] * (n_uv-curr_uv);
+-+        base = s->frame->linesize[1] * curr_uv;
+-+        if (flush_chroma) {
+-+          p = av_buffer_pool_opaque(s->frame->buf[1]);
+-+          iocache.s[0].handle = p->vcsm_handle;
+-+          iocache.s[0].cmd = 3; // clean+invalidate
+-+          iocache.s[0].addr = (int)p->arm + base;
+-+          iocache.s[0].size  = sz;
+-+          p = av_buffer_pool_opaque(s->frame->buf[2]);
+-+          iocache.s[1].handle = p->vcsm_handle;
+-+          iocache.s[1].cmd = 3; // clean+invalidate
+-+          iocache.s[1].addr = (int)p->arm + base;
+-+          iocache.s[1].size  = sz;
+-+        }
+-+        if (flush_luma) {
+-+          p = av_buffer_pool_opaque(s->frame->buf[0]);
+-+          sz = s->frame->linesize[0] * (n-curr_y);
+-+          base = s->frame->linesize[0] * curr_y;
+-+          iocache.s[2].handle = p->vcsm_handle;
+-+          iocache.s[2].cmd = 3; // clean+invalidate
+-+          iocache.s[2].addr = (int)p->arm + base;
+-+          iocache.s[2].size  = sz;
+-+        }
+-+        vcsm_clean_invalid( &iocache );
+-+#else
+-+        if (flush_chroma) {
+-+          flush_buffer(s->frame->buf[1]);
+-+          flush_buffer(s->frame->buf[2]);
+-+        }
+-+        if (flush_luma) {
+-+          flush_buffer(s->frame->buf[0]);
+-+        }
+-+#endif
+-+}
+-+
+-+
+- void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n)
+- {
+-     if (s->enable_rpi && s->used_for_ref) {
+-+      // TODO make this use ff_hevc_flush_buffer_lines
+- #ifdef RPI_FAST_CACHEFLUSH
+-         struct vcsm_user_clean_invalid_s iocache = {};
+-         int curr_y = ((int *)f->progress->data)[0];
+-         int curr_uv = curr_y >> s->ps.sps->vshift[1];
+-         int n_uv = n >> s->ps.sps->vshift[1];
+-         int sz,base;
+-+        GPU_MEM_PTR_T *p;
+-         if (curr_uv < 0) curr_uv = 0;
+-         if (n_uv<=curr_uv) { return; }
+-         sz = s->frame->linesize[1] * (n_uv-curr_uv);
+-         base = s->frame->linesize[1] * curr_uv;
+--        GPU_MEM_PTR_T *p = av_buffer_pool_opaque(s->frame->buf[1]);
+-+        p = av_buffer_pool_opaque(s->frame->buf[1]);
+-         iocache.s[0].handle = p->vcsm_handle;
+-         iocache.s[0].cmd = 3; // clean+invalidate
+--        iocache.s[0].addr = p->arm + base;
+-+        iocache.s[0].addr = (int)p->arm + base;
+-         iocache.s[0].size  = sz;
+-         p = av_buffer_pool_opaque(s->frame->buf[2]);
+-         iocache.s[1].handle = p->vcsm_handle;
+-         iocache.s[1].cmd = 3; // clean+invalidate
+--        iocache.s[1].addr = p->arm + base;
+-+        iocache.s[1].addr = (int)p->arm + base;
+-         iocache.s[1].size  = sz;
+- 
+- #ifdef RPI_LUMA_QPU
+-@@ -911,7 +989,7 @@ void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n)
+-         base = s->frame->linesize[0] * curr_y;
+-         iocache.s[2].handle = p->vcsm_handle;
+-         iocache.s[2].cmd = 3; // clean+invalidate
+--        iocache.s[2].addr = p->arm + base;
+-+        iocache.s[2].addr = (int)p->arm + base;
+-         iocache.s[2].size  = sz;
+- #endif
+-         vcsm_clean_invalid( &iocache );
+-@@ -930,11 +1008,40 @@ void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n)
+- }
+- #endif
+- 
+-+#ifdef RPI_DEBLOCK_VPU
+-+/* rpi_deblock deblocks an entire row of ctbs using the VPU */
+-+static void rpi_deblock(HEVCContext *s, int y, int ctb_size)
+-+{
+-+  // Flush image, 4 lines above to bottom of ctb stripe
+-+  ff_hevc_flush_buffer_lines(s, FFMAX(y-4,0), y+ctb_size, 1, 0);
+-+  // TODO flush buffer of beta/tc setup when it becomes cached
+-+  // Call VPU
+-+  // TODO add this to a separate pipeline of VPU jobs that can be run in parallel and wait for completion
+-+  vpu_wait(vpu_post_code( vpu_get_fn(), get_vc_address(s->frame->buf[0]) + s->frame->linesize[0] * y, s->frame->linesize[0],
+-+                               s->setup_width, (int) ( s->y_setup_vc + s->setup_width * (y>>4) ),
+-+                               ctb_size>>4, 2, 0)); // 2 means to do the deblocking code
+-+}
+-+
+-+static void rpi_deblock2(HEVCContext *s, int y, int ctb_size)
+-+{
+-+   int y2;
+-+   for(y2=y;y2<y+ctb_size;y2+=16) {
+-+      rpi_deblock(s,y2,16);
+-+   }
+-+}
+-+#endif
+-+
+- void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
+- {
+-     int x_end = x >= s->ps.sps->width  - ctb_size;
+-     if (s->avctx->skip_loop_filter < AVDISCARD_ALL)
+-         deblocking_filter_CTB(s, x, y);
+-+#ifdef RPI_DEBLOCK_VPU
+-+    if (s->enable_rpi_deblock && x_end)
+-+    {
+-+      rpi_deblock(s, y, ctb_size);
+-+    }
+-+#endif
+-     if (s->ps.sps->sao_enabled) {
+-         int y_end = y >= s->ps.sps->height - ctb_size;
+-         if (y && x)
+-@@ -965,6 +1072,7 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
+-         //if (((y + ctb_size)&63)==0)
+- #ifdef RPI_INTER_QPU
+-         ff_hevc_flush_buffer(s, &s->ref->tf, y + ctb_size - 4);
+-+        // TODO we no longer need to flush the luma buffer as it is in GPU memory when using deblocking on the rpi
+- #endif
+-         ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
+-     }
+-diff --git a/libavcodec/rpi_hevc_transform.h b/libavcodec/rpi_hevc_transform.h
+-index 4f13622..b3f155f 100644
+---- a/libavcodec/rpi_hevc_transform.h
+-+++ b/libavcodec/rpi_hevc_transform.h
+-@@ -3,7 +3,13 @@ unsigned char rpi_hevc_transform [] = {
+- 106,
+- 0,
+- 144,
+--35,
+-+38,
+-+1,
+-+37,
+-+106,
+-+0,
+-+144,
+-+57,
+- 1,
+- 169,
+- 3,
+-@@ -627,4 +633,1798 @@ unsigned char rpi_hevc_transform [] = {
+- 30,
+- 90,
+- 0,
+-+169,
+-+3,
+-+73,
+-+64,
+-+52,
+-+64,
+-+45,
+-+64,
+-+2,
+-+64,
+-+10,
+-+64,
+-+64,
+-+198,
+-+1,
+-+7,
+-+8,
+-+232,
+-+63,
+-+0,
+-+0,
+-+0,
+-+6,
+-+232,
+-+253,
+-+255,
+-+255,
+-+255,
+-+0,
+-+246,
+-+0,
+-+0,
+-+0,
+-+4,
+-+215,
+-+64,
+-+3,
+-+96,
+-+2,
+-+248,
+-+0,
+-+35,
+-+0,
+-+0,
+-+64,
+-+56,
+-+0,
+-+0,
+-+4,
+-+248,
+-+0,
+-+36,
+-+0,
+-+0,
+-+64,
+-+56,
+-+8,
+-+0,
+-+0,
+-+240,
+-+64,
+-+0,
+-+132,
+-+3,
+-+128,
+-+240,
+-+0,
+-+0,
+-+132,
+-+3,
+-+128,
+-+144,
+-+137,
+-+0,
+-+131,
+-+98,
+-+0,
+-+255,
+-+64,
+-+0,
+-+0,
+-+20,
+-+200,
+-+243,
+-+0,
+-+0,
+-+128,
+-+144,
+-+129,
+-+0,
+-+131,
+-+102,
+-+0,
+-+158,
+-+67,
+-+0,
+-+2,
+-+248,
+-+0,
+-+35,
+-+0,
+-+0,
+-+64,
+-+56,
+-+0,
+-+0,
+-+4,
+-+248,
+-+0,
+-+36,
+-+0,
+-+0,
+-+64,
+-+56,
+-+8,
+-+0,
+-+0,
+-+240,
+-+64,
+-+0,
+-+132,
+-+3,
+-+128,
+-+240,
+-+0,
+-+0,
+-+132,
+-+3,
+-+128,
+-+144,
+-+108,
+-+0,
+-+131,
+-+98,
+-+0,
+-+255,
+-+64,
+-+0,
+-+0,
+-+20,
+-+200,
+-+243,
+-+0,
+-+0,
+-+128,
+-+144,
+-+100,
+-+0,
+-+131,
+-+102,
+-+0,
+-+248,
+-+64,
+-+0,
+-+112,
+-+0,
+-+192,
+-+243,
+-+211,
+-+31,
+-+128,
+-+248,
+-+0,
+-+0,
+-+112,
+-+0,
+-+192,
+-+243,
+-+211,
+-+31,
+-+128,
+-+144,
+-+161,
+-+0,
+-+188,
+-+64,
+-+67,
+-+232,
+-+0,
+-+2,
+-+0,
+-+0,
+-+0,
+-+255,
+-+64,
+-+0,
+-+0,
+-+20,
+-+200,
+-+243,
+-+0,
+-+0,
+-+128,
+-+144,
+-+150,
+-+0,
+-+195,
+-+232,
+-+0,
+-+2,
+-+0,
+-+0,
+-+12,
+-+128,
+-+7,
+-+192,
+-+130,
+-+248,
+-+0,
+-+0,
+-+112,
+-+192,
+-+224,
+-+16,
+-+195,
+-+31,
+-+132,
+-+248,
+-+1,
+-+0,
+-+112,
+-+0,
+-+224,
+-+16,
+-+203,
+-+31,
+-+3,
+-+99,
+-+131,
+-+71,
+-+68,
+-+232,
+-+32,
+-+0,
+-+0,
+-+0,
+-+0,
+-+99,
+-+2,
+-+99,
+-+23,
+-+102,
+-+7,
+-+106,
+-+127,
+-+156,
+-+182,
+-+255,
+-+0,
+-+248,
+-+64,
+-+0,
+-+112,
+-+0,
+-+192,
+-+243,
+-+211,
+-+31,
+-+128,
+-+248,
+-+0,
+-+0,
+-+112,
+-+0,
+-+192,
+-+243,
+-+211,
+-+31,
+-+128,
+-+144,
+-+112,
+-+0,
+-+188,
+-+64,
+-+67,
+-+232,
+-+0,
+-+2,
+-+0,
+-+0,
+-+0,
+-+255,
+-+64,
+-+0,
+-+0,
+-+20,
+-+200,
+-+243,
+-+0,
+-+0,
+-+128,
+-+144,
+-+101,
+-+0,
+-+195,
+-+232,
+-+0,
+-+2,
+-+0,
+-+0,
+-+12,
+-+128,
+-+7,
+-+192,
+-+130,
+-+248,
+-+0,
+-+0,
+-+112,
+-+192,
+-+224,
+-+16,
+-+195,
+-+31,
+-+132,
+-+248,
+-+1,
+-+0,
+-+112,
+-+0,
+-+224,
+-+16,
+-+203,
+-+31,
+-+25,
+-+102,
+-+9,
+-+106,
+-+2,
+-+30,
+-+41,
+-+3,
+-+26,
+-+87,
+-+162,
+-+64,
+-+64,
+-+198,
+-+1,
+-+23,
+-+127,
+-+158,
+-+103,
+-+255,
+-+239,
+-+3,
+-+0,
+-+254,
+-+0,
+-+143,
+-+92,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+64,
+-+143,
+-+93,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+128,
+-+143,
+-+94,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+192,
+-+143,
+-+95,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+192,
+-+142,
+-+208,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+128,
+-+142,
+-+209,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+64,
+-+142,
+-+210,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+0,
+-+142,
+-+211,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+128,
+-+144,
+-+107,
+-+0,
+-+8,
+-+255,
+-+99,
+-+23,
+-+0,
+-+212,
+-+192,
+-+51,
+-+0,
+-+0,
+-+8,
+-+255,
+-+163,
+-+23,
+-+0,
+-+228,
+-+192,
+-+51,
+-+0,
+-+0,
+-+8,
+-+255,
+-+227,
+-+23,
+-+0,
+-+244,
+-+192,
+-+51,
+-+0,
+-+0,
+-+8,
+-+255,
+-+35,
+-+52,
+-+0,
+-+180,
+-+192,
+-+51,
+-+0,
+-+0,
+-+8,
+-+255,
+-+99,
+-+52,
+-+0,
+-+164,
+-+192,
+-+51,
+-+0,
+-+0,
+-+8,
+-+255,
+-+163,
+-+52,
+-+0,
+-+148,
+-+192,
+-+51,
+-+0,
+-+0,
+-+111,
+-+3,
+-+239,
+-+3,
+-+0,
+-+254,
+-+0,
+-+143,
+-+12,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+64,
+-+143,
+-+13,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+128,
+-+143,
+-+14,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+192,
+-+143,
+-+15,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+192,
+-+142,
+-+16,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+128,
+-+142,
+-+17,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+64,
+-+142,
+-+18,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+0,
+-+142,
+-+19,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+128,
+-+144,
+-+33,
+-+0,
+-+8,
+-+255,
+-+99,
+-+3,
+-+0,
+-+212,
+-+192,
+-+51,
+-+0,
+-+0,
+-+8,
+-+255,
+-+163,
+-+3,
+-+0,
+-+228,
+-+192,
+-+51,
+-+0,
+-+0,
+-+8,
+-+255,
+-+227,
+-+3,
+-+0,
+-+244,
+-+192,
+-+51,
+-+0,
+-+0,
+-+8,
+-+255,
+-+35,
+-+4,
+-+0,
+-+180,
+-+192,
+-+51,
+-+0,
+-+0,
+-+8,
+-+255,
+-+99,
+-+4,
+-+0,
+-+164,
+-+192,
+-+51,
+-+0,
+-+0,
+-+8,
+-+255,
+-+163,
+-+4,
+-+0,
+-+148,
+-+192,
+-+51,
+-+0,
+-+0,
+-+111,
+-+3,
+-+32,
+-+246,
+-+192,
+-+11,
+-+1,
+-+16,
+-+32,
+-+246,
+-+2,
+-+137,
+-+47,
+-+240,
+-+40,
+-+246,
+-+2,
+-+140,
+-+47,
+-+240,
+-+128,
+-+245,
+-+99,
+-+140,
+-+5,
+-+4,
+-+0,
+-+247,
+-+99,
+-+140,
+-+1,
+-+20,
+-+88,
+-+246,
+-+99,
+-+140,
+-+1,
+-+20,
+-+0,
+-+247,
+-+35,
+-+136,
+-+62,
+-+226,
+-+32,
+-+247,
+-+35,
+-+136,
+-+32,
+-+210,
+-+0,
+-+247,
+-+34,
+-+136,
+-+63,
+-+2,
+-+208,
+-+246,
+-+34,
+-+136,
+-+0,
+-+4,
+-+0,
+-+247,
+-+99,
+-+136,
+-+58,
+-+162,
+-+32,
+-+247,
+-+99,
+-+136,
+-+33,
+-+146,
+-+0,
+-+247,
+-+98,
+-+136,
+-+59,
+-+18,
+-+208,
+-+246,
+-+98,
+-+136,
+-+0,
+-+20,
+-+0,
+-+247,
+-+162,
+-+136,
+-+33,
+-+2,
+-+88,
+-+246,
+-+98,
+-+137,
+-+2,
+-+68,
+-+88,
+-+246,
+-+162,
+-+137,
+-+3,
+-+68,
+-+208,
+-+254,
+-+227,
+-+136,
+-+60,
+-+242,
+-+192,
+-+243,
+-+188,
+-+11,
+-+208,
+-+254,
+-+227,
+-+136,
+-+56,
+-+178,
+-+192,
+-+243,
+-+188,
+-+10,
+-+32,
+-+255,
+-+226,
+-+136,
+-+38,
+-+58,
+-+192,
+-+243,
+-+60,
+-+0,
+-+208,
+-+254,
+-+227,
+-+136,
+-+59,
+-+242,
+-+192,
+-+243,
+-+60,
+-+128,
+-+32,
+-+255,
+-+226,
+-+136,
+-+49,
+-+58,
+-+192,
+-+243,
+-+60,
+-+128,
+-+0,
+-+255,
+-+226,
+-+136,
+-+34,
+-+34,
+-+192,
+-+243,
+-+60,
+-+128,
+-+32,
+-+255,
+-+226,
+-+136,
+-+37,
+-+58,
+-+192,
+-+243,
+-+60,
+-+128,
+-+0,
+-+254,
+-+192,
+-+136,
+-+1,
+-+4,
+-+0,
+-+240,
+-+0,
+-+160,
+-+0,
+-+255,
+-+194,
+-+8,
+-+0,
+-+52,
+-+195,
+-+243,
+-+0,
+-+128,
+-+0,
+-+255,
+-+202,
+-+40,
+-+0,
+-+52,
+-+195,
+-+243,
+-+0,
+-+128,
+-+0,
+-+254,
+-+0,
+-+240,
+-+35,
+-+10,
+-+0,
+-+240,
+-+60,
+-+0,
+-+0,
+-+254,
+-+192,
+-+136,
+-+1,
+-+4,
+-+0,
+-+240,
+-+0,
+-+160,
+-+0,
+-+255,
+-+226,
+-+140,
+-+34,
+-+34,
+-+195,
+-+243,
+-+60,
+-+0,
+-+32,
+-+255,
+-+227,
+-+140,
+-+36,
+-+58,
+-+192,
+-+243,
+-+60,
+-+0,
+-+0,
+-+254,
+-+192,
+-+136,
+-+0,
+-+4,
+-+0,
+-+240,
+-+0,
+-+160,
+-+16,
+-+246,
+-+226,
+-+136,
+-+35,
+-+50,
+-+16,
+-+246,
+-+226,
+-+136,
+-+35,
+-+50,
+-+32,
+-+246,
+-+226,
+-+136,
+-+35,
+-+50,
+-+32,
+-+254,
+-+226,
+-+136,
+-+35,
+-+58,
+-+192,
+-+243,
+-+60,
+-+0,
+-+11,
+-+96,
+-+0,
+-+254,
+-+0,
+-+240,
+-+1,
+-+4,
+-+0,
+-+240,
+-+64,
+-+115,
+-+5,
+-+106,
+-+0,
+-+144,
+-+173,
+-+1,
+-+27,
+-+96,
+-+0,
+-+254,
+-+0,
+-+240,
+-+1,
+-+4,
+-+0,
+-+240,
+-+64,
+-+147,
+-+5,
+-+106,
+-+0,
+-+144,
+-+227,
+-+0,
+-+64,
+-+246,
+-+163,
+-+140,
+-+1,
+-+4,
+-+0,
+-+246,
+-+192,
+-+175,
+-+63,
+-+2,
+-+0,
+-+246,
+-+192,
+-+174,
+-+59,
+-+2,
+-+0,
+-+246,
+-+128,
+-+175,
+-+62,
+-+2,
+-+0,
+-+246,
+-+128,
+-+174,
+-+58,
+-+2,
+-+0,
+-+246,
+-+64,
+-+175,
+-+61,
+-+2,
+-+0,
+-+246,
+-+64,
+-+174,
+-+57,
+-+2,
+-+0,
+-+255,
+-+43,
+-+240,
+-+4,
+-+212,
+-+192,
+-+243,
+-+128,
+-+11,
+-+64,
+-+254,
+-+43,
+-+240,
+-+1,
+-+228,
+-+192,
+-+243,
+-+128,
+-+10,
+-+64,
+-+254,
+-+43,
+-+240,
+-+1,
+-+244,
+-+192,
+-+243,
+-+128,
+-+10,
+-+64,
+-+254,
+-+43,
+-+240,
+-+1,
+-+180,
+-+192,
+-+243,
+-+128,
+-+10,
+-+64,
+-+254,
+-+43,
+-+141,
+-+0,
+-+164,
+-+192,
+-+243,
+-+128,
+-+10,
+-+88,
+-+246,
+-+35,
+-+141,
+-+3,
+-+68,
+-+32,
+-+247,
+-+35,
+-+141,
+-+191,
+-+66,
+-+240,
+-+246,
+-+35,
+-+141,
+-+50,
+-+66,
+-+0,
+-+255,
+-+235,
+-+143,
+-+52,
+-+242,
+-+192,
+-+243,
+-+60,
+-+128,
+-+0,
+-+255,
+-+43,
+-+240,
+-+2,
+-+212,
+-+192,
+-+243,
+-+128,
+-+11,
+-+0,
+-+255,
+-+43,
+-+240,
+-+191,
+-+226,
+-+192,
+-+243,
+-+188,
+-+10,
+-+64,
+-+254,
+-+43,
+-+141,
+-+0,
+-+180,
+-+192,
+-+243,
+-+128,
+-+10,
+-+88,
+-+246,
+-+35,
+-+141,
+-+2,
+-+68,
+-+32,
+-+247,
+-+35,
+-+141,
+-+190,
+-+66,
+-+240,
+-+246,
+-+35,
+-+141,
+-+50,
+-+66,
+-+0,
+-+255,
+-+171,
+-+143,
+-+52,
+-+226,
+-+192,
+-+243,
+-+60,
+-+128,
+-+0,
+-+255,
+-+43,
+-+240,
+-+4,
+-+180,
+-+192,
+-+243,
+-+128,
+-+11,
+-+0,
+-+255,
+-+43,
+-+240,
+-+191,
+-+226,
+-+192,
+-+243,
+-+188,
+-+10,
+-+128,
+-+253,
+-+43,
+-+240,
+-+3,
+-+212,
+-+192,
+-+243,
+-+128,
+-+10,
+-+64,
+-+254,
+-+35,
+-+141,
+-+1,
+-+196,
+-+192,
+-+243,
+-+128,
+-+10,
+-+88,
+-+246,
+-+35,
+-+141,
+-+3,
+-+68,
+-+32,
+-+247,
+-+35,
+-+141,
+-+189,
+-+66,
+-+240,
+-+246,
+-+35,
+-+141,
+-+50,
+-+66,
+-+0,
+-+255,
+-+107,
+-+143,
+-+52,
+-+210,
+-+192,
+-+243,
+-+60,
+-+128,
+-+0,
+-+255,
+-+43,
+-+240,
+-+4,
+-+148,
+-+192,
+-+243,
+-+128,
+-+11,
+-+64,
+-+254,
+-+43,
+-+240,
+-+1,
+-+164,
+-+192,
+-+243,
+-+128,
+-+10,
+-+64,
+-+254,
+-+43,
+-+240,
+-+1,
+-+180,
+-+192,
+-+243,
+-+128,
+-+10,
+-+64,
+-+254,
+-+43,
+-+240,
+-+1,
+-+244,
+-+192,
+-+243,
+-+128,
+-+10,
+-+64,
+-+254,
+-+43,
+-+141,
+-+0,
+-+228,
+-+192,
+-+243,
+-+128,
+-+10,
+-+88,
+-+246,
+-+35,
+-+141,
+-+3,
+-+68,
+-+32,
+-+247,
+-+35,
+-+141,
+-+187,
+-+66,
+-+240,
+-+246,
+-+35,
+-+141,
+-+50,
+-+66,
+-+0,
+-+255,
+-+235,
+-+142,
+-+52,
+-+178,
+-+192,
+-+243,
+-+60,
+-+128,
+-+0,
+-+255,
+-+43,
+-+240,
+-+2,
+-+148,
+-+192,
+-+243,
+-+128,
+-+11,
+-+0,
+-+255,
+-+43,
+-+240,
+-+187,
+-+162,
+-+192,
+-+243,
+-+188,
+-+10,
+-+64,
+-+254,
+-+43,
+-+141,
+-+0,
+-+244,
+-+192,
+-+243,
+-+128,
+-+10,
+-+88,
+-+246,
+-+35,
+-+141,
+-+2,
+-+68,
+-+32,
+-+247,
+-+35,
+-+141,
+-+186,
+-+66,
+-+240,
+-+246,
+-+35,
+-+141,
+-+50,
+-+66,
+-+0,
+-+255,
+-+171,
+-+142,
+-+52,
+-+162,
+-+192,
+-+243,
+-+60,
+-+128,
+-+0,
+-+255,
+-+43,
+-+240,
+-+4,
+-+244,
+-+192,
+-+243,
+-+128,
+-+11,
+-+0,
+-+255,
+-+43,
+-+240,
+-+187,
+-+162,
+-+192,
+-+243,
+-+188,
+-+10,
+-+128,
+-+253,
+-+43,
+-+240,
+-+3,
+-+148,
+-+192,
+-+243,
+-+128,
+-+10,
+-+64,
+-+254,
+-+35,
+-+141,
+-+1,
+-+132,
+-+192,
+-+243,
+-+128,
+-+10,
+-+88,
+-+246,
+-+35,
+-+141,
+-+3,
+-+68,
+-+32,
+-+247,
+-+35,
+-+141,
+-+185,
+-+66,
+-+240,
+-+246,
+-+35,
+-+141,
+-+50,
+-+66,
+-+0,
+-+255,
+-+107,
+-+142,
+-+52,
+-+146,
+-+192,
+-+243,
+-+60,
+-+128,
+-+64,
+-+255,
+-+98,
+-+141,
+-+0,
+-+52,
+-+192,
+-+243,
+-+0,
+-+0,
+-+0,
+-+254,
+-+0,
+-+240,
+-+53,
+-+10,
+-+0,
+-+240,
+-+60,
+-+0,
+-+0,
+-+254,
+-+0,
+-+240,
+-+1,
+-+4,
+-+0,
+-+240,
+-+64,
+-+147,
+-+5,
+-+106,
+-+0,
+-+144,
+-+177,
+-+0,
+-+88,
+-+246,
+-+163,
+-+140,
+-+1,
+-+4,
+-+128,
+-+245,
+-+99,
+-+141,
+-+10,
+-+4,
+-+88,
+-+246,
+-+162,
+-+138,
+-+1,
+-+68,
+-+0,
+-+247,
+-+162,
+-+138,
+-+36,
+-+162,
+-+88,
+-+254,
+-+162,
+-+138,
+-+3,
+-+164,
+-+192,
+-+243,
+-+128,
+-+11,
+-+0,
+-+255,
+-+226,
+-+137,
+-+32,
+-+2,
+-+195,
+-+243,
+-+60,
+-+0,
+-+32,
+-+247,
+-+226,
+-+137,
+-+42,
+-+114,
+-+0,
+-+255,
+-+34,
+-+138,
+-+33,
+-+18,
+-+195,
+-+243,
+-+60,
+-+0,
+-+32,
+-+247,
+-+34,
+-+138,
+-+42,
+-+130,
+-+16,
+-+246,
+-+98,
+-+138,
+-+40,
+-+114,
+-+16,
+-+246,
+-+98,
+-+138,
+-+41,
+-+146,
+-+32,
+-+246,
+-+98,
+-+138,
+-+41,
+-+146,
+-+32,
+-+246,
+-+226,
+-+137,
+-+41,
+-+146,
+-+40,
+-+246,
+-+34,
+-+138,
+-+41,
+-+146,
+-+32,
+-+247,
+-+163,
+-+141,
+-+63,
+-+178,
+-+32,
+-+247,
+-+227,
+-+141,
+-+62,
+-+162,
+-+0,
+-+254,
+-+0,
+-+240,
+-+8,
+-+4,
+-+0,
+-+240,
+-+128,
+-+11,
+-+128,
+-+253,
+-+35,
+-+240,
+-+9,
+-+100,
+-+192,
+-+243,
+-+128,
+-+10,
+-+128,
+-+253,
+-+163,
+-+141,
+-+128,
+-+115,
+-+192,
+-+243,
+-+152,
+-+10,
+-+88,
+-+246,
+-+163,
+-+141,
+-+4,
+-+100,
+-+208,
+-+246,
+-+35,
+-+139,
+-+0,
+-+100,
+-+32,
+-+255,
+-+34,
+-+139,
+-+53,
+-+202,
+-+192,
+-+243,
+-+60,
+-+128,
+-+0,
+-+254,
+-+0,
+-+139,
+-+0,
+-+4,
+-+0,
+-+240,
+-+0,
+-+160,
+-+240,
+-+246,
+-+163,
+-+141,
+-+48,
+-+98,
+-+0,
+-+247,
+-+99,
+-+139,
+-+63,
+-+210,
+-+0,
+-+247,
+-+98,
+-+139,
+-+1,
+-+212,
+-+88,
+-+254,
+-+98,
+-+139,
+-+1,
+-+212,
+-+192,
+-+243,
+-+128,
+-+11,
+-+32,
+-+255,
+-+99,
+-+139,
+-+62,
+-+98,
+-+192,
+-+243,
+-+188,
+-+10,
+-+88,
+-+246,
+-+98,
+-+139,
+-+1,
+-+212,
+-+240,
+-+246,
+-+98,
+-+139,
+-+50,
+-+210,
+-+0,
+-+247,
+-+163,
+-+128,
+-+59,
+-+146,
+-+0,
+-+247,
+-+160,
+-+128,
+-+1,
+-+36,
+-+88,
+-+254,
+-+160,
+-+128,
+-+1,
+-+36,
+-+192,
+-+243,
+-+128,
+-+11,
+-+0,
+-+247,
+-+163,
+-+128,
+-+58,
+-+98,
+-+64,
+-+255,
+-+35,
+-+240,
+-+0,
+-+100,
+-+192,
+-+243,
+-+128,
+-+10,
+-+64,
+-+255,
+-+163,
+-+128,
+-+0,
+-+164,
+-+192,
+-+243,
+-+128,
+-+10,
+-+88,
+-+246,
+-+160,
+-+128,
+-+1,
+-+36,
+-+240,
+-+246,
+-+160,
+-+128,
+-+50,
+-+34,
+-+8,
+-+255,
+-+227,
+-+143,
+-+54,
+-+242,
+-+192,
+-+243,
+-+60,
+-+128,
+-+40,
+-+255,
+-+227,
+-+142,
+-+54,
+-+178,
+-+192,
+-+243,
+-+60,
+-+128,
+-+0,
+-+254,
+-+0,
+-+240,
+-+39,
+-+10,
+-+0,
+-+240,
+-+60,
+-+128,
+-+8,
+-+255,
+-+163,
+-+143,
+-+45,
+-+226,
+-+192,
+-+243,
+-+60,
+-+128,
+-+0,
+-+254,
+-+0,
+-+240,
+-+44,
+-+10,
+-+0,
+-+240,
+-+60,
+-+0,
+-+0,
+-+254,
+-+0,
+-+240,
+-+40,
+-+10,
+-+0,
+-+240,
+-+60,
+-+128,
+-+8,
+-+255,
+-+163,
+-+142,
+-+2,
+-+162,
+-+192,
+-+243,
+-+60,
+-+128,
+-+90,
+-+0,
+- };
+-diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s
+-index fd159bc..b055208 100644
+---- a/libavcodec/rpi_hevc_transform.s
+-+++ b/libavcodec/rpi_hevc_transform.s
+-@@ -83,6 +83,8 @@
+- hevc_trans_16x16:
+-   cmp r5,1
+-   beq memclear16
+-+  cmp r5,2
+-+  beq hevc_deblock_16x16
+-   push r6-r15, lr # TODO cut down number of used registers
+-   mov r14,r3 # coeffs32
+-   mov r15,r4 # num32
+-@@ -282,3 +284,427 @@ loop:
+-   cmp r1,0
+-   bgt loop
+-   b lr
+-+
+-+
+-+################################################################################
+-+# HEVC VPU Deblock
+-+#
+-+# Vertical edges before horizontal
+-+# Decision can change every 4 pixels, but only 8 pixel boundaries are deblocked
+-+#
+-+# ARM is responsible for storing beta and tc for each 4 pixels horiz and vert edge.
+-+# The VPU code works in units of 16x16 blocks.
+-+# We do vertical filtering for the current block followed by horizontal filtering for the previous (except for the first time).
+-+# One final horizontal filter is required at the end.
+-+# PCM is not allowed in this code.
+-+#
+-+#
+-+# H(16-4:16+15,0) contains previous block (note that we need 4 lines above of context that may get altered during filtering)
+-+# H(16:31,16) contains current block (note that we do not need the upper lines until the horizontal filtering.
+-+
+-+.set P0,63
+-+.set P1,62
+-+.set P2,61
+-+.set P3,60
+-+.set Q0,59
+-+.set Q1,58
+-+.set Q2,57
+-+.set Q3,56
+-+
+-+.set dp,32
+-+.set dq,33
+-+.set d,34
+-+.set decision,35
+-+.set beta,36
+-+.set beta2,37
+-+.set beta3,38
+-+.set ptest,39
+-+.set qtest,40
+-+.set pqtest,41
+-+.set thresh,42
+-+.set deltatest, 44
+-+.set deltap1, 45
+-+.set tc25, 46
+-+.set setup,47
+-+.set tc,48
+-+.set tc25,49
+-+.set tc2, 50
+-+.set do_filter, 51
+-+.set delta, 52
+-+.set tc10, 53
+-+.set delta0, 54
+-+.set delta1, 55
+-+.set zeros, 0
+-+.set setup_input, 1
+-+.set deltaq1, 2
+-+
+-+
+-+
+-+# hevc_deblock_16x16 deblocks an entire row that is 16 pixels high by the full width of the image.
+-+# Row has num16 16x16 blocks across
+-+# Beta goes from 0 to 64
+-+# tc goes from 0 to 24
+-+# setup[block_idx][0=vert,1=horz][0=first edge, 1=second edge][0=beta,1=tc][0..3=edge number]
+-+#   has 8 bytes per edge
+-+#   has 16 bytes per direction
+-+#   has 32 bytes per 16x16 block
+-+# hevc_deblock_16x16(uint8_t *img (r0), int stride (r1), int num16w (r2), uint8_t setup[num16][2][2][2][4](r3),int num16h(r4))
+-+hevc_deblock_16x16:
+-+  push r6-r15, lr
+-+  mov r9,r4
+-+  mov r4,r3
+-+  mov r13,r2
+-+  mov r2,r0
+-+  mov r10,r0
+-+  subscale4 r0,r1
+-+  mov r8,63
+-+  mov r6,-3
+-+  vmov H(zeros,0),0
+-+# r7 is number of blocks still to load
+-+# r0 is location of current block - 4 * stride
+-+# r1 is stride
+-+# r2 is location of current block
+-+# r3 is offset of start of block (actual edges start at H(16,16)+r3 for horizontal and H(16,0)+r3 for vertical
+-+# r4 is setup
+-+# r5 is for temporary calculations
+-+# r8 holds 63
+-+# r6 holds -3
+-+# r9 holds the number of 16 high rows to process
+-+# r10 holds the original img base
+-+# r11 returns 0 if no filtering was done on the edge
+-+# r12 saves a copy of this
+-+# r13 is copy of width
+-+
+-+process_row:
+-+  # First iteration does not do horizontal filtering on previous
+-+  mov r7, r13
+-+  mov r3,0
+-+  vldb H(12++,16)+r3,(r0 += r1) REP 4    # Load the current block
+-+  vldb H(16++,16)+r3,(r2 += r1) REP 16
+-+  vldb H(setup_input,0), (r4)  # We may wish to prefetch these
+-+  vstb H(zeros,0),(r4)
+-+  bl vert_filter
+-+  add r3,8
+-+  vadd H(setup_input,0),H(setup_input,8),0 # Rotate to second set of 8
+-+  bl vert_filter
+-+  sub r3,8
+-+  b start_deblock_loop
+-+deblock_loop:
+-+  # Middle iterations do vertical on current block and horizontal on preceding
+-+  vldb H(12++,16)+r3,(r0 += r1) REP 4  # load the current block
+-+  vldb H(16++,16)+r3,(r2 += r1) REP 16
+-+  vldb H(setup_input,0), (r4)
+-+  vstb H(zeros,0),(r4)
+-+  bl vert_filter
+-+  add r3,8
+-+  vadd H(setup_input,0),H(setup_input,8),0
+-+  bl vert_filter
+-+  sub r3,8
+-+  vldb H(setup_input,0), -16(r4)
+-+  vstb H(zeros,0),-16(r4)
+-+  bl horz_filter
+-+  mov r12,r11
+-+  add r3,8*64
+-+  vadd H(setup_input,0),H(setup_input,8),0
+-+  bl horz_filter
+-+  sub r3,8*64
+-+  addcmpbeq r12,0,0,skip_save_top
+-+  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
+-+skip_save_top:
+-+  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
+-+start_deblock_loop:
+-+  # move onto next 16x16 (could do this with circular buffer support instead)
+-+  add r3,16
+-+  and r3,r8
+-+  add r4,32
+-+  # Perform loop counter operations (may work with an addcmpbgt as well?)
+-+  add r0,16
+-+  add r2,16
+-+  sub r7,1
+-+  cmp r7,0 # Are there still more blocks to load
+-+  bgt deblock_loop
+-+
+-+  # Final iteration needs to just do horizontal filtering
+-+  vldb H(setup_input,0), -16(r4)
+-+  vstb H(zeros,0),-16(r4)
+-+  bl horz_filter
+-+  mov r12,r11
+-+  add r3,8*64
+-+  vadd H(setup_input,0),H(setup_input,8),0
+-+  bl horz_filter
+-+  sub r3,64*8
+-+  addcmpbeq r12,0,0,skip_save_top2
+-+  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
+-+skip_save_top2:
+-+  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
+-+
+-+# Now look to see if we should do another row
+-+  sub r9,1
+-+  cmp r9,0
+-+  bgt start_again
+-+  pop r6-r15, pc
+-+start_again:
+-+  # Need to sort out r0,r2 to point to next row down
+-+  addscale16 r10,r1
+-+  mov r2,r10
+-+  subscale4 r0,r2,r1
+-+  b process_row
+-+
+-+
+-+# At this stage H(16,16)+r3 points to the first pixel of the 16 high edge to be filtered
+-+# So we can reuse the code we move the parts to be filtered into HX(P0/P1/P2/P3/Q0/Q1/Q2/Q3,0) - we will perform a final saturation step on placing them back into the correct locations
+-+
+-+vert_filter:
+-+  push lr
+-+
+-+  vmov HX(P3,0), V(16,12)+r3
+-+  vmov HX(P2,0), V(16,13)+r3
+-+  vmov HX(P1,0), V(16,14)+r3
+-+  vmov HX(P0,0), V(16,15)+r3
+-+  vmov HX(Q0,0), V(16,16)+r3
+-+  vmov HX(Q1,0), V(16,17)+r3
+-+  vmov HX(Q2,0), V(16,18)+r3
+-+  vmov HX(Q3,0), V(16,19)+r3
+-+
+-+  bl do_luma_filter
+-+
+-+  vadds V(16,13)+r3, HX(P2,0), 0
+-+  vadds V(16,14)+r3, HX(P1,0), 0
+-+  vadds V(16,15)+r3, HX(P0,0), 0
+-+  # P3 and Q3 never change so don't bother saving back
+-+  vadds V(16,16)+r3, HX(Q0,0), 0
+-+  vadds V(16,17)+r3, HX(Q1,0), 0
+-+  vadds V(16,18)+r3, HX(Q2,0), 0
+-+
+-+  pop pc
+-+
+-+# Filter edge at H(16,0)+r3
+-+horz_filter:
+-+  push lr
+-+
+-+  vmov HX(P3,0), H(12,0)+r3
+-+  vmov HX(P2,0), H(13,0)+r3
+-+  vmov HX(P1,0), H(14,0)+r3
+-+  vmov HX(P0,0), H(15,0)+r3
+-+  vmov HX(Q0,0), H(16,0)+r3
+-+  vmov HX(Q1,0), H(17,0)+r3
+-+  vmov HX(Q2,0), H(18,0)+r3
+-+  vmov HX(Q3,0), H(19,0)+r3
+-+
+-+  bl do_luma_filter
+-+
+-+  vadds H(13,0)+r3, HX(P2,0), 0
+-+  vadds H(14,0)+r3, HX(P1,0), 0
+-+  vadds H(15,0)+r3, HX(P0,0), 0
+-+  # P3 and Q3 never change so don't bother saving back
+-+  vadds H(16,0)+r3, HX(Q0,0), 0
+-+  vadds H(17,0)+r3, HX(Q1,0), 0
+-+  vadds H(18,0)+r3, HX(Q2,0), 0
+-+
+-+  pop pc
+-+
+-+# r4 points to array of beta/tc for each 4 length edge
+-+do_luma_filter:
+-+  valtl H(setup,0),H(setup_input,0),H(setup_input,0) # b*8tc*8
+-+  valtl HX(beta,0),H(setup,0),H(setup,0)
+-+  valtu HX(tc,0),H(setup,0),H(setup,0)
+-+  vmul HX(tc25,0), HX(tc,0), 5
+-+  vadd HX(tc25,0),HX(tc25,0), 1
+-+  vasr HX(tc25,0), HX(tc25,0), 1
+-+
+-+  # Compute decision
+-+  vadd HX(dp,0),HX(P1,0),HX(P1,0) # 2*P1
+-+  vsub HX(dp,0),HX(P2,0),HX(dp,0) # P2-2*P1
+-+  vadd HX(dp,0),HX(dp,0),HX(P0,0) # P2-2*P1+P0
+-+  vdist HX(dp,0),HX(dp,0),0 # abs(P2-2*P1+P0) # dp0
+-+
+-+  vadd HX(dq,0),HX(Q1,0),HX(Q1,0) # 2*Q1
+-+  vsub HX(dq,0),HX(Q2,0),HX(dq,0) # Q2-2*Q1
+-+  vadd HX(dq,0),HX(dq,0),HX(Q0,0) # Q2-2*Q1+Q0
+-+  vdist HX(dq,0),HX(dq,0),0 # abs(Q2-2*Q1+Q0) # dq0
+-+
+-+  vadd HX(d,0), HX(dp,0), HX(dq,0)
+-+  vasr HX(beta2,0),HX(beta,0),2
+-+  vasr HX(beta3,0),HX(beta,0),3
+-+
+-+  # Compute flags that are negative if all conditions pass
+-+  vdist HX(decision,0), HX(P0,0), HX(P3,0) CLRA SACC
+-+  vdist HX(decision,0), HX(Q0,0), HX(Q3,0) SACC
+-+  vsub HX(decision,0), HX(decision,0), HX(beta3,0) SETF
+-+
+-+  vdist HX(decision,0), HX(P0,0), HX(Q0,0) IFN
+-+  vsub HX(decision,0), HX(decision,0), HX(tc25,0) IFN SETF
+-+  vadd HX(decision,0), HX(d,0), HX(d,0) IFN
+-+  vsub HX(decision,0), HX(decision,0), HX(beta2,0) IFN SETF
+-+  vmov HX(decision,0), 1 IFNN
+-+  vadd H(decision,0),H(decision,3),0 IFN
+-+  vadd H(decision,16),H(decision,19),0 IFN
+-+  vmov -,HX(decision,0) SETF   # N marks strong filter
+-+  vmov HX(decision,0), 1 IFNN  # NN marks normal filter
+-+
+-+  vadd HX(do_filter,0), HX(d,3), HX(d,0)
+-+  vsub HX(do_filter,0), HX(do_filter,0), HX(beta,0) SETF # IFNN means no filter
+-+  vmov HX(decision,0),0 IFNN # Z marks no filter
+-+
+-+  # Expand out decision (currently valid one every 4 pixels)  0...1...2...3
+-+  # First extract out even terms
+-+  vodd HX(decision,0),HX(decision,0),HX(decision,0)  # 0.1.2.3
+-+  vodd HX(decision,0),HX(decision,0),HX(decision,0)  # 0123
+-+  # Now expand back
+-+  valtl HX(decision,0),HX(decision,0),HX(decision,0) # 00112233
+-+  valtl HX(decision,0),HX(decision,0),HX(decision,0) SETF # 0000111122223333
+-+
+-+  # HX(decision,0) is negative if want strong filtering, 1 if want normal filtering, 0 if want no filtering
+-+
+-+  # Do a quick check to see if there is anything to do
+-+  mov r11, 0 # Signal no filtering
+-+  vmov -,1 IFNZ SUMS r5
+-+  cmp r5,0
+-+  beq filtering_done
+-+  mov r11, 1 # Signal some filtering
+-+  # And whether there is any strong filtering
+-+  vmov -,1 IFN SUMS r5
+-+  cmp r5,0
+-+  beq normal_filtering
+-+
+-+  ##############################################################################
+-+  # Strong filtering - could maybe fast case if all have same sign? (especially if all disabled!)
+-+  vshl HX(tc2,0), HX(tc,0), 1  # Note that in normal filtering tx2 is tc/2, while here it is tc*2
+-+
+-+  # Take a copy of the original pixels for use in decision calculation
+-+  vmov HX(P0,32),HX(P0,0)
+-+  vmov HX(Q0,32),HX(Q0,0)
+-+  vmov HX(P1,32),HX(P1,0)
+-+  vmov HX(Q1,32),HX(Q1,0)
+-+  vmov HX(P2,32),HX(P2,0)
+-+  vmov HX(Q2,32),HX(Q2,0)
+-+
+-+  vadd -,HX(P2,32),4 CLRA SACC
+-+  vshl -,HX(P1,32),1 SACC
+-+  vshl -,HX(P0,32),1 SACC
+-+  vshl -,HX(Q0,32),1 SACC
+-+  vshl HX(delta,0),HX(Q1,32),0 SACC
+-+  vasr HX(delta,0),HX(delta,0), 3
+-+  vsub HX(delta,0),HX(delta,0),HX(P0,32)
+-+  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
+-+  vadd HX(P0,0),HX(P0,32),HX(delta,0) IFN
+-+
+-+  vadd -,HX(P2,32),2 CLRA SACC
+-+  vadd -,HX(P1,32),HX(P0,32) SACC
+-+  vshl HX(delta,0),HX(Q0,32),0 SACC
+-+  vasr HX(delta,0),HX(delta,0), 2
+-+  vsub HX(delta,0),HX(delta,0),HX(P1,32)
+-+  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
+-+  vadd HX(P1,0),HX(P1,32),HX(delta,0) IFN
+-+
+-+  vadd -,HX(Q0,32),4 CLRA SACC
+-+  vadd -,HX(P1,32),HX(P0,32) SACC
+-+  vmul -,HX(P2,32),3 SACC
+-+  vshl HX(delta,0),HX(P3,0),1 SACC # Note that we have not made a copy of P3, so using P3,0 is correct
+-+  vasr HX(delta,0),HX(delta,0), 3
+-+  vsub HX(delta,0),HX(delta,0),HX(P2,32)
+-+  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
+-+  vadd HX(P2,0),HX(P2,32),HX(delta,0) IFN
+-+  #vmov HX(P2,0),3 IFN
+-+
+-+  # Now reverse all P/Qs
+-+
+-+  vadd -,HX(Q2,32),4 CLRA SACC
+-+  vshl -,HX(Q1,32),1 SACC
+-+  vshl -,HX(Q0,32),1 SACC
+-+  vshl -,HX(P0,32),1 SACC
+-+  vshl HX(delta,0),HX(P1,32),0 SACC
+-+  vasr HX(delta,0),HX(delta,0), 3
+-+  vsub HX(delta,0),HX(delta,0),HX(Q0,32)
+-+  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
+-+  vadd HX(Q0,0),HX(Q0,32),HX(delta,0) IFN
+-+
+-+  vadd -,HX(Q2,32),2 CLRA SACC
+-+  vadd -,HX(Q1,32),HX(Q0,32) SACC
+-+  vshl HX(delta,0),HX(P0,32),0 SACC
+-+  vasr HX(delta,0),HX(delta,0), 2
+-+  vsub HX(delta,0),HX(delta,0),HX(Q1,32)
+-+  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
+-+  vadd HX(Q1,0),HX(Q1,32),HX(delta,0) IFN
+-+
+-+  vadd -,HX(P0,32),4 CLRA SACC
+-+  vadd -,HX(Q1,32),HX(Q0,32) SACC
+-+  vmul -,HX(Q2,32),3 SACC
+-+  vshl HX(delta,0),HX(Q3,0),1 SACC # Note that we have not made a copy of Q3, so using Q3,0 is correct
+-+  vasr HX(delta,0),HX(delta,0), 3
+-+  vsub HX(delta,0),HX(delta,0),HX(Q2,32)
+-+  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
+-+  vadd HX(Q2,0),HX(Q2,32),HX(delta,0) IFN
+-+
+-+  ##############################################################################
+-+  # Normal filtering
+-+normal_filtering:
+-+  # Invert the decision flags
+-+  # make instruction more complicated as assembler has error and loses SETF
+-+  vrsub HX(tc10,0), HX(decision,0), 0 SETF # IFN means normal filtering
+-+  vmov  -, HX(tc10,0) SETF # IFN means normal filtering
+-+
+-+  vmov -,1 IFN SUMS r5
+-+  cmp r5,0
+-+  beq filtering_done
+-+
+-+  vasr HX(tc2,0), HX(tc,0), 1
+-+  vmul HX(tc10,0), HX(tc,0), 10
+-+
+-+  vasr HX(thresh,0), HX(beta,0), 1
+-+  vadd HX(thresh,0), HX(thresh,0), HX(beta,0)
+-+  vasr HX(thresh,0), HX(thresh,0), 3 CLRA SACC
+-+
+-+  vadd HX(ptest,0),HX(dp,3),HX(dp,0)
+-+  vsub HX(ptest,0),HX(ptest,0),HX(thresh,0) # ptest is negative if we need to do the P2 pixel
+-+  vadd HX(qtest,0),HX(dq,3),HX(dq,0)
+-+  vsub HX(qtest,0),HX(qtest,0),HX(thresh,0) # qtest is negative if we need to do the Q2 pixel
+-+  # Expand ptest and qtest together
+-+  vodd HX(pqtest,0),HX(ptest,0),HX(qtest,0)  # p.p.p.p.q.q.q.q
+-+  vodd HX(pqtest,0),HX(pqtest,0),HX(pqtest,0) # ppppqqqq........
+-+  valtl HX(pqtest,0),HX(pqtest,0),HX(pqtest,0) # ppppppppqqqqqqqq
+-+  valtl HX(ptest,0),HX(pqtest,0),HX(pqtest,0)
+-+  valtu HX(qtest,0),HX(pqtest,0),HX(pqtest,0)
+-+
+-+  vsub HX(delta0,0), HX(Q0,0), HX(P0,0)
+-+  vsub HX(delta1,0), HX(Q1,0), HX(P1,0)
+-+  vmov -,8 CLRA SACC
+-+  vmul -,HX(delta0,0), 9 SACC
+-+  vmul HX(delta0,0),HX(delta1,0), r6 SACC
+-+  vasr HX(delta0,0), HX(delta0,0), 4
+-+  vdist HX(deltatest,0), HX(delta0,0), 0
+-+  vsub HX(deltatest,0), HX(deltatest,0), HX(tc10,0) IFN SETF # negative if still need to do something
+-+  vmov HX(deltatest,0), 0 IFNN # clear if no need to do anything so we can reload flags later
+-+
+-+  vclamps HX(delta0,0), HX(delta0,0), HX(tc,0)
+-+
+-+  vadd HX(deltap1,0), HX(P2,0), HX(P0,0)
+-+  vadd HX(deltap1,0), HX(deltap1,0), 1
+-+  vasr HX(deltap1,0), HX(deltap1,0), 1 CLRA SACC
+-+  vsub HX(deltap1,0), HX(delta0,0), HX(P1,0) SACC
+-+  vasr HX(deltap1,0), HX(deltap1,0), 1
+-+  vclamps HX(deltap1,0), HX(deltap1,0), HX(tc2,0)
+-+
+-+  vadd HX(deltaq1,0), HX(Q2,0), HX(Q0,0)
+-+  vadd HX(deltaq1,0), HX(deltaq1,0), 1
+-+  vasr HX(deltaq1,0), HX(deltaq1,0), 1 CLRA SACC
+-+  vadd HX(deltaq1,0), HX(delta0,0), HX(Q1,0)
+-+  vrsub -, HX(delta0,0), 0 SACC
+-+  vrsub HX(deltaq1,0), HX(Q1,0), 0 SACC
+-+  vasr HX(deltaq1,0), HX(deltaq1,0), 1
+-+  vclamps HX(deltaq1,0), HX(deltaq1,0), HX(tc2,0)
+-+
+-+  vadds HX(P0,0), HX(P0,0), HX(delta0,0) IFN
+-+  vsubs HX(Q0,0), HX(Q0,0), HX(delta0,0) IFN
+-+
+-+  vmov -,HX(ptest,0) IFN SETF # Negative if need to do p1
+-+  vadds HX(P1,0), HX(P1,0), HX(deltap1,0) IFN
+-+
+-+  vmov -,HX(deltatest,0) SETF
+-+  vmov -,HX(qtest,0) IFN SETF # Negative if need to do q1
+-+  vadds HX(Q1,0), HX(Q1,0), HX(deltaq1,0) IFN
+-+
+-+  #vmov HX(P2,0),1 IFN
+-+
+-+filtering_done:
+-+  b lr
+-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
+-index 0121fca..05b2169 100644
+---- a/libavcodec/rpi_qpu.c
+-+++ b/libavcodec/rpi_qpu.c
+-@@ -147,7 +147,7 @@ static int gpu_init(volatile struct GPU **gpu) {
+-   vcsm_init();
+-   gpu_malloc_uncached_internal(sizeof(struct GPU), &gpu_mem_ptr, mb);
+-   ptr = (volatile struct GPU*)gpu_mem_ptr.arm;
+--  memset(ptr, 0, sizeof *ptr);
+-+  memset((void*)ptr, 0, sizeof *ptr);
+-   vc = gpu_mem_ptr.vc;
+- 
+-   ptr->mb = mb;
+-@@ -254,7 +254,7 @@ void gpu_cache_flush(GPU_MEM_PTR_T *p)
+-     struct vcsm_user_clean_invalid_s iocache = {};
+-     iocache.s[0].handle = p->vcsm_handle;
+-     iocache.s[0].cmd = 3; // clean+invalidate
+--    iocache.s[0].addr = p->arm;
+-+    iocache.s[0].addr = (int) p->arm;
+-     iocache.s[0].size  = p->numbytes;
+-     vcsm_clean_invalid( &iocache );
+- #else
+-@@ -390,6 +390,7 @@ static void *vpu_start(void *arg) {
+- #ifdef RPI_TIME_TOTAL_POSTED
+-   int last_time=0;
+-   long long on_time=0;
+-+  long long on_time_deblock=0;
+-   long long off_time=0;
+-   int start_time;
+-   int end_time;
+-@@ -451,10 +452,13 @@ static void *vpu_start(void *arg) {
+- #ifdef RPI_TIME_TOTAL_POSTED
+-     end_time = Microseconds();
+-     last_time = end_time;
+--    on_time += end_time - start_time;
+-+    if (p[6]==2)
+-+      on_time_deblock += end_time - start_time;
+-+    else
+-+      on_time += end_time - start_time;
+-     count++;
+-     if ((count&0x7f)==0)
+--      printf("Posted %d On=%dms, Off=%dms\n",count,(int)(on_time/1000),(int)(off_time/1000));
+-+      printf("Posted %d On=%dms, On_deblock=%dms, Off=%dms\n",count,(int)(on_time/1000),(int)(on_time_deblock/1000),(int)(off_time/1000));
+- #endif
+-     pthread_mutex_lock(&post_mutex);
+-     vpu_async_head++;
+-diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
+-index e86eb30..c5d8b29 100644
+---- a/libavcodec/rpi_shader.c
+-+++ b/libavcodec/rpi_shader.c
+-@@ -61,7 +61,7 @@ unsigned int rpi_shader[] = {
+- /* [0x00000120] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_frame_base, r2
+- /* [0x00000128] */ 0x0c9e7440, 0x10020f27, // add t1s, r2, r1
+- /* [0x00000130] */ 0x00000008, 0xe00208a7, // mov r2,8
+--/* [0x00000138] */ 0x11827c80, 0x10021327, // shl rb12,unif, r2
+-+/* [0x00000138] */ 0x11827c80, 0x10021327, // shl rb12,unif,r2
+- /* [0x00000140] */ 0x0c827c80, 0x10021367, // add rb13,unif,r2
+- /* [0x00000148] */ 0x15827d80, 0x100208a7, // mov r2, unif
+- /* [0x00000150] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
+--- 
+-2.7.4
+-
+-
+-From e9c59f0d7b42dfb10d85ab2477f95b44484a8d70 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Wed, 1 Jul 2015 09:21:17 +0100
+-Subject: [PATCH 65/68] Added ability to combine jobs
+-
+----
+- libavcodec/rpi_qpu.c | 81 +++++++++++++++++++++++++++++++++++++++++++++++++++-
+- 1 file changed, 80 insertions(+), 1 deletion(-)
+-
+-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
+-index 05b2169..91777be 100644
+---- a/libavcodec/rpi_qpu.c
+-+++ b/libavcodec/rpi_qpu.c
+-@@ -8,6 +8,8 @@
+- #define RPI_TIME_TOTAL_POSTED
+- // define RPI_ASYNC to run the VPU in a separate thread, need to make a separate call to check for completion
+- #define RPI_ASYNC
+-+// Define RPI_COMBINE_JOBS to find jobs that can be executed in parallel
+-+#define RPI_COMBINE_JOBS
+- 
+- #include <stdio.h>
+- #include <stdlib.h>
+-@@ -398,9 +400,15 @@ static void *vpu_start(void *arg) {
+- #endif
+-   while(1) {
+-     int i;
+--    int *p;
+-+    int *p; // Pointer for a QPU/VPU job
+-+#ifdef RPI_COMBINE_JOBS
+-+    int *q = NULL; // Pointer for a VPU only job
+-+    int have_qpu = 0;
+-+    int have_vpu = 0;
+-+#endif
+-     int qpu_code;
+-     int qpu_codeb;
+-+    int num_jobs; // Number of jobs available
+-     pthread_mutex_lock(&post_mutex);
+-     while( vpu_async_tail - vpu_async_head <= 0)
+-     {
+-@@ -408,13 +416,38 @@ static void *vpu_start(void *arg) {
+-       pthread_cond_wait(&post_cond_tail, &post_mutex);
+-     }
+-     p = vpu_cmds[vpu_async_head%MAXCMDS];
+-+    num_jobs = vpu_async_tail - vpu_async_head;
+-     pthread_mutex_unlock(&post_mutex);
+- 
+-     if (p[6] == -1) {
+-       break; // Last job
+-     }
+-+    if (p[7] == 0 && p[0] == 0 && p[16]==0)
+-+      goto job_done_early;
+-+
+-+#ifdef RPI_COMBINE_JOBS
+-+    // First scan for a qpu job
+-+    for (int x=0;x<num_jobs;x++) {
+-+      p = vpu_cmds[(vpu_async_head+x)%MAXCMDS];
+-+      if (p[7]) {
+-+        have_qpu = 1;
+-+        break;
+-+      }
+-+    }
+-+    // Now scan for a non-qpu job
+-+    for (int x=0;x<num_jobs;x++) {
+-+      q = vpu_cmds[(vpu_async_head+x)%MAXCMDS];
+-+      if (!q[7]) {
+-+        have_vpu = 1;
+-+        break;
+-+      }
+-+    }
+-+    printf("Have_qpu = %d, have_vpu=%d\n",have_qpu,have_vpu);
+-+#endif
+-     qpu_code = p[7];
+-     qpu_codeb = p[16];
+-+
+-+
+-     //if (p[7]) {
+-         //GPU_MEM_PTR_T *buf = (GPU_MEM_PTR_T *)p[7];
+-         //gpu_cache_flush(buf);
+-@@ -427,6 +460,40 @@ static void *vpu_start(void *arg) {
+-     off_time += start_time-last_time;
+- #endif
+- 
+-+#ifdef RPI_COMBINE_JOBS
+-+    if (have_qpu) {
+-+      for(i=0;i<8;i++) {
+-+        gpu->mail[i*2] = p[8+i];
+-+        gpu->mail[i*2 + 1] = qpu_code;
+-+      }
+-+      for(i=0;i<12;i++) {
+-+        gpu->mail2[i*2] = p[17+i];
+-+        gpu->mail2[i*2 + 1] = qpu_codeb;
+-+      }
+-+      if (have_vpu) {
+-+        execute_multi(gpu->mb,
+-+                              12,gpu->vc + offsetof(struct GPU, mail2), 1, 5000,
+-+                              8,gpu->vc + offsetof(struct GPU, mail), 1 /* no flush */, 5000 /* timeout ms */,
+-+                              p[0], p[1], p[2], p[3], p[4], p[5], p[6], // VPU0
+-+                              q[0], q[1], q[2], q[3], q[4], q[5], q[6]); // VPU1
+-+        q[0] = 0;
+-+      } else {
+-+        execute_multi(gpu->mb,
+-+                              12,gpu->vc + offsetof(struct GPU, mail2), 1, 5000,
+-+                              8,gpu->vc + offsetof(struct GPU, mail), 1 /* no flush */, 5000 /* timeout ms */,
+-+                              p[0], p[1], p[2], p[3], p[4], p[5], p[6], // VPU0
+-+                              0,    0   , 0   , 0   , 0   , 0   , 0); // VPU1
+-+      }
+-+      p[0] = 0;
+-+      p[7] = 0;
+-+      p[16] = 0;
+-+    } else {
+-+        av_assert0(have_vpu);
+-+        vpu_execute_code(q[0], q[1], q[2], q[3], q[4], q[5], q[6]);
+-+        q[0] = 0;
+-+    }
+-+#else
+-+
+-     if (!qpu_code) {
+-       vpu_execute_code(p[0], p[1], p[2], p[3], p[4], p[5], p[6]);
+-     } else {
+-@@ -449,17 +516,29 @@ static void *vpu_start(void *arg) {
+-                               0,    0   , 0   , 0   , 0   , 0   , 0); // VPU1
+- #endif
+-     }
+-+#endif
+-+
+- #ifdef RPI_TIME_TOTAL_POSTED
+-     end_time = Microseconds();
+-     last_time = end_time;
+-+#ifdef RPI_COMBINE_JOBS
+-+    // There are three cases we may wish to distinguish of VPU/QPU activity
+-+    on_time += end_time - start_time;
+-+#else
+-     if (p[6]==2)
+-       on_time_deblock += end_time - start_time;
+-     else
+-       on_time += end_time - start_time;
+-+#endif
+-     count++;
+-     if ((count&0x7f)==0)
+-+#ifdef RPI_COMBINE_JOBS
+-       printf("Posted %d On=%dms, On_deblock=%dms, Off=%dms\n",count,(int)(on_time/1000),(int)(on_time_deblock/1000),(int)(off_time/1000));
+-+#else
+-+      printf("Posted %d On=%dms, Off=%dms\n",count,(int)(on_time/1000),(int)(off_time/1000));
+-+#endif
+- #endif
+-+job_done_early:
+-     pthread_mutex_lock(&post_mutex);
+-     vpu_async_head++;
+-     pthread_cond_broadcast(&post_cond_head);
+--- 
+-2.7.4
+-
+-
+-From 0d54661f303b2a8903e806648ed54a34dcf315dc Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Wed, 1 Jul 2015 12:53:10 +0100
+-Subject: [PATCH 66/68] Added chroma deblocking
+-
+----
+- libavcodec/hevc.c               |  20 ++
+- libavcodec/hevc.h               |  12 +-
+- libavcodec/hevc_filter.c        |  92 +++++-
+- libavcodec/rpi_hevc_transform.h | 644 +++++++++++++++++++++++++++++++++++++++-
+- libavcodec/rpi_hevc_transform.s | 207 +++++++++++++
+- libavcodec/rpi_qpu.c            |  27 +-
+- libavcodec/rpi_shader.qasm      |  11 +
+- 7 files changed, 988 insertions(+), 25 deletions(-)
+-
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 4ce94a7..8437e10 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -251,6 +251,14 @@ static void pic_arrays_free(HEVCContext *s)
+-       gpu_free(&s->y_setup_ptr);
+-       s->y_setup_arm = 0;
+-     }
+-+    if (s->uv_setup_arm) {
+-+      gpu_free(&s->uv_setup_ptr);
+-+      s->uv_setup_arm = 0;
+-+    }
+-+    if (s->vpu_cmds_arm) {
+-+      gpu_free(&s->vpu_cmds_ptr);
+-+      s->vpu_cmds_arm = 0;
+-+    }
+- #endif
+-     av_freep(&s->sao);
+-     av_freep(&s->deblock);
+-@@ -324,6 +332,18 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
+-     s->y_setup_vc = (void*)s->y_setup_ptr.vc;
+-     memset(s->y_setup_arm, 0, s->y_setup_ptr.numbytes);
+-     printf("Setup %d by %d by %d\n",s->setup_width,s->setup_height,sizeof(*s->y_setup_arm));
+-+
+-+    s->uv_setup_width = ( (sps->width >> sps->hshift[1]) + 15) / 16;
+-+    s->uv_setup_height = ( (sps->height >> sps->vshift[1]) + 15) / 16;
+-+    gpu_malloc_uncached(sizeof(*s->uv_setup_arm) * s->uv_setup_width * s->uv_setup_height, &s->uv_setup_ptr); // TODO make this cached
+-+    s->uv_setup_arm = (void*)s->uv_setup_ptr.arm;
+-+    s->uv_setup_vc = (void*)s->uv_setup_ptr.vc;
+-+    memset(s->uv_setup_arm, 0, s->uv_setup_ptr.numbytes);
+-+    printf("Setup uv %d by %d by %d\n",s->uv_setup_width,s->uv_setup_height,sizeof(*s->uv_setup_arm));
+-+
+-+    gpu_malloc_uncached(sizeof(*s->vpu_cmds_arm) * 3,&s->vpu_cmds_ptr);
+-+    s->vpu_cmds_arm = (void*) s->vpu_cmds_ptr.arm;
+-+    s->vpu_cmds_vc = s->vpu_cmds_ptr.vc;
+- #endif
+- 
+-     s->bs_width  = (width  >> 2) + 1;
+-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
+-index cf08489..7eb37e6 100644
+---- a/libavcodec/hevc.h
+-+++ b/libavcodec/hevc.h
+-@@ -56,7 +56,7 @@
+-   #define RPI_MAX_JOBS 2
+-   // Define RPI_WORKER to launch a worker thread for pixel processing tasks
+-   #define RPI_WORKER
+--
+-+  // Define RPI_DEBLOCK_VPU to perform deblocking on the VPUs
+-   #define RPI_DEBLOCK_VPU
+- 
+- #endif
+-@@ -980,6 +980,16 @@ typedef struct HEVCContext {
+-     uint8_t (*y_setup_vc)[2][2][2][4];
+-     int setup_width; // Number of 16x16 blocks across the image
+-     int setup_height; // Number of 16x16 blocks down the image
+-+
+-+    GPU_MEM_PTR_T uv_setup_ptr;
+-+    uint8_t (*uv_setup_arm)[2][2][2][4];  // Half of this is unused [][][1][], but easier for the VPU as it allows us to store with zeros and addresses are aligned
+-+    uint8_t (*uv_setup_vc)[2][2][2][4];
+-+    int uv_setup_width;
+-+    int uv_setup_height;
+-+
+-+    GPU_MEM_PTR_T vpu_cmds_ptr;
+-+    int (*vpu_cmds_arm)[6]; // r0-r5 for each command
+-+    int vpu_cmds_vc;
+- #endif
+- 
+- #endif
+-diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
+-index 06371da..6367068 100644
+---- a/libavcodec/hevc_filter.c
+-+++ b/libavcodec/hevc_filter.c
+-@@ -656,9 +656,23 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+-                                                                    s->frame->linesize[chroma],
+-                                                                    c_tc, no_p, no_q);
+-                         } else
+-+#ifdef RPI_DEBLOCK_VPU
+-+                        if (s->enable_rpi_deblock) {
+-+                            uint8_t (*setup)[2][2][4];
+-+                            int xc = x>>s->ps.sps->hshift[chroma];
+-+                            int yc = y>>s->ps.sps->vshift[chroma];
+-+                            int num16 = (yc>>4)*s->uv_setup_width + (xc>>4);
+-+                            int a = ((yc>>3) & 1) << 1;
+-+                            int b = (xc>>3) & 1;
+-+                            setup = s->uv_setup_arm[num16];
+-+                            setup[0][b][0][a] = c_tc[0];
+-+                            setup[0][b][0][a + 1] = c_tc[1];
+-+                        } else
+-+#endif
+-                             s->hevcdsp.hevc_v_loop_filter_chroma(src,
+-                                                                  s->frame->linesize[chroma],
+-                                                                  c_tc, no_p, no_q);
+-+
+-                     }
+-                 }
+- 
+-@@ -689,6 +703,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+-                                                                    s->frame->linesize[chroma],
+-                                                                    c_tc, no_p, no_q);
+-                         } else
+-+#ifdef RPI_DEBLOCK_VPU
+-+                        if (s->enable_rpi_deblock) {
+-+                            uint8_t (*setup)[2][2][4];
+-+                            int xc = x>>s->ps.sps->hshift[chroma];
+-+                            int yc = y>>s->ps.sps->vshift[chroma];
+-+                            int num16 = (yc>>4)*s->uv_setup_width + (xc>>4);
+-+                            int a = ((xc>>3) & 1) << 1;
+-+                            int b = (yc>>3) & 1;
+-+                            setup = s->uv_setup_arm[num16];
+-+                            setup[1][b][0][a] = c_tc[0];
+-+                            setup[1][b][0][a + 1] = c_tc[1];
+-+                        } else
+-+#endif
+-                             s->hevcdsp.hevc_h_loop_filter_chroma(src,
+-                                                                  s->frame->linesize[chroma],
+-                                                                  c_tc, no_p, no_q);
+-@@ -1013,33 +1040,56 @@ void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n)
+- static void rpi_deblock(HEVCContext *s, int y, int ctb_size)
+- {
+-   // Flush image, 4 lines above to bottom of ctb stripe
+--  ff_hevc_flush_buffer_lines(s, FFMAX(y-4,0), y+ctb_size, 1, 0);
+-+  ff_hevc_flush_buffer_lines(s, FFMAX(y-4,0), y+ctb_size, 1, 1);
+-   // TODO flush buffer of beta/tc setup when it becomes cached
+-+
+-+  // Prepare three commands at once to avoid calling overhead
+-+  s->vpu_cmds_arm[0][0] = get_vc_address(s->frame->buf[0]) + s->frame->linesize[0] * y;
+-+  s->vpu_cmds_arm[0][1] = s->frame->linesize[0];
+-+  s->vpu_cmds_arm[0][2] = s->setup_width;
+-+  s->vpu_cmds_arm[0][3] = (int) ( s->y_setup_vc + s->setup_width * (y>>4) );
+-+  s->vpu_cmds_arm[0][4] = ctb_size>>4;
+-+  s->vpu_cmds_arm[0][5] = 2;
+-+
+-+  s->vpu_cmds_arm[1][0] = get_vc_address(s->frame->buf[1]) + s->frame->linesize[1] * (y>> s->ps.sps->vshift[1]);
+-+  s->vpu_cmds_arm[1][1] = s->frame->linesize[1];
+-+  s->vpu_cmds_arm[1][2] = s->uv_setup_width;
+-+  s->vpu_cmds_arm[1][3] = (int) ( s->uv_setup_vc + s->uv_setup_width * ((y>>4)>> s->ps.sps->vshift[1]) );
+-+  s->vpu_cmds_arm[1][4] = (ctb_size>>4)>> s->ps.sps->vshift[1];
+-+  s->vpu_cmds_arm[1][5] = 3;
+-+
+-+  s->vpu_cmds_arm[2][0] = get_vc_address(s->frame->buf[2]) + s->frame->linesize[2] * (y>> s->ps.sps->vshift[2]);
+-+  s->vpu_cmds_arm[2][1] = s->frame->linesize[2];
+-+  s->vpu_cmds_arm[2][2] = s->uv_setup_width;
+-+  s->vpu_cmds_arm[2][3] = (int) ( s->uv_setup_vc + s->uv_setup_width * ((y>>4)>> s->ps.sps->vshift[1]) );
+-+  s->vpu_cmds_arm[2][4] = (ctb_size>>4)>> s->ps.sps->vshift[1];
+-+  s->vpu_cmds_arm[2][5] = 4;
+-+
+-   // Call VPU
+--  // TODO add this to a separate pipeline of VPU jobs that can be run in parallel and wait for completion
+--  vpu_wait(vpu_post_code( vpu_get_fn(), get_vc_address(s->frame->buf[0]) + s->frame->linesize[0] * y, s->frame->linesize[0],
+--                               s->setup_width, (int) ( s->y_setup_vc + s->setup_width * (y>>4) ),
+--                               ctb_size>>4, 2, 0)); // 2 means to do the deblocking code
+-+  vpu_wait(vpu_post_code( vpu_get_fn(), s->vpu_cmds_vc, 3, 0, 0, 0, 5, 0)); // 5 means to do all the commands
+- }
+- 
+--static void rpi_deblock2(HEVCContext *s, int y, int ctb_size)
+--{
+--   int y2;
+--   for(y2=y;y2<y+ctb_size;y2+=16) {
+--      rpi_deblock(s,y2,16);
+--   }
+--}
+- #endif
+- 
+- void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
+- {
+-     int x_end = x >= s->ps.sps->width  - ctb_size;
+-+#ifdef RPI_DEBLOCK_VPU
+-+    int done_deblock = 0;
+-+#endif
+-     if (s->avctx->skip_loop_filter < AVDISCARD_ALL)
+-         deblocking_filter_CTB(s, x, y);
+- #ifdef RPI_DEBLOCK_VPU
+-     if (s->enable_rpi_deblock && x_end)
+-     {
+--      rpi_deblock(s, y, ctb_size);
+-+      int y_at_end = y >= s->ps.sps->height - ctb_size;
+-+      int height = 64;  // Deblock in units 64 high to avoid too many VPU calls
+-+      int y_start = y&~63;
+-+      if (y_at_end) height = s->ps.sps->height - y_start;
+-+      if ((((y+ctb_size)&63)==0) || y_at_end) {
+-+        done_deblock = 1;
+-+        rpi_deblock(s, y_start, height);
+-+      }
+-     }
+- #endif
+-     if (s->ps.sps->sao_enabled) {
+-@@ -1070,11 +1120,25 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
+-         //int newh = y + ctb_size - 4;
+-         //int currh = s->ref->tf.progress->data[0];
+-         //if (((y + ctb_size)&63)==0)
+-+#ifdef RPI_DEBLOCK_VPU
+-+        if (s->enable_rpi_deblock) {
+-+          // we no longer need to flush the luma buffer as it is in GPU memory when using deblocking on the rpi
+-+          if (done_deblock) {
+-+            ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
+-+          }
+-+        } else {
+-+#ifdef RPI_INTER_QPU
+-+          ff_hevc_flush_buffer(s, &s->ref->tf, y + ctb_size - 4);
+-+#endif
+-+          ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
+-+        }
+-+#else
+- #ifdef RPI_INTER_QPU
+-         ff_hevc_flush_buffer(s, &s->ref->tf, y + ctb_size - 4);
+--        // TODO we no longer need to flush the luma buffer as it is in GPU memory when using deblocking on the rpi
+-+        // we no longer need to flush the luma buffer as it is in GPU memory when using deblocking on the rpi
+- #endif
+-         ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
+-+#endif
+-     }
+- }
+- 
+-diff --git a/libavcodec/rpi_hevc_transform.h b/libavcodec/rpi_hevc_transform.h
+-index b3f155f..4309f1c 100644
+---- a/libavcodec/rpi_hevc_transform.h
+-+++ b/libavcodec/rpi_hevc_transform.h
+-@@ -3,14 +3,32 @@ unsigned char rpi_hevc_transform [] = {
+- 106,
+- 0,
+- 144,
+--38,
+++192,
+++243,
+++211,
+++31,
+++128,
+++248,
+++0,
+++0,
+++112,
+++0,
+++192,
+++243,
+++211,
+++31,
+++128,
+++144,
+++112,
+++0,
+++188,
+++64,
+++67,
+++232,
+++0,
+++2,
+++0,
+++0,
+++0,
+++255,
+++64,
+++0,
+++0,
+++20,
+++200,
+++243,
+++0,
+++0,
+++128,
+++144,
+++101,
+++0,
+++195,
+++232,
+++0,
+++2,
+++0,
+++0,
+++12,
+++128,
+++7,
+++192,
+++130,
+++248,
+++0,
+++0,
+++112,
+++192,
+++224,
+++16,
+++195,
+++31,
+++132,
+++248,
+++1,
+++0,
+++112,
+++0,
+++224,
+++16,
+++203,
+++31,
+++25,
+++102,
+++9,
+++106,
+++2,
+++30,
+++41,
+++3,
+++26,
+++87,
+++162,
+++64,
+++64,
+++198,
+++1,
+++23,
+++127,
+++158,
+++103,
+++255,
+++239,
+++3,
+++0,
+++254,
+++0,
+++143,
+++92,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++64,
+++143,
+++93,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++128,
+++143,
+++94,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++192,
+++143,
+++95,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++192,
+++142,
+++208,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++128,
+++142,
+++209,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++64,
+++142,
+++210,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++0,
+++142,
+++211,
+++0,
+++0,
+++240,
+++12,
+++0,
+++128,
+++144,
+++107,
+++0,
+++8,
+++255,
+++99,
+++23,
+++0,
+++212,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++163,
+++23,
+++0,
+++228,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++227,
+++23,
+++0,
+++244,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++35,
+++52,
+++0,
+++180,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++99,
+++52,
+++0,
+++164,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++163,
+++52,
+++0,
+++148,
+++192,
+++51,
+++0,
+++0,
+++111,
+++3,
+++239,
+++3,
+++0,
+++254,
+++0,
+++143,
+++12,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++64,
+++143,
+++13,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++128,
+++143,
+++14,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++192,
+++143,
+++15,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++192,
+++142,
+++16,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++128,
+++142,
+++17,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++64,
+++142,
+++18,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++0,
+++142,
+++19,
+++0,
+++0,
+++240,
+++12,
+++0,
+++128,
+++144,
+++33,
+++0,
+++8,
+++255,
+++99,
+++3,
+++0,
+++212,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++163,
+++3,
+++0,
+++228,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++227,
+++3,
+++0,
+++244,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++35,
+++4,
+++0,
+++180,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++99,
+++4,
+++0,
+++164,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++163,
+++4,
+++0,
+++148,
+++192,
+++51,
+++0,
+++0,
+++111,
+++3,
+++32,
+++246,
+++192,
+++11,
+++1,
+++16,
+++32,
+++246,
+++2,
+++137,
+++47,
+++240,
+++40,
+++246,
+++2,
+++140,
+ +47,
+- 1,
+- 37,
+- 106,
+- 0,
+- 144,
+--57,
+++240,
+++128,
+++245,
+++99,
+++140,
+++5,
+++4,
+++0,
+++247,
+++99,
+++140,
+++1,
+++20,
+++88,
+++246,
+++99,
+++140,
+++1,
+++20,
+++0,
+++247,
+++35,
+++136,
+++62,
+++226,
+++32,
+++247,
+++35,
+++136,
+++32,
+++210,
+++0,
+++247,
+++34,
+++136,
+++63,
+++2,
+++208,
+++246,
+++34,
+++136,
+++0,
+++4,
+++0,
+++247,
+++99,
+++136,
+++58,
+++162,
+++32,
+++247,
+++99,
+++136,
+++33,
+++146,
+++0,
+++247,
+++98,
+++136,
+++59,
+++18,
+++208,
+++246,
+++98,
+++136,
+++0,
+++20,
+++0,
+++247,
+++162,
+++136,
+++33,
+++2,
+++88,
+++246,
+++98,
+++137,
+++2,
+++68,
+++88,
+++246,
+++162,
+++137,
+++3,
+++68,
+++208,
+++254,
+++227,
+++136,
+++60,
+++242,
+++192,
+++243,
+++188,
+++11,
+++208,
+++254,
+++227,
+++136,
+++56,
+++178,
+++192,
+++243,
+++188,
+++10,
+++32,
+++255,
+++226,
+++136,
+++38,
+++58,
+++192,
+++243,
+++60,
+++0,
+++208,
+++254,
+++227,
+++136,
+++59,
+++242,
+++192,
+++243,
+++60,
+++128,
+++32,
+++255,
+++226,
+++136,
+++49,
+++58,
+++192,
+++243,
+++60,
+++128,
+++0,
+++255,
+++226,
+++136,
+++34,
+++34,
+++192,
+++243,
+++60,
+++128,
+++32,
+++255,
+++226,
+++136,
+++37,
+++58,
+++192,
+++243,
+++60,
+++128,
+++0,
+++254,
+++192,
+++136,
+++1,
+++4,
+++0,
+++240,
+++0,
+++160,
+++0,
+++255,
+++194,
+++8,
+++0,
+++52,
+++195,
+++243,
+++0,
+++128,
+++0,
+++255,
+++202,
+++40,
+++0,
+++52,
+++195,
+++243,
+++0,
+++128,
+++0,
+++254,
+++0,
+++240,
+++35,
+++10,
+++0,
+++240,
+++60,
+++0,
+++0,
+++254,
+++192,
+++136,
+++1,
+++4,
+++0,
+++240,
+++0,
+++160,
+++0,
+++255,
+++226,
+++140,
+++34,
+++34,
+++195,
+++243,
+++60,
+++0,
+++32,
+++255,
+++227,
+++140,
+++36,
+++58,
+++192,
+++243,
+++60,
+++0,
+++0,
+++254,
+++192,
+++136,
+++0,
+++4,
+++0,
+++240,
+++0,
+++160,
+++16,
+++246,
+++226,
+++136,
+++35,
+++50,
+++16,
+++246,
+++226,
+++136,
+++35,
+++50,
+++32,
+++246,
+++226,
+++136,
+++35,
+++50,
+++32,
+++254,
+++226,
+++136,
+++35,
+++58,
+++192,
+++243,
+++60,
+++0,
+++11,
+++96,
+++0,
+++254,
+++0,
+++240,
+++1,
+++4,
+++0,
+++240,
+++64,
+++115,
+++5,
+++106,
+++0,
+++144,
+++173,
+++1,
+++27,
+++96,
+++0,
+++254,
+++0,
+++240,
+++1,
+++4,
+++0,
+++240,
+++64,
+++147,
+++5,
+++106,
+++0,
+++144,
+++227,
+++0,
+++64,
+++246,
+++163,
+++140,
+++1,
+++4,
+++0,
+++246,
+++192,
+++175,
+++63,
+++2,
+++0,
+++246,
+++192,
+++174,
+++59,
+++2,
+++0,
+++246,
+++128,
+++175,
+++62,
+++2,
+++0,
+++246,
+++128,
+++174,
+++58,
+++2,
+++0,
+++246,
+++64,
+++175,
+++61,
+++2,
+++0,
+++246,
+++64,
+++174,
+++57,
+++2,
+++0,
+++255,
+++43,
+++240,
+++4,
+++212,
+++192,
+++243,
+++128,
+++11,
+++64,
+++254,
+++43,
+++240,
+++1,
+++228,
+++192,
+++243,
+++128,
+++10,
+++64,
+++254,
+++43,
+++240,
+++1,
+++244,
+++192,
+++243,
+++128,
+++10,
+++64,
+++254,
+++43,
+++240,
+++1,
+++180,
+++192,
+++243,
+++128,
+++10,
+++64,
+++254,
+++43,
+++141,
+++0,
+++164,
+++192,
+++243,
+++128,
+++10,
+++88,
+++246,
+++35,
+++141,
+++3,
+++68,
+++32,
+++247,
+++35,
+++141,
+++191,
+++66,
+++240,
+++246,
+++35,
+++141,
+++50,
+++66,
+++0,
+++255,
+++235,
+++143,
+++52,
+++242,
+++192,
+++243,
+++60,
+++128,
+++0,
+++255,
+++43,
+++240,
+++2,
+++212,
+++192,
+++243,
+++128,
+++11,
+++0,
+++255,
+++43,
+++240,
+++191,
+++226,
+++192,
+++243,
+++188,
+++10,
+++64,
+++254,
+++43,
+++141,
+++0,
+++180,
+++192,
+++243,
+++128,
+++10,
+++88,
+++246,
+++35,
+++141,
+++2,
+++68,
+++32,
+++247,
+++35,
+++141,
+++190,
+++66,
+++240,
+++246,
+++35,
+++141,
+++50,
+++66,
+++0,
+++255,
+++171,
+++143,
+++52,
+++226,
+++192,
+++243,
+++60,
+++128,
+++0,
+++255,
+++43,
+++240,
+++4,
+++180,
+++192,
+++243,
+++128,
+++11,
+++0,
+++255,
+++43,
+++240,
+++191,
+++226,
+++192,
+++243,
+++188,
+++10,
+++128,
+++253,
+++43,
+++240,
+++3,
+++212,
+++192,
+++243,
+++128,
+++10,
+++64,
+++254,
+++35,
+++141,
+++1,
+++196,
+++192,
+++243,
+++128,
+++10,
+++88,
+++246,
+++35,
+++141,
+++3,
+++68,
+++32,
+++247,
+++35,
+++141,
+++189,
+++66,
+++240,
+++246,
+++35,
+++141,
+++50,
+++66,
+++0,
+++255,
+++107,
+++143,
+++52,
+++210,
+++192,
+++243,
+++60,
+++128,
+++0,
+++255,
+++43,
+++240,
+++4,
+++148,
+++192,
+++243,
+++128,
+++11,
+++64,
+++254,
+++43,
+++240,
+++1,
+++164,
+++192,
+++243,
+++128,
+++10,
+++64,
+++254,
+++43,
+++240,
+++1,
+++180,
+++192,
+++243,
+++128,
+++10,
+++64,
+++254,
+++43,
+++240,
+++1,
+++244,
+++192,
+++243,
+++128,
+++10,
+++64,
+++254,
+++43,
+++141,
+++0,
+++228,
+++192,
+++243,
+++128,
+++10,
+++88,
+++246,
+++35,
+++141,
+++3,
+++68,
+++32,
+++247,
+++35,
+++141,
+++187,
+++66,
+++240,
+++246,
+++35,
+++141,
+++50,
+++66,
+++0,
+++255,
+++235,
+++142,
+++52,
+++178,
+++192,
+++243,
+++60,
+++128,
+++0,
+++255,
+++43,
+++240,
+++2,
+++148,
+++192,
+++243,
+++128,
+++11,
+++0,
+++255,
+++43,
+++240,
+++187,
+++162,
+++192,
+++243,
+++188,
+++10,
+++64,
+++254,
+++43,
+++141,
+++0,
+++244,
+++192,
+++243,
+++128,
+++10,
+++88,
+++246,
+++35,
+++141,
+++2,
+++68,
+++32,
+++247,
+++35,
+++141,
+++186,
+++66,
+++240,
+++246,
+++35,
+++141,
+++50,
+++66,
+++0,
+++255,
+++171,
+++142,
+++52,
+++162,
+++192,
+++243,
+++60,
+++128,
+++0,
+++255,
+++43,
+++240,
+++4,
+++244,
+++192,
+++243,
+++128,
+++11,
+++0,
+++255,
+++43,
+++240,
+++187,
+++162,
+++192,
+++243,
+++188,
+++10,
+++128,
+++253,
+++43,
+++240,
+++3,
+++148,
+++192,
+++243,
+++128,
+++10,
+++64,
+++254,
+++35,
+++141,
+++1,
+++132,
+++192,
+++243,
+++128,
+++10,
+++88,
+++246,
+++35,
+++141,
+++3,
+++68,
+++32,
+++247,
+++35,
+++141,
+++185,
+++66,
+++240,
+++246,
+++35,
+++141,
+++50,
+ +66,
+- 1,
+++0,
+++255,
+++107,
+++142,
+++52,
+++146,
+++192,
+++243,
+++60,
+++128,
+++64,
+++255,
+++98,
+++141,
+++0,
+++52,
+++192,
+++243,
+++0,
+++0,
+++0,
+++254,
+++0,
+++240,
+ +53,
+++10,
+++0,
+++240,
+++60,
+++0,
+++0,
+++254,
+++0,
+++240,
+++1,
+++4,
+++0,
+++240,
+++64,
+++147,
+++5,
+ +106,
+ +0,
+ +144,
+++177,
+++0,
+++88,
+++246,
+++163,
+++140,
+++1,
+++4,
+++128,
+++245,
+++99,
+++141,
+++10,
+++4,
+++88,
+++246,
+++162,
+++138,
+++1,
+++68,
+++0,
+++247,
+++162,
+++138,
+++36,
+++162,
+++88,
+++254,
+++162,
+++138,
+++3,
+++164,
+++192,
+++243,
+++128,
+++11,
+++0,
+++255,
+++226,
+++137,
+++32,
+++2,
+++195,
+++243,
+++60,
+++0,
+++32,
+++247,
+++226,
+++137,
+++42,
+++114,
+++0,
+++255,
+++34,
+++138,
+++33,
+++18,
+++195,
+++243,
+++60,
+++0,
+++32,
+++247,
+++34,
+++138,
+++42,
+++130,
+++16,
+++246,
+++98,
+++138,
+++40,
+++114,
+++16,
+++246,
+++98,
+++138,
+++41,
+++146,
+++32,
+++246,
+++98,
+++138,
+++41,
+++146,
+++32,
+++246,
+++226,
+++137,
+++41,
+++146,
+++40,
+++246,
+++34,
+++138,
+++41,
+++146,
+++32,
+++247,
+++163,
+++141,
+++63,
+++178,
+++32,
+++247,
+++227,
+++141,
+++62,
+++162,
+++0,
+++254,
+++0,
+++240,
+++8,
+++4,
+++0,
+++240,
+++128,
+++11,
+++128,
+++253,
+++35,
+++240,
+++9,
+++100,
+++192,
+++243,
+++128,
+++10,
+++128,
+++253,
+++163,
+++141,
+++128,
+++115,
+ +192,
+++243,
+++152,
+++10,
+++88,
+++246,
+++163,
+++141,
+ +4,
+-+69,
+-+106,
+++100,
+++208,
+++246,
+++35,
+++139,
+ +0,
+-+144,
+++100,
+++32,
+++255,
+++34,
+++139,
+++53,
+++202,
+ +192,
+++243,
+++60,
+++128,
+++0,
+++254,
+++0,
+++139,
+++0,
+ +4,
+-+85,
+-+106,
+ +0,
+-+144,
+-+220,
+-+5,
+- 169,
+- 3,
+- 62,
+-@@ -2427,4 +2445,626 @@ unsigned char rpi_hevc_transform [] = {
+- 128,
+- 90,
+- 0,
+++240,
+++0,
+++160,
+++240,
+++246,
+++163,
+++141,
+++48,
+++98,
+++0,
+++247,
+++99,
+++139,
+++63,
+++210,
+++0,
+++247,
+++98,
+++139,
+++1,
+++212,
+++88,
+++254,
+++98,
+++139,
+++1,
+++212,
+++192,
+++243,
+++128,
+++11,
+++32,
+++255,
+++99,
+++139,
+++62,
+++98,
+++192,
+++243,
+++188,
+++10,
+++88,
+++246,
+++98,
+++139,
+++1,
+++212,
+++240,
+++246,
+++98,
+++139,
+++50,
+++210,
+++0,
+++247,
+++163,
+++128,
+++59,
+++146,
+++0,
+++247,
+++160,
+++128,
+++1,
+++36,
+++88,
+++254,
+++160,
+++128,
+++1,
+++36,
+++192,
+++243,
+++128,
+++11,
+++0,
+++247,
+++163,
+++128,
+++58,
+++98,
+++64,
+++255,
+++35,
+++240,
+++0,
+++100,
+++192,
+++243,
+++128,
+++10,
+++64,
+++255,
+++163,
+++128,
+++0,
+++164,
+++192,
+++243,
+++128,
+++10,
+++88,
+++246,
+++160,
+++128,
+++1,
+++36,
+++240,
+++246,
+++160,
+++128,
+++50,
+++34,
+++8,
+++255,
+++227,
+++143,
+++54,
+++242,
+++192,
+++243,
+++60,
+++128,
+++40,
+++255,
+++227,
+++142,
+++54,
+++178,
+++192,
+++243,
+++60,
+++128,
+++0,
+++254,
+++0,
+++240,
+++39,
+++10,
+++0,
+++240,
+++60,
+++128,
+++8,
+++255,
+++163,
+++143,
+++45,
+++226,
+++192,
+++243,
+++60,
+++128,
+++0,
+++254,
+++0,
+++240,
+++44,
+++10,
+++0,
+++240,
+++60,
+++0,
+++0,
+++254,
+++0,
+++240,
+++40,
+++10,
+++0,
+++240,
+++60,
+++128,
+++8,
+++255,
+++163,
+++142,
+++2,
+++162,
+++192,
+++243,
+++60,
+++128,
+++90,
+++0,
+ +169,
+ +3,
+ +14,
+@@ -35609,15 +10256,100 @@ index b3f155f..4309f1c 100644
+ +30,
+ +33,
+ +3,
+- };
+++};
+ diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s
+-index b055208..5543093 100644
+---- a/libavcodec/rpi_hevc_transform.s
++new file mode 100644
++index 0000000..5543093
++--- /dev/null
+ +++ b/libavcodec/rpi_hevc_transform.s
+-@@ -85,6 +85,13 @@ hevc_trans_16x16:
+-   beq memclear16
+-   cmp r5,2
+-   beq hevc_deblock_16x16
++@@ -0,0 +1,917 @@
+++# ******************************************************************************
+++# Argon Design Ltd.
+++# (c) Copyright 2015 Argon Design Ltd. All rights reserved.
+++#
+++# Module : HEVC
+++# Author : Peter de Rivaz
+++# ******************************************************************************
+++
+++# HEVC VPU Transform
+++#
+++# Transform matrix can be thought of as
+++#   output row vector = input row vector * transMatrix2
+++#
+++# The even rows of the matrix are symmetric
+++# The odd rows of the matrix are antisymmetric
+++#
+++# So only need to compute the first half of the results, then can compute the remainder with a butterfly
+++#
+++# EXAMPLE
+++#   (a b c d) (1 2  2  1)
+++#             (3 4 -4 -3)
+++#             (5 6  6  5)
+++#             (7 8 -8 -7)
+++#
+++#  x=(a c)(1 2) = 1a+5c 2a+6c
+++#         (5 6)
+++#
+++#  y=(b d)(3 4) = 3b+7d 4b+8d
+++#         (7 8)
+++#
+++#  u=x+y = 1a+5c+3b+7d 2a+4b+6c+8d
+++#  v=x-y = 1a+5c-3b-7d 2a+6c-4b-8d
+++#
+++#  Final results are (u , v[::-1])
+++#
+++#
+++#  For 32x1 input, load even rows into HX(0++,0), odd rows into HX(16++,0)
+++#  Apply the even matrix first and stop before rounding
+++#  Then apply the odd matrix in a full manner:
+++#
+++#   First step is to compute partial products with the first input (16 cycles)
+++#   1a 3b 5c 7d   16x1 input coefficients produce 16x16 output
+++#   2a 4b 6c 8d
+++#   2a -4b 6c -8d
+++#   1a -3b 5c -7d
+++#
+++#   Second step is to sum partial products into final position (8 cycles)
+++#   1a+3b+5c+7d
+++#   2a+4b+6c+8d
+++#   2a-4b+6c-8d
+++#   1a-3b+5c-7d
+++#
+++#   Then can apply butterfly to combine even results and odd results + rounding to produce 16 rows of output at a time (need to save in transposed format)
+++#
+++#   For 16x16 no butterfly is required and can store final results in original location  (Could do 2 16x16s in parallel to make use of the trick - saves on the adds)
+++#
+++#   For 8x8 we could compute two in parallel.
+++#
+++#
+++
+++# Columns are transformed first
+++#
+++# Store top left half of transMatrix2 in
+++# Store bottom left half of transMatrix2 in HX(32,32)
+++#
+++# For 16x16
+++# HX(0:15,0) contains input data before transform
+++# HY(0:15,0) contains 32bit output data after transform
+++# HX(32,0) contains even rows of left half of transMatrix2
+++# HX(32,32) contains odd rows of left half of transMatrix2
+++# HY(48,0) contains partial products ready for summing
+++#
+++
+++
+++# hevc_trans_16x16(short *transMatrix2, short *coeffs, int num) # TODO add size so we can branch to correct implementation (or perhaps have coeffs32 and num32 as secondary inputs!)
+++# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory)
+++# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
+++# num: number of 16x16 transforms to be done
+++# coeffs32
+++# num32: number of 32x32 transforms
+++# command 0 for transform, 1 for memclear16(int16_t *dst,num16)
+++#
+++hevc_trans_16x16:
+++  cmp r5,1
+++  beq memclear16
+++  cmp r5,2
+++  beq hevc_deblock_16x16
+ +  cmp r5,3
+ +  beq hevc_uv_deblock_16x16
+ +  cmp r5,4
+@@ -35625,1937 +10357,4614 @@ index b055208..5543093 100644
+ +  cmp r5,5
+ +  beq hevc_run_command_list
+ +
+-   push r6-r15, lr # TODO cut down number of used registers
+-   mov r14,r3 # coeffs32
+-   mov r15,r4 # num32
+-@@ -708,3 +715,203 @@ normal_filtering:
+- 
+- filtering_done:
+-   b lr
+++  push r6-r15, lr # TODO cut down number of used registers
+++  mov r14,r3 # coeffs32
+++  mov r15,r4 # num32
+++  mov r3, 16*2 # Stride of transMatrix2 in bytes
+++  vldh HX(32++,0),(r0 += r3) REP 16 # This is the 16x16 matrix, a transform is equivalent to multiplying input row vector * matrix
+++
+++  add r0, 16*16*2 # For 32x32 transforms we also need this matrix
+++  vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
+++
+++  # Now use r0 to describe which matrix we are working on.
+++  # Allows us to prefetch the next block of coefficients for efficiency.
+++  mov r0,0 # This describes the location where we read our coefficients from
+++  mov r3,16*2 # Stride of coefficients in bytes (TODO remove)
+++  mov r7,16*16*2 # Total block size
+++  mov r8,64*16 # Value used to swap from current to next VRF location
+++  vldh HX(0++,0)+r0,(r1 += r3) REP 16
+++  mov r4,64 # Constant used for rounding first pass
+++  mov r5,1<<11 # Constant used for rounding second pass
+++
+++  # At start of block r0,r1 point to the current block (that has already been loaded)
+++block_loop:
+++  eor r0,r8
+++  add r1,r7
+++  # Prefetch the next block
+++  vldh HX(0++,0)+r0,(r1 += r3) REP 16
+++  eor r0,r8
+++  sub r1,r7
+++
+++  # Transform the current block
+++  bl col_trans_16
+++  vadd HY(0++,0)+r0,HY(0++,0)+r0,r4 REP 16   # Now add on rounding, shift down by 7, and saturate
+++  #vsasls HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # 9+7=16 so this ends up with the output saturated and in the top half of the word.
+++  vasl HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16    # This should be saturating, but the instruction above does not assemble?
+++  vmov VX(0,0++)+r0, HX(0++,32)+r0 REP 16    # For simplicity transpose this back to the original position
+++
+++  bl col_trans_16
+++  vadd HY(0++,0)+r0,HY(0++,0)+r0,r5 REP 16   # Now add on rounding, shift down by 7, and saturate
+++  #vsasls HY(0++,0)+r0,HY(0++,0)+r0,4 REP 16 # 4+12=16 so this ends up with the output saturated and in the top half of the word.
+++  vasl HY(0++,0)+r0,HY(0++,0)+r0,4 REP 16    # This should be saturating, but the instruction above does not assemble?  (Probably because it ends with ls which is interpreted as a condition flag)
+++
+++  # Save results - note there has been a transposition during the processing so we save columns
+++  vsth VX(0,32++)+r0, (r1 += r3) REP 16
+++
+++  # Move onto next block
+++  eor r0,r8
+++  add r1,r7
+++
+++  addcmpbgt r2,-1,0,block_loop
+++
+++  # Now go and do any 32x32 transforms
+++  b hevc_trans_32x32
+++
+++  pop r6-r15, pc
+++
+++# r1,r2,r3 r7,r8 should be preserved
+++# HX(0++,0)+r0 is the block to be transformed
+++# HX(32++,0)+r6 is the 16x16 matrix of transform coefficients
+++# Use HY(48,0) for intermediate results
+++# r0 can be used, but should be returned to its original value at the end
+++col_trans_16:
+++  add r6,r0,16 # Final value for this loop
+++col_trans_16_loop:
+++  # First compute partial products for a single column
+++  vmul32s HY(48++,0), VX(0,0)+r0, VX(32,0++) REP 16
+++  # Then sum up the results and place back
+++  vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
+++  addcmpblt r0,1,r6,col_trans_16_loop
+++  sub r0,16  # put r0 back to its original value
+++  b lr
+++
+++col_trans_odd_16:
+++  add r6,r0,16 # Final value for this loop
+++col_trans_odd_16_loop:
+++  # First compute partial products for a single column
+++  vmul32s HY(48++,0), VX(0,0)+r0, VX(32,32++) REP 16
+++  # Then sum up the results and place back
+++  vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
+++  addcmpblt r0,1,r6,col_trans_odd_16_loop
+++  sub r0,16  # put r0 back to its original value
+++  b lr
+++
+++# hevc_trans_32x32(short *transMatrix2, short *coeffs, int num)
+++# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory) Even followed by odd
+++# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
+++# num: number of 16x16 transforms to be done
+++#
+++hevc_trans_32x32:
+++  mov r1,r14 # coeffs
+++  mov r2,r15 # num
+++
+++  # Fetch odd transform matrix
+++  #mov r3, 16*2 # Stride of transMatrix2 in bytes (and of coefficients)
+++  #vldh HX(32++,0),(r0 += r3) REP 16 # This is the even 16x16 matrix
+++  #add r0, 16*16*2
+++  #vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
+++
+++  mov r3, 32*2*2 # Stride used to fetch alternate rows of our input coefficient buffer
+++  mov r7, 16*16*2 # Total block size
+++  sub sp,sp,32*32*2+32 # Allocate some space on the stack for us to store 32*32 shorts as temporary results (needs to be aligned)
+++  # set r8 to 32byte aligned stack pointer
+++  add r8,sp,31
+++  lsr r8,5
+++  lsl r8,5
+++  mov r9,r8  # Backup of the temporary storage
+++  mov r10,r1 # Backup of the coefficient buffer
+++block_loop32:
+++
+++  # COLUMN TRANSFORM
+++  mov r4, 64 # Constant used for rounding first pass
+++  mov r5, 9 # left shift used for rounding first pass
+++
+++  # Transform the first 16 columns
+++  mov r1,r10  # Input Coefficient buffer
+++  mov r8,r9   # Output temporary storage
+++  bl trans32
+++  # Transform the second 16 columns
+++  add r8,32*16*2
+++  add r1,32
+++  bl trans32
+++
+++  # ROW TRANSFORM
+++  mov r4, 1<<11 # Constant used for rounding second pass
+++  mov r5, 4 # left shift used for rounding second pass
+++
+++  mov r1,r9  # Input temporary storage
+++  mov r8,r10   # Output Coefficient buffer
+++  bl trans32
+++  # Transform the second 16 columns
+++  add r8,32*16*2
+++  add r1,32
+++  bl trans32
+++
+++  add r10, 32*32*2 # move onto next block of coefficients
+++  addcmpbgt r2,-1,0,block_loop32
+++
+++  add sp,sp,32*32*2+32 # Restore stack
+++
+++  pop r6-r15, pc
+++
+++trans32:
+++  push lr
+++  # We can no longer afford the VRF space to do prefetching when doing 32x32
+++  # Fetch the even rows
+++  vldh HX(0++,0),(r1 += r3) REP 16
+++  # Fetch the odd rows
+++  vldh HX(16++,0),64(r1 += r3) REP 16 # First odd row is 32 shorts ahead of r1
+++
+++  # Transform the even rows using even matrix
+++  mov r0, 0 # Even rows
+++  bl col_trans_16
+++
+++  # Now transform the odd rows using odd matrix
+++  mov r0, 64*16 # Odd rows
+++  bl col_trans_odd_16
+++
+++  # Now apply butterfly to compute the first 16 results
+++  vadd HY(48++,0),HY(0++,0),HY(16++,0) REP 16
+++  vadd HY(48++,0),HY(48++,0),r4 REP 16   # add on rounding,
+++  vasl HY(48++,0),HY(48++,0),r5 REP 16    # shift down by 7, and saturate
+++  # 16bit results now in HX(48,32)
+++  mov r0,r8
+++  mov r6,32*2
+++  vsth VX(48,32++),(r0+=r6) REP 16
+++
+++  # Now apply butterfly to compute the second 16 results (in reverse order)
+++  vsub HY(63,0),HY(0 ,0),HY(16,0)
+++  vsub HY(62,0),HY(1 ,0),HY(17,0)
+++  vsub HY(61,0),HY(2 ,0),HY(18,0)
+++  vsub HY(60,0),HY(3 ,0),HY(19,0)
+++  vsub HY(59,0),HY(4 ,0),HY(20,0)
+++  vsub HY(58,0),HY(5 ,0),HY(21,0)
+++  vsub HY(57,0),HY(6 ,0),HY(22,0)
+++  vsub HY(56,0),HY(7 ,0),HY(23,0)
+++  vsub HY(55,0),HY(8 ,0),HY(24,0)
+++  vsub HY(54,0),HY(9 ,0),HY(25,0)
+++  vsub HY(53,0),HY(10,0),HY(26,0)
+++  vsub HY(52,0),HY(11,0),HY(27,0)
+++  vsub HY(51,0),HY(12,0),HY(28,0)
+++  vsub HY(50,0),HY(13,0),HY(29,0)
+++  vsub HY(49,0),HY(14,0),HY(30,0)
+++  vsub HY(48,0),HY(15,0),HY(31,0)
+++  vadd HY(48++,0),HY(48++,0),r4 REP 16   # add on rounding,
+++  vasl HY(48++,0),HY(48++,0),r5 REP 16    # shift down by 7, and saturate
+++  add r0,r8,32
+++  vsth VX(48,32++),(r0+=r6) REP 16
+++  pop pc
+++
+++memclear16:
+++  # r0 is address
+++  # r1 is number of 16bits values to set to 0 (may overrun past end and clear more than specified)
+++  vmov HX(0++,0),0 REP 16
+++  mov r2,32
+++loop:
+++  vsth HX(0++,0),(r0+=r2) REP 16
+++  add r0,16*16*2
+++  sub r1,16*16
+++  cmp r1,0
+++  bgt loop
+++  b lr
+++
+++
+++################################################################################
+++# HEVC VPU Deblock
+++#
+++# Vertical edges before horizontal
+++# Decision can change every 4 pixels, but only 8 pixel boundaries are deblocked
+++#
+++# ARM is responsible for storing beta and tc for each 4 pixels horiz and vert edge.
+++# The VPU code works in units of 16x16 blocks.
+++# We do vertical filtering for the current block followed by horizontal filtering for the previous (except for the first time).
+++# One final horizontal filter is required at the end.
+++# PCM is not allowed in this code.
+++#
+++#
+++# H(16-4:16+15,0) contains previous block (note that we need 4 lines above of context that may get altered during filtering)
+++# H(16:31,16) contains current block (note that we do not need the upper lines until the horizontal filtering.
+++
+++.set P0,63
+++.set P1,62
+++.set P2,61
+++.set P3,60
+++.set Q0,59
+++.set Q1,58
+++.set Q2,57
+++.set Q3,56
+++
+++.set dp,32
+++.set dq,33
+++.set d,34
+++.set decision,35
+++.set beta,36
+++.set beta2,37
+++.set beta3,38
+++.set ptest,39
+++.set qtest,40
+++.set pqtest,41
+++.set thresh,42
+++.set deltatest, 44
+++.set deltap1, 45
+++.set tc25, 46
+++.set setup,47
+++.set tc,48
+++.set tc25,49
+++.set tc2, 50
+++.set do_filter, 51
+++.set delta, 52
+++.set tc10, 53
+++.set delta0, 54
+++.set delta1, 55
+++.set zeros, 0
+++.set setup_input, 1
+++.set deltaq1, 2
+++
+++
+++
+++# hevc_deblock_16x16 deblocks an entire row that is 16 pixels high by the full width of the image.
+++# Row has num16 16x16 blocks across
+++# Beta goes from 0 to 64
+++# tc goes from 0 to 24
+++# setup[block_idx][0=vert,1=horz][0=first edge, 1=second edge][0=beta,1=tc][0..3=edge number]
+++#   has 8 bytes per edge
+++#   has 16 bytes per direction
+++#   has 32 bytes per 16x16 block
+++# hevc_deblock_16x16(uint8_t *img (r0), int stride (r1), int num16w (r2), uint8_t setup[num16][2][2][2][4](r3),int num16h(r4))
+++hevc_deblock_16x16:
+++  push r6-r15, lr
+++  mov r9,r4
+++  mov r4,r3
+++  mov r13,r2
+++  mov r2,r0
+++  mov r10,r0
+++  subscale4 r0,r1
+++  mov r8,63
+++  mov r6,-3
+++  vmov H(zeros,0),0
+++# r7 is number of blocks still to load
+++# r0 is location of current block - 4 * stride
+++# r1 is stride
+++# r2 is location of current block
+++# r3 is offset of start of block (actual edges start at H(16,16)+r3 for horizontal and H(16,0)+r3 for vertical
+++# r4 is setup
+++# r5 is for temporary calculations
+++# r8 holds 63
+++# r6 holds -3
+++# r9 holds the number of 16 high rows to process
+++# r10 holds the original img base
+++# r11 returns 0 if no filtering was done on the edge
+++# r12 saves a copy of this
+++# r13 is copy of width
+++
+++process_row:
+++  # First iteration does not do horizontal filtering on previous
+++  mov r7, r13
+++  mov r3,0
+++  vldb H(12++,16)+r3,(r0 += r1) REP 4    # Load the current block
+++  vldb H(16++,16)+r3,(r2 += r1) REP 16
+++  vldb H(setup_input,0), (r4)  # We may wish to prefetch these
+++  vstb H(zeros,0),(r4)
+++  bl vert_filter
+++  add r3,8
+++  vadd H(setup_input,0),H(setup_input,8),0 # Rotate to second set of 8
+++  bl vert_filter
+++  sub r3,8
+++  b start_deblock_loop
+++deblock_loop:
+++  # Middle iterations do vertical on current block and horizontal on preceding
+++  vldb H(12++,16)+r3,(r0 += r1) REP 4  # load the current block
+++  vldb H(16++,16)+r3,(r2 += r1) REP 16
+++  vldb H(setup_input,0), (r4)
+++  vstb H(zeros,0),(r4)
+++  bl vert_filter
+++  add r3,8
+++  vadd H(setup_input,0),H(setup_input,8),0
+++  bl vert_filter
+++  sub r3,8
+++  vldb H(setup_input,0), -16(r4)
+++  vstb H(zeros,0),-16(r4)
+++  bl horz_filter
+++  mov r12,r11
+++  add r3,8*64
+++  vadd H(setup_input,0),H(setup_input,8),0
+++  bl horz_filter
+++  sub r3,8*64
+++  addcmpbeq r12,0,0,skip_save_top
+++  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
+++skip_save_top:
+++  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
+++start_deblock_loop:
+++  # move onto next 16x16 (could do this with circular buffer support instead)
+++  add r3,16
+++  and r3,r8
+++  add r4,32
+++  # Perform loop counter operations (may work with an addcmpbgt as well?)
+++  add r0,16
+++  add r2,16
+++  sub r7,1
+++  cmp r7,0 # Are there still more blocks to load
+++  bgt deblock_loop
+++
+++  # Final iteration needs to just do horizontal filtering
+++  vldb H(setup_input,0), -16(r4)
+++  vstb H(zeros,0),-16(r4)
+++  bl horz_filter
+++  mov r12,r11
+++  add r3,8*64
+++  vadd H(setup_input,0),H(setup_input,8),0
+++  bl horz_filter
+++  sub r3,64*8
+++  addcmpbeq r12,0,0,skip_save_top2
+++  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
+++skip_save_top2:
+++  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
+++
+++# Now look to see if we should do another row
+++  sub r9,1
+++  cmp r9,0
+++  bgt start_again
+++  pop r6-r15, pc
+++start_again:
+++  # Need to sort out r0,r2 to point to next row down
+++  addscale16 r10,r1
+++  mov r2,r10
+++  subscale4 r0,r2,r1
+++  b process_row
+++
+++
+++# At this stage H(16,16)+r3 points to the first pixel of the 16 high edge to be filtered
+++# So we can reuse the code we move the parts to be filtered into HX(P0/P1/P2/P3/Q0/Q1/Q2/Q3,0) - we will perform a final saturation step on placing them back into the correct locations
+++
+++vert_filter:
+++  push lr
+++
+++  vmov HX(P3,0), V(16,12)+r3
+++  vmov HX(P2,0), V(16,13)+r3
+++  vmov HX(P1,0), V(16,14)+r3
+++  vmov HX(P0,0), V(16,15)+r3
+++  vmov HX(Q0,0), V(16,16)+r3
+++  vmov HX(Q1,0), V(16,17)+r3
+++  vmov HX(Q2,0), V(16,18)+r3
+++  vmov HX(Q3,0), V(16,19)+r3
+++
+++  bl do_luma_filter
+++
+++  vadds V(16,13)+r3, HX(P2,0), 0
+++  vadds V(16,14)+r3, HX(P1,0), 0
+++  vadds V(16,15)+r3, HX(P0,0), 0
+++  # P3 and Q3 never change so don't bother saving back
+++  vadds V(16,16)+r3, HX(Q0,0), 0
+++  vadds V(16,17)+r3, HX(Q1,0), 0
+++  vadds V(16,18)+r3, HX(Q2,0), 0
+++
+++  pop pc
+++
+++# Filter edge at H(16,0)+r3
+++horz_filter:
+++  push lr
+++
+++  vmov HX(P3,0), H(12,0)+r3
+++  vmov HX(P2,0), H(13,0)+r3
+++  vmov HX(P1,0), H(14,0)+r3
+++  vmov HX(P0,0), H(15,0)+r3
+++  vmov HX(Q0,0), H(16,0)+r3
+++  vmov HX(Q1,0), H(17,0)+r3
+++  vmov HX(Q2,0), H(18,0)+r3
+++  vmov HX(Q3,0), H(19,0)+r3
+++
+++  bl do_luma_filter
+++
+++  vadds H(13,0)+r3, HX(P2,0), 0
+++  vadds H(14,0)+r3, HX(P1,0), 0
+++  vadds H(15,0)+r3, HX(P0,0), 0
+++  # P3 and Q3 never change so don't bother saving back
+++  vadds H(16,0)+r3, HX(Q0,0), 0
+++  vadds H(17,0)+r3, HX(Q1,0), 0
+++  vadds H(18,0)+r3, HX(Q2,0), 0
+++
+++  pop pc
+++
+++# r4 points to array of beta/tc for each 4 length edge
+++do_luma_filter:
+++  valtl H(setup,0),H(setup_input,0),H(setup_input,0) # b*8tc*8
+++  valtl HX(beta,0),H(setup,0),H(setup,0)
+++  valtu HX(tc,0),H(setup,0),H(setup,0)
+++  vmul HX(tc25,0), HX(tc,0), 5
+++  vadd HX(tc25,0),HX(tc25,0), 1
+++  vasr HX(tc25,0), HX(tc25,0), 1
+++
+++  # Compute decision
+++  vadd HX(dp,0),HX(P1,0),HX(P1,0) # 2*P1
+++  vsub HX(dp,0),HX(P2,0),HX(dp,0) # P2-2*P1
+++  vadd HX(dp,0),HX(dp,0),HX(P0,0) # P2-2*P1+P0
+++  vdist HX(dp,0),HX(dp,0),0 # abs(P2-2*P1+P0) # dp0
+++
+++  vadd HX(dq,0),HX(Q1,0),HX(Q1,0) # 2*Q1
+++  vsub HX(dq,0),HX(Q2,0),HX(dq,0) # Q2-2*Q1
+++  vadd HX(dq,0),HX(dq,0),HX(Q0,0) # Q2-2*Q1+Q0
+++  vdist HX(dq,0),HX(dq,0),0 # abs(Q2-2*Q1+Q0) # dq0
+++
+++  vadd HX(d,0), HX(dp,0), HX(dq,0)
+++  vasr HX(beta2,0),HX(beta,0),2
+++  vasr HX(beta3,0),HX(beta,0),3
+++
+++  # Compute flags that are negative if all conditions pass
+++  vdist HX(decision,0), HX(P0,0), HX(P3,0) CLRA SACC
+++  vdist HX(decision,0), HX(Q0,0), HX(Q3,0) SACC
+++  vsub HX(decision,0), HX(decision,0), HX(beta3,0) SETF
+++
+++  vdist HX(decision,0), HX(P0,0), HX(Q0,0) IFN
+++  vsub HX(decision,0), HX(decision,0), HX(tc25,0) IFN SETF
+++  vadd HX(decision,0), HX(d,0), HX(d,0) IFN
+++  vsub HX(decision,0), HX(decision,0), HX(beta2,0) IFN SETF
+++  vmov HX(decision,0), 1 IFNN
+++  vadd H(decision,0),H(decision,3),0 IFN
+++  vadd H(decision,16),H(decision,19),0 IFN
+++  vmov -,HX(decision,0) SETF   # N marks strong filter
+++  vmov HX(decision,0), 1 IFNN  # NN marks normal filter
+++
+++  vadd HX(do_filter,0), HX(d,3), HX(d,0)
+++  vsub HX(do_filter,0), HX(do_filter,0), HX(beta,0) SETF # IFNN means no filter
+++  vmov HX(decision,0),0 IFNN # Z marks no filter
+++
+++  # Expand out decision (currently valid one every 4 pixels)  0...1...2...3
+++  # First extract out even terms
+++  vodd HX(decision,0),HX(decision,0),HX(decision,0)  # 0.1.2.3
+++  vodd HX(decision,0),HX(decision,0),HX(decision,0)  # 0123
+++  # Now expand back
+++  valtl HX(decision,0),HX(decision,0),HX(decision,0) # 00112233
+++  valtl HX(decision,0),HX(decision,0),HX(decision,0) SETF # 0000111122223333
+++
+++  # HX(decision,0) is negative if want strong filtering, 1 if want normal filtering, 0 if want no filtering
+++
+++  # Do a quick check to see if there is anything to do
+++  mov r11, 0 # Signal no filtering
+++  vmov -,1 IFNZ SUMS r5
+++  cmp r5,0
+++  beq filtering_done
+++  mov r11, 1 # Signal some filtering
+++  # And whether there is any strong filtering
+++  vmov -,1 IFN SUMS r5
+++  cmp r5,0
+++  beq normal_filtering
+++
+++  ##############################################################################
+++  # Strong filtering - could maybe fast case if all have same sign? (especially if all disabled!)
+++  vshl HX(tc2,0), HX(tc,0), 1  # Note that in normal filtering tx2 is tc/2, while here it is tc*2
+++
+++  # Take a copy of the original pixels for use in decision calculation
+++  vmov HX(P0,32),HX(P0,0)
+++  vmov HX(Q0,32),HX(Q0,0)
+++  vmov HX(P1,32),HX(P1,0)
+++  vmov HX(Q1,32),HX(Q1,0)
+++  vmov HX(P2,32),HX(P2,0)
+++  vmov HX(Q2,32),HX(Q2,0)
+++
+++  vadd -,HX(P2,32),4 CLRA SACC
+++  vshl -,HX(P1,32),1 SACC
+++  vshl -,HX(P0,32),1 SACC
+++  vshl -,HX(Q0,32),1 SACC
+++  vshl HX(delta,0),HX(Q1,32),0 SACC
+++  vasr HX(delta,0),HX(delta,0), 3
+++  vsub HX(delta,0),HX(delta,0),HX(P0,32)
+++  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
+++  vadd HX(P0,0),HX(P0,32),HX(delta,0) IFN
+++
+++  vadd -,HX(P2,32),2 CLRA SACC
+++  vadd -,HX(P1,32),HX(P0,32) SACC
+++  vshl HX(delta,0),HX(Q0,32),0 SACC
+++  vasr HX(delta,0),HX(delta,0), 2
+++  vsub HX(delta,0),HX(delta,0),HX(P1,32)
+++  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
+++  vadd HX(P1,0),HX(P1,32),HX(delta,0) IFN
+++
+++  vadd -,HX(Q0,32),4 CLRA SACC
+++  vadd -,HX(P1,32),HX(P0,32) SACC
+++  vmul -,HX(P2,32),3 SACC
+++  vshl HX(delta,0),HX(P3,0),1 SACC # Note that we have not made a copy of P3, so using P3,0 is correct
+++  vasr HX(delta,0),HX(delta,0), 3
+++  vsub HX(delta,0),HX(delta,0),HX(P2,32)
+++  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
+++  vadd HX(P2,0),HX(P2,32),HX(delta,0) IFN
+++  #vmov HX(P2,0),3 IFN
+++
+++  # Now reverse all P/Qs
+++
+++  vadd -,HX(Q2,32),4 CLRA SACC
+++  vshl -,HX(Q1,32),1 SACC
+++  vshl -,HX(Q0,32),1 SACC
+++  vshl -,HX(P0,32),1 SACC
+++  vshl HX(delta,0),HX(P1,32),0 SACC
+++  vasr HX(delta,0),HX(delta,0), 3
+++  vsub HX(delta,0),HX(delta,0),HX(Q0,32)
+++  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
+++  vadd HX(Q0,0),HX(Q0,32),HX(delta,0) IFN
+++
+++  vadd -,HX(Q2,32),2 CLRA SACC
+++  vadd -,HX(Q1,32),HX(Q0,32) SACC
+++  vshl HX(delta,0),HX(P0,32),0 SACC
+++  vasr HX(delta,0),HX(delta,0), 2
+++  vsub HX(delta,0),HX(delta,0),HX(Q1,32)
+++  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
+++  vadd HX(Q1,0),HX(Q1,32),HX(delta,0) IFN
+++
+++  vadd -,HX(P0,32),4 CLRA SACC
+++  vadd -,HX(Q1,32),HX(Q0,32) SACC
+++  vmul -,HX(Q2,32),3 SACC
+++  vshl HX(delta,0),HX(Q3,0),1 SACC # Note that we have not made a copy of Q3, so using Q3,0 is correct
+++  vasr HX(delta,0),HX(delta,0), 3
+++  vsub HX(delta,0),HX(delta,0),HX(Q2,32)
+++  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
+++  vadd HX(Q2,0),HX(Q2,32),HX(delta,0) IFN
+++
+++  ##############################################################################
+++  # Normal filtering
+++normal_filtering:
+++  # Invert the decision flags
+++  # make instruction more complicated as assembler has error and loses SETF
+++  vrsub HX(tc10,0), HX(decision,0), 0 SETF # IFN means normal filtering
+++  vmov  -, HX(tc10,0) SETF # IFN means normal filtering
+++
+++  vmov -,1 IFN SUMS r5
+++  cmp r5,0
+++  beq filtering_done
+++
+++  vasr HX(tc2,0), HX(tc,0), 1
+++  vmul HX(tc10,0), HX(tc,0), 10
+++
+++  vasr HX(thresh,0), HX(beta,0), 1
+++  vadd HX(thresh,0), HX(thresh,0), HX(beta,0)
+++  vasr HX(thresh,0), HX(thresh,0), 3 CLRA SACC
+++
+++  vadd HX(ptest,0),HX(dp,3),HX(dp,0)
+++  vsub HX(ptest,0),HX(ptest,0),HX(thresh,0) # ptest is negative if we need to do the P2 pixel
+++  vadd HX(qtest,0),HX(dq,3),HX(dq,0)
+++  vsub HX(qtest,0),HX(qtest,0),HX(thresh,0) # qtest is negative if we need to do the Q2 pixel
+++  # Expand ptest and qtest together
+++  vodd HX(pqtest,0),HX(ptest,0),HX(qtest,0)  # p.p.p.p.q.q.q.q
+++  vodd HX(pqtest,0),HX(pqtest,0),HX(pqtest,0) # ppppqqqq........
+++  valtl HX(pqtest,0),HX(pqtest,0),HX(pqtest,0) # ppppppppqqqqqqqq
+++  valtl HX(ptest,0),HX(pqtest,0),HX(pqtest,0)
+++  valtu HX(qtest,0),HX(pqtest,0),HX(pqtest,0)
+++
+++  vsub HX(delta0,0), HX(Q0,0), HX(P0,0)
+++  vsub HX(delta1,0), HX(Q1,0), HX(P1,0)
+++  vmov -,8 CLRA SACC
+++  vmul -,HX(delta0,0), 9 SACC
+++  vmul HX(delta0,0),HX(delta1,0), r6 SACC
+++  vasr HX(delta0,0), HX(delta0,0), 4
+++  vdist HX(deltatest,0), HX(delta0,0), 0
+++  vsub HX(deltatest,0), HX(deltatest,0), HX(tc10,0) IFN SETF # negative if still need to do something
+++  vmov HX(deltatest,0), 0 IFNN # clear if no need to do anything so we can reload flags later
+++
+++  vclamps HX(delta0,0), HX(delta0,0), HX(tc,0)
+++
+++  vadd HX(deltap1,0), HX(P2,0), HX(P0,0)
+++  vadd HX(deltap1,0), HX(deltap1,0), 1
+++  vasr HX(deltap1,0), HX(deltap1,0), 1 CLRA SACC
+++  vsub HX(deltap1,0), HX(delta0,0), HX(P1,0) SACC
+++  vasr HX(deltap1,0), HX(deltap1,0), 1
+++  vclamps HX(deltap1,0), HX(deltap1,0), HX(tc2,0)
+++
+++  vadd HX(deltaq1,0), HX(Q2,0), HX(Q0,0)
+++  vadd HX(deltaq1,0), HX(deltaq1,0), 1
+++  vasr HX(deltaq1,0), HX(deltaq1,0), 1 CLRA SACC
+++  vadd HX(deltaq1,0), HX(delta0,0), HX(Q1,0)
+++  vrsub -, HX(delta0,0), 0 SACC
+++  vrsub HX(deltaq1,0), HX(Q1,0), 0 SACC
+++  vasr HX(deltaq1,0), HX(deltaq1,0), 1
+++  vclamps HX(deltaq1,0), HX(deltaq1,0), HX(tc2,0)
+++
+++  vadds HX(P0,0), HX(P0,0), HX(delta0,0) IFN
+++  vsubs HX(Q0,0), HX(Q0,0), HX(delta0,0) IFN
+++
+++  vmov -,HX(ptest,0) IFN SETF # Negative if need to do p1
+++  vadds HX(P1,0), HX(P1,0), HX(deltap1,0) IFN
+++
+++  vmov -,HX(deltatest,0) SETF
+++  vmov -,HX(qtest,0) IFN SETF # Negative if need to do q1
+++  vadds HX(Q1,0), HX(Q1,0), HX(deltaq1,0) IFN
+++
+++  #vmov HX(P2,0),1 IFN
+++
+++filtering_done:
+++  b lr
+++
+++
+++hevc_uv_deblock_16x16:
+++  push r6-r15, lr
+++  mov r14,0
+++  b hevc_uv_start
+++hevc_uv_deblock_16x16_with_clear:
+++  push r6-r15, lr
+++  mov r14,1
+++  b hevc_uv_start
+++
+++hevc_uv_start:
+++  mov r9,r4
+++  mov r4,r3
+++  mov r13,r2
+++  mov r2,r0
+++  mov r10,r0
+++  subscale4 r0,r1
+++  mov r8,63
+++  mov r6,-3
+++  vmov H(zeros,0),0
+++# r7 is number of blocks still to load
+++# r0 is location of current block - 4 * stride
+++# r1 is stride
+++# r2 is location of current block
+++# r3 is offset of start of block (actual edges start at H(16,16)+r3 for horizontal and H(16,0)+r3 for vertical
+++# r4 is setup
+++# r5 is for temporary calculations
+++# r8 holds 63
+++# r6 holds -3
+++# r9 holds the number of 16 high rows to process
+++# r10 holds the original img base
+++# r11 returns 0 if no filtering was done on the edge
+++# r12 saves a copy of this
+++# r13 is copy of width
+++# r14 is 1 if we should clear the old contents, or 0 if not
+++
+++uv_process_row:
+++  # First iteration does not do horizontal filtering on previous
+++  mov r7, r13
+++  mov r3,0
+++  vldb H(12++,16)+r3,(r0 += r1) REP 4    # Load the current block
+++  vldb H(16++,16)+r3,(r2 += r1) REP 16
+++  vldb H(setup_input,0), (r4)  # We may wish to prefetch these
+++  cmp r14,1
+++  bne uv_skip0
+++  vstb H(zeros,0),(r4)
+++uv_skip0:
+++  bl uv_vert_filter
+++  add r3,8
+++  vadd H(setup_input,0),H(setup_input,8),0 # Rotate to second set of 8
+++  bl uv_vert_filter
+++  sub r3,8
+++  b uv_start_deblock_loop
+++uv_deblock_loop:
+++  # Middle iterations do vertical on current block and horizontal on preceding
+++  vldb H(12++,16)+r3,(r0 += r1) REP 4  # load the current block
+++  vldb H(16++,16)+r3,(r2 += r1) REP 16
+++  vldb H(setup_input,0), (r4)
+++  cmp r14,1
+++  bne uv_skip1
+++  vstb H(zeros,0),(r4)
+++uv_skip1:
+++  bl uv_vert_filter
+++  add r3,8
+++  vadd H(setup_input,0),H(setup_input,8),0
+++  bl uv_vert_filter
+++  sub r3,8
+++  vldb H(setup_input,0), -16(r4)
+++  cmp r14,1
+++  bne uv_skip3
+++  vstb H(zeros,0),-16(r4)
+++uv_skip3:
+++  bl uv_horz_filter
+++  mov r12,r11
+++  add r3,8*64
+++  vadd H(setup_input,0),H(setup_input,8),0
+++  bl uv_horz_filter
+++  sub r3,8*64
+++  addcmpbeq r12,0,0,uv_skip_save_top
+++  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
+++uv_skip_save_top:
+++  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
+++uv_start_deblock_loop:
+++  # move onto next 16x16 (could do this with circular buffer support instead)
+++  add r3,16
+++  and r3,r8
+++  add r4,32
+++  # Perform loop counter operations (may work with an addcmpbgt as well?)
+++  add r0,16
+++  add r2,16
+++  sub r7,1
+++  cmp r7,0 # Are there still more blocks to load
+++  bgt uv_deblock_loop
+++
+++  # Final iteration needs to just do horizontal filtering
+++  vldb H(setup_input,0), -16(r4)
+++  cmp r14,1
+++  bne uv_skip2
+++  vstb H(zeros,0),-16(r4)
+++uv_skip2:
+++  bl uv_horz_filter
+++  mov r12,r11
+++  add r3,8*64
+++  vadd H(setup_input,0),H(setup_input,8),0
+++  bl uv_horz_filter
+++  sub r3,64*8
+++  addcmpbeq r12,0,0,uv_skip_save_top2
+++  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
+++uv_skip_save_top2:
+++  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
+++
+++# Now look to see if we should do another row
+++  sub r9,1
+++  cmp r9,0
+++  bgt uv_start_again
+++  pop r6-r15, pc
+++uv_start_again:
+++  # Need to sort out r0,r2 to point to next row down
+++  addscale16 r10,r1
+++  mov r2,r10
+++  subscale4 r0,r2,r1
+++  b uv_process_row
+++
+++
+++# At this stage H(16,16)+r3 points to the first pixel of the 16 high edge to be filtered
+++# So we can reuse the code we move the parts to be filtered into HX(P0/P1/P2/P3/Q0/Q1/Q2/Q3,0) - we will perform a final saturation step on placing them back into the correct locations
+++
+++uv_vert_filter:
+++  push lr
+++
+++  vmov HX(P1,0), V(16,14)+r3
+++  vmov HX(P0,0), V(16,15)+r3
+++  vmov HX(Q0,0), V(16,16)+r3
+++  vmov HX(Q1,0), V(16,17)+r3
+++
+++  bl do_chroma_filter
+++
+++  vadds V(16,15)+r3, HX(P0,0), 0
+++  vadds V(16,16)+r3, HX(Q0,0), 0
+++
+++  pop pc
+++
+++# Filter edge at H(16,0)+r3
+++uv_horz_filter:
+++  push lr
+++
+++  vmov HX(P1,0), H(14,0)+r3
+++  vmov HX(P0,0), H(15,0)+r3
+++  vmov HX(Q0,0), H(16,0)+r3
+++  vmov HX(Q1,0), H(17,0)+r3
+++
+++  bl do_chroma_filter
+++
+++  vadds H(15,0)+r3, HX(P0,0), 0
+++  # P3 and Q3 never change so don't bother saving back
+++  vadds H(16,0)+r3, HX(Q0,0), 0
+++
+++  pop pc
+++
+++# r4 points to array of beta/tc for each 4 length edge
+++do_chroma_filter:
+++  valtl H(setup,0),H(setup_input,0),H(setup_input,0) # tc*8
+++  valtl HX(tc,0),H(setup,0),H(setup,0)
+++
+++  vsub HX(delta,0),HX(Q0,0),HX(P0,0)
+++  vshl HX(delta,0),HX(delta,0),2 CLRA SACC
+++  vsub -,HX(P1,0),HX(Q1,0) SACC
+++  vmov HX(delta,0),4 SACC
+++  vasr HX(delta,0),HX(delta,0),3
+++  vclamps HX(delta,0), HX(delta,0), HX(tc,0)
+++  vadd HX(P0,0),HX(P0,0),HX(delta,0)
+++  vsub HX(Q0,0),HX(Q0,0),HX(delta,0)
+++  b lr
+++
+++# r0 = list
+++# r1 = number
+++hevc_run_command_list:
+++  push r6-r7, lr
+++  mov r6, r0
+++  mov r7, r1
+++loop_cmds:
+++  ld r0,(r6) # How to encode r6++?
+++  add r6,4
+++  ld r1,(r6)
+++  add r6,4
+++  ld r2,(r6)
+++  add r6,4
+++  ld r3,(r6)
+++  add r6,4
+++  ld r4,(r6)
+++  add r6,4
+++  ld r5,(r6)
+++  add r6,4
+++  bl hevc_trans_16x16
+++  sub r7,1
+++  cmp r7,0
+++  bgt loop_cmds
+++
+++  pop r6-r7, pc
++diff --git a/libavcodec/rpi_mailbox.c b/libavcodec/rpi_mailbox.c
++new file mode 100644
++index 0000000..3904efc
++--- /dev/null
+++++ b/libavcodec/rpi_mailbox.c
++@@ -0,0 +1,340 @@
+++/*
+++Copyright (c) 2012, Broadcom Europe Ltd.
+++All rights reserved.
+++
+++Redistribution and use in source and binary forms, with or without
+++modification, are permitted provided that the following conditions are met:
+++    * Redistributions of source code must retain the above copyright
+++      notice, this list of conditions and the following disclaimer.
+++    * Redistributions in binary form must reproduce the above copyright
+++      notice, this list of conditions and the following disclaimer in the
+++      documentation and/or other materials provided with the distribution.
+++    * Neither the name of the copyright holder nor the
+++      names of its contributors may be used to endorse or promote products
+++      derived from this software without specific prior written permission.
+++
+++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+++*/
+++
+++#include <stdio.h>
+++#include <string.h>
+++#include <stdlib.h>
+++#include <fcntl.h>
+++#include <unistd.h>
+++#include <assert.h>
+++#include <stdint.h>
+++#include <sys/mman.h>
+++#include <sys/ioctl.h>
+++
+++#include <linux/ioctl.h>
+++
+++#define MAJOR_NUM 100
+++#define IOCTL_MBOX_PROPERTY _IOWR(MAJOR_NUM, 0, char *)
+++#define DEVICE_FILE_NAME "/dev/vcio"
+++
+++#include "rpi_mailbox.h"
+++
+++#define PAGE_SIZE (4*1024)
+++
+++// Shared memory will not be cached in ARM cache
+++void *mapmem_shared(unsigned base, unsigned size)
+++{
+++   int mem_fd;
+++   unsigned offset = base % PAGE_SIZE;
+++   base = base - offset;
+++   /* open /dev/mem */
+++   if ((mem_fd = open("/dev/mem", O_RDWR|O_SYNC) ) < 0) {
+++      printf("can't open /dev/mem\nThis program should be run as root. Try prefixing command with: sudo\n");
+++      return NULL;
+++   }
+++   void *mem = mmap(
+++      0,
+++      size,
+++      PROT_READ|PROT_WRITE,
+++      MAP_SHARED/*|MAP_FIXED*/,
+++      mem_fd,
+++      base);
+++#ifdef DEBUG
+++   printf("base=0x%x, mem=%p\n", base, mem);
+++#endif
+++   if (mem == MAP_FAILED) {
+++      printf("mmap error %d\n", (int)mem);
+++      return NULL;
+++   }
+++   close(mem_fd);
+++   return (char *)mem + offset;
+++}
+++
+++// Unshared memory will be faster as lives in ARM cache, but requires cache flushing
+++void *mapmem_private(unsigned base, unsigned size)
+++{
+++   int mem_fd;
+++   unsigned offset = base % PAGE_SIZE;
+++   base = base - offset;
+++   /* open /dev/mem */
+++   if ((mem_fd = open("/dev/mem", O_RDWR|O_SYNC) ) < 0) {
+++      printf("can't open /dev/mem\nThis program should be run as root. Try prefixing command with: sudo\n");
+++      return NULL;
+++   }
+++   void *mem = mmap(
+++      0,
+++      size,
+++      PROT_READ|PROT_WRITE,
+++      MAP_PRIVATE/*|MAP_FIXED*/,
+++      mem_fd,
+++      base);
+++#ifdef DEBUG
+++   printf("base=0x%x, mem=%p\n", base, mem);
+++#endif
+++   if (mem == MAP_FAILED) {
+++      printf("mmap error %d\n", (int)mem);
+++      return NULL;
+++   }
+++   close(mem_fd);
+++   return (char *)mem + offset;
+++}
+++
+++void unmapmem(void *addr, unsigned size)
+++{
+++   int s = munmap(addr, size);
+++   if (s != 0) {
+++      printf("munmap error %d\n", s);
+++      exit (-1);
+++   }
+++}
+++
+++/*
+++ * use ioctl to send mbox property message
+++ */
+++
+++static int mbox_property(int file_desc, void *buf)
+++{
+++   int ret_val = ioctl(file_desc, IOCTL_MBOX_PROPERTY, buf);
+++
+++   if (ret_val < 0) {
+++      printf("ioctl_set_msg failed:%d\n", ret_val);
+++   }
+++
+++#ifdef DEBUG
+++   unsigned *p = buf; int i; unsigned size = *(unsigned *)buf;
+++   for (i=0; i<size/4; i++)
+++      printf("%04x: 0x%08x\n", i*sizeof *p, p[i]);
+++#endif
+++   return ret_val;
+++}
+++
+++unsigned mem_alloc(int file_desc, unsigned size, unsigned align, unsigned flags)
+++{
+++   int i=0;
+++   unsigned p[32];
+++   p[i++] = 0; // size
+++   p[i++] = 0x00000000; // process request
+++
+++   p[i++] = 0x3000c; // (the tag id)
+++   p[i++] = 12; // (size of the buffer)
+++   p[i++] = 12; // (size of the data)
+++   p[i++] = size; // (num bytes? or pages?)
+++   p[i++] = align; // (alignment)
+++   p[i++] = flags; // (MEM_FLAG_L1_NONALLOCATING)
+++
+++   p[i++] = 0x00000000; // end tag
+++   p[0] = i*sizeof *p; // actual size
+++
+++   mbox_property(file_desc, p);
+++   return p[5];
+++}
+++
+++unsigned mem_free(int file_desc, unsigned handle)
+++{
+++   int i=0;
+++   unsigned p[32];
+++   p[i++] = 0; // size
+++   p[i++] = 0x00000000; // process request
+++
+++   p[i++] = 0x3000f; // (the tag id)
+++   p[i++] = 4; // (size of the buffer)
+++   p[i++] = 4; // (size of the data)
+++   p[i++] = handle;
+++
+++   p[i++] = 0x00000000; // end tag
+++   p[0] = i*sizeof *p; // actual size
+++
+++   mbox_property(file_desc, p);
+++   return p[5];
+++}
+++
+++unsigned mem_lock(int file_desc, unsigned handle)
+++{
+++   int i=0;
+++   unsigned p[32];
+++   p[i++] = 0; // size
+++   p[i++] = 0x00000000; // process request
+++
+++   p[i++] = 0x3000d; // (the tag id)
+++   p[i++] = 4; // (size of the buffer)
+++   p[i++] = 4; // (size of the data)
+++   p[i++] = handle;
+++
+++   p[i++] = 0x00000000; // end tag
+++   p[0] = i*sizeof *p; // actual size
+++
+++   mbox_property(file_desc, p);
+++   return p[5];
+++}
+++
+++unsigned mem_unlock(int file_desc, unsigned handle)
+++{
+++   int i=0;
+++   unsigned p[32];
+++   p[i++] = 0; // size
+++   p[i++] = 0x00000000; // process request
+++
+++   p[i++] = 0x3000e; // (the tag id)
+++   p[i++] = 4; // (size of the buffer)
+++   p[i++] = 4; // (size of the data)
+++   p[i++] = handle;
+++
+++   p[i++] = 0x00000000; // end tag
+++   p[0] = i*sizeof *p; // actual size
+++
+++   mbox_property(file_desc, p);
+++   return p[5];
+++}
+++
+++unsigned execute_code(int file_desc, unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5)
+++{
+++   int i=0;
+++   unsigned p[32];
+++   p[i++] = 0; // size
+++   p[i++] = 0x00000000; // process request
+++
+++   p[i++] = 0x30010; // (the tag id)
+++   p[i++] = 28; // (size of the buffer)
+++   p[i++] = 28; // (size of the data)
+++   p[i++] = code;
+++   p[i++] = r0;
+++   p[i++] = r1;
+++   p[i++] = r2;
+++   p[i++] = r3;
+++   p[i++] = r4;
+++   p[i++] = r5;
+++
+++   p[i++] = 0x00000000; // end tag
+++   p[0] = i*sizeof *p; // actual size
+++
+++   mbox_property(file_desc, p);
+++   return p[5];
+++}
+++
+++unsigned qpu_enable(int file_desc, unsigned enable)
+++{
+++   int i=0;
+++   unsigned p[32];
+++
+++   p[i++] = 0; // size
+++   p[i++] = 0x00000000; // process request
+++
+++   p[i++] = 0x30012; // (the tag id)
+++   p[i++] = 4; // (size of the buffer)
+++   p[i++] = 4; // (size of the data)
+++   p[i++] = enable;
+++
+++   p[i++] = 0x00000000; // end tag
+++   p[0] = i*sizeof *p; // actual size
+++
+++   mbox_property(file_desc, p);
+++   return p[5];
+++}
+++
+++unsigned execute_qpu(int file_desc, unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout) {
+++   int i=0;
+++   unsigned p[32];
+++
+++   p[i++] = 0; // size
+++   p[i++] = 0x00000000; // process request
+++   p[i++] = 0x30011; // (the tag id)
+++   p[i++] = 16; // (size of the buffer)
+++   p[i++] = 16; // (size of the data)
+++   p[i++] = num_qpus;
+++   p[i++] = control;
+++   p[i++] = noflush;
+++   p[i++] = timeout; // ms
+++
+++   p[i++] = 0x00000000; // end tag
+++   p[0] = i*sizeof *p; // actual size
+++
+++   mbox_property(file_desc, p);
+++   return p[5];
+++}
+++
+++void execute_multi(int file_desc,
+++   unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout,
+++   unsigned num_qpus_2, unsigned control_2, unsigned noflush_2, unsigned timeout_2,
+++   unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
+++   unsigned code_2, unsigned r0_2, unsigned r1_2, unsigned r2_2, unsigned r3_2, unsigned r4_2, unsigned r5_2) {
+++   int i=0;
+++   unsigned p[32];
+++
+++   p[i++] = 0; // size
+++   p[i++] = 0x00000000; // process request
+++   p[i++] = 0x30018; // (the tag id)
+++   p[i++] = 88; // (size of the buffer)
+++   p[i++] = 88; // (size of the data)
+++
+++   p[i++] = num_qpus;
+++   p[i++] = control;
+++   p[i++] = noflush;
+++   p[i++] = timeout; // ms
+++
+++   p[i++] = num_qpus_2;
+++   p[i++] = control_2;
+++   p[i++] = noflush_2;
+++   p[i++] = timeout_2; // ms
+++
+++   p[i++] = code;
+++   p[i++] = r0;
+++   p[i++] = r1;
+++   p[i++] = r2;
+++   p[i++] = r3;
+++   p[i++] = r4;
+++   p[i++] = r5;
+++
+++   p[i++] = code_2;
+++   p[i++] = r0_2;
+++   p[i++] = r1_2;
+++   p[i++] = r2_2;
+++   p[i++] = r3_2;
+++   p[i++] = r4_2;
+++   p[i++] = r5_2;
+++
+++   p[i++] = 0x00000000; // end tag
+++   p[0] = i*sizeof *p; // actual size
+++
+++   mbox_property(file_desc, p);
+++   return;
+++}
+++
+++int mbox_open() {
+++   int file_desc;
+++
+++   // open a char device file used for communicating with kernel mbox driver
+++   file_desc = open(DEVICE_FILE_NAME, 0);
+++   if (file_desc < 0) {
+++      printf("Can't open device file: %s\n", DEVICE_FILE_NAME);
+++      printf("Try creating a device file with: sudo mknod %s c %d 0\n", DEVICE_FILE_NAME, MAJOR_NUM);
+++   }
+++   return file_desc;
+++}
+++
+++void mbox_close(int file_desc) {
+++  close(file_desc);
+++}
++diff --git a/libavcodec/rpi_mailbox.h b/libavcodec/rpi_mailbox.h
++new file mode 100644
++index 0000000..5898102
++--- /dev/null
+++++ b/libavcodec/rpi_mailbox.h
++@@ -0,0 +1,25 @@
+++#ifndef RPI_MAILBOX_H
+++#define RPI_MAILBOX_H
+++
+++extern int mbox_open(void);
+++extern void mbox_close(int file_desc);
+++
+++extern unsigned get_version(int file_desc);
+++extern unsigned mem_alloc(int file_desc, unsigned size, unsigned align, unsigned flags);
+++extern unsigned mem_free(int file_desc, unsigned handle);
+++extern unsigned mem_lock(int file_desc, unsigned handle);
+++extern unsigned mem_unlock(int file_desc, unsigned handle);
+++extern void *mapmem_shared(unsigned base, unsigned size);
+++extern void *mapmem_private(unsigned base, unsigned size);
+++extern void unmapmem(void *addr, unsigned size);
+++
+++extern unsigned execute_code(int file_desc, unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5);
+++extern unsigned execute_qpu(int file_desc, unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout);
+++extern void execute_multi(int file_desc,
+++   unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout,
+++   unsigned num_qpus_2, unsigned control_2, unsigned noflush_2, unsigned timeout_2,
+++   unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
+++   unsigned code_2, unsigned r0_2, unsigned r1_2, unsigned r2_2, unsigned r3_2, unsigned r4_2, unsigned r5_2);
+++extern unsigned qpu_enable(int file_desc, unsigned enable);
+++
+++#endif
++diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
++new file mode 100644
++index 0000000..a01c051
++--- /dev/null
+++++ b/libavcodec/rpi_qpu.c
++@@ -0,0 +1,991 @@
+++#ifdef RPI
+++// Use vchiq service for submitting jobs
+++#define GPUSERVICE
+++
+++// This works better than the mmap in that the memory can be cached, but requires a kernel modification to enable the device.
+++// define RPI_TIME_TOTAL_QPU to print out how much time is spent in the QPU code
+++//#define RPI_TIME_TOTAL_QPU
+++// define RPI_TIME_TOTAL_VPU to print out how much time is spent in the VPI code
+++//#define RPI_TIME_TOTAL_VPU
+++// define RPI_TIME_TOTAL_POSTED to print out how much time is spent in the multi execute QPU/VPU combined
+++#define RPI_TIME_TOTAL_POSTED
+++
+++#include <stdio.h>
+++#include <stdlib.h>
+++#include <string.h>
+++#include <stddef.h>
+++#include <stdint.h>
+++#include "libavutil/avassert.h"
+++
+++#include "config.h"
+++
+++#include <pthread.h>
+++#include <time.h>
+++
+++#include "rpi_mailbox.h"
+++#include "rpi_qpu.h"
+++#include "rpi_shader.h"
+++#include "rpi_hevc_transform.h"
+++
+++#include "rpi_user_vcsm.h"
+++#ifdef GPUSERVICE
+++#pragma GCC diagnostic push
+++// Many many redundant decls in the header files
+++#pragma GCC diagnostic ignored "-Wredundant-decls"
+++#include "interface/vmcs_host/vc_vchi_gpuserv.h"
+++#pragma GCC diagnostic pop
+++#endif
+++
+++// QPU profile flags
+++#define NO_FLUSH 1
+++#define CLEAR_PROFILE 2
+++#define OUTPUT_COUNTS 4
+++
+++#define FLAGS_FOR_PROFILING (NO_FLUSH)
+++
+++
+++// On Pi2 there is no way to access the VPU L2 cache
+++// GPU_MEM_FLG should be 4 for uncached memory.  (Or C for alias to allocate in the VPU L2 cache)
+++// However, if using VCSM allocated buffers, need to use C at the moment because VCSM does not allocate uncached memory correctly
+++// The QPU crashes if we mix L2 cached and L2 uncached accesses due to a HW bug.
+++#define GPU_MEM_FLG 0x4
+++// GPU_MEM_MAP is meaningless on the Pi2 and should be left at 0  (On Pi1 it allows ARM to access VPU L2 cache)
+++#define GPU_MEM_MAP 0x0
+++
+++#define vcos_verify_ge0(x) ((x)>=0)
+++
+++/*static const unsigned code[] =
+++{
+++  #include "rpi_shader.hex"
+++};*/
+++
+++// Size in 32bit words
+++#define QPU_CODE_SIZE 2048
+++#define VPU_CODE_SIZE 2048
+++
+++const short rpi_transMatrix2even[32][16] = { // Even rows first
+++{64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64},
+++{90,  87,  80,  70,  57,  43,  25,   9,  -9, -25, -43, -57, -70, -80, -87, -90},
+++{89,  75,  50,  18, -18, -50, -75, -89, -89, -75, -50, -18,  18,  50,  75,  89},
+++{87,  57,   9, -43, -80, -90, -70, -25,  25,  70,  90,  80,  43,  -9, -57, -87},
+++{83,  36, -36, -83, -83, -36,  36,  83,  83,  36, -36, -83, -83, -36,  36,  83},
+++{80,   9, -70, -87, -25,  57,  90,  43, -43, -90, -57,  25,  87,  70,  -9, -80},
+++{75, -18, -89, -50,  50,  89,  18, -75, -75,  18,  89,  50, -50, -89, -18,  75},
+++{70, -43, -87,   9,  90,  25, -80, -57,  57,  80, -25, -90,  -9,  87,  43, -70},
+++{64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64},
+++{57, -80, -25,  90,  -9, -87,  43,  70, -70, -43,  87,   9, -90,  25,  80, -57},
+++{50, -89,  18,  75, -75, -18,  89, -50, -50,  89, -18, -75,  75,  18, -89,  50},
+++{43, -90,  57,  25, -87,  70,   9, -80,  80,  -9, -70,  87, -25, -57,  90, -43},
+++{36, -83,  83, -36, -36,  83, -83,  36,  36, -83,  83, -36, -36,  83, -83,  36},
+++{25, -70,  90, -80,  43,   9, -57,  87, -87,  57,  -9, -43,  80, -90,  70, -25},
+++{18, -50,  75, -89,  89, -75,  50, -18, -18,  50, -75,  89, -89,  75, -50,  18},
+++{ 9, -25,  43, -57,  70, -80,  87, -90,  90, -87,  80, -70,  57, -43,  25,  -9},
+++// Odd rows
+++{90,  90,  88,  85,  82,  78,  73,  67,  61,  54,  46,  38,  31,  22,  13,   4},
+++{90,  82,  67,  46,  22,  -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13},
+++{88,  67,  31, -13, -54, -82, -90, -78, -46,  -4,  38,  73,  90,  85,  61,  22},
+++{85,  46, -13, -67, -90, -73, -22,  38,  82,  88,  54,  -4, -61, -90, -78, -31},
+++{82,  22, -54, -90, -61,  13,  78,  85,  31, -46, -90, -67,   4,  73,  88,  38},
+++{78,  -4, -82, -73,  13,  85,  67, -22, -88, -61,  31,  90,  54, -38, -90, -46},
+++{73, -31, -90, -22,  78,  67, -38, -90, -13,  82,  61, -46, -88,  -4,  85,  54},
+++{67, -54, -78,  38,  85, -22, -90,   4,  90,  13, -88, -31,  82,  46, -73, -61},
+++{61, -73, -46,  82,  31, -88, -13,  90,  -4, -90,  22,  85, -38, -78,  54,  67},
+++{54, -85,  -4,  88, -46, -61,  82,  13, -90,  38,  67, -78, -22,  90, -31, -73},
+++{46, -90,  38,  54, -90,  31,  61, -88,  22,  67, -85,  13,  73, -82,   4,  78},
+++{38, -88,  73,  -4, -67,  90, -46, -31,  85, -78,  13,  61, -90,  54,  22, -82},
+++{31, -78,  90, -61,   4,  54, -88,  82, -38, -22,  73, -90,  67, -13, -46,  85},
+++{22, -61,  85, -90,  73, -38,  -4,  46, -78,  90, -82,  54, -13, -31,  67, -88},
+++{13, -38,  61, -78,  88, -90,  85, -73,  54, -31,   4,  22, -46,  67, -82,  90},
+++{ 4, -13,  22, -31,  38, -46,  54, -61,  67, -73,  78, -82,  85, -88,  90, -90}
+++};
+++
+++struct GPU
+++{
+++  unsigned int qpu_code[QPU_CODE_SIZE];
+++  unsigned int vpu_code[VPU_CODE_SIZE];
+++  short transMatrix2even[16*16*2];
+++  int open_count; // Number of allocated video buffers
+++  int      mb; // Mailbox handle
+++  int      vc; // Address in GPU memory
+++  int mail[12*2]; // These are used to pass pairs of code/unifs to the QPUs for the first QPU task
+++  int mail2[12*2]; // These are used to pass pairs of code/unifs to the QPUs for the second QPU task
+++};
+++
+++// Stop more than one thread trying to allocate memory or use the processing resources at once
+++static pthread_mutex_t gpu_mutex = PTHREAD_MUTEX_INITIALIZER;
+++static volatile struct GPU* gpu = NULL;
+++static GPU_MEM_PTR_T gpu_mem_ptr;
+++
+++#if defined(RPI_TIME_TOTAL_QPU) || defined(RPI_TIME_TOTAL_VPU) || defined(RPI_TIME_TOTAL_POSTED)
+++static unsigned int Microseconds(void) {
+++    struct timespec ts;
+++    unsigned int x;
+++    static unsigned int base = 0;
+++    clock_gettime(CLOCK_REALTIME, &ts);
+++    x = ts.tv_sec*1000000 + ts.tv_nsec/1000;
+++    if (base==0) base=x;
+++    return x-base;
+++}
+++#endif
+++
+++static int gpu_malloc_uncached_internal(int numbytes, GPU_MEM_PTR_T *p, int mb);
+++static void gpu_free_internal(GPU_MEM_PTR_T *p);
+++
+++// Connect to QPU, returns 0 on success.
+++static int gpu_init(volatile struct GPU **gpu) {
+++  int mb = mbox_open();
+++  int vc;
+++  volatile struct GPU* ptr;
+++	if (mb < 0)
+++		return -1;
+++#ifndef RPI_ASYNC
+++	if (qpu_enable(mb, 1)) return -2;
+++#endif
+++  vcsm_init();
+++  gpu_malloc_uncached_internal(sizeof(struct GPU), &gpu_mem_ptr, mb);
+++  ptr = (volatile struct GPU*)gpu_mem_ptr.arm;
+++  memset((void*)ptr, 0, sizeof *ptr);
+++  vc = gpu_mem_ptr.vc;
+++
+++  ptr->mb = mb;
+++  ptr->vc = vc;
+++
+++  printf("GPU allocated at 0x%x\n",vc);
+++
+++  *gpu = ptr;
+++
+++  // Now copy over the QPU code into GPU memory
+++  {
+++    int num_bytes = qpu_get_fn(QPU_MC_END) - qpu_get_fn(QPU_MC_SETUP_UV);
+++    av_assert0(num_bytes<=QPU_CODE_SIZE*sizeof(unsigned int));
+++    memcpy((void*)ptr->qpu_code, rpi_shader, num_bytes);
+++  }
+++  // And the VPU code
+++  {
+++    int num_bytes = sizeof(rpi_hevc_transform);
+++    av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
+++    memcpy((void*)ptr->vpu_code, rpi_hevc_transform, num_bytes);
+++  }
+++  // And the transform coefficients
+++  memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, sizeof(rpi_transMatrix2even));
+++
+++#ifdef RPI_ASYNC
+++  {
+++    int err;
+++    vpu_async_tail = 0;
+++    vpu_async_head = 0;
+++    err = pthread_create(&vpu_thread, NULL, vpu_start, NULL);
+++    //printf("Created thread\n");
+++    if (err) {
+++        av_log(NULL, AV_LOG_FATAL, "Failed to create vpu thread\n");
+++        return -4;
+++    }
+++
+++    {
+++      struct sched_param param = {0};
+++      int policy = 0;
+++
+++      if (pthread_getschedparam(vpu_thread, &policy, &param) != 0)
+++      {
+++        av_log(NULL, AV_LOG_ERROR, "Unable to get VPU thread scheduling parameters\n");
+++      }
+++      else
+++      {
+++        av_log(NULL, AV_LOG_INFO, "VPU thread: policy=%d (%s), pri=%d\n",
+++            policy,
+++            policy == SCHED_RR ? "RR" : policy == SCHED_FIFO ? "FIFO" : "???" ,
+++            param.sched_priority);
+++
+++        policy = SCHED_FIFO;
+++        param.sched_priority = sched_get_priority_max(SCHED_FIFO);
+++
+++        av_log(NULL, AV_LOG_INFO, "Attempt to set: policy=%d (%s), pri=%d\n",
+++            policy,
+++            policy == SCHED_RR ? "RR" : policy == SCHED_FIFO ? "FIFO" : "???" ,
+++            param.sched_priority);
+++
+++        if (pthread_setschedparam(vpu_thread, policy, &param) != 0)
+++        {
+++          av_log(NULL, AV_LOG_ERROR, "Unable to set VPU thread scheduling parameters\n");
+++        }
+++        else
+++        {
+++          if (pthread_getschedparam(vpu_thread, &policy, &param) != 0)
+++          {
+++            av_log(NULL, AV_LOG_ERROR, "Unable to get VPU thread scheduling parameters\n");
+++          }
+++          else
+++          {
+++            av_log(NULL, AV_LOG_INFO, "VPU thread (after): policy=%d (%s), pri=%d\n",
+++                policy,
+++                policy == SCHED_RR ? "RR" : policy == SCHED_FIFO ? "FIFO" : "???" ,
+++                param.sched_priority);
+++          }
+++        }
+++      }
+++
+++    }
+++
+++  }
+++#endif
+++
+++  return 0;
+++}
+++
+++// Returns 1 if the gpu is currently idle
+++static int gpu_idle(void)
+++{
+++  int ret = pthread_mutex_trylock(&gpu_mutex);
+++  if (ret==0) {
+++    pthread_mutex_unlock(&gpu_mutex);
+++    return 1;
+++  }
+++  return 0;
+++}
+++
+++// Make sure we have exclusive access to the mailbox, and enable qpu if necessary.
+++static void gpu_lock(void) {
+++  pthread_mutex_lock(&gpu_mutex);
+++
+++  if (gpu==NULL) {
+++    gpu_init(&gpu);
+++  }
+++}
+++
+++static void gpu_unlock(void) {
+++  pthread_mutex_unlock(&gpu_mutex);
+++}
+++
+++static int gpu_malloc_uncached_internal(int numbytes, GPU_MEM_PTR_T *p, int mb) {
+++  p->numbytes = numbytes;
+++  p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
+++  av_assert0(p->vcsm_handle);
+++  p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
+++  av_assert0(p->vc_handle);
+++  p->arm = vcsm_lock(p->vcsm_handle);
+++  av_assert0(p->arm);
+++  p->vc = mem_lock(mb, p->vc_handle);
+++  av_assert0(p->vc);
+++  return 0;
+++}
+++
+++// Allocate memory on GPU
+++// Fills in structure <p> containing ARM pointer, videocore handle, videocore memory address, numbytes
+++// Returns 0 on success.
+++// This allocates memory that will not be cached in ARM's data cache.
+++// Therefore safe to use without data cache flushing.
+++int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p)
+++{
+++  int r;
+++  gpu_lock();
+++  r = gpu_malloc_uncached_internal(numbytes, p, gpu->mb);
+++  gpu->open_count++;
+++  gpu_unlock();
+++  return r;
+++}
+++
+++int gpu_get_mailbox(void)
+++{
+++  av_assert0(gpu);
+++  return gpu->mb;
+++}
+++
+++// Call this to clean and invalidate a region of memory
+++void gpu_cache_flush(const GPU_MEM_PTR_T * const p)
+++{
+++#ifdef RPI_FAST_CACHEFLUSH
+++    struct vcsm_user_clean_invalid_s iocache = {};
+++    iocache.s[0].handle = p->vcsm_handle;
+++    iocache.s[0].cmd = 3; // clean+invalidate
+++    iocache.s[0].addr = (int) p->arm;
+++    iocache.s[0].size  = p->numbytes;
+++    vcsm_clean_invalid( &iocache );
+++#else
+++    void *tmp = vcsm_lock(p->vcsm_handle);
+++    vcsm_unlock_ptr(tmp);
+++#endif
+++}
+++
+++void gpu_cache_flush3(GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2)
+++{
+++#ifdef RPI_FAST_CACHEFLUSH
+++    struct vcsm_user_clean_invalid_s iocache = {};
+++    iocache.s[0].handle = p0->vcsm_handle;
+++    iocache.s[0].cmd = 3; // clean+invalidate
+++    iocache.s[0].addr = (int) p0->arm;
+++    iocache.s[0].size  = p0->numbytes;
+++    iocache.s[1].handle = p1->vcsm_handle;
+++    iocache.s[1].cmd = 3; // clean+invalidate
+++    iocache.s[1].addr = (int) p1->arm;
+++    iocache.s[1].size  = p1->numbytes;
+++    iocache.s[2].handle = p2->vcsm_handle;
+++    iocache.s[2].cmd = 3; // clean+invalidate
+++    iocache.s[2].addr = (int) p2->arm;
+++    iocache.s[2].size  = p2->numbytes;
+++    vcsm_clean_invalid( &iocache );
+++#else
+++    void *tmp;
+++    tmp = vcsm_lock(p0->vcsm_handle);
+++    vcsm_unlock_ptr(tmp);
+++    tmp = vcsm_lock(p1->vcsm_handle);
+++    vcsm_unlock_ptr(tmp);
+++    tmp = vcsm_lock(p2->vcsm_handle);
+++    vcsm_unlock_ptr(tmp);
+++#endif
+++}
+++
+++static int gpu_malloc_cached_internal(int numbytes, GPU_MEM_PTR_T *p) {
+++  p->numbytes = numbytes;
+++  p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST, (char *)"Video Frame" );
+++  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_VC, (char *)"Video Frame" );
+++  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
+++  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST_AND_VC, (char *)"Video Frame" );
+++  av_assert0(p->vcsm_handle);
+++  p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
+++  av_assert0(p->vc_handle);
+++  p->arm = vcsm_lock(p->vcsm_handle);
+++  av_assert0(p->arm);
+++  p->vc = mem_lock(gpu->mb, p->vc_handle);
+++  av_assert0(p->vc);
+++  return 0;
+++}
+++
+++// This allocates data that will be
+++//    Cached in ARM L2
+++//    Uncached in VPU L2
+++int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p)
+++{
+++  int r;
+++  gpu_lock();
+++  r = gpu_malloc_cached_internal(numbytes, p);
+++  gpu->open_count++;
+++  gpu_unlock();
+++  return r;
+++}
+++
+++static void gpu_term(void)
+++{
+++  int mb;
+++
+++  if (gpu==NULL)
+++    return;
+++  mb = gpu->mb;
+++
+++  // ??? Tear down anything needed for gpuexecute
+++
+++  qpu_enable(mb, 0);
+++  gpu_free_internal(&gpu_mem_ptr);
+++
+++  vcsm_exit();
+++
+++  mbox_close(mb);
+++  gpu = NULL;
+++}
+++
+++void gpu_free_internal(GPU_MEM_PTR_T *p) {
+++  int mb = gpu->mb;
+++  mem_unlock(mb,p->vc_handle);
+++  vcsm_unlock_ptr(p->arm);
+++  vcsm_free(p->vcsm_handle);
+++}
+++
+++void gpu_free(GPU_MEM_PTR_T *p) {
+++  gpu_lock();
+++
+++  gpu_free_internal(p);
+++
+++  gpu->open_count--;
+++  if (gpu->open_count==0) {
+++      printf("Closing GPU\n");
+++      gpu_term();
+++      gpu = NULL;
+++  }
+++  gpu_unlock();
+++}
+++
+++unsigned int vpu_get_fn(void) {
+++  // Make sure that the gpu is initialized
+++  if (gpu==NULL) {
+++    printf("Preparing gpu\n");
+++    gpu_lock();
+++    gpu_unlock();
+++  }
+++  return gpu->vc + offsetof(struct GPU,vpu_code);
+++}
+++
+++unsigned int vpu_get_constants(void) {
+++  if (gpu==NULL) {
+++    gpu_lock();
+++    gpu_unlock();
+++  }
+++  return gpu->vc + offsetof(struct GPU,transMatrix2even);
+++}
+++
+++#ifdef GPUSERVICE
+++static void callback(void *cookie)
+++{
+++  sem_post((sem_t *)cookie);
+++}
+++#endif
+++
+++
+++static volatile uint32_t post_done = 0;
+++static volatile uint32_t post_qed = 0;
+++
+++static void post_code2_cb(void * v)
+++{
+++  uint32_t n = (uint32_t)v;
+++  if ((int32_t)(n - post_done) > 0) {
+++    post_done = n;
+++  }
+++}
+++
+++
+++// Post a command to the queue
+++// Returns an id which we can use to wait for completion
+++int vpu_post_code2(unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5, GPU_MEM_PTR_T *buf)
+++{
+++  struct gpu_job_s j[1] = {
+++    {
+++      .command = EXECUTE_VPU,
+++      .u.v.q = {code, r0, r1, r2, r3, r4, r5},
+++      .callback.func = post_code2_cb
+++    }
+++  };
+++  uint32_t id;
+++
+++  j[0].callback.cookie = (void *)(id = ++post_qed);
+++
+++  av_assert0(vc_gpuserv_execute_code(1, j) == 0);
+++
+++  return id;
+++}
+++
+++int vpu_qpu_post_code2(unsigned vpu_code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
+++    int qpu0_n, const uint32_t * qpu0_mail,
+++    int qpu1_n, const uint32_t * qpu1_mail)
+++{
+++#if 1
+++  sem_t sync0;
+++  struct gpu_job_s j[4];
+++
+++  sem_init(&sync0, 0, 0);
+++
+++  j[0].command = EXECUTE_VPU;
+++  j[0].u.v.q[0] = vpu_code;
+++  j[0].u.v.q[1] = r0;
+++  j[0].u.v.q[2] = r1;
+++  j[0].u.v.q[3] = r2;
+++  j[0].u.v.q[4] = r3;
+++  j[0].u.v.q[5] = r4;
+++  j[0].u.v.q[6] = r5;
+++  j[0].callback.func = 0;
+++  j[0].callback.cookie = NULL;
+++
+++  j[1].command = EXECUTE_QPU;
+++  j[1].u.q.jobs = qpu1_n;
+++  memcpy(j[1].u.q.control, qpu1_mail, qpu1_n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
+++  j[1].u.q.noflush = FLAGS_FOR_PROFILING;
+++  j[1].u.q.timeout = 5000;
+++  j[1].callback.func = 0;
+++  j[1].callback.cookie = NULL;
+++
+++  j[2].command = EXECUTE_QPU;
+++  j[2].u.q.jobs = qpu0_n;
+++  memcpy(j[2].u.q.control, qpu0_mail, qpu0_n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
+++  j[2].u.q.noflush = 1;
+++  j[2].u.q.timeout = 5000;
+++  j[2].callback.func = 0;
+++  j[2].callback.cookie = NULL;
+++
+++  j[3].command = EXECUTE_SYNC;
+++  j[3].u.s.mask = 3;
+++  j[3].callback.func = callback;
+++  j[3].callback.cookie = (void *)&sync0;
+++
+++  av_assert0(vc_gpuserv_execute_code(4, j) == 0);
+++
+++  sem_wait(&sync0);
+++#else
+++
+++  sem_t sync0, sync2;
+++  struct gpu_job_s j[3];
+++
+++  sem_init(&sync0, 0, 0);
+++  sem_init(&sync2, 0, 0);
+++
+++  j[0].command = EXECUTE_VPU;
+++  j[0].u.v.q[0] = vpu_code;
+++  j[0].u.v.q[1] = r0;
+++  j[0].u.v.q[2] = r1;
+++  j[0].u.v.q[3] = r2;
+++  j[0].u.v.q[4] = r3;
+++  j[0].u.v.q[5] = r4;
+++  j[0].u.v.q[6] = r5;
+++  j[0].callback.func = callback;
+++  j[0].callback.cookie = (void *)&sync0;
+++
+++  j[1].command = EXECUTE_QPU;
+++  j[1].u.q.jobs = qpu1_n;
+++  memcpy(j[1].u.q.control, qpu1_mail, qpu1_n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
+++  j[1].u.q.noflush = FLAGS_FOR_PROFILING;
+++  j[1].u.q.timeout = 5000;
+++  j[1].callback.func = 0;
+++  j[1].callback.cookie = NULL;
+++
+++  j[2].command = EXECUTE_QPU;
+++  j[2].u.q.jobs = qpu0_n;
+++  memcpy(j[2].u.q.control, qpu0_mail, qpu0_n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
+++  j[2].u.q.noflush = 1;
+++  j[2].u.q.timeout = 5000;
+++  j[2].callback.func = callback;
+++  j[2].callback.cookie = (void *)&sync2;
+++
+++  av_assert0(vc_gpuserv_execute_code(3, j) == 0);
+++
+++  sem_wait(&sync0);
+++  sem_wait(&sync2);
+++#endif
+++
+++  return 0;
+++}
+++
+++
+++// Wait for completion of the given command
+++void vpu_wait(int id)
+++{
+++  if (id == 0) {
+++#if 0
+++    sem_t sync0;
+++    struct gpu_job_s j[1] =
+++    {
+++      {
+++        .command = EXECUTE_SYNC,
+++        .u.s.mask = 3,
+++        .callback.func = callback,
+++        .callback.cookie = (void *)&sync0
+++      }
+++    };
+++
+++    sem_init(&sync0, 0, 0);
+++
+++    av_assert0(vc_gpuserv_execute_code(1, j) == 0);
+++
+++    sem_wait(&sync0);
+++#endif
+++  }
+++  else {
+++    while ((int32_t)(post_done - (uint32_t)id) < 0) {
+++      usleep(1000);
+++    }
+++  }
+++}
+++
+++
+++unsigned int qpu_get_fn(int num) {
+++    // Make sure that the gpu is initialized
+++    unsigned int *fn;
+++    if (gpu==NULL) {
+++      printf("Preparing gpu\n");
+++      gpu_lock();
+++      gpu_unlock();
+++    }
+++    switch(num) {
+++    case QPU_MC_SETUP:
+++      fn = mc_setup;
+++      break;
+++    case QPU_MC_FILTER:
+++      fn = mc_filter;
+++      break;
+++    case QPU_MC_EXIT:
+++      fn = mc_exit;
+++      break;
+++    case QPU_MC_INTERRUPT_EXIT12:
+++      fn = mc_interrupt_exit12;
+++      break;
+++    case QPU_MC_FILTER_B:
+++      fn = mc_filter_b;
+++      break;
+++    //case QPU_MC_FILTER_HONLY:
+++    //  fn = mc_filter_honly;
+++    //  break;
+++    case QPU_MC_SETUP_UV:
+++      fn = mc_setup_uv;
+++      break;
+++    case QPU_MC_FILTER_UV:
+++      fn = mc_filter_uv;
+++      break;
+++    case QPU_MC_FILTER_UV_B0:
+++      fn = mc_filter_uv_b0;
+++      break;
+++    case QPU_MC_FILTER_UV_B:
+++      fn = mc_filter_uv_b;
+++      break;
+++    case QPU_MC_INTERRUPT_EXIT8:
+++      fn = mc_interrupt_exit8;
+++      break;
+++    case QPU_MC_END:
+++      fn = mc_end;
+++      break;
+++    default:
+++      printf("Unknown function\n");
+++      exit(-1);
+++    }
+++    return gpu->vc + 4*(int)(fn-rpi_shader);
+++    //return code[num] + gpu->vc;
+++}
+++
+++#if 0
+++typedef unsigned int uint32_t;
+++
+++typedef struct mvs_s {
+++    GPU_MEM_PTR_T unif_mvs_ptr;
+++    uint32_t *unif_mvs; // Base of memory for motion vector commands
+++
+++    // _base pointers are to the start of the row
+++    uint32_t *mvs_base[8];
+++    // these pointers are to the next free space
+++    uint32_t *u_mvs[8];
+++
+++} HEVCContext;
+++
+++#define RPI_CHROMA_COMMAND_WORDS 12
+++
+++static void rpi_inter_clear(HEVCContext *s)
+++{
+++    int i;
+++    for(i=0;i<8;i++) {
+++        s->u_mvs[i] = s->mvs_base[i];
+++        *s->u_mvs[i]++ = 0;
+++        *s->u_mvs[i]++ = 0;
+++        *s->u_mvs[i]++ = 0;
+++        *s->u_mvs[i]++ = 0;
+++        *s->u_mvs[i]++ = 0;
+++        *s->u_mvs[i]++ = 128;  // w
+++        *s->u_mvs[i]++ = 128;  // h
+++        *s->u_mvs[i]++ = 128;  // stride u
+++        *s->u_mvs[i]++ = 128;  // stride v
+++        s->u_mvs[i] += 3;  // Padding words
+++    }
+++}
+++
+++static void rpi_execute_inter_qpu(HEVCContext *s)
+++{
+++    int k;
+++    uint32_t *unif_vc = (uint32_t *)s->unif_mvs_ptr.vc;
+++
+++    for(k=0;k<8;k++) {
+++        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
+++        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
+++        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP); //  dummy location for V
+++    }
+++
+++    s->u_mvs[8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
+++
+++    qpu_run_shader8(qpu_get_fn(QPU_MC_SETUP_UV),
+++      (uint32_t)(unif_vc+(s->mvs_base[0 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+++      (uint32_t)(unif_vc+(s->mvs_base[1 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+++      (uint32_t)(unif_vc+(s->mvs_base[2 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+++      (uint32_t)(unif_vc+(s->mvs_base[3 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+++      (uint32_t)(unif_vc+(s->mvs_base[4 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+++      (uint32_t)(unif_vc+(s->mvs_base[5 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+++      (uint32_t)(unif_vc+(s->mvs_base[6 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+++      (uint32_t)(unif_vc+(s->mvs_base[7 ] - (uint32_t*)s->unif_mvs_ptr.arm))
+++      );
+++}
+++
+++void rpi_test_qpu(void)
+++{
+++    HEVCContext mvs;
+++    HEVCContext *s = &mvs;
+++    int i;
+++    int uv_commands_per_qpu = (1 + (256*64*2)/(4*4)) * RPI_CHROMA_COMMAND_WORDS;
+++    uint32_t *p;
+++    printf("Allocate memory\n");
+++    gpu_malloc_uncached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr );
+++    s->unif_mvs = (uint32_t *) s->unif_mvs_ptr.arm;
+++
+++    // Set up initial locations for uniform streams
+++    p = s->unif_mvs;
+++    for(i = 0; i < 8; i++) {
+++        s->mvs_base[i] = p;
+++        p += uv_commands_per_qpu;
+++    }
+++    // Now run a simple program that should just quit immediately after a single texture fetch
+++    rpi_inter_clear(s);
+++    for(i=0;i<4;i++) {
+++      printf("Launch QPUs\n");
+++      rpi_execute_inter_qpu(s);
+++      printf("Done\n");
+++    }
+++    printf("Free memory\n");
+++    gpu_free(&s->unif_mvs_ptr);
+++    return;
+++}
+++#endif
+++
+++#if 0
+++
+++int32_t hcoeffs[] = {-4, 10, -21, 70, 90, -24, 11, -4};
+++//int32_t hcoeffs[] = {1, 1, 1, 1, 1, 1, 1, 1};
+++int32_t vcoeffs[] = {-2, 6, -13, 37, 115, -20, 9, -4};
+++//int32_t vcoeffs[] = {1, 1, 1, 1, 1, 1, 1, 1};
+++
+++#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0-1) & 0xff) | ((c1-1) & 0xff) << 8 | ((c2-1) & 0xff) << 16 | ((c3-1) & 0xff) << 24);
+++
+++static uint8_t av_clip_uint8(int32_t a)
+++{
+++    if (a&(~255)) return (-a)>>31;
+++    else          return a;
+++}
+++
+++static int32_t filter8(const uint8_t *data, int pitch)
+++{
+++   int32_t vsum = 0;
+++   int x, y;
+++
+++   for (y = 0; y < 8; y++) {
+++      int32_t hsum = 0;
+++
+++      for (x = 0; x < 8; x++)
+++         hsum += hcoeffs[x]*data[x + y * pitch];
+++
+++      vsum += vcoeffs[y]*av_clip_uint8( (hsum + 64) >> 7); // Added brackets to stop compiler warning
+++   }
+++
+++   return av_clip_uint8( (vsum + 64) >> 7);
+++}
+++
+++// Note regression changes coefficients so is not thread safe
+++//#define REGRESSION
+++#ifdef REGRESSION
+++#define CMAX 100
+++#else
+++#define CMAX 2
+++#endif
+++#define YMAX 16
+++
+++int rpi_test_shader(void)
+++{
+++   int i, c;
+++
+++   uint32_t *unifs;
+++
+++   uint8_t *in_buffer;
+++   uint8_t *out_buffer[2];
+++
+++   GPU_MEM_PTR_T unifs_ptr;
+++   GPU_MEM_PTR_T in_buffer_ptr;
+++   GPU_MEM_PTR_T out_buffer_ptr[2];
+++
+++   // Addresses in GPU memory of filter programs
+++   uint32_t mc_setup = 0;
+++   uint32_t mc_filter = 0;
+++   uint32_t mc_exit = 0;
+++
+++   int pitch = 0x500;
+++
+++   if (gpu==NULL) {
+++      gpu_lock();
+++      gpu_unlock();
+++   }
+++
+++   printf("This needs to change to reflect new assembler\n");
+++   // Use table to compute locations of program start points
+++   mc_setup = code[0] + gpu->vc;
+++   mc_filter = code[1] + gpu->vc;
+++   mc_exit = code[2] + gpu->vc;
+++
+++   if (!vcos_verify_ge0(gpu_malloc_uncached(4*64,&unifs_ptr))) {
+++      return -2;
+++   }
+++   unifs = (uint32_t*)unifs_ptr.arm;
+++
+++   if (!vcos_verify_ge0(gpu_malloc_uncached(64*23,&in_buffer_ptr))) {
+++      return -3;
+++   }
+++   in_buffer = (uint8_t*)in_buffer_ptr.arm;
+++
+++   if (!vcos_verify_ge0(gpu_malloc_uncached(16*pitch,&out_buffer_ptr[0])) || !vcos_verify_ge0(gpu_malloc_uncached(16*pitch,&out_buffer_ptr[1]))) {
+++      return -4;
+++   }
+++   out_buffer[0] = (uint8_t*)out_buffer_ptr[0].arm;
+++   out_buffer[1] = (uint8_t*)out_buffer_ptr[1].arm;
+++
+++   for (c = 0; c < CMAX; c++) {
+++      int xo[] = {rand()&31, rand()&31};
+++
+++#ifdef REGRESSION
+++      for (i = 0; i < 8; i++) {
+++         hcoeffs[i] = (int8_t)rand();
+++         vcoeffs[i] = (int8_t)rand();
+++         if (hcoeffs[i]==-128)
+++           hcoeffs[i]++;
+++         if (vcoeffs[i]==-128)
+++           vcoeffs[i]++;
+++      }
+++#endif
+++
+++      for (i = 0; i < 64*23; i++) {
+++         //printf("%d %d %p\n",i,gpu->mb,&in_buffer[i]);
+++         in_buffer[i] = rand();
+++      }
+++
+++      // Clear output array
+++      {
+++        int b;
+++        for(b=0;b<2;b++) {
+++          for(i=0;i<16*16;i++) {
+++            out_buffer[b][i] = 3;
+++          }
+++        }
+++      }
+++
+++      unifs[0] = mc_filter;
+++      unifs[1] = in_buffer_ptr.vc+xo[0]+16;
+++      unifs[2] = 64; // src pitch
+++      unifs[3] = pitch; // dst pitch
+++      unifs[4] = 0; // Padding
+++      unifs[5] = 0;
+++      unifs[6] = 0;
+++      unifs[7 ] = mc_filter;
+++      unifs[8 ] = in_buffer_ptr.vc+xo[1]+16;
+++      unifs[9 ] = ENCODE_COEFFS(hcoeffs[0], hcoeffs[1], hcoeffs[2], hcoeffs[3]);
+++      unifs[10] = ENCODE_COEFFS(hcoeffs[4], hcoeffs[5], hcoeffs[6], hcoeffs[7]);
+++      unifs[11] = ENCODE_COEFFS(vcoeffs[0], vcoeffs[1], vcoeffs[2], vcoeffs[3]);
+++      unifs[12] = ENCODE_COEFFS(vcoeffs[4], vcoeffs[5], vcoeffs[6], vcoeffs[7]);
+++      unifs[13] = out_buffer_ptr[0].vc;
+++      unifs[14] = mc_exit;
+++      unifs[15] = in_buffer_ptr.vc+xo[1]+16;        // dummy
+++      unifs[16] = ENCODE_COEFFS(hcoeffs[0], hcoeffs[1], hcoeffs[2], hcoeffs[3]);
+++      unifs[17] = ENCODE_COEFFS(hcoeffs[4], hcoeffs[5], hcoeffs[6], hcoeffs[7]);
+++      unifs[18] = ENCODE_COEFFS(vcoeffs[0], vcoeffs[1], vcoeffs[2], vcoeffs[3]);
+++      unifs[19] = ENCODE_COEFFS(vcoeffs[4], vcoeffs[5], vcoeffs[6], vcoeffs[7]);
+++      unifs[20] = out_buffer_ptr[1].vc;
+++
+++      printf("Gpu->vc=%x Code=%x dst=%x\n",gpu->vc, mc_filter,out_buffer_ptr[1].vc);
+++
+++      // flush_dcache(); TODO is this needed on ARM side? - tried to use the direct alias to avoid this problem
+++
+++      //qpu_run_shader(mc_setup, unifs_ptr.vc);
+++      //qpu_run_shader(gpu, gpu->vc, unifs_ptr.vc);
+++      rpi_do_block(in_buffer_ptr.vc+xo[0]+16, 64, out_buffer_ptr[0].vc, pitch,out_buffer[0]);
+++      rpi_do_block(in_buffer_ptr.vc+xo[1]+16, 64, out_buffer_ptr[1].vc, pitch,out_buffer[1]);
+++
+++      if (1)
+++      {
+++         int x, y, b;
+++         int bad = 0;
+++
+++         for (b=0; b<2; ++b)
+++            for (y=0; y<YMAX; ++y)
+++               for (x=0; x<16; ++x) {
+++                  int32_t ref = filter8(in_buffer+x+y*64+xo[b], 64);
+++
+++                  if (out_buffer[b][x+y*pitch] != ref) {
+++                      bad = 1;
+++//                     printf("%d, %d, %d, %d\n", c, b, x, y);
+++                  }
+++#ifndef REGRESSION
+++                  //printf("%08x %08x\n", out_buffer[b][x+y*pitch], ref);
+++#endif
+++               }
+++          if (bad)
+++            printf("Failed dst=%x test=%d\n",out_buffer_ptr[1].vc,c);
+++          else
+++            printf("Passed dst=%x test=%d\n",out_buffer_ptr[1].vc,c);
+++      }
+++      //printf("%d\n", simpenrose_get_qpu_tick_count());
+++   }
+++
+++   gpu_free(&out_buffer_ptr[0]);
+++   gpu_free(&out_buffer_ptr[1]);
+++   gpu_free(&in_buffer_ptr);
+++   gpu_free(&unifs_ptr);
+++
+++   return 0;
+++}
+++
+++void rpi_do_block_arm(const uint8_t *in_buffer, int src_pitch, uint8_t *dst, int dst_pitch)
+++{
+++  int x,y;
+++  for (y=0; y<16; ++y) {
+++    for (x=0; x<16; ++x) {
+++       dst[x+y*dst_pitch] = filter8(in_buffer+x+y*src_pitch, src_pitch);
+++    }
+++  }
+++}
+++
+++void rpi_do_block(const uint8_t *in_buffer_vc, int src_pitch, uint8_t *dst_vc, int dst_pitch, uint8_t *dst)
+++{
+++   uint32_t *unifs;
+++
+++   GPU_MEM_PTR_T unifs_ptr;
+++   //uint8_t *out_buffer;
+++   //GPU_MEM_PTR_T out_buffer_ptr;
+++
+++   // Addresses in GPU memory of filter programs
+++   uint32_t mc_setup = 0;
+++   uint32_t mc_filter = 0;
+++   uint32_t mc_exit = 0;
+++   //int x,y;
+++
+++   if (gpu==NULL) {
+++      gpu_lock();
+++      gpu_unlock();
+++   }
+++
+++   // Use table to compute locations of program start points
+++   mc_setup = code[0] + gpu->vc;
+++   mc_filter = code[1] + gpu->vc;
+++   mc_exit = code[2] + gpu->vc;
+++
+++   if (!vcos_verify_ge0(gpu_malloc_uncached(4*64,&unifs_ptr))) {
+++      return;
+++   }
+++   //gpu_malloc_uncached(16*dst_pitch,&out_buffer_ptr);
+++   //out_buffer = (uint8_t*)out_buffer_ptr.arm;
+++
+++   /*for (y=0; y<16; ++y) {
+++      for (x=0; x<16; ++x) {
+++         out_buffer[x+y*dst_pitch] = 7;
+++      }
+++    }*/
+++
+++   unifs = (uint32_t*)unifs_ptr.arm;
+++
+++    unifs[0] = mc_filter;
+++    unifs[1] = (int)in_buffer_vc;
+++    unifs[2] = src_pitch; // src pitch
+++    unifs[3] = dst_pitch; // dst pitch
+++    unifs[4] = 0; // Padding
+++    unifs[5] = 0;
+++    unifs[6] = 0;
+++    unifs[7 ] = mc_exit;
+++    unifs[8 ] = (int)in_buffer_vc;
+++    unifs[9 ] = ENCODE_COEFFS(hcoeffs[0], hcoeffs[1], hcoeffs[2], hcoeffs[3]);
+++    unifs[10] = ENCODE_COEFFS(hcoeffs[4], hcoeffs[5], hcoeffs[6], hcoeffs[7]);
+++    unifs[11] = ENCODE_COEFFS(vcoeffs[0], vcoeffs[1], vcoeffs[2], vcoeffs[3]);
+++    unifs[12] = ENCODE_COEFFS(vcoeffs[4], vcoeffs[5], vcoeffs[6], vcoeffs[7]);
+++    unifs[13] = (int)dst_vc;
+++    //unifs[13] = (int)out_buffer_ptr.vc;
+++
+++    //printf("Gpu->vc=%x Code=%x dst=%x\n",gpu->vc, mc_filter,out_buffer_ptr[1].vc);
+++
+++    qpu_run_shader(mc_setup, unifs_ptr.vc);
+++
+++    /*for (y=0; y<16; ++y) {
+++      for (x=0; x<16; ++x) {
+++         dst[x+y*dst_pitch] = out_buffer[x+y*dst_pitch];
+++      }
+++    }*/
+++
+++    gpu_free(&unifs_ptr);
+++    //gpu_free(&out_buffer_ptr);
+++}
+++
+++
+++
+++#endif
+++
+++#endif // RPI
++diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
++new file mode 100644
++index 0000000..c6cdb2b
++--- /dev/null
+++++ b/libavcodec/rpi_qpu.h
++@@ -0,0 +1,176 @@
+++#ifndef RPI_QPU_H
+++#define RPI_QPU_H
+++
+++// Define RPI_FAST_CACHEFLUSH to use the VCSM cache flush code
+++// *** N.B. Code has rotted & crashes if this is unset (before this set of changes)
+++#define RPI_FAST_CACHEFLUSH
+++
+++#define RPI_ONE_BUF 1
+++
+++typedef struct gpu_mem_ptr_s {
+++  unsigned char *arm; // Pointer to memory mapped on ARM side
+++  int vc_handle;   // Videocore handle of relocatable memory
+++  int vcsm_handle; // Handle for use by VCSM
+++  int vc;       // Address for use in GPU code
+++  int numbytes; // Size of memory block
+++} GPU_MEM_PTR_T;
+++
+++// General GPU functions
+++extern int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p);
+++extern int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p);
+++extern void gpu_free(GPU_MEM_PTR_T *p);
+++extern void gpu_cache_flush(const GPU_MEM_PTR_T * const p);
+++extern void gpu_cache_flush3(GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2);
+++
+++#include "libavutil/frame.h"
+++#if !RPI_ONE_BUF
+++static inline uint32_t get_vc_address_y(const AVFrame * const frame) {
+++    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[0]);
+++    return p->vc;
+++}
+++
+++static inline uint32_t get_vc_address_u(const AVFrame * const frame) {
+++    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[1]);
+++    return p->vc;
+++}
+++
+++static inline uint32_t get_vc_address_v(const AVFrame * const frame) {
+++    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[2]);
+++    return p->vc;
+++}
+++
+++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) {
+++    return *(GPU_MEM_PTR_T *)av_buffer_pool_opaque(frame->buf[0]);
+++}
+++
+++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_u(const AVFrame * const frame) {
+++    return *(GPU_MEM_PTR_T *)av_buffer_pool_opaque(frame->buf[1]);
+++}
+++
+++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_v(const AVFrame * const frame) {
+++    return *(GPU_MEM_PTR_T *)av_buffer_pool_opaque(frame->buf[2]);
+++}
+++
+++#else
+++
+++static inline int gpu_is_buf1(const AVFrame * const frame)
+++{
+++    return frame->buf[1] == NULL;
+++}
+++
+++static inline GPU_MEM_PTR_T * gpu_buf1_gmem(const AVFrame * const frame)
+++{
+++    return av_buffer_get_opaque(frame->buf[0]);
+++}
+++
+++static inline GPU_MEM_PTR_T * gpu_buf3_gmem(const AVFrame * const frame, const int n)
+++{
+++    return av_buffer_pool_opaque(frame->buf[n]);
+++}
+++
+++
+++static inline uint32_t get_vc_address_y(const AVFrame * const frame) {
+++    return gpu_is_buf1(frame) ? gpu_buf1_gmem(frame)->vc : gpu_buf3_gmem(frame, 0)->vc;
+++}
+++
+++static inline uint32_t get_vc_address_u(const AVFrame * const frame) {
+++    return gpu_is_buf1(frame) ?
+++        gpu_buf1_gmem(frame)->vc + frame->data[1] - frame->data[0] :
+++        gpu_buf3_gmem(frame, 1)->vc;
+++}
+++
+++static inline uint32_t get_vc_address_v(const AVFrame * const frame) {
+++    return gpu_is_buf1(frame) ?
+++        gpu_buf1_gmem(frame)->vc + frame->data[2] - frame->data[0] :
+++        gpu_buf3_gmem(frame, 2)->vc;
+++}
+++
+++
+++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) {
+++    if (gpu_is_buf1(frame))
+++    {
+++        GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
+++        g.numbytes = frame->data[1] - frame->data[0];
+++        return g;
+++    }
+++    else
+++        return *gpu_buf3_gmem(frame, 0);
+++}
+++
+++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_u(const AVFrame * const frame) {
+++    if (gpu_is_buf1(frame))
+++    {
+++        GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
+++        g.arm += frame->data[1] - frame->data[0];
+++        g.vc += frame->data[1] - frame->data[0];
+++        g.numbytes = frame->data[2] - frame->data[1];  // chroma size
+++        return g;
+++    }
+++    else
+++        return *gpu_buf3_gmem(frame, 1);
+++}
+++
+++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_v(const AVFrame * const frame) {
+++    if (gpu_is_buf1(frame))
+++    {
+++        GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
+++        g.arm += frame->data[2] - frame->data[0];
+++        g.vc += frame->data[2] - frame->data[0];
+++        g.numbytes = frame->data[2] - frame->data[1];  // chroma size
+++        return g;
+++    }
+++    else
+++        return *gpu_buf3_gmem(frame, 2);
+++}
+++
+++#endif
+++
+++
+++// QPU specific functions
+++extern void rpi_test_qpu(void);
+++
+++enum {
+++  QPU_MC_SETUP,
+++  QPU_MC_FILTER,
+++  QPU_MC_EXIT,
+++  QPU_MC_INTERRUPT_EXIT12,
+++  QPU_MC_FILTER_B,
+++  QPU_MC_FILTER_HONLY,
+++  QPU_MC_SETUP_UV,
+++  QPU_MC_FILTER_UV,
+++  QPU_MC_FILTER_UV_B0,
+++  QPU_MC_FILTER_UV_B,
+++  QPU_MC_INTERRUPT_EXIT8,
+++  QPU_MC_END
+++  };
+++extern unsigned int qpu_get_fn(int num);
+++
+++#define QPU_N_UV   8
+++#define QPU_N_Y    12
+++#define QPU_N_MAX  16
+++
+++#define QPU_MAIL_EL_VALS  2
+++#define QPU_MAIL_EL_SIZE  (QPU_MAIL_EL_VALS * sizeof(uint32_t))
+++#define QPU_MAIL_VALS_MAX (QPU_N_MAX * QPU_MAIL_EL_VALS)
+++#define QPU_MAIL_SIZE (QPU_MAIL_VALS_MAX * sizeof(uint32_t))
+++
+++// VPU specific functions
+++extern unsigned int vpu_get_fn(void);
+++extern unsigned int vpu_get_constants(void);
+++//extern unsigned vpu_execute_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5);
+++extern int vpu_post_code2( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5, GPU_MEM_PTR_T *buf);
+++int vpu_qpu_post_code2(unsigned vpu_code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
+++    int qpu0_n, const uint32_t * qpu0_mail,
+++    int qpu1_n, const uint32_t * qpu1_mail);
+++
+++extern void vpu_wait( int id);
+++
+++// Simple test of shader code
+++extern int rpi_test_shader(void);
+++
+++extern void rpi_do_block(const unsigned char *in_buffer_vc, int src_pitch, unsigned char *dst_vc, int dst_pitch, unsigned char *dst);
+++extern void rpi_do_block_arm(const unsigned char *in_buffer, int src_pitch, unsigned char *dst, int dst_pitch);
+++
+++extern int gpu_get_mailbox(void);
+++
+++#endif
++diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
++new file mode 100644
++index 0000000..06fb166
++--- /dev/null
+++++ b/libavcodec/rpi_shader.c
++@@ -0,0 +1,629 @@
+++#include "rpi_shader.h"
+++
+++#ifdef _MSC_VER
+++   #include <stdint.h>
+++   /* cast through uintptr_t to avoid warnings */
+++   #define POINTER_TO_UINT(X) ((unsigned int)(uintptr_t)(X))
+++#else
+++   #define POINTER_TO_UINT(X) ((unsigned int)(X))
+++#endif
+++
+++#ifdef __cplusplus
+++extern "C" { /* the types are probably wrong... */
+++#endif
+++#ifdef __cplusplus
+++}
+++#endif
+++
+++#ifdef _MSC_VER
+++__declspec(align(8))
+++#elif defined(__GNUC__)
+++__attribute__((aligned(8)))
+++#endif
+++unsigned int rpi_shader[] = {
+++// ::mc_setup_uv
+++/* [0x00000000] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+++/* [0x00000008] */ 0x0c9a0f80, 0x10020427, // add ra_x, unif, elem_num
+++/* [0x00000010] */ 0x15827d80, 0x10020767, // mov ra_y, unif
+++/* [0x00000018] */ 0x15827d80, 0x10020627, // mov ra_frame_base, unif
+++/* [0x00000020] */ 0x009e7000, 0x100009e7, // nop
+++/* [0x00000028] */ 0x0d620f80, 0x10020667, // sub ra_u2v_ref_offset, unif, ra_frame_base
+++/* [0x00000030] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
+++/* [0x00000038] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
+++/* [0x00000040] */ 0x15827d80, 0x10021427, // mov rb16, unif
+++/* [0x00000048] */ 0x15827d80, 0x10020827, // mov r0, unif
+++/* [0x00000050] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+++/* [0x00000058] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
+++/* [0x00000060] */ 0x00010000, 0xe0020127, // mov ra4, 0x10000
+++/* [0x00000068] */ 0x00000001, 0xe0020527, // mov ra_k1, 1
+++/* [0x00000070] */ 0x00000100, 0xe00205a7, // mov ra_k256, 256
+++/* [0x00000078] */ 0x00000040, 0xe00207a7, // mov ra30, 64
+++/* [0x00000080] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
+++/* [0x00000088] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255
+++/* [0x00000090] */ 0x00000018, 0xe00215e7, // mov rb23, 24
+++/* [0x00000098] */ 0x00000000, 0xe0020227, // mov ra8, 0
+++/* [0x000000a0] */ 0x00000000, 0xe0020267, // mov ra9, 0
+++/* [0x000000a8] */ 0x00000000, 0xe00202a7, // mov ra10, 0
+++/* [0x000000b0] */ 0x00000000, 0xe00202e7, // mov ra11, 0
+++/* [0x000000b8] */ 0x00000000, 0xe0020327, // mov ra12, 0
+++/* [0x000000c0] */ 0x00000000, 0xe0020367, // mov ra13, 0
+++/* [0x000000c8] */ 0x00000000, 0xe00203a7, // mov ra14, 0
+++/* [0x000000d0] */ 0x00000000, 0xe00203e7, // mov ra15, 0
+++/* [0x000000d8] */ 0x15427d80, 0x10020827, // mov r0, ra_x
+++/* [0x000000e0] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
+++/* [0x000000e8] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base
+++/* [0x000000f0] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
+++/* [0x000000f8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
+++/* [0x00000100] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+++/* [0x00000108] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+++/* [0x00000110] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x, r0
+++/* [0x00000118] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+++/* [0x00000120] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
+++/* [0x00000128] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_frame_base, r2
+++/* [0x00000130] */ 0x0c9e7440, 0x10020f27, // add t1s, r2, r1
+++/* [0x00000138] */ 0x00000009, 0xe00208a7, // mov r2, 9
+++/* [0x00000140] */ 0x0c827580, 0x10021367, // add rb13, r2, unif
+++/* [0x00000148] */ 0x15827d80, 0x100009e7, // mov -, unif
+++/* [0x00000150] */ 0x15827d80, 0x100208a7, // mov r2, unif
+++/* [0x00000158] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
+++/* [0x00000160] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+++/* [0x00000168] */ 0x159e7480, 0x10020867, // mov r1, r2
+++/* [0x00000170] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+++/* [0x00000178] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+++/* [0x00000180] */ 0x159e7480, 0x10020827, // mov r0, r2
+++/* [0x00000188] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+++/* [0x00000190] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+++/* [0x00000198] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+++/* [0x000001a0] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
+++/* [0x000001a8] */ 0x0f9c11c0, 0xd00208a7, // asr r2, r0, 1
+++/* [0x000001b0] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
+++/* [0x000001b8] */ 0x0c9e7440, 0x10021567, // add rb21, r2, r1
+++/* [0x000001c0] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
+++/* [0x000001c8] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+++/* [0x000001d0] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
+++/* [0x000001d8] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
+++/* [0x000001e0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+++/* [0x000001e8] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
+++/* [0x000001f0] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+++/* [0x000001f8] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+++/* [0x00000200] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x
+++/* [0x00000208] */ 0x0c627380, 0x10020f27, // add t1s, r1, ra_frame_base
+++// ::mc_filter_uv
+++/* [0x00000210] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+++/* [0x00000218] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+++/* [0x00000220] */ 0x938001f6, 0xd0024821, // max r0, r0, 0         ; mov r1, unif
+++/* [0x00000228] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+++/* [0x00000230] */ 0x8d4e0ef6, 0x10025891, // sub r2, unif, r3      ; mov ra_xshift, ra_xshift_next
+++/* [0x00000238] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+++/* [0x00000240] */ 0x8c8270f6, 0x10025801, // add r0, r0, r3        ; mov ra1, unif
+++/* [0x00000248] */ 0x9481c1f6, 0xd00254c0, // and rb_x_next, r0, ~3 ; mov ra0, unif
+++/* [0x00000250] */ 0x959dc27f, 0x10024731, // mov ra_y_next, r1     ; mov vw_setup, rb28
+++/* [0x00000258] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
+++/* [0x00000260] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b
+++/* [0x00000268] */ 0x0c041dc0, 0xd2021467, // add rb17, ra1.16a, 1
+++/* [0x00000270] */ 0x0c043dc0, 0xd20214a7, // add rb18, ra1.16a, 3
+++/* [0x00000278] */ 0x11047dc0, 0xd2020827, // shl r0,   ra1.16a, 7
+++/* [0x00000280] */ 0x0c067180, 0x14020827, // add r0,   r0, ra1.16b
+++/* [0x00000288] */ 0x119d01c0, 0xd0020827, // shl r0,   r0, i_shift16
+++/* [0x00000290] */ 0x8c81b1f6, 0x10025683, // add rb26, r0, rb27    ; mov ra3, unif
+++/* [0x00000298] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++/* [0x000002a0] */ 0x950e0ff6, 0x18024048, // mov ra1, unif         ; mov rb8,  ra3.8a
+++/* [0x000002a8] */ 0x950e0ff6, 0x1a064049, // mov.ifnz ra1, unif    ; mov rb9,  ra3.8b
+++/* [0x000002b0] */ 0x800e7036, 0x1c0049ca, // nop                   ; mov rb10, ra3.8c
+++/* [0x000002b8] */ 0x950c0ff6, 0xde0248cb, // mov r3, 0             ; mov rb11, ra3.8d
+++/* [0x000002c0] */ 0x1104ddc0, 0x14020867, // shl r1, ra1.16b, rb13
+++/* [0x000002c8] */ 0x0f9c13c0, 0xd0021327, // asr rb12, r1, 1
+++/* [0x000002d0] */ 0x11041dc0, 0xd20213a7, // shl rb14, ra1.16a, 1
+++// :uvloop
+++/* [0x000002d8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1          ; ldtmu0
+++/* [0x000002e0] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
+++/* [0x000002e8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+++/* [0x000002f0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+++/* [0x000002f8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+++/* [0x00000300] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+++/* [0x00000308] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+++/* [0x00000310] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+++/* [0x00000318] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
+++/* [0x00000320] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2
+++/* [0x00000328] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++/* [0x00000330] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,       r0
+++/* [0x00000338] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8
+++/* [0x00000340] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1
+++/* [0x00000348] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9
+++/* [0x00000350] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2
+++/* [0x00000358] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
+++/* [0x00000360] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3
+++/* [0x00000368] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
+++/* [0x00000370] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
+++/* [0x00000378] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+++/* [0x00000380] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
+++/* [0x00000388] */ 0x55389db7, 0x10024361, // mov ra13, ra14          ; mul24 r1, ra14, rb9
+++/* [0x00000390] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+++/* [0x00000398] */ 0x55308037, 0x100243e0, // mov ra15, r0            ; mul24 r0, ra12, rb8
+++/* [0x000003a0] */ 0x4d38a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra14, rb10
+++/* [0x000003a8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+++/* [0x000003b0] */ 0x8d9f223f, 0x10020867, // sub r1, r1, r0          ; mov -, vw_wait
+++/* [0x000003b8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
+++/* [0x000003c0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+++/* [0x000003c8] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
+++/* [0x000003d0] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
+++/* [0x000003d8] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
+++/* [0x000003e0] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:uvloop
+++/* [0x000003e8] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
+++/* [0x000003f0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
+++/* [0x000003f8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+++/* [0x00000400] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+++/* [0x00000408] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+++/* [0x00000410] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+++/* [0x00000418] */ 0x00000010, 0xe0020827, // mov r0, 16
+++/* [0x00000420] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+++/* [0x00000428] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+++/* [0x00000430] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+++/* [0x00000438] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+++/* [0x00000440] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+++// ::mc_filter_uv_b0
+++/* [0x00000448] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+++/* [0x00000450] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+++/* [0x00000458] */ 0x938001f6, 0xd0024821, // max r0, r0, 0                ; mov r1, unif
+++/* [0x00000460] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+++/* [0x00000468] */ 0x8d4e0ef6, 0x10025891, // sub r2, unif, r3             ; mov ra_xshift, ra_xshift_next
+++/* [0x00000470] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+++/* [0x00000478] */ 0x8c8270f6, 0x10025801, // add r0, r0, r3  	     ; mov ra1, unif
+++/* [0x00000480] */ 0x9481c1f6, 0xd00254c0, // and rb_x_next, r0, ~3        ; mov ra0, unif
+++/* [0x00000488] */ 0x959d527f, 0x10024731, // mov ra_y_next, r1            ; mov vw_setup, rb21
+++/* [0x00000490] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
+++/* [0x00000498] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b
+++/* [0x000004a0] */ 0x0c041dc0, 0xd2021467, // add rb17, ra1.16a, 1
+++/* [0x000004a8] */ 0x0c043dc0, 0xd20214a7, // add rb18, ra1.16a, 3
+++/* [0x000004b0] */ 0x11047dc0, 0xd2020827, // shl r0,   ra1.16a, 7
+++/* [0x000004b8] */ 0x0c067180, 0x14020827, // add r0,   r0, ra1.16b
+++/* [0x000004c0] */ 0x918101f6, 0xd0025803, // shl r0,   r0, i_shift16      ; mov ra3, unif
+++/* [0x000004c8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+++/* [0x000004d0] */ 0x150e7d80, 0x18021227, // mov rb8, ra3.8a
+++/* [0x000004d8] */ 0x150e7d80, 0x1a021267, // mov rb9, ra3.8b
+++/* [0x000004e0] */ 0x150e7d80, 0x1c0212a7, // mov rb10, ra3.8c
+++/* [0x000004e8] */ 0x150e7d80, 0x1e0212e7, // mov rb11, ra3.8d
+++/* [0x000004f0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++/* [0x000004f8] */ 0x15827d80, 0x100213a7, // mov      rb14, unif
+++/* [0x00000500] */ 0x95800dbf, 0xd00653a3, // mov.ifnz rb14, unif    ; mov r3, 0
+++// :uvloop_b0
+++/* [0x00000508] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1          ; ldtmu0
+++/* [0x00000510] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
+++/* [0x00000518] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+++/* [0x00000520] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+++/* [0x00000528] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+++/* [0x00000530] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+++/* [0x00000538] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+++/* [0x00000540] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+++/* [0x00000548] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
+++/* [0x00000550] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2
+++/* [0x00000558] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++/* [0x00000560] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,       r0
+++/* [0x00000568] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8
+++/* [0x00000570] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1
+++/* [0x00000578] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9
+++/* [0x00000580] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2
+++/* [0x00000588] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
+++/* [0x00000590] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3
+++/* [0x00000598] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
+++/* [0x000005a0] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
+++/* [0x000005a8] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+++/* [0x000005b0] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+++/* [0x000005b8] */ 0x55389db7, 0x10024361, // mov ra13, ra14          ; mul24 r1, ra14, rb9
+++/* [0x000005c0] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+++/* [0x000005c8] */ 0x55308037, 0x100243e0, // mov ra15, r0            ; mul24 r0, ra12, rb8
+++/* [0x000005d0] */ 0x4d38a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra14, rb10
+++/* [0x000005d8] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
+++/* [0x000005e0] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+++/* [0x000005e8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+++/* [0x000005f0] */ 0x8d9f223f, 0x10020867, // sub r1, r1, r0          ; mov -, vw_wait
+++/* [0x000005f8] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
+++/* [0x00000600] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+++/* [0x00000608] */ 0x15827d80, 0x100009e7, // mov -, unif
+++/* [0x00000610] */ 0x15827d80, 0x100009e7, // mov -, unif
+++/* [0x00000618] */ 0x009e7000, 0x100009e7, // nop
+++// ::mc_filter_uv_b
+++/* [0x00000620] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+++/* [0x00000628] */ 0x954dcdbf, 0x10024471, // mov ra_xshift, ra_xshift_next      ; mov vw_setup, rb28
+++/* [0x00000630] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+++/* [0x00000638] */ 0x938001f6, 0xd002581c, // max r0, r0, 0                      ; mov ra_y_next, unif
+++/* [0x00000640] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+++/* [0x00000648] */ 0x4d808cc7, 0xd0025893, // sub r2, unif, r3                   ; mul24 ra_xshift_next, r0, 8
+++/* [0x00000650] */ 0x8c8270f6, 0x10025801, // add r0, r0, r3                     ; mov ra1, unif
+++/* [0x00000658] */ 0x9481c1f6, 0xd00254c0, // and rb_x_next, r0, ~3              ; mov ra0, unif
+++/* [0x00000660] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b
+++/* [0x00000668] */ 0x0c041dc0, 0xd2021467, // add rb17, ra1.16a, 1
+++/* [0x00000670] */ 0x0c043dc0, 0xd20214a7, // add rb18, ra1.16a, 3
+++/* [0x00000678] */ 0x11047dc0, 0xd2020827, // shl r0,   ra1.16a, 7
+++/* [0x00000680] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
+++/* [0x00000688] */ 0x918151f6, 0xd00258c3, // shl r3, r0, i_shift21     ; mov ra3, unif
+++/* [0x00000690] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+++/* [0x00000698] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
+++/* [0x000006a0] */ 0x0c067180, 0x14020827, // add r0, r0, ra1.16b
+++/* [0x000006a8] */ 0x119d01c0, 0xd0020827, // shl r0, r0, i_shift16
+++/* [0x000006b0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+++/* [0x000006b8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++/* [0x000006c0] */ 0x950e0ff6, 0x18024048, // mov      ra1, unif  ; mov rb8,  ra3.8a
+++/* [0x000006c8] */ 0x950e0ff6, 0x1a064049, // mov.ifnz ra1, unif  ; mov rb9,  ra3.8b
+++/* [0x000006d0] */ 0x800e7036, 0x1c0049ca, // nop                 ; mov rb10, ra3.8c
+++/* [0x000006d8] */ 0x950c0ff6, 0xde0248cb, // mov r3, 0           ; mov rb11, ra3.8d
+++/* [0x000006e0] */ 0x1104ddc0, 0x14020867, // shl r1, ra1.16b, rb13
+++/* [0x000006e8] */ 0x0f9c13c0, 0xd0021327, // asr rb12, r1, 1
+++// :uvloop_b
+++/* [0x000006f0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1          ; ldtmu0
+++/* [0x000006f8] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
+++/* [0x00000700] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+++/* [0x00000708] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+++/* [0x00000710] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift     ; v8subs r0, r0, rb20
+++/* [0x00000718] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+++/* [0x00000720] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+++/* [0x00000728] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+++/* [0x00000730] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2         ; v8subs r1, r1, rb20
+++/* [0x00000738] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2
+++/* [0x00000740] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++/* [0x00000748] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,       r0
+++/* [0x00000750] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8
+++/* [0x00000758] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1
+++/* [0x00000760] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9
+++/* [0x00000768] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2
+++/* [0x00000770] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
+++/* [0x00000778] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3
+++/* [0x00000780] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
+++/* [0x00000788] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
+++/* [0x00000790] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+++/* [0x00000798] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
+++/* [0x000007a0] */ 0x55389db7, 0x10024361, // mov ra13, ra14          ; mul24 r1, ra14, rb9
+++/* [0x000007a8] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+++/* [0x000007b0] */ 0x55308037, 0x100243e0, // mov ra15, r0            ; mul24 r0, ra12, rb8
+++/* [0x000007b8] */ 0x4d38a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra14, rb10
+++/* [0x000007c0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+++/* [0x000007c8] */ 0x4d13023e, 0x10024860, // sub r1, r1, r0          ; mul24 r0, vpm, ra4
+++/* [0x000007d0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
+++/* [0x000007d8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+++/* [0x000007e0] */ 0x4f0501ce, 0xd2024821, // asr r0, r0, i_shift16   ; mul24 r1, r1, ra1.16a
+++/* [0x000007e8] */ 0x409ce007, 0x100049e0, // nop                     ; mul24 r0, r0, rb14
+++/* [0x000007f0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+++/* [0x000007f8] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
+++/* [0x00000800] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
+++/* [0x00000808] */ 0xfffffec8, 0xf06809e7, // brr.anyn -, r:uvloop_b
+++/* [0x00000810] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
+++/* [0x00000818] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
+++/* [0x00000820] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+++/* [0x00000828] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+++/* [0x00000830] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+++/* [0x00000838] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+++/* [0x00000840] */ 0x00000010, 0xe0020827, // mov r0, 16
+++/* [0x00000848] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+++/* [0x00000850] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+++/* [0x00000858] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+++/* [0x00000860] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+++/* [0x00000868] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+++// ::mc_exit
+++/* [0x00000870] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+++/* [0x00000878] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+++/* [0x00000880] */ 0x009e7000, 0xa00009e7, // ldtmu0
+++/* [0x00000888] */ 0x009e7000, 0xb00009e7, // ldtmu1
+++/* [0x00000890] */ 0x009e7000, 0xa00009e7, // ldtmu0
+++/* [0x00000898] */ 0x009e7000, 0xb00009e7, // ldtmu1
+++/* [0x000008a0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+++/* [0x000008a8] */ 0x009e7000, 0x100009e7, // nop        ; nop
+++/* [0x000008b0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+++// ::mc_interrupt_exit8
+++/* [0x000008b8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+++/* [0x000008c0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+++/* [0x000008c8] */ 0x009e7000, 0xb00009e7, // ldtmu1
+++/* [0x000008d0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+++/* [0x000008d8] */ 0x009e7000, 0xb00009e7, // ldtmu1
+++/* [0x000008e0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+++/* [0x000008e8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+++/* [0x000008f0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+++/* [0x000008f8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+++/* [0x00000900] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+++/* [0x00000908] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+++/* [0x00000910] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+++/* [0x00000918] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+++/* [0x00000920] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+++/* [0x00000928] */ 0x009e7000, 0x100009e7, // nop        ; nop
+++// ::mc_setup
+++/* [0x00000930] */ 0x00000010, 0xe00208e7, // mov r3, 16
+++/* [0x00000938] */ 0x15827d80, 0x10020227, // mov ra8, unif
+++/* [0x00000940] */ 0x15827d80, 0x10020267, // mov ra9, unif
+++/* [0x00000948] */ 0x15827d80, 0x100202a7, // mov ra10, unif
+++/* [0x00000950] */ 0x15827d80, 0x100202e7, // mov ra11, unif
+++/* [0x00000958] */ 0x15827d80, 0x10020867, // mov r1, unif
+++/* [0x00000960] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3
+++/* [0x00000968] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3
+++/* [0x00000970] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3
+++/* [0x00000978] */ 0x0d9c13c0, 0xd0021667, // sub rb_frame_width_minus_1,r1,1
+++/* [0x00000980] */ 0x0d9c11c0, 0xd00217a7, // sub rb_frame_height_minus_1,r0,1
+++/* [0x00000988] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
+++/* [0x00000990] */ 0x15827d80, 0x10020827, // mov r0, unif
+++/* [0x00000998] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+++/* [0x000009a0] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
+++/* [0x000009a8] */ 0x15227d80, 0x10020867, // mov r1, ra8
+++/* [0x000009b0] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3
+++/* [0x000009b8] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3
+++/* [0x000009c0] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3
+++/* [0x000009c8] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
+++/* [0x000009d0] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+++/* [0x000009d8] */ 0x922591f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, ra9
+++/* [0x000009e0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+++/* [0x000009e8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
+++/* [0x000009f0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+++/* [0x000009f8] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
+++/* [0x00000a00] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+++/* [0x00000a08] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+++/* [0x00000a10] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
+++/* [0x00000a18] */ 0x8c9e7452, 0x10025e18, // add t0s, r2, r1 ; mov ra_frame_base, r2
+++/* [0x00000a20] */ 0x152a7d80, 0x10020867, // mov r1, ra10
+++/* [0x00000a28] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3
+++/* [0x00000a30] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3
+++/* [0x00000a38] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3
+++/* [0x00000a40] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
+++/* [0x00000a48] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+++/* [0x00000a50] */ 0x922d91f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, ra11
+++/* [0x00000a58] */ 0x119c31c0, 0xd0021067, // shl rx_xshift2_next, r0, 3
+++/* [0x00000a60] */ 0x0c9c13c0, 0xd0120567, // add ra_y2, r1, 1
+++/* [0x00000a68] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+++/* [0x00000a70] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
+++/* [0x00000a78] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+++/* [0x00000a80] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+++/* [0x00000a88] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
+++/* [0x00000a90] */ 0x8c9e7452, 0x10025f19, // add t1s, r2, r1 ; mov ra_frame_base2, r2
+++/* [0x00000a98] */ 0x00000001, 0xe0020527, // mov ra_k1, 1
+++/* [0x00000aa0] */ 0x00000100, 0xe00205a7, // mov ra_k256, 256
+++/* [0x00000aa8] */ 0x00000040, 0xe00207a7, // mov ra30, 64
+++/* [0x00000ab0] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
+++/* [0x00000ab8] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255
+++/* [0x00000ac0] */ 0x00000018, 0xe00215e7, // mov rb23, 24
+++/* [0x00000ac8] */ 0x00000000, 0xe0020227, // mov ra8, 0
+++/* [0x00000ad0] */ 0x00000000, 0xe0020267, // mov ra9, 0
+++/* [0x00000ad8] */ 0x00000000, 0xe00202a7, // mov ra10, 0
+++/* [0x00000ae0] */ 0x00000000, 0xe00202e7, // mov ra11, 0
+++/* [0x00000ae8] */ 0x00000000, 0xe0020327, // mov ra12, 0
+++/* [0x00000af0] */ 0x00000000, 0xe0020367, // mov ra13, 0
+++/* [0x00000af8] */ 0x00000000, 0xe00203a7, // mov ra14, 0
+++/* [0x00000b00] */ 0x00000000, 0xe00203e7, // mov ra15, 0
+++/* [0x00000b08] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+++/* [0x00000b10] */ 0x159e7480, 0x10020867, // mov r1, r2
+++/* [0x00000b18] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+++/* [0x00000b20] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+++/* [0x00000b28] */ 0x159e7480, 0x10020827, // mov r0, r2
+++/* [0x00000b30] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+++/* [0x00000b38] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+++/* [0x00000b40] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+++/* [0x00000b48] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
+++/* [0x00000b50] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
+++/* [0x00000b58] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+++/* [0x00000b60] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
+++/* [0x00000b68] */ 0x0c809dc0, 0xd0021367, // add rb13, unif, 9
+++/* [0x00000b70] */ 0x15827d80, 0x100009e7, // mov -, unif
+++/* [0x00000b78] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
+++/* [0x00000b80] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+++/* [0x00000b88] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
+++/* [0x00000b90] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+++/* [0x00000b98] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_frame_base
+++/* [0x00000ba0] */ 0x13540dc0, 0xd2020867, // max r1, ra_y2, 0
+++/* [0x00000ba8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+++/* [0x00000bb0] */ 0x0c541dc0, 0xd2120567, // add ra_y2, ra_y2, 1
+++/* [0x00000bb8] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+++/* [0x00000bc0] */ 0x0c667380, 0x10020f27, // add t1s, r1, ra_frame_base2
+++// :per_block_setup
+++/* [0x00000bc8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++/* [0x00000bd0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+++/* [0x00000bd8] */ 0x959a0ff6, 0x10024061, // mov ra1, unif  ; mov r1, elem_num
+++/* [0x00000be0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+++/* [0x00000be8] */ 0x159c1fc0, 0x10021027, // mov rx_xshift2, rx_xshift2_next
+++/* [0x00000bf0] */ 0x0c067c40, 0x12020827, // add r0, ra1.16a, r1
+++/* [0x00000bf8] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+++/* [0x00000c00] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
+++/* [0x00000c08] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+++/* [0x00000c10] */ 0x95048ff6, 0xd40258dc, // mov r3, 8                          ; mov ra_y_next, ra1.16b
+++/* [0x00000c18] */ 0x9481c1f6, 0xd0025801, // and r0, r0, ~3                     ; mov ra1, unif
+++/* [0x00000c20] */ 0x0c9e7400, 0x100206a7, // add ra_frame_base_next, r2, r0
+++/* [0x00000c28] */ 0x0c067c40, 0x12020827, // add r0, ra1.16a, r1
+++/* [0x00000c30] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+++/* [0x00000c38] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
+++/* [0x00000c40] */ 0x119c31c0, 0xd0021067, // shl rx_xshift2_next, r0, 3
+++/* [0x00000c48] */ 0x8c0676f6, 0x142258d5, // add r3, r3, r3                     ; mov ra_y2_next, ra1.16b
+++/* [0x00000c50] */ 0x9481c1f6, 0xd0025801, // and r0, r0, ~3                     ; mov ra1, unif
+++/* [0x00000c58] */ 0x0c9e7400, 0x100214e7, // add rx_frame_base2_next, r2, r0
+++/* [0x00000c60] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+++/* [0x00000c68] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b
+++/* [0x00000c70] */ 0x0c045dc0, 0xd2021467, // add rb17, ra1.16a, 5
+++/* [0x00000c78] */ 0x0c047dc0, 0xd20214a7, // add rb18, ra1.16a, 7
+++/* [0x00000c80] */ 0x11047dc0, 0xd2020827, // shl r0,   ra1.16a, 7
+++/* [0x00000c88] */ 0x0c067180, 0x14020827, // add r0,   r0, ra1.16b
+++/* [0x00000c90] */ 0x119d01c0, 0xd0020827, // shl r0,   r0, i_shift16
+++/* [0x00000c98] */ 0x8c81b1f6, 0x100256a0, // add rb26, r0, rb27                 ; mov r0, unif
+++/* [0x00000ca0] */ 0x119d01c0, 0xd0040827, // shl.ifz r0, r0, i_shift16
+++/* [0x00000ca8] */ 0x119c31c0, 0xd0020227, // shl ra8, r0, 3
+++/* [0x00000cb0] */ 0x00010100, 0xe0020867, // mov r1,0x00010100
+++/* [0x00000cb8] */ 0x10227380, 0x1e4200a7, // ror ra2.8a, r1, ra8.8d
+++/* [0x00000cc0] */ 0x10227380, 0x1c420027, // ror ra0.8a, r1, ra8.8c
+++/* [0x00000cc8] */ 0x01040400, 0xe0020867, // mov r1,0x01040400
+++/* [0x00000cd0] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d
+++/* [0x00000cd8] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c
+++/* [0x00000ce0] */ 0x050b0a00, 0xe0020867, // mov r1,0x050b0a00
+++/* [0x00000ce8] */ 0x10227380, 0x1e6200a7, // ror ra2.8c, r1, ra8.8d
+++/* [0x00000cf0] */ 0x10227380, 0x1c620027, // ror ra0.8c, r1, ra8.8c
+++/* [0x00000cf8] */ 0x11283a40, 0xe0020867, // mov r1,0x11283a40
+++/* [0x00000d00] */ 0x10227380, 0x1e7200a7, // ror ra2.8d, r1, ra8.8d
+++/* [0x00000d08] */ 0x10227380, 0x1c720027, // ror ra0.8d, r1, ra8.8c
+++/* [0x00000d10] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100
+++/* [0x00000d18] */ 0x10227380, 0x1e4200e7, // ror ra3.8a, r1, ra8.8d
+++/* [0x00000d20] */ 0x10227380, 0x1c420067, // ror ra1.8a, r1, ra8.8c
+++/* [0x00000d28] */ 0x0a0b0500, 0xe0020867, // mov r1,0x0a0b0500
+++/* [0x00000d30] */ 0x10227380, 0x1e5200e7, // ror ra3.8b, r1, ra8.8d
+++/* [0x00000d38] */ 0x10227380, 0x1c520067, // ror ra1.8b, r1, ra8.8c
+++/* [0x00000d40] */ 0x04040100, 0xe0020867, // mov r1,0x04040100
+++/* [0x00000d48] */ 0x10227380, 0x1e6200e7, // ror ra3.8c, r1, ra8.8d
+++/* [0x00000d50] */ 0x10227380, 0x1c620067, // ror ra1.8c, r1, ra8.8c
+++/* [0x00000d58] */ 0x01010000, 0xe0020867, // mov r1,0x01010000
+++/* [0x00000d60] */ 0x902203bf, 0x1e7240e0, // ror ra3.8d, r1, ra8.8d    ; mov r0, unif
+++/* [0x00000d68] */ 0x9020d3bf, 0x1c724061, // ror ra1.8d, r1, ra8.8c    ; mov r1, rb13
+++/* [0x00000d70] */ 0x910e0e76, 0x18024844, // shl r1, unif, r1          ; mov rb4, ra3.8a
+++/* [0x00000d78] */ 0x8f0e70f6, 0x1a024485, // asr ra18, r0, r3          ; mov rb5, ra3.8b
+++/* [0x00000d80] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+++/* [0x00000d88] */ 0x910e70f6, 0x1c024806, // shl r0, r0, r3            ; mov rb6, ra3.8c
+++/* [0x00000d90] */ 0x950c0ff6, 0xde0248c7, // mov r3, 0                 ; mov rb7, ra3.8d
+++/* [0x00000d98] */ 0x0f9c93c0, 0xd0021327, // asr rb12, r1, 9
+++// ::mc_filter
+++/* [0x00000da0] */ 0x0f9cf1c0, 0xd00213a7, // asr rb14, r0, 15
+++// :yloop
+++/* [0x00000da8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1                           ; ldtmu0
+++/* [0x00000db0] */ 0x8e4539bf, 0xb0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu1
+++/* [0x00000db8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+++/* [0x00000dc0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+++/* [0x00000dc8] */ 0x8e5409f6, 0x14129855, // shr r1, r4, rx_xshift2    ; mov.ifz ra_y2, ra_y2_next
+++/* [0x00000dd0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+++/* [0x00000dd8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+++/* [0x00000de0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
+++/* [0x00000de8] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
+++/* [0x00000df0] */ 0x13540dc0, 0xd20208a7, // max r2, ra_y2, 0
+++/* [0x00000df8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+++/* [0x00000e00] */ 0x4c541dd3, 0xd2124562, // add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
+++/* [0x00000e08] */ 0xec654c8f, 0x10024f21, // add t1s, ra_frame_base2, r2  ; v8subs r1, r1, rb20
+++/* [0x00000e10] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++/* [0x00000e18] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,      r0
+++/* [0x00000e20] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
+++/* [0x00000e28] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1, r0 << 1
+++/* [0x00000e30] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
+++/* [0x00000e38] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2
+++/* [0x00000e40] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
+++/* [0x00000e48] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3
+++/* [0x00000e50] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
+++/* [0x00000e58] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4
+++/* [0x00000e60] */ 0x40074031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12
+++/* [0x00000e68] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5
+++/* [0x00000e70] */ 0x40073031, 0xda00c9e3, // nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13
+++/* [0x00000e78] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6
+++/* [0x00000e80] */ 0x40072031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14
+++/* [0x00000e88] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7
+++/* [0x00000e90] */ 0x40071031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15
+++/* [0x00000e98] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
+++/* [0x00000ea0] */ 0x8d2087f6, 0xd00269e1, // sub.setf -, r3, 8       ; mov r1,   ra8
+++/* [0x00000ea8] */ 0x95249dbf, 0x10024208, // mov ra8,  ra9           ; mov rb8,  rb9
+++/* [0x00000eb0] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:yloop
+++/* [0x00000eb8] */ 0x9528adbf, 0x10024249, // mov ra9,  ra10          ; mov rb9,  rb10
+++/* [0x00000ec0] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11          ; mov rb10, rb11
+++/* [0x00000ec8] */ 0x959e7009, 0x100242cb, // mov ra11, r0            ; mov rb11, r1
+++/* [0x00000ed0] */ 0x4008803e, 0x180049e0, // nop                     ; mul24 r0, rb8,  ra2.8a
+++/* [0x00000ed8] */ 0x4008903e, 0x1a0049e1, // nop                     ; mul24 r1, rb9,  ra2.8b
+++/* [0x00000ee0] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0          ; mul24 r0, rb10, ra2.8c
+++/* [0x00000ee8] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0          ; mul24 r0, rb11, ra2.8d
+++/* [0x00000ef0] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8,  rb4
+++/* [0x00000ef8] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9,  rb5
+++/* [0x00000f00] */ 0x4d286237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb6
+++/* [0x00000f08] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
+++/* [0x00000f10] */ 0x8d9f223f, 0x10020867, // sub r1, r1, r0          ; mov -, vw_wait
+++/* [0x00000f18] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
+++/* [0x00000f20] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+++/* [0x00000f28] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
+++/* [0x00000f30] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
+++/* [0x00000f38] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
+++/* [0x00000f40] */ 0xfffffe48, 0xf06809e7, // brr.anyn -, r:yloop
+++/* [0x00000f48] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
+++/* [0x00000f50] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
+++/* [0x00000f58] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+++/* [0x00000f60] */ 0xfffffc48, 0xf0f809e7, // brr -, r:per_block_setup
+++/* [0x00000f68] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+++/* [0x00000f70] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+++/* [0x00000f78] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+++// ::mc_filter_b
+++/* [0x00000f80] */ 0x0f9d01c0, 0xd00213a7, // asr rb14, r0, i_shift16
+++// :yloopb
+++/* [0x00000f88] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1                           ; ldtmu0
+++/* [0x00000f90] */ 0x8e4539bf, 0xb0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu1
+++/* [0x00000f98] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+++/* [0x00000fa0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+++/* [0x00000fa8] */ 0x8e5409f6, 0x14129855, // shr r1, r4, rx_xshift2    ; mov.ifz ra_y2, ra_y2_next
+++/* [0x00000fb0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+++/* [0x00000fb8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+++/* [0x00000fc0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
+++/* [0x00000fc8] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
+++/* [0x00000fd0] */ 0x13540dc0, 0xd20208a7, // max r2, ra_y2, 0
+++/* [0x00000fd8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+++/* [0x00000fe0] */ 0x4c541dd3, 0xd2124562, // add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
+++/* [0x00000fe8] */ 0xec654c8f, 0x10024f21, // add t1s, ra_frame_base2, r2  ; v8subs r1, r1, rb20
+++/* [0x00000ff0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++/* [0x00000ff8] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,      r0
+++/* [0x00001000] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
+++/* [0x00001008] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1, r0 << 1
+++/* [0x00001010] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
+++/* [0x00001018] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2
+++/* [0x00001020] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
+++/* [0x00001028] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3
+++/* [0x00001030] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
+++/* [0x00001038] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4
+++/* [0x00001040] */ 0x40074031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12
+++/* [0x00001048] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5
+++/* [0x00001050] */ 0x40073031, 0xda00c9e3, // nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13
+++/* [0x00001058] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6
+++/* [0x00001060] */ 0x40072031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14
+++/* [0x00001068] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7
+++/* [0x00001070] */ 0x40071031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15
+++/* [0x00001078] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
+++/* [0x00001080] */ 0x8d2087f6, 0xd00269e1, // sub.setf -, r3, 8       ; mov r1,   ra8
+++/* [0x00001088] */ 0x95249dbf, 0x10024208, // mov ra8,  ra9           ; mov rb8,  rb9
+++/* [0x00001090] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:yloopb
+++/* [0x00001098] */ 0x9528adbf, 0x10024249, // mov ra9,  ra10          ; mov rb9,  rb10
+++/* [0x000010a0] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11          ; mov rb10, rb11
+++/* [0x000010a8] */ 0x959e7009, 0x100242cb, // mov ra11, r0            ; mov rb11, r1
+++/* [0x000010b0] */ 0x4008803e, 0x180049e0, // nop                     ; mul24 r0, rb8,  ra2.8a
+++/* [0x000010b8] */ 0x4008903e, 0x1a0049e1, // nop                     ; mul24 r1, rb9,  ra2.8b
+++/* [0x000010c0] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0          ; mul24 r0, rb10, ra2.8c
+++/* [0x000010c8] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0          ; mul24 r0, rb11, ra2.8d
+++/* [0x000010d0] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8,  rb4
+++/* [0x000010d8] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9,  rb5
+++/* [0x000010e0] */ 0x4d286237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb6
+++/* [0x000010e8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
+++/* [0x000010f0] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0          ; mov r2, rb12
+++/* [0x000010f8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
+++/* [0x00001100] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+++/* [0x00001108] */ 0x409ce00f, 0x100049e0, // nop                     ; mul24 r0, r1, rb14
+++/* [0x00001110] */ 0x4c4b808e, 0xd0024821, // add r0, r0, r2          ; mul24 r1, r1 << 8, ra18 << 8
+++/* [0x00001118] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+++/* [0x00001120] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
+++/* [0x00001128] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:yloopb
+++/* [0x00001130] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
+++/* [0x00001138] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
+++/* [0x00001140] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+++/* [0x00001148] */ 0xfffffa60, 0xf0f809e7, // brr -, r:per_block_setup
+++/* [0x00001150] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+++/* [0x00001158] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+++/* [0x00001160] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+++// ::mc_interrupt_exit12
+++/* [0x00001168] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+++/* [0x00001170] */ 0x009e7000, 0xa00009e7, // ldtmu0
+++/* [0x00001178] */ 0x009e7000, 0xa00009e7, // ldtmu0
+++/* [0x00001180] */ 0x009e7000, 0xb00009e7, // ldtmu1
+++/* [0x00001188] */ 0x009e7000, 0xb00009e7, // ldtmu1
+++/* [0x00001190] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+++/* [0x00001198] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+++/* [0x000011a0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+++/* [0x000011a8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+++/* [0x000011b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+++/* [0x000011b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+++/* [0x000011c0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+++/* [0x000011c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+++/* [0x000011d0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+++/* [0x000011d8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+++/* [0x000011e0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+++/* [0x000011e8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+++/* [0x000011f0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+++/* [0x000011f8] */ 0x009e7000, 0x100009e7, // nop        ; nop
+++// ::mc_exit1
+++/* [0x00001200] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+++/* [0x00001208] */ 0x009e7000, 0xa00009e7, // ldtmu0
+++/* [0x00001210] */ 0x009e7000, 0xb00009e7, // ldtmu1
+++/* [0x00001218] */ 0x009e7000, 0xa00009e7, // ldtmu0
+++/* [0x00001220] */ 0x009e7000, 0xb00009e7, // ldtmu1
+++/* [0x00001228] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+++/* [0x00001230] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+++/* [0x00001238] */ 0x009e7000, 0x100009e7, // nop        ; nop
+++// ::mc_end
+++};
+++#ifdef __HIGHC__
+++#pragma Align_to(8, rpi_shader)
+++#endif
++diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
++new file mode 100644
++index 0000000..9772796
++--- /dev/null
+++++ b/libavcodec/rpi_shader.h
++@@ -0,0 +1,19 @@
+++#ifndef rpi_shader_H
+++#define rpi_shader_H
+++
+++extern unsigned int rpi_shader[];
+++
+++#define mc_setup_uv (rpi_shader + 0)
+++#define mc_filter_uv (rpi_shader + 132)
+++#define mc_filter_uv_b0 (rpi_shader + 274)
+++#define mc_filter_uv_b (rpi_shader + 392)
+++#define mc_exit (rpi_shader + 540)
+++#define mc_interrupt_exit8 (rpi_shader + 558)
+++#define mc_setup (rpi_shader + 588)
+++#define mc_filter (rpi_shader + 872)
+++#define mc_filter_b (rpi_shader + 992)
+++#define mc_interrupt_exit12 (rpi_shader + 1114)
+++#define mc_exit1 (rpi_shader + 1152)
+++#define mc_end (rpi_shader + 1168)
+++
+++#endif
++diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
++new file mode 100644
++index 0000000..aa9e1e7
++--- /dev/null
+++++ b/libavcodec/rpi_shader.qasm
++@@ -0,0 +1,1098 @@
+++# register allocation
+++#
+++# ra0...ra7                                     eight horizontal filter coefficients
+++#
+++# rb0 rx_shift2
+++# rb1 rb_y2_next
+++#
+++# rb4...rb7
+++#
+++# rb8..rb11, ra8...ra11                         Y: eight filtered rows of context (ra11 == most recent)
+++#
+++#                                               (ra15 isn't clamped to zero - this happens during the
+++#                                                copy to ra14, and during its use in the vertical filter)
+++#
+++# rb8...rb11                                    eight vertical filter coefficients
+++
+++# ra4                                           y: Fiter, UV: 0x10000
+++
+++# rb12                                          offset to add before shift (round + weighting offsets)
+++# rb13                                          shift: denom + 6 + 9
+++# rb14                                          L0 weight (U on left, V on right)
+++# rb15                                          -- free --
+++#
+++# ra16                                          clipped(row start address+elem_num)&~3
+++# ra17                                          per-channel shifts
+++# ra18                                          L1 weight (Y)
+++# ra19                                          next ra17
+++#
+++# rb16                                          pitch
+++# rb17                                          height + 1
+++# rb18                                          height + 3
+++# rb19                                          next ra16
+++#
+++# ra20                                          1
+++# ra21                                          ra_21
+++# ra22 ra_k256                                  256
+++# ra23 ra_y2_next                               ra_y2_next
+++#
+++# rb20                                          0xffffff00
+++# rb21                                          vpm_setup for reading/writing 16bit results into VPM
+++# rb22 rb_k255                                  255
+++# rb23                                          24
+++#
+++# rb24                                          vdw_setup_1(dst_pitch)
+++# rb25                                          frame width-1
+++# rb26                                          height<<23 + width<<16 + vdw_setup_0
+++# rb27                                          vdw_setup_0 (depends on QPU number)
+++# rb28                                          vpm_setup (depends on QPU number) for writing 8bit results into VPM
+++# rb29                                          vdw_setup_1(dst_pitch-width)
+++# rb30                                          frame height-1
+++# rb31                                          used as temp to count loop iterations
+++#
+++# ra24                                          clipped(row start address+8+elem_num)&~3
+++# ra25                                          per-channel shifts 2
+++# ra26                                          next ra24
+++# ra27                                          next ra25
+++# ra28                                          next y
+++# ra29                                          y for next texture access
+++# ra30                                          64
+++#
+++# ra31                                          next kernel address
+++
+++.set rb_frame_width_minus_1,       rb25
+++.set rb_frame_height_minus_1,      rb30
+++.set rb_pitch,                     rb16
+++.set ra_x,                         ra16
+++.set ra_y2,                        ra21.16a
+++.set ra_y2_next,                   ra21.16b
+++
+++.set rb_x_next,                    rb19
+++.set rx_frame_base2_next,          rb19
+++
+++.set ra_frame_base,                ra24
+++.set ra_frame_base_next,           ra26
+++.set ra_xshift,                    ra17
+++
+++.set ra_u2v_ref_offset,            ra25
+++.set ra_frame_base2,               ra25
+++
+++.set ra_xshift_next,               ra19
+++.set rx_xshift2,                   rb0
+++.set rx_xshift2_next,              rb1
+++
+++.set ra_u2v_dst_offset,            ra27
+++
+++.set ra_y_next,                    ra28
+++.set ra_y,                         ra29
+++
+++.set ra_k1,                        ra20
+++.set rb_k255,                      rb22
+++.set ra_k256,                      ra22
+++
+++# With shifts only the bottom 5 bits are considered so -16=16, -15=17 etc.
+++.set i_shift16,                    -16
+++.set i_shift21,                    -11
+++
+++################################################################################
+++# mc_setup_uv(next_kernel, x, y, ref_u_base, ref_v_base, frame_width, frame_height, pitch, dst_pitch, offset, denom, vpm_id)
+++::mc_setup_uv
+++
+++# Read starting kernel
+++mov ra31, unif
+++
+++# Load first request location
+++add ra_x, unif, elem_num # Store x
+++mov ra_y, unif # Store y
+++mov ra_frame_base, unif # Store frame u base
+++nop
+++sub ra_u2v_ref_offset, unif, ra_frame_base # Store offset to add to move from u to v in reference frame
+++
+++# Read image dimensions
+++sub rb25,unif,1
+++sub rb30,unif,1
+++
+++# get source pitch
+++mov rb16, unif
+++
+++# get destination pitch
+++mov r0, unif
+++mov r1, vdw_setup_1(0)
+++add rb24, r1, r0
+++
+++# load constants
+++
+++mov ra4, 0x10000
+++mov ra_k1, 1
+++mov ra_k256, 256
+++mov ra30, 64
+++
+++mov rb20, 0xffffff00
+++mov rb_k255, 255
+++mov rb23, 24
+++
+++# touch vertical context to keep simulator happy
+++
+++mov ra8, 0
+++mov ra9, 0
+++mov ra10, 0
+++mov ra11, 0
+++mov ra12, 0
+++mov ra13, 0
+++mov ra14, 0
+++mov ra15, 0
+++
+++# Compute base address for first and second access
+++mov r0, ra_x           # Load x
+++max r0, r0, 0; mov r1, ra_y # Load y
+++min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base  # Load the frame base
+++shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
+++add ra_y, r1, 1
+++add r0, r0, r3
+++and r0, r0, ~3
+++max r1, r1, 0 ; mov ra_x, r0 # y
+++min r1, r1, rb_frame_height_minus_1
+++# submit texture requests for first line
+++add r2, r2, r0 ; mul24 r1, r1, rb_pitch
+++add t0s, r0, r1 ; mov ra_frame_base, r2
+++add t1s, r2, r1
+++
+++mov r2, 9
+++add rb13, r2, unif  # denominator
+++mov -, unif         # Unused
+++
+++# Compute part of VPM to use for DMA output
+++mov r2, unif
+++shl r2, r2, 1   # Convert QPU numbers to be even (this means we can only use 8 QPUs, but is necessary as we need to save 16bit intermediate results)
+++and r2, r2, 15
+++mov r1, r2
+++asr r1, r1, 2
+++shl r1, r1, 6
+++mov r0, r2
+++and r0, r0, 3
+++add r0, r0, r1
+++
+++mov r1, vpm_setup(0, 4, h8p(0, 0))   # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
+++add rb28, r0, r1  # VPM 8bit storage
+++asr r2, r0, 1     # r0 = bc0000d
+++mov r1, vpm_setup(0, 2, h16p(0, 0))  # 2 is stride - stride acts on ADDR which is Y[5:0],H[0] for 16 bit
+++add rb21, r2, r1  # VPM for 16bit intermediates
+++mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
+++shl r0, r0, 5
+++add rb27, r0, r1  # DMA out
+++
+++# submit texture requests for second line
+++max r1, ra_y, 0
+++min r1, r1, rb_frame_height_minus_1
+++add ra_y, ra_y, 1
+++bra -, ra31
+++nop ; mul24 r1, r1, rb_pitch
+++add t0s, r1, ra_x
+++add t1s, r1, ra_frame_base
+++
+++
+++
+++################################################################################
+++
+++# mc_filter_uv(next_kernel, x, y, frame_u_base, frame_v_base, width_height, hcoeffs, vcoeffs, offset_weight_u, offset_weight_v, this_u_dst, this_v_dst)
+++
+++# At this point we have already issued two pairs of texture requests for the current block
+++# ra_x, ra_x16_base point to the current coordinates for this block
+++::mc_filter_uv
+++mov ra31, unif
+ +
+++# per-channel shifts were calculated on the *previous* invocation
+ +
+-+hevc_uv_deblock_16x16:
+-+  push r6-r15, lr
+-+  mov r14,0
+-+  b hevc_uv_start
+-+hevc_uv_deblock_16x16_with_clear:
+-+  push r6-r15, lr
+-+  mov r14,1
+-+  b hevc_uv_start
+++# get base addresses and per-channel shifts for *next* invocation
+++add r0, unif, elem_num    # x
+++max r0, r0, 0         ; mov r1, unif # y
+++min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base
+++# compute offset from frame base u to frame base v
+++sub r2, unif, r3      ; mov ra_xshift, ra_xshift_next
+++shl ra_xshift_next, r0, 3
+++add r0, r0, r3        ; mov ra1, unif  # ; width_height
+++and rb_x_next, r0, ~3 ; mov ra0, unif  # H filter coeffs
+++mov ra_y_next, r1     ; mov vw_setup, rb28
+++add ra_frame_base_next, rb_x_next, r2
+ +
+-+hevc_uv_start:
+-+  mov r9,r4
+-+  mov r4,r3
+-+  mov r13,r2
+-+  mov r2,r0
+-+  mov r10,r0
+-+  subscale4 r0,r1
+-+  mov r8,63
+-+  mov r6,-3
+-+  vmov H(zeros,0),0
+-+# r7 is number of blocks still to load
+-+# r0 is location of current block - 4 * stride
+-+# r1 is stride
+-+# r2 is location of current block
+-+# r3 is offset of start of block (actual edges start at H(16,16)+r3 for horizontal and H(16,0)+r3 for vertical
+-+# r4 is setup
+-+# r5 is for temporary calculations
+-+# r8 holds 63
+-+# r6 holds -3
+-+# r9 holds the number of 16 high rows to process
+-+# r10 holds the original img base
+-+# r11 returns 0 if no filtering was done on the edge
+-+# r12 saves a copy of this
+-+# r13 is copy of width
+-+# r14 is 1 if we should clear the old contents, or 0 if not
+++# set up VPM write
+++# get width,height of block
+ +
+-+uv_process_row:
+-+  # First iteration does not do horizontal filtering on previous
+-+  mov r7, r13
+-+  mov r3,0
+-+  vldb H(12++,16)+r3,(r0 += r1) REP 4    # Load the current block
+-+  vldb H(16++,16)+r3,(r2 += r1) REP 16
+-+  vldb H(setup_input,0), (r4)  # We may wish to prefetch these
+-+  cmp r14,1
+-+  bne uv_skip0
+-+  vstb H(zeros,0),(r4)
+-+uv_skip0:
+-+  bl uv_vert_filter
+-+  add r3,8
+-+  vadd H(setup_input,0),H(setup_input,8),0 # Rotate to second set of 8
+-+  bl uv_vert_filter
+-+  sub r3,8
+-+  b uv_start_deblock_loop
+-+uv_deblock_loop:
+-+  # Middle iterations do vertical on current block and horizontal on preceding
+-+  vldb H(12++,16)+r3,(r0 += r1) REP 4  # load the current block
+-+  vldb H(16++,16)+r3,(r2 += r1) REP 16
+-+  vldb H(setup_input,0), (r4)
+-+  cmp r14,1
+-+  bne uv_skip1
+-+  vstb H(zeros,0),(r4)
+-+uv_skip1:
+-+  bl uv_vert_filter
+-+  add r3,8
+-+  vadd H(setup_input,0),H(setup_input,8),0
+-+  bl uv_vert_filter
+-+  sub r3,8
+-+  vldb H(setup_input,0), -16(r4)
+-+  cmp r14,1
+-+  bne uv_skip3
+-+  vstb H(zeros,0),-16(r4)
+-+uv_skip3:
+-+  bl uv_horz_filter
+-+  mov r12,r11
+-+  add r3,8*64
+-+  vadd H(setup_input,0),H(setup_input,8),0
+-+  bl uv_horz_filter
+-+  sub r3,8*64
+-+  addcmpbeq r12,0,0,uv_skip_save_top
+-+  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
+-+uv_skip_save_top:
+-+  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
+-+uv_start_deblock_loop:
+-+  # move onto next 16x16 (could do this with circular buffer support instead)
+-+  add r3,16
+-+  and r3,r8
+-+  add r4,32
+-+  # Perform loop counter operations (may work with an addcmpbgt as well?)
+-+  add r0,16
+-+  add r2,16
+-+  sub r7,1
+-+  cmp r7,0 # Are there still more blocks to load
+-+  bgt uv_deblock_loop
+++sub rb29, rb24, ra1.16b  # Compute vdw_setup1(dst_pitch-width)
+++add rb17, ra1.16a, 1
+++add rb18, ra1.16a, 3
+++shl r0,   ra1.16a, 7
+++add r0,   r0, ra1.16b    # Combine width and height of destination area
+++shl r0,   r0, i_shift16  # Shift into bits 16 upwards of the vdw_setup0 register
+++add rb26, r0, rb27    ; mov ra3, unif  # ; V filter coeffs
+++
+++mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++
+++# unpack filter coefficients
+++
+++mov ra1, unif         ; mov rb8,  ra3.8a   # U offset/weight
+++mov.ifnz ra1, unif    ; mov rb9,  ra3.8b   # V offset/weight
+++nop                   ; mov rb10, ra3.8c
+++mov r3, 0             ; mov rb11, ra3.8d   # Loop count
+++
+++shl r1, ra1.16b, rb13
+++asr rb12, r1, 1
+++shl rb14, ra1.16a, 1  # b14 = weight*2
+++
+++# rb14 - weight L0 * 2
+++# rb13 = weight denom + 6 + 9
+++# rb12 = (((is P) ? offset L0 * 2 : offset L1 + offset L0) + 1) << (rb13 - 1)
+++
+++# r2 is elem_num
+++# retrieve texture results and pick out bytes
+++# then submit two more texture requests
+++
+++# r3 = 0
+++:uvloop
+++# retrieve texture results and pick out bytes
+++# then submit two more texture requests
+++
+++sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1          ; ldtmu0     # loop counter increment
+++shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
+++mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+++mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+++shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
+++
+++max r2, ra_y, 0  # y
+++min r2, r2, rb_frame_height_minus_1
+++add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+++add t0s, ra_x, r2    ; v8subs r1, r1, rb20
+++add t1s, ra_frame_base, r2
+++
+++# generate seven shifted versions
+++# interleave with scroll of vertical context
+++
+++mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++
+++# apply horizontal filter
+++nop                  ; mul24      r3, ra0.8a,       r0
+++nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8
+++nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1
+++nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9
+++sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2
+++nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
+++add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3
+++nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
+++sub r0, r2, r3       ; mov r3, rb31
+++sub.setf -, r3, 4    ; mov ra12, ra13
+++brr.anyn -, r:uvloop
+++mov ra13, ra14          ; mul24 r1, ra14, rb9
+++mov ra14, ra15
+++mov ra15, r0            ; mul24 r0, ra12, rb8
+++# >>> .anyn uvloop
+++
+++# apply vertical filter and write to VPM
+++
+++sub r1, r1, r0          ; mul24 r0, ra14, rb10
+++add r1, r1, r0          ; mul24 r0, ra15, rb11
+++sub r1, r1, r0          ; mov -, vw_wait
+++sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
+++asr r1, r1, 14
+++nop                     ; mul24 r1, r1, rb14
+++shl r1, r1, 8
+++
+++add r1, r1, rb12
+++brr.anyn -, r:uvloop
+++asr r1, r1, rb13
+++min r1, r1, rb_k255       # Delay 2
+++max vpm, r1, 0         # Delay 3
+++
+++# DMA out for U
+++
+++mov vw_setup, rb26 # VDW setup 0
+++mov vw_setup, rb29 # Stride
+++mov vw_addr, unif # start the VDW
+++
+++# DMA out for V
+++# We need to wait for the U to complete first, but have nothing useful to compute while we wait.
+++# Could potentially push this write into the start of the next pipeline stage.
+++mov r0, 16
+++mov -, vw_wait
+++
+++bra -, ra31
+++add vw_setup, rb26, r0 # VDW setup 0
+++mov vw_setup, rb29 # Stride
+++mov vw_addr, unif # start the VDW
+++
+++
+++################################################################################
+++
+++# mc_filter_uv_b0(next_kernel, x, y, frame_u_base, frame_v_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_u_dst, this_v_dst)
+++
+++# At this point we have already issued two pairs of texture requests for the current block
+++# ra_x, ra_x16_base point to the current coordinates for this block
+++::mc_filter_uv_b0
+++mov ra31, unif
+++
+++# per-channel shifts were calculated on the *previous* invocation
+++
+++# get base addresses and per-channel shifts for *next* invocation
+++add r0, unif, elem_num       # x
+++max r0, r0, 0                ; mov r1, unif # y
+++min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base
+++sub r2, unif, r3             ; mov ra_xshift, ra_xshift_next # compute offset from frame base u to frame base v ;
+++shl ra_xshift_next, r0, 3
+++add r0, r0, r3  	     ; mov ra1, unif   # ; width_height
+++and rb_x_next, r0, ~3        ; mov ra0, unif   # ; H filter coeffs
+++mov ra_y_next, r1            ; mov vw_setup, rb21
+++
+++add ra_frame_base_next, rb_x_next, r2
+++
+++# Need to have unsigned coeffs to so we can just unpack in the filter
+++# chroma filter always goes -ve, +ve, +ve, -ve. This is fixed in the
+++# filter code. Unpack into b regs for V
+++
+++# set up VPM write, we need to save 16bit precision
+++
+++sub rb29, rb24, ra1.16b         # Compute vdw_setup1(dst_pitch-width)
+++add rb17, ra1.16a, 1
+++add rb18, ra1.16a, 3
+++shl r0,   ra1.16a, 7
+++add r0,   r0, ra1.16b           # Combine width and height of destination area
+++shl r0,   r0, i_shift16      ; mov ra3, unif  # ; V filter coeffs
+++add rb26, r0, rb27
+++
+++mov rb8, ra3.8a
+++mov rb9, ra3.8b
+++mov rb10, ra3.8c
+++mov rb11, ra3.8d
+++
+++# r2 is elem_num
+++# r3 is loop counter
+++
+++mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++
+++mov      rb14, unif                 # U weight L0
+++mov.ifnz rb14, unif    ; mov r3, 0  # V weight L0 ; Loop counter
+++# rb14 unused in b0 but will hang around till the second pass
+++
+++# retrieve texture results and pick out bytes
+++# then submit two more texture requests
+++
+++# r3 = 0
+++:uvloop_b0
+++# retrieve texture results and pick out bytes
+++# then submit two more texture requests
+++
+++sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1          ; ldtmu0     # loop counter increment
+++shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
+++mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+++mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+++shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
+++
+++max r2, ra_y, 0  # y
+++min r2, r2, rb_frame_height_minus_1
+++add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+++add t0s, ra_x, r2    ; v8subs r1, r1, rb20
+++add t1s, ra_frame_base, r2
+++
+++# generate seven shifted versions
+++# interleave with scroll of vertical context
+++
+++mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++
+++nop                  ; mul24      r3, ra0.8a,       r0
+++nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8
+++nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1
+++nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9
+++sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2
+++nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
+++add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3
+++nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
+++sub r0, r2, r3       ; mov r3, rb31
+++sub.setf -, r3, 4    ; mov ra12, ra13
+++brr.anyn -, r:uvloop_b0
+++mov ra13, ra14          ; mul24 r1, ra14, rb9  # ra14 is about to be ra13
+++mov ra14, ra15
+++mov ra15, r0            ; mul24 r0, ra12, rb8
+++# >>> .anyn uvloop_b0
+++
+++# apply vertical filter and write to VPM
+++
+++sub r1, r1, r0          ; mul24 r0, ra14, rb10
+++sub.setf -, r3, rb18
+++brr.anyn -, r:uvloop_b0
+++add r1, r1, r0          ; mul24 r0, ra15, rb11
+++sub r1, r1, r0          ; mov -, vw_wait
+++asr vpm, r1, 6
+++# >>> .anyn uvloop_b0
+++
+++# in pass0 we don't really need to save any results, but need to discard the uniforms
+++# DMA out for U
+++
+++bra -, ra31
+++mov -, unif           # Delay 1
+++mov -, unif           # Delay 2
+++nop                   # Delay 3
+++
+++
+++################################################################################
+++
+++::mc_filter_uv_b
+++mov ra31, unif
+++
+++# per-channel shifts were calculated on the *previous* invocation
+++
+++# set up VPM write
+++mov ra_xshift, ra_xshift_next      ; mov vw_setup, rb28
+++
+++# get base addresses and per-channel shifts for *next* invocation
+++add r0, unif, elem_num    # x
+++max r0, r0, 0                      ; mov ra_y_next, unif # y
+++min r0, r0, rb_frame_width_minus_1 ; mov r3, unif        # V frame_base
+++# compute offset from frame base u to frame base v
+++sub r2, unif, r3                   ; mul24 ra_xshift_next, r0, 8 # U frame_base
+++add r0, r0, r3                     ; mov ra1, unif       # width_height
+++and rb_x_next, r0, ~3              ; mov ra0, unif       # H filter coeffs
+++
+++sub rb29, rb24, ra1.16b  # Compute vdw_setup1(dst_pitch-width)
+++add rb17, ra1.16a, 1
+++add rb18, ra1.16a, 3
+++shl r0,   ra1.16a, 7
+++
+++add ra_frame_base_next, rb_x_next, r2
+++
+++# r0 is currently height<<7
+++# For vr_setup we want height<<20 (so 20-7=13 additional bits)
+++shl r3, r0, i_shift21     ; mov ra3, unif # Shl 13 + Mask off top 8 bits ; V filter coeffs
+++shr r3, r3, 8
+++add vr_setup, r3, rb21
+++
+++add r0, r0, ra1.16b    # Combine width and height of destination area
+++shl r0, r0, i_shift16  # Shift into bits 16 upwards of the vdw_setup0 register
+++add rb26, r0, rb27
+ +
+-+  # Final iteration needs to just do horizontal filtering
+-+  vldb H(setup_input,0), -16(r4)
+-+  cmp r14,1
+-+  bne uv_skip2
+-+  vstb H(zeros,0),-16(r4)
+-+uv_skip2:
+-+  bl uv_horz_filter
+-+  mov r12,r11
+-+  add r3,8*64
+-+  vadd H(setup_input,0),H(setup_input,8),0
+-+  bl uv_horz_filter
+-+  sub r3,64*8
+-+  addcmpbeq r12,0,0,uv_skip_save_top2
+-+  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
+-+uv_skip_save_top2:
+-+  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
+++# get filter coefficients
+ +
+-+# Now look to see if we should do another row
+-+  sub r9,1
+-+  cmp r9,0
+-+  bgt uv_start_again
+-+  pop r6-r15, pc
+-+uv_start_again:
+-+  # Need to sort out r0,r2 to point to next row down
+-+  addscale16 r10,r1
+-+  mov r2,r10
+-+  subscale4 r0,r2,r1
+-+  b uv_process_row
+++mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+ +
+++# Get offset & weight stuff
+ +
+-+# At this stage H(16,16)+r3 points to the first pixel of the 16 high edge to be filtered
+-+# So we can reuse the code we move the parts to be filtered into HX(P0/P1/P2/P3/Q0/Q1/Q2/Q3,0) - we will perform a final saturation step on placing them back into the correct locations
+++# The unif read occurs unconditionally, only the write is conditional
+++mov      ra1, unif  ; mov rb8,  ra3.8a    # U offset/weight ;
+++mov.ifnz ra1, unif  ; mov rb9,  ra3.8b    # V offset/weight ;
+++nop                 ; mov rb10, ra3.8c
+++mov r3, 0           ; mov rb11, ra3.8d    # Loop counter ;
+ +
+-+uv_vert_filter:
+-+  push lr
+++shl r1, ra1.16b, rb13
+++asr rb12, r1, 1
+ +
+-+  vmov HX(P1,0), V(16,14)+r3
+-+  vmov HX(P0,0), V(16,15)+r3
+-+  vmov HX(Q0,0), V(16,16)+r3
+-+  vmov HX(Q1,0), V(16,17)+r3
+++# ra1.16a used directly in the loop
+ +
+-+  bl do_chroma_filter
+++# retrieve texture results and pick out bytes
+++# then submit two more texture requests
+ +
+-+  vadds V(16,15)+r3, HX(P0,0), 0
+-+  vadds V(16,16)+r3, HX(Q0,0), 0
+++# r3 = 0
+++:uvloop_b
+++# retrieve texture results and pick out bytes
+++# then submit two more texture requests
+ +
+-+  pop pc
+++sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1          ; ldtmu0     # loop counter increment
+++shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
+++mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+++mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+++shr r1, r4, ra_xshift     ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
+ +
+-+# Filter edge at H(16,0)+r3
+-+uv_horz_filter:
+-+  push lr
+++max r2, ra_y, 0  # y
+++min r2, r2, rb_frame_height_minus_1
+++add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+++add t0s, ra_x, r2         ; v8subs r1, r1, rb20
+++add t1s, ra_frame_base, r2
+ +
+-+  vmov HX(P1,0), H(14,0)+r3
+-+  vmov HX(P0,0), H(15,0)+r3
+-+  vmov HX(Q0,0), H(16,0)+r3
+-+  vmov HX(Q1,0), H(17,0)+r3
+++# generate seven shifted versions
+++# interleave with scroll of vertical context
+ +
+-+  bl do_chroma_filter
+++mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+ +
+-+  vadds H(15,0)+r3, HX(P0,0), 0
+-+  # P3 and Q3 never change so don't bother saving back
+-+  vadds H(16,0)+r3, HX(Q0,0), 0
+++nop                  ; mul24      r3, ra0.8a,       r0
+++nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8
+++nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1
+++nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9
+++sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2
+++nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
+++add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3
+++nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
+++sub r0, r2, r3       ; mov r3, rb31
+++sub.setf -, r3, 4    ; mov ra12, ra13
+++brr.anyn -, r:uvloop_b
+++mov ra13, ra14          ; mul24 r1, ra14, rb9
+++mov ra14, ra15
+++mov ra15, r0            ; mul24 r0, ra12, rb8
+++# >>> .anyn uvloop_b
+ +
+-+  pop pc
+++# apply vertical filter and write to VPM
+ +
+-+# r4 points to array of beta/tc for each 4 length edge
+-+do_chroma_filter:
+-+  valtl H(setup,0),H(setup_input,0),H(setup_input,0) # tc*8
+-+  valtl HX(tc,0),H(setup,0),H(setup,0)
+++sub r1, r1, r0          ; mul24 r0, ra14, rb10
+++add r1, r1, r0          ; mul24 r0, ra15, rb11
+++# Beware: vpm read gets unsigned 16-bit value, so we must sign extend it
+++sub r1, r1, r0          ; mul24 r0, vpm, ra4  # ra4 = 0x10000
+++sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
+++asr r1, r1, 14          # shift2=6
+ +
+-+  vsub HX(delta,0),HX(Q0,0),HX(P0,0)
+-+  vshl HX(delta,0),HX(delta,0),2 CLRA SACC
+-+  vsub -,HX(P1,0),HX(Q1,0) SACC
+-+  vmov HX(delta,0),4 SACC
+-+  vasr HX(delta,0),HX(delta,0),3
+-+  vclamps HX(delta,0), HX(delta,0), HX(tc,0)
+-+  vadd HX(P0,0),HX(P0,0),HX(delta,0)
+-+  vsub HX(Q0,0),HX(Q0,0),HX(delta,0)
+-+  b lr
+++asr r0, r0, i_shift16   ; mul24 r1, r1, ra1.16a
+++nop                     ; mul24 r0, r0, rb14
+ +
+-+# r0 = list
+-+# r1 = number
+-+hevc_run_command_list:
+-+  push r6-r7, lr
+-+  mov r6, r0
+-+  mov r7, r1
+-+loop_cmds:
+-+  ld r0,(r6) # How to encode r6++?
+-+  add r6,4
+-+  ld r1,(r6)
+-+  add r6,4
+-+  ld r2,(r6)
+-+  add r6,4
+-+  ld r3,(r6)
+-+  add r6,4
+-+  ld r4,(r6)
+-+  add r6,4
+-+  ld r5,(r6)
+-+  add r6,4
+-+  bl hevc_trans_16x16
+-+  sub r7,1
+-+  cmp r7,0
+-+  bgt loop_cmds
+++add r1, r1, r0          ; mov -, vw_wait
+++shl r1, r1, 8           # Lose bad top 8 bits & sign extend
+ +
+-+  pop r6-r7, pc
+-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
+-index 91777be..5aa0432 100644
+---- a/libavcodec/rpi_qpu.c
+-+++ b/libavcodec/rpi_qpu.c
+-@@ -397,6 +397,8 @@ static void *vpu_start(void *arg) {
+-   int start_time;
+-   int end_time;
+-   int count=0;
+-+  int count_deblock=0;
+-+  int count_qpu=0;
+- #endif
+-   while(1) {
+-     int i;
+-@@ -442,7 +444,7 @@ static void *vpu_start(void *arg) {
+-         break;
+-       }
+-     }
+--    printf("Have_qpu = %d, have_vpu=%d\n",have_qpu,have_vpu);
+-+    //printf("Have_qpu = %d, have_vpu=%d\n",have_qpu,have_vpu);
+- #endif
+-     qpu_code = p[7];
+-     qpu_codeb = p[16];
+-@@ -460,6 +462,12 @@ static void *vpu_start(void *arg) {
+-     off_time += start_time-last_time;
+- #endif
+- 
+-+#define NO_FLUSH 1
+-+#define CLEAR_PROFILE 2
+-+#define OUTPUT_COUNTS 4
+++add r1, r1, rb12        # rb12 = (offsetL0 + offsetL1 + 1) << (rb13 - 1)
+ +
+-+#define FLAGS_FOR_PROFILING (NO_FLUSH)
+++brr.anyn -, r:uvloop_b
+++asr r1, r1, rb13         # Delay 1
+++min r1, r1, rb_k255       # Delay 2
+++max vpm, r1, 0         # Delay 3
+ +
+- #ifdef RPI_COMBINE_JOBS
+-     if (have_qpu) {
+-       for(i=0;i<8;i++) {
+-@@ -472,14 +480,14 @@ static void *vpu_start(void *arg) {
+-       }
+-       if (have_vpu) {
+-         execute_multi(gpu->mb,
+--                              12,gpu->vc + offsetof(struct GPU, mail2), 1, 5000,
+-+                              12,gpu->vc + offsetof(struct GPU, mail2), FLAGS_FOR_PROFILING, 5000,
+-                               8,gpu->vc + offsetof(struct GPU, mail), 1 /* no flush */, 5000 /* timeout ms */,
+-                               p[0], p[1], p[2], p[3], p[4], p[5], p[6], // VPU0
+-                               q[0], q[1], q[2], q[3], q[4], q[5], q[6]); // VPU1
+-         q[0] = 0;
+-       } else {
+-         execute_multi(gpu->mb,
+--                              12,gpu->vc + offsetof(struct GPU, mail2), 1, 5000,
+-+                              12,gpu->vc + offsetof(struct GPU, mail2), FLAGS_FOR_PROFILING, 5000,
+-                               8,gpu->vc + offsetof(struct GPU, mail), 1 /* no flush */, 5000 /* timeout ms */,
+-                               p[0], p[1], p[2], p[3], p[4], p[5], p[6], // VPU0
+-                               0,    0   , 0   , 0   , 0   , 0   , 0); // VPU1
+-@@ -510,7 +518,7 @@ static void *vpu_start(void *arg) {
+-       execute_qpu(gpu->mb,8,gpu->vc + offsetof(struct GPU, mail), 1 /* no flush */, 5000 /* timeout ms */);
+- #else
+-       execute_multi(gpu->mb,
+--                              12,gpu->vc + offsetof(struct GPU, mail2), 1, 5000,
+-+                              12,gpu->vc + offsetof(struct GPU, mail2), FLAGS_FOR_PROFILING , 5000,
+-                               8,gpu->vc + offsetof(struct GPU, mail), 1 /* no flush */, 5000 /* timeout ms */,
+-                               p[0], p[1], p[2], p[3], p[4], p[5], p[6], // VPU0
+-                               0,    0   , 0   , 0   , 0   , 0   , 0); // VPU1
+-@@ -525,17 +533,20 @@ static void *vpu_start(void *arg) {
+-     // There are three cases we may wish to distinguish of VPU/QPU activity
+-     on_time += end_time - start_time;
+- #else
+--    if (p[6]==2)
+-+    if (p[6]>1) {
+-+      count_deblock++;
+-       on_time_deblock += end_time - start_time;
+--    else
+-+    } else {
+-       on_time += end_time - start_time;
+-+      count_qpu++;
+-+    }
+- #endif
+-     count++;
+-     if ((count&0x7f)==0)
+- #ifdef RPI_COMBINE_JOBS
+--      printf("Posted %d On=%dms, On_deblock=%dms, Off=%dms\n",count,(int)(on_time/1000),(int)(on_time_deblock/1000),(int)(off_time/1000));
+--#else
+-       printf("Posted %d On=%dms, Off=%dms\n",count,(int)(on_time/1000),(int)(off_time/1000));
+-+#else
+-+      printf("Posted %d On=%dms (%d calls), On_deblock=%dms (%d calls), Off=%dms\n",count,(int)(on_time/1000),count_qpu,(int)(on_time_deblock/1000),count_deblock,(int)(off_time/1000));
+- #endif
+- #endif
+- job_done_early:
+-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
+-index 0686249..64bf5b0 100644
+---- a/libavcodec/rpi_shader.qasm
+-+++ b/libavcodec/rpi_shader.qasm
+-@@ -1077,6 +1077,17 @@ nop        ; nop # delay slot 2
+- ::mc_interrupt_exit12
+-   mov  -, vw_wait # wait on the VDW
+- 
+-+  # Dummy wait to test instructions
+-+#  mov r3,1000000
+-+#:dummy_loop
+-+#  sub.setf r3, r3, 1
+-+#  nop
+-+#  nop
+-+#  brr.anynn -, r:dummy_loop
+-+#  nop
+-+#  nop
+-+#  nop
+ +
+-   ldtmu0
+-   ldtmu0
+-   ldtmu1
+--- 
+-2.7.4
+-
+-
+-From 12a194bddd049ab97154e9fbdd46b63b558a3bee Mon Sep 17 00:00:00 2001
+-From: Ben Avison <bavison@riscosopen.org>
+-Date: Tue, 23 Jun 2015 23:42:03 +0100
+-Subject: [PATCH 67/68] armv7/hevc: Optimise deblocking boundary strength
+- calculation
+-
+----
+- libavcodec/arm/hevcdsp_deblock_neon.S | 115 +++++++++++++++++
+- libavcodec/arm/hevcdsp_init_neon.c    |   9 ++
+- libavcodec/hevc.h                     |  11 --
+- libavcodec/hevc_filter.c              | 224 ++++++++++++++--------------------
+- libavcodec/hevcdsp.c                  | 116 ++++++++++++++++++
+- libavcodec/hevcdsp.h                  |  14 +++
+- 6 files changed, 344 insertions(+), 145 deletions(-)
+-
+-diff --git a/libavcodec/arm/hevcdsp_deblock_neon.S b/libavcodec/arm/hevcdsp_deblock_neon.S
+-index 166bddb..bad4589 100644
+---- a/libavcodec/arm/hevcdsp_deblock_neon.S
+-+++ b/libavcodec/arm/hevcdsp_deblock_neon.S
+-@@ -383,3 +383,118 @@ function ff_hevc_h_loop_filter_chroma_neon, export=1
+-         vst1.8   {d4}, [r0]
+-         bx       lr
+- endfunc
+++# DMA out for U
+ +
+-+/* ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_inc, int out_inc,
+-+ *                                            int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
+-+ *                                            MvField *curr, MvField *neigh, uint8_t *bs)
+-+ */
+-+function ff_hevc_deblocking_boundary_strengths_neon, export=1
+-+        add         ip, sp, #4*4
+-+        push        {a2-a4,v1-v8,lr}
+-+        ldmia       ip, {v5-v7}
+-+1:      ldmdb       ip, {v1-v4}
+-+        ldrsb       a3, [v5, #8]    @ curr->ref_idx
+-+        ldrsb       v8, [v5, #9]
+-+        ldrsb       ip, [v6, #8]    @ neigh->ref_idx
+-+        ldrsb       lr, [v6, #9]
+-+        ldr         v1, [v1, a3, lsl #2]
+-+        ldrb        a3, [v5, #10]   @ curr->pred_flag
+-+        ldr         v2, [v2, v8, lsl #2]
+-+        ldrb        v8, [v6, #10]   @ neigh->pred_flag
+-+        ldr         v3, [v3, ip, lsl #2]
+-+        ldr         v4, [v4, lr, lsl #2]
+-+        teq         a3, #3
+-+        beq         20f
+-+        teq         v8, #3
+-+        beq         90f
+++mov vw_setup, rb26 # VDW setup 0
+++mov vw_setup, rb29 # Stride
+++mov vw_addr, unif # start the VDW
+ +
+-+        tst         a3, #1
+-+        ldrne       a3, [v5, #0]    @ curr->mv[0]
+-+        ldreq       a3, [v5, #4]    @ curr->mv[1]
+-+        moveq       v1, v2
+-+        tst         v8, #1
+-+        ldrne       v8, [v6, #0]    @ neigh->mv[0]
+-+        ldreq       v8, [v6, #4]    @ neigh->mv[1]
+-+        moveq       v3, v4
+-+        teq         v1, v3
+-+        bne         10f
+-+        ldr         lr, =0xFFFCFFFC
+-+        ssub16      ip, v8, a3
+-+        ssub16      a3, a3, v8
+-+        sel         a3, a3, ip
+-+        ands        a3, a3, lr
+-+        @ drop through
+-+10:     movne       a3, #1
+-+11:     subs        a2, a2, #1
+-+12:     strbhs      a3, [v7], a4
+-+        subs        a2, a2, #1
+-+        bhs         12b
+++# DMA out for V
+++# We need to wait for the U to complete first, but have nothing useful to compute while we wait.
+++# Could potentially push this write into the start of the next pipeline stage.
+++mov r0, 16
+++mov -, vw_wait
+++
+++bra -, ra31
+++add vw_setup, rb26, r0 # VDW setup 0
+++mov vw_setup, rb29 # Stride
+++mov vw_addr, unif # start the VDW
+++
+++################################################################################
+++
+++# mc_exit()
+++
+++::mc_exit
+++mov  -, vw_wait # wait on the VDW
+++
+++mov -,srel(0)
+++
+++ldtmu0
+++ldtmu1
+++ldtmu0
+++ldtmu1
+++
+++nop        ; nop ; thrend
+++nop        ; nop # delay slot 1
+++nop        ; nop # delay slot 2
+++
+++# mc_interrupt_exit8()
+++::mc_interrupt_exit8
+++mov  -, vw_wait # wait on the VDW
+++
+++ldtmu0
+++ldtmu1
+++ldtmu0
+++ldtmu1
+++
+++mov -,sacq(0) # 1
+++mov -,sacq(0) # 2
+++mov -,sacq(0) # 3
+++mov -,sacq(0) # 4
+++mov -,sacq(0) # 5
+++mov -,sacq(0) # 6
+++mov -,sacq(0) # 7
+++
+++nop        ; nop ; thrend
+++mov interrupt, 1; nop # delay slot 1
+++nop        ; nop # delay slot 2
+++
+++
+++
+++
+++
+++# LUMA CODE
+++
+++# The idea is to form B predictions by doing 8 pixels from ref0 in parallel with 8 pixels from ref1.
+++# For P frames we make the second x,y coordinates offset by +8
+ +
+-+        ldm         sp, {a2, a3}
+-+        add         ip, sp, #16*4
+-+        subs        a1, a1, #1
+-+        add         v5, v5, a3
+-+        add         v6, v6, a3
+-+        bhi         1b
+-+        pop         {a2-a4,v1-v8,pc}
+++################################################################################
+++# mc_setup(y_x, ref_y_base, y2_x2, ref_y2_base, frame_width_height, pitch, dst_pitch, offset_shift, tbd, next_kernel)
+++::mc_setup
+++  mov r3, 16
+ +
+-+20:     teq         v8, #3
+-+        bne         10b
+++  # Need to save these because we need to know the frame dimensions before computing texture coordinates
+++  mov ra8, unif  # y_x
+++  mov ra9, unif  # ref_y_base
+++  mov ra10, unif # y2_x2
+++  mov ra11, unif # ref_y2_base
+ +
+-+        teq         v1, v3
+-+        teqeq       v2, v4
+-+        bne         40f
+-+        teq         v1, v2
+-+        bne         30f
+++# Read image dimensions
+++  mov r1, unif # width_height
+++  shl r0,r1,r3
+++  asr r1,r1,r3 # width
+++  asr r0,r0,r3 # height
+++  sub rb_frame_width_minus_1,r1,1
+++  sub rb_frame_height_minus_1,r0,1
+ +
+-+        ldrd        v1, v2, [v5]    @ curr->mv
+-+        ldrd        v3, v4, [v6]    @ neigh->mv
+-+        ldr         lr, =0xFFFCFFFC
+-+        ssub16      ip, v3, v1
+-+        ssub16      a3, v1, v3
+-+        sel         a3, a3, ip
+-+        ands        a3, a3, lr
+-+        bne         25f
+-+        ssub16      ip, v4, v2
+-+        ssub16      a3, v2, v4
+-+        sel         a3, a3, ip
+-+        ands        a3, a3, lr
+-+        beq         11b
+-+        @ drop through
+-+25:     ssub16      ip, v4, v1
+-+        ssub16      a3, v1, v4
+-+        sel         a3, a3, ip
+-+        ands        a3, a3, lr
+-+        bne         10b
+-+        ssub16      ip, v3, v2
+-+        ssub16      a3, v2, v3
+-+        sel         a3, a3, ip
+-+        ands        a3, a3, lr
+-+        b           10b
+++# get source pitch
+++  mov rb_pitch, unif # src_pitch
+ +
+-+30:     ldrd        v1, v2, [v5]    @ curr->mv
+-+        ldrd        v3, v4, [v6]    @ neigh->mv
+-+        ldr         lr, =0xFFFCFFFC
+-+        ssub16      ip, v3, v1
+-+        ssub16      a3, v1, v3
+-+        sel         a3, a3, ip
+-+        ands        a3, a3, lr
+-+        bne         10b
+-+        ssub16      ip, v4, v2
+-+        ssub16      a3, v2, v4
+-+        sel         a3, a3, ip
+-+        ands        a3, a3, lr
+-+        b           10b
+++# get destination pitch
+++  mov r0, unif       # dst_pitch
+++  mov r1, vdw_setup_1(0)
+++  add rb24, r1, r0
+ +
+-+40:     teq         v1, v4
+-+        teqeq       v2, v3
+-+        bne         10b
+++# Compute base address for first and second access
+++  mov r1, ra8 # y_x
+++  shl r0,r1,r3 # r0 is x<<16
+++  asr r1,r1,r3 # r1 is y
+++  asr r0,r0,r3 # r0 is x
+++  add r0, r0, elem_num # Load x
+++  max r0, r0, 0
+++  min r0, r0, rb_frame_width_minus_1 ; mov r2, ra9  # Load the frame base
+++  shl ra_xshift_next, r0, 3 # Compute shifts
+++  add ra_y, r1, 1
+++  and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
+++  add r2, r2, r0  # r2 is address for frame0 (not including y offset)
+++  max r1, r1, 0
+++  min r1, r1, rb_frame_height_minus_1
+++  nop             ; mul24 r1, r1, rb_pitch   # r2 contains the addresses (not including y offset) for frame0
+++  add t0s, r2, r1 ; mov ra_frame_base, r2
+ +
+-+        ldrd        v1, v2, [v5]    @ curr->mv
+-+        ldrd        v3, v4, [v6]    @ neigh->mv
+-+        ldr         lr, =0xFFFCFFFC
+-+        b           25b
+++  mov r1, ra10 # y_x
+++  shl r0,r1,r3 # r0 is x<<16
+++  asr r1,r1,r3 # r1 is y
+++  asr r0,r0,r3 # r0 is x
+++  add r0, r0, elem_num # Load x
+++  max r0, r0, 0
+++  min r0, r0, rb_frame_width_minus_1 ; mov r2, ra11  # Load the frame base
+++  shl rx_xshift2_next, r0, 3 # Compute shifts
+++  add ra_y2, r1, 1
+++  and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
+++  add r2, r2, r0  # r2 is address for frame1 (not including y offset)
+++  max r1, r1, 0
+++  min r1, r1, rb_frame_height_minus_1
+++  nop             ; mul24 r1, r1, rb_pitch   # r2 contains the addresses (not including y offset) for frame0
+++  add t1s, r2, r1 ; mov ra_frame_base2, r2
+ +
+-+90:     mov         a3, #1
+-+        b           11b
+-+endfunc
+-diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c
+-index e5da7e9..49c70dd 100644
+---- a/libavcodec/arm/hevcdsp_init_neon.c
+-+++ b/libavcodec/arm/hevcdsp_init_neon.c
+-@@ -290,6 +290,10 @@ static void ff_hevc_sao_edge_neon_wrapper(uint8_t *_dst /* align 16 */, uint8_t
+- }
+- #undef CMP
+- 
+-+void ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_inc, int out_inc,
+-+                                                int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
+-+                                                MvField *curr, MvField *neigh, uint8_t *bs);
+ +
+- av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
+- {
+-     if (bit_depth == 8) {
+-@@ -387,4 +391,9 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
+-         c->put_hevc_qpel_uni[8][0][0]  = ff_hevc_put_qpel_uw_pixels_w48_neon_8;
+-         c->put_hevc_qpel_uni[9][0][0]  = ff_hevc_put_qpel_uw_pixels_w64_neon_8;
+-     }
+++# load constants
+ +
+-+    assert(offsetof(MvField, mv) == 0);
+-+    assert(offsetof(MvField, ref_idx) == 8);
+-+    assert(offsetof(MvField, pred_flag) == 10);
+-+    c->hevc_deblocking_boundary_strengths = ff_hevc_deblocking_boundary_strengths_neon;
+- }
+-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
+-index 7eb37e6..496c0e1 100644
+---- a/libavcodec/hevc.h
+-+++ b/libavcodec/hevc.h
+-@@ -684,17 +684,6 @@ typedef struct CodingUnit {
+-     uint8_t cu_transquant_bypass_flag;
+- } CodingUnit;
+- 
+--typedef struct Mv {
+--    int16_t x;  ///< horizontal component of motion vector
+--    int16_t y;  ///< vertical component of motion vector
+--} Mv;
+--
+--typedef struct MvField {
+--    DECLARE_ALIGNED(4, Mv, mv)[2];
+--    int8_t ref_idx[2];
+--    int8_t pred_flag;
+--} MvField;
+--
+- typedef struct NeighbourAvailable {
+-     int cand_bottom_left;
+-     int cand_left;
+-diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
+-index 6367068..826a82f 100644
+---- a/libavcodec/hevc_filter.c
+-+++ b/libavcodec/hevc_filter.c
+-@@ -726,69 +726,6 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+-     }
+- }
+- 
+--static int boundary_strength(HEVCContext *s, MvField *curr, MvField *neigh,
+--                             RefPicList *neigh_refPicList)
+--{
+--    if (curr->pred_flag == PF_BI &&  neigh->pred_flag == PF_BI) {
+--        // same L0 and L1
+--        if (s->ref->refPicList[0].list[curr->ref_idx[0]] == neigh_refPicList[0].list[neigh->ref_idx[0]]  &&
+--            s->ref->refPicList[0].list[curr->ref_idx[0]] == s->ref->refPicList[1].list[curr->ref_idx[1]] &&
+--            neigh_refPicList[0].list[neigh->ref_idx[0]] == neigh_refPicList[1].list[neigh->ref_idx[1]]) {
+--            if ((FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
+--                 FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4) &&
+--                (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
+--                 FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4))
+--                return 1;
+--            else
+--                return 0;
+--        } else if (neigh_refPicList[0].list[neigh->ref_idx[0]] == s->ref->refPicList[0].list[curr->ref_idx[0]] &&
+--                   neigh_refPicList[1].list[neigh->ref_idx[1]] == s->ref->refPicList[1].list[curr->ref_idx[1]]) {
+--            if (FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
+--                FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4)
+--                return 1;
+--            else
+--                return 0;
+--        } else if (neigh_refPicList[1].list[neigh->ref_idx[1]] == s->ref->refPicList[0].list[curr->ref_idx[0]] &&
+--                   neigh_refPicList[0].list[neigh->ref_idx[0]] == s->ref->refPicList[1].list[curr->ref_idx[1]]) {
+--            if (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
+--                FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4)
+--                return 1;
+--            else
+--                return 0;
+--        } else {
+--            return 1;
+--        }
+--    } else if ((curr->pred_flag != PF_BI) && (neigh->pred_flag != PF_BI)){ // 1 MV
+--        Mv A, B;
+--        int ref_A, ref_B;
+--
+--        if (curr->pred_flag & 1) {
+--            A     = curr->mv[0];
+--            ref_A = s->ref->refPicList[0].list[curr->ref_idx[0]];
+--        } else {
+--            A     = curr->mv[1];
+--            ref_A = s->ref->refPicList[1].list[curr->ref_idx[1]];
+--        }
+--
+--        if (neigh->pred_flag & 1) {
+--            B     = neigh->mv[0];
+--            ref_B = neigh_refPicList[0].list[neigh->ref_idx[0]];
+--        } else {
+--            B     = neigh->mv[1];
+--            ref_B = neigh_refPicList[1].list[neigh->ref_idx[1]];
+--        }
+--
+--        if (ref_A == ref_B) {
+--            if (FFABS(A.x - B.x) >= 4 || FFABS(A.y - B.y) >= 4)
+--                return 1;
+--            else
+--                return 0;
+--        } else
+--            return 1;
+--    }
+--
+--    return 1;
+--}
+- 
+- void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
+-                                            int log2_trafo_size)
+-@@ -799,10 +736,17 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
+-     int log2_min_tu_size = s->ps.sps->log2_min_tb_size;
+-     int min_pu_width     = s->ps.sps->min_pu_width;
+-     int min_tu_width     = s->ps.sps->min_tb_width;
+--    int is_intra = tab_mvf[(y0 >> log2_min_pu_size) * min_pu_width +
+--                           (x0 >> log2_min_pu_size)].pred_flag == PF_INTRA;
+-     int boundary_upper, boundary_left;
+--    int i, j, bs;
+-+    int i, j;
+-+    RefPicList *rpl      = s->ref->refPicList;
+-+    int min_pu_in_4pix   = (1 << log2_min_pu_size) >> 2;
+-+    int trafo_in_min_pus = (1 << log2_trafo_size) >> log2_min_pu_size;
+-+    int y_pu             = y0 >> log2_min_pu_size;
+-+    int x_pu             = x0 >> log2_min_pu_size;
+-+    MvField *curr        = &tab_mvf[y_pu * min_pu_width + x_pu];
+-+    int is_intra         = curr->pred_flag == PF_INTRA;
+-+    int inc              = log2_min_pu_size == 2 ? 2 : 1;
+-+    uint8_t *bs;
+- 
+- #ifdef DISABLE_STRENGTHS
+-     return;
+-@@ -818,34 +762,56 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
+-           (y0 % (1 << s->ps.sps->log2_ctb_size)) == 0)))
+-         boundary_upper = 0;
+- 
+-+    bs = &s->horizontal_bs[(x0 + y0 * s->bs_width) >> 2];
+++  mov ra_k1, 1
+++  mov ra_k256, 256
+++  mov ra30, 64
+++
+++  mov rb20, 0xffffff00
+++  mov rb_k255, 255
+++  mov rb23, 24
+++
+++# touch vertical context to keep simulator happy
+++
+++  mov ra8, 0
+++  mov ra9, 0
+++  mov ra10, 0
+++  mov ra11, 0
+++  mov ra12, 0
+++  mov ra13, 0
+++  mov ra14, 0
+++  mov ra15, 0
+ +
+-     if (boundary_upper) {
+-         RefPicList *rpl_top = (lc->boundary_flags & BOUNDARY_UPPER_SLICE) ?
+-                               ff_hevc_get_ref_list(s, s->ref, x0, y0 - 1) :
+--                              s->ref->refPicList;
+--        int yp_pu = (y0 - 1) >> log2_min_pu_size;
+--        int yq_pu =  y0      >> log2_min_pu_size;
+--        int yp_tu = (y0 - 1) >> log2_min_tu_size;
+--        int yq_tu =  y0      >> log2_min_tu_size;
+-+                              rpl;
+-+        MvField *top = curr - min_pu_width;
+++# Compute part of VPM to use
+++  mov r2, qpu_num
+++  mov r1, r2
+++  asr r1, r1, 2
+++  shl r1, r1, 6
+++  mov r0, r2
+++  and r0, r0, 3
+++  add r0, r0, r1
+++  mov r1, vpm_setup(0, 4, h8p(0, 0))   # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
+++  add rb28, r0, r1  # VPM for saving data
+++  mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
+++  shl r0, r0, 5
+++  add rb27, r0, r1  # Command for dma output
+ +
+-+        if (is_intra) {
+-+            for (i = 0; i < (1 << log2_trafo_size); i += 4)
+-+                bs[i >> 2] = 2;
+++# Weighted prediction denom
+++  add rb13, unif, 9  # unif = weight denom + 6
+ +
+-+        } else {
+-+            int y_tu = y0 >> log2_min_tu_size;
+-+            int x_tu = x0 >> log2_min_tu_size;
+-+            uint8_t *curr_cbf_luma = &s->cbf_luma[y_tu * min_tu_width + x_tu];
+-+            uint8_t *top_cbf_luma = curr_cbf_luma - min_tu_width;
+++  mov -, unif # Unused
+ +
+-+            s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
+-+                    min_pu_in_4pix, sizeof (MvField), 4 >> 2,
+-+                    rpl[0].list, rpl[1].list, rpl_top[0].list, rpl_top[1].list,
+-+                    curr, top, bs);
+- 
+-             for (i = 0; i < (1 << log2_trafo_size); i += 4) {
+--                int x_pu = (x0 + i) >> log2_min_pu_size;
+--                int x_tu = (x0 + i) >> log2_min_tu_size;
+--                MvField *top  = &tab_mvf[yp_pu * min_pu_width + x_pu];
+--                MvField *curr = &tab_mvf[yq_pu * min_pu_width + x_pu];
+--                uint8_t top_cbf_luma  = s->cbf_luma[yp_tu * min_tu_width + x_tu];
+--                uint8_t curr_cbf_luma = s->cbf_luma[yq_tu * min_tu_width + x_tu];
+--
+--                if (curr->pred_flag == PF_INTRA || top->pred_flag == PF_INTRA)
+--                    bs = 2;
+--                else if (curr_cbf_luma || top_cbf_luma)
+--                    bs = 1;
+--                else
+--                    bs = boundary_strength(s, curr, top, rpl_top);
+--                s->horizontal_bs[((x0 + i) + y0 * s->bs_width) >> 2] = bs;
+-+                int i_pu = i >> log2_min_pu_size;
+-+                int i_tu = i >> log2_min_tu_size;
+++# submit texture requests for second line
+++  max r1, ra_y, 0
+++  min r1, r1, rb_frame_height_minus_1
+++  add ra_y, ra_y, 1
+++  nop ; mul24 r1, r1, rb_pitch
+++  add t0s, r1, ra_frame_base
+ +
+-+                if (top[i_pu].pred_flag == PF_INTRA)
+-+                    bs[i >> 2] = 2;
+-+                else if (curr_cbf_luma[i_tu] || top_cbf_luma[i_tu])
+-+                    bs[i >> 2] = 1;
+-             }
+-+        }
+-+    }
+++  max r1, ra_y2, 0
+++  min r1, r1, rb_frame_height_minus_1
+++  add ra_y2, ra_y2, 1
+++  nop ; mul24 r1, r1, rb_pitch
+++  add t1s, r1, ra_frame_base2
+ +
+-+    if (!is_intra) {
+-+        for (j = inc; j < trafo_in_min_pus; j += inc) {
+-+            MvField *top;
+++# FALL THROUGHT TO PER-BLOCK SETUP
+ +
+-+            curr += min_pu_width * inc;
+-+            top = curr - min_pu_width;
+-+            bs += s->bs_width * inc << log2_min_pu_size >> 2;
+++# Start of per-block setup code
+++# P and B blocks share the same setup code to save on Icache space
+++:per_block_setup
+++  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++  mov ra31, unif
+ +
+-+            s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
+-+                    min_pu_in_4pix, sizeof (MvField), 4 >> 2,
+-+                    rpl[0].list, rpl[1].list, rpl[0].list, rpl[1].list,
+-+                    curr, top, bs);
+-+        }
+-     }
+- 
+--    // bs for vertical TU boundaries
+-     boundary_left = x0 > 0 && !(x0 & 7);
+-     if (boundary_left &&
+-         ((!s->sh.slice_loop_filter_across_slices_enabled_flag &&
+-@@ -856,64 +822,54 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
+-           (x0 % (1 << s->ps.sps->log2_ctb_size)) == 0)))
+-         boundary_left = 0;
+- 
+-+    curr = &tab_mvf[y_pu * min_pu_width + x_pu];
+-+    bs = &s->vertical_bs[(x0 + y0 * s->bs_width) >> 2];
+++  mov ra1, unif  ; mov r1, elem_num  # y_x ; elem_num has implicit unpack??
+ +
+-     if (boundary_left) {
+-         RefPicList *rpl_left = (lc->boundary_flags & BOUNDARY_LEFT_SLICE) ?
+-                                ff_hevc_get_ref_list(s, s->ref, x0 - 1, y0) :
+--                               s->ref->refPicList;
+--        int xp_pu = (x0 - 1) >> log2_min_pu_size;
+--        int xq_pu =  x0      >> log2_min_pu_size;
+--        int xp_tu = (x0 - 1) >> log2_min_tu_size;
+--        int xq_tu =  x0      >> log2_min_tu_size;
+--
+--            for (i = 0; i < (1 << log2_trafo_size); i += 4) {
+--                int y_pu      = (y0 + i) >> log2_min_pu_size;
+--                int y_tu      = (y0 + i) >> log2_min_tu_size;
+--                MvField *left = &tab_mvf[y_pu * min_pu_width + xp_pu];
+--                MvField *curr = &tab_mvf[y_pu * min_pu_width + xq_pu];
+--                uint8_t left_cbf_luma = s->cbf_luma[y_tu * min_tu_width + xp_tu];
+--                uint8_t curr_cbf_luma = s->cbf_luma[y_tu * min_tu_width + xq_tu];
+--
+--                if (curr->pred_flag == PF_INTRA || left->pred_flag == PF_INTRA)
+--                    bs = 2;
+--                else if (curr_cbf_luma || left_cbf_luma)
+--                    bs = 1;
+--                else
+--                    bs = boundary_strength(s, curr, left, rpl_left);
+--                s->vertical_bs[(x0 + (y0 + i) * s->bs_width) >> 2] = bs;
+--            }
+--    }
+-+                               rpl;
+-+        MvField *left = curr - 1;
+- 
+--    if (log2_trafo_size > log2_min_pu_size && !is_intra) {
+--        RefPicList *rpl = s->ref->refPicList;
+-+        if (is_intra) {
+-+            for (j = 0; j < (1 << log2_trafo_size); j += 4)
+-+                bs[j * s->bs_width >> 2] = 2;
+- 
+--        // bs for TU internal horizontal PU boundaries
+--        for (j = 8; j < (1 << log2_trafo_size); j += 8) {
+--            int yp_pu = (y0 + j - 1) >> log2_min_pu_size;
+--            int yq_pu = (y0 + j)     >> log2_min_pu_size;
+--
+--            for (i = 0; i < (1 << log2_trafo_size); i += 4) {
+--                int x_pu = (x0 + i) >> log2_min_pu_size;
+--                MvField *top  = &tab_mvf[yp_pu * min_pu_width + x_pu];
+--                MvField *curr = &tab_mvf[yq_pu * min_pu_width + x_pu];
+--
+--                bs = boundary_strength(s, curr, top, rpl);
+--                s->horizontal_bs[((x0 + i) + (y0 + j) * s->bs_width) >> 2] = bs;
+-+        } else {
+-+            int y_tu = y0 >> log2_min_tu_size;
+-+            int x_tu = x0 >> log2_min_tu_size;
+-+            uint8_t *curr_cbf_luma = &s->cbf_luma[y_tu * min_tu_width + x_tu];
+-+            uint8_t *left_cbf_luma = curr_cbf_luma - 1;
+++# per-channel shifts were calculated on the *previous* invocation
+++  mov ra_xshift, ra_xshift_next
+++  mov rx_xshift2, rx_xshift2_next
+++
+++# get base addresses and per-channel shifts for *next* invocation
+++
+++  add r0, ra1.16a, r1 # Load x
+++  max r0, r0, 0
+++  min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
+++  shl ra_xshift_next, r0, 3 # Compute shifts
+++  mov r3, 8                          ; mov ra_y_next, ra1.16b
+++  and r0, r0, ~3                     ; mov ra1, unif # y2_x2
+++  add ra_frame_base_next, r2, r0
+++
+++  add r0, ra1.16a, r1 # Load x
+++  max r0, r0, 0
+++  min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
+++  shl rx_xshift2_next, r0, 3         # Compute shifts
+++  add r3, r3, r3                     ; mov ra_y2_next, ra1.16b  # r3 = 16 ;
+++  and r0, r0, ~3                     ; mov ra1, unif  # width_height ; r0 gives the clipped and aligned x coordinate
+++  add rx_frame_base2_next, r2, r0    # r2 is address for frame1 (not including y offset)
+++
+++# set up VPM write
+++  mov vw_setup, rb28
+++
+++# get width,height of block (unif load above)
+++  sub rb29, rb24, ra1.16b # Compute vdw_setup1(dst_pitch-width)
+++  add rb17, ra1.16a, 5
+++  add rb18, ra1.16a, 7
+++  shl r0,   ra1.16a, 7
+++  add r0,   r0, ra1.16b # Combine width and height of destination area
+++  shl r0,   r0, i_shift16 # Shift into bits 16 upwards of the vdw_setup0 register
+++  add rb26, r0, rb27                 ; mov r0, unif   # Packed filter offsets
+++
+++# get filter coefficients and discard unused B frame values
+++  shl.ifz r0, r0, i_shift16      # Pick half to use
+++  shl ra8, r0, 3
+++
+++# Pack the 1st 4 filter coefs for H & V tightly
+++
+++  mov r1,0x00010100  # -ve
+++  ror ra2.8a, r1, ra8.8d
+++  ror ra0.8a, r1, ra8.8c
+++
+++  mov r1,0x01040400
+++  ror ra2.8b, r1, ra8.8d
+++  ror ra0.8b, r1, ra8.8c
+++
+++  mov r1,0x050b0a00  # -ve
+++  ror ra2.8c, r1, ra8.8d
+++  ror ra0.8c, r1, ra8.8c
+++
+++  mov r1,0x11283a40
+++  ror ra2.8d, r1, ra8.8d
+++  ror ra0.8d, r1, ra8.8c
+++
+++# In the 2nd vertical half we use b registers due to
+++# using a-side fifo regs. The easiest way to achieve this to pack it
+++# and then unpack!
+++
+++  mov r1,0x3a281100
+++  ror ra3.8a, r1, ra8.8d
+++  ror ra1.8a, r1, ra8.8c
+++
+++  mov r1,0x0a0b0500  # -ve
+++  ror ra3.8b, r1, ra8.8d
+++  ror ra1.8b, r1, ra8.8c
+++
+++  mov r1,0x04040100
+++  ror ra3.8c, r1, ra8.8d
+++  ror ra1.8c, r1, ra8.8c
+++
+++# Extract weighted prediction information in parallel
+++
+++  mov r1,0x01010000  # -ve
+++  ror ra3.8d, r1, ra8.8d    ; mov r0, unif      # ; weight L1 weight L1 (hi16)/weight L0 (lo16)
+++  ror ra1.8d, r1, ra8.8c    ; mov r1, rb13      # ; rb13 = weight denom + 6 + 9
+++
+++# r3 = 16 from (long way) above
+++  shl r1, unif, r1          ; mov rb4, ra3.8a   # combined offet = ((is P) ? offset L0 * 2 : offset L1 + offset L0) + 1) ;
+++  asr ra18, r0, r3          ; mov rb5, ra3.8b
+++  bra -, ra31
+++  shl r0, r0, r3            ; mov rb6, ra3.8c
+++  mov r3, 0                 ; mov rb7, ra3.8d   # loop count ;
+++  asr rb12, r1, 9
+++
+++# >>> branch ra31
+++#
+++# r3 = 0
+++# ra18 = weight L1
+++# r0   = weight L0 << 16 (will be put into rb14 in filter preamble)
+++# rb13 = weight denom + 6 + 9
+++# rb12 = (((is P) ? offset L0 * 2 : offset L1 + offset L0) + 1) << (rb13 - 1)
+++
+++
+++################################################################################
+++# mc_filter(y_x, frame_base, y2_x2, frame_base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
+++# In a P block, y2_x2 should be y_x+8
+++# At this point we have already issued two pairs of texture requests for the current block
+++
+++::mc_filter
+++# r0 = weight << 16; We want weight * 2 in rb14
+++  asr rb14, r0, 15
+++
+++# r3 = 0
+++
+++:yloop
+++# retrieve texture results and pick out bytes
+++# then submit two more texture requests
+++
+++# If we knew there was no clipping then this code would get simpler.
+++# Perhaps we could add on the pitch and clip using larger values?
+++
+++# N.B. Whilst y == y2 as far as this loop is concerned we will start
+++# the grab for the next block before we finish with this block and that
+++# might be B where y != y2 so we must do full processing on both y and y2
+++
+++  sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1                           ; ldtmu0
+++  shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu1
+++  mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+++  mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+++  shr r1, r4, rx_xshift2    ; mov.ifz ra_y2, ra_y2_next
+++
+++  max r2, ra_y, 0  # y
+++  min r2, r2, rb_frame_height_minus_1
+++  add ra_y, ra_y, 1            ; mul24 r2, r2, r3
+++  add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte
+++
+++  max r2, ra_y2, 0  # y
+++  min r2, r2, rb_frame_height_minus_1
+++  add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
+++  add t1s, ra_frame_base2, r2  ; v8subs r1, r1, rb20
+++
+++# generate seven shifted versions
+++# interleave with scroll of vertical context
+++
+++  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++
+++# apply horizontal filter
+++  nop                  ; mul24      r3, ra0.8a,      r0
+++  nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
+++  nop                  ; mul24      r2, ra0.8b << 1, r0 << 1
+++  nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
+++  sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2
+++  nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
+++  sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3
+++  nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
+++  add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4
+++  nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12
+++  add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5
+++  nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13
+++  sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6
+++  nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14
+++  add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7
+++  nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15
+++  sub r0, r2, r3       ; mov r3, rb31
+++
+++  sub.setf -, r3, 8       ; mov r1,   ra8
+++  mov ra8,  ra9           ; mov rb8,  rb9
+++  brr.anyn -, r:yloop
+++  mov ra9,  ra10          ; mov rb9,  rb10
+++  mov ra10, ra11          ; mov rb10, rb11
+++  mov ra11, r0            ; mov rb11, r1
+++  # >>> .anyn yloop
+++
+++  # apply vertical filter and write to VPM
+++
+++  nop                     ; mul24 r0, rb8,  ra2.8a
+++  nop                     ; mul24 r1, rb9,  ra2.8b
+++  sub r1, r1, r0          ; mul24 r0, rb10, ra2.8c
+++  sub r1, r1, r0          ; mul24 r0, rb11, ra2.8d
+++  add r1, r1, r0          ; mul24 r0, ra8,  rb4
+++  add r1, r1, r0          ; mul24 r0, ra9,  rb5
+++  sub r1, r1, r0          ; mul24 r0, ra10, rb6
+++  add r1, r1, r0          ; mul24 r0, ra11, rb7
+++  sub r1, r1, r0          ; mov -, vw_wait
+++# At this point r1 is a 22-bit signed quantity: 8 (original sample),
+++#  +6, +6 (each pass), +1 (the passes can overflow slightly), +1 (sign)
+++# The top 8 bits have rubbish in them as mul24 is unsigned
+++# The low 6 bits need discard before weighting
+++  sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256  # x256 - sign extend & discard rubbish
+++  asr r1, r1, 14
+++  nop                     ; mul24 r1, r1, rb14
+++  add r1, r1, rb12
+++
+++  shl r1, r1, 8
+++  brr.anyn -, r:yloop
+++  asr r1, r1, rb13
+++# We have a saturating pack unit - I can't help feeling it should be useful here
+++  min r1, r1, rb_k255       # Delay 2  rb_k255 = 255
+++  max vpm, r1, 0         # Delay 3
+++# >>> branch.anyn yloop
+++
+++# DMA out
+++
+++  brr -, r:per_block_setup
+++  mov vw_setup, rb26 # VDW setup 0    Delay 1
+++  mov vw_setup, rb29 # Stride         Delay 2
+++  mov vw_addr, unif # start the VDW   Delay 3
+++
+++
+++
+++################################################################################
+++
+++# mc_filter_b(y_x, frame_base, y2_x2, frame_base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
+++# In a P block, only the first half of coefficients contain used information.
+++# At this point we have already issued two pairs of texture requests for the current block
+++# May be better to just send 16.16 motion vector and figure out the coefficients inside this block (only 4 cases so can compute hcoeffs in around 24 cycles?)
+++# Can fill in the coefficients so only
+++# Can also assume default weighted prediction for B frames.
+++# Perhaps can unpack coefficients in a more efficient manner by doing H/V for a and b at the same time?
+++# Or possibly by taking advantage of symmetry?
+++# From 19->7 32bits per command.
+ +
+-+            s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
+-+                    min_pu_in_4pix, min_pu_width * sizeof (MvField), 4 * s->bs_width >> 2,
+-+                    rpl[0].list, rpl[1].list, rpl_left[0].list, rpl_left[1].list,
+-+                    curr, left, bs);
+++::mc_filter_b
+++  # r0 = weightL0 << 16, we want it in rb14
+++  asr rb14, r0, i_shift16
+ +
+-+            for (j = 0; j < (1 << log2_trafo_size); j += 4) {
+-+                int j_pu = j >> log2_min_pu_size;
+-+                int j_tu = j >> log2_min_tu_size;
+++:yloopb
+++# retrieve texture results and pick out bytes
+++# then submit two more texture requests
+ +
+-+                if (left[j_pu * min_pu_width].pred_flag == PF_INTRA)
+-+                    bs[j * s->bs_width >> 2] = 2;
+-+                else if (curr_cbf_luma[j_tu * min_tu_width] || left_cbf_luma[j_tu * min_tu_width])
+-+                    bs[j * s->bs_width >> 2] = 1;
+-             }
+-         }
+-+    }
+- 
+--        // bs for TU internal vertical PU boundaries
+--        for (j = 0; j < (1 << log2_trafo_size); j += 4) {
+--            int y_pu = (y0 + j) >> log2_min_pu_size;
+-+    if (!is_intra) {
+-+        for (i = inc; i < trafo_in_min_pus; i += inc) {
+-+            MvField *left;
+- 
+--            for (i = 8; i < (1 << log2_trafo_size); i += 8) {
+--                int xp_pu = (x0 + i - 1) >> log2_min_pu_size;
+--                int xq_pu = (x0 + i)     >> log2_min_pu_size;
+--                MvField *left = &tab_mvf[y_pu * min_pu_width + xp_pu];
+--                MvField *curr = &tab_mvf[y_pu * min_pu_width + xq_pu];
+-+            curr += inc;
+-+            left = curr - 1;
+-+            bs += inc << log2_min_pu_size >> 2;
+- 
+--                bs = boundary_strength(s, curr, left, rpl);
+--                s->vertical_bs[((x0 + i) + (y0 + j) * s->bs_width) >> 2] = bs;
+--            }
+-+            s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
+-+                    min_pu_in_4pix, min_pu_width * sizeof (MvField), 4 * s->bs_width >> 2,
+-+                    rpl[0].list, rpl[1].list, rpl[0].list, rpl[1].list,
+-+                    curr, left, bs);
+-         }
+-     }
+- }
+-diff --git a/libavcodec/hevcdsp.c b/libavcodec/hevcdsp.c
+-index 9d773d9..a6534a9 100644
+---- a/libavcodec/hevcdsp.c
+-+++ b/libavcodec/hevcdsp.c
+-@@ -123,6 +123,120 @@ DECLARE_ALIGNED(16, const int8_t, ff_hevc_qpel_filters[3][16]) = {
+- #include "hevcdsp_template.c"
+- #undef BIT_DEPTH
+- 
+-+static void hevc_deblocking_boundary_strengths(int pus, int dup, int in_inc, int out_inc,
+-+                                               int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
+-+                                               MvField *curr, MvField *neigh, uint8_t *bs)
+-+{
+-+    for (; pus > 0; pus--) {
+-+        int strength, out;
+-+        int curr_refL0 = curr_rpl0[curr->ref_idx[0]];
+-+        int curr_refL1 = curr_rpl1[curr->ref_idx[1]];
+-+        int neigh_refL0 = neigh_rpl0[neigh->ref_idx[0]];
+-+        int neigh_refL1 = neigh_rpl1[neigh->ref_idx[1]];
+++# If we knew there was no clipping then this code would get simpler.
+++# Perhaps we could add on the pitch and clip using larger values?
+ +
+-+#if 1 // This more directly matches the original implementation
+-+        if (curr->pred_flag == PF_BI &&  neigh->pred_flag == PF_BI) {
+-+            // same L0 and L1
+-+            if (curr_refL0 == neigh_refL0 &&
+-+                curr_refL0 == curr_refL1 &&
+-+                neigh_refL0 == neigh_refL1) {
+-+                if ((FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
+-+                     FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4) &&
+-+                    (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
+-+                     FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4))
+-+                    strength = 1;
+-+                else
+-+                    strength = 0;
+-+            } else if (neigh_refL0 == curr_refL0 &&
+-+                       neigh_refL1 == curr_refL1) {
+-+                if (FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
+-+                    FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4)
+-+                    strength = 1;
+-+                else
+-+                    strength = 0;
+-+            } else if (neigh_refL1 == curr_refL0 &&
+-+                       neigh_refL0 == curr_refL1) {
+-+                if (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
+-+                    FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4)
+-+                    strength = 1;
+-+                else
+-+                    strength = 0;
+-+            } else {
+-+                strength = 1;
+-+            }
+-+        } else if ((curr->pred_flag != PF_BI) && (neigh->pred_flag != PF_BI)){ // 1 MV
+-+            Mv curr_mv0, neigh_mv0;
+++  sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1                           ; ldtmu0
+++  shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu1
+++  mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+++  mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+++  shr r1, r4, rx_xshift2    ; mov.ifz ra_y2, ra_y2_next
+ +
+-+            if (curr->pred_flag & 1) {
+-+                curr_mv0   = curr->mv[0];
+-+            } else {
+-+                curr_mv0   = curr->mv[1];
+-+                curr_refL0 = curr_refL1;
+-+            }
+++  max r2, ra_y, 0  # y
+++  min r2, r2, rb_frame_height_minus_1
+++  add ra_y, ra_y, 1            ; mul24 r2, r2, r3
+++  add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte
+ +
+-+            if (neigh->pred_flag & 1) {
+-+                neigh_mv0   = neigh->mv[0];
+-+            } else {
+-+                neigh_mv0   = neigh->mv[1];
+-+                neigh_refL0 = neigh_refL1;
+-+            }
+++  max r2, ra_y2, 0  # y
+++  min r2, r2, rb_frame_height_minus_1
+++  add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
+++  add t1s, ra_frame_base2, r2  ; v8subs r1, r1, rb20
+ +
+-+            if (curr_refL0 == neigh_refL0) {
+-+                if (FFABS(curr_mv0.x - neigh_mv0.x) >= 4 || FFABS(curr_mv0.y - neigh_mv0.y) >= 4)
+-+                    strength = 1;
+-+                else
+-+                    strength = 0;
+-+            } else
+-+                strength = 1;
+-+        } else
+-+            strength = 1;
+-+#else // This has exactly the same effect, but is more suitable for vectorisation
+-+        Mv curr_mv[2];
+-+        Mv neigh_mv[2];
+-+        memcpy(curr_mv, curr->mv, sizeof curr_mv);
+-+        memcpy(neigh_mv, neigh->mv, sizeof neigh_mv);
+++# generate seven shifted versions
+++# interleave with scroll of vertical context
+ +
+-+        if (!(curr->pred_flag & 2)) {
+-+            curr_mv[1] = curr_mv[0];
+-+            curr_refL1 = curr_refL0;
+-+        }
+-+        if (!(neigh->pred_flag & 2)) {
+-+            neigh_mv[1] = neigh_mv[0];
+-+            neigh_refL1 = neigh_refL0;
+-+        }
+-+        if (!(curr->pred_flag & 1)) {
+-+            curr_mv[0] = curr_mv[1];
+-+            curr_refL0 = curr_refL1;
+-+        }
+-+        if (!(neigh->pred_flag & 1)) {
+-+            neigh_mv[0] = neigh_mv[1];
+-+            neigh_refL0 = neigh_refL1;
+-+        }
+++  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+ +
+-+        strength = 1;
+++# apply horizontal filter
+++  nop                  ; mul24      r3, ra0.8a,      r0
+++  nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
+++  nop                  ; mul24      r2, ra0.8b << 1, r0 << 1
+++  nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
+++  sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2
+++  nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
+++  sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3
+++  nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
+++  add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4
+++  nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12
+++  add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5
+++  nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13
+++  sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6
+++  nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14
+++  add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7
+++  nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15
+++  sub r0, r2, r3       ; mov r3, rb31
+++
+++  sub.setf -, r3, 8       ; mov r1,   ra8
+++  mov ra8,  ra9           ; mov rb8,  rb9
+++  brr.anyn -, r:yloopb
+++  mov ra9,  ra10          ; mov rb9,  rb10
+++  mov ra10, ra11          ; mov rb10, rb11
+++  mov ra11, r0            ; mov rb11, r1
+++  # >>> .anyn yloopb
+ +
+-+        strength &= (neigh_refL0 != curr_refL0) | (neigh_refL1 != curr_refL1) |
+-+                (FFABS(neigh_mv[0].x - curr_mv[0].x) >= 4) | (FFABS(neigh_mv[0].y - curr_mv[0].y) >= 4) |
+-+                (FFABS(neigh_mv[1].x - curr_mv[1].x) >= 4) | (FFABS(neigh_mv[1].y - curr_mv[1].y) >= 4);
+++  # apply vertical filter and write to VPM
+ +
+-+        strength &= (neigh_refL1 != curr_refL0) | (neigh_refL0 != curr_refL1) |
+-+                (FFABS(neigh_mv[1].x - curr_mv[0].x) >= 4) | (FFABS(neigh_mv[1].y - curr_mv[0].y) >= 4) |
+-+                (FFABS(neigh_mv[0].x - curr_mv[1].x) >= 4) | (FFABS(neigh_mv[0].y - curr_mv[1].y) >= 4);
+++  nop                     ; mul24 r0, rb8,  ra2.8a
+++  nop                     ; mul24 r1, rb9,  ra2.8b
+++  sub r1, r1, r0          ; mul24 r0, rb10, ra2.8c
+++  sub r1, r1, r0          ; mul24 r0, rb11, ra2.8d
+++  add r1, r1, r0          ; mul24 r0, ra8,  rb4
+++  add r1, r1, r0          ; mul24 r0, ra9,  rb5
+++  sub r1, r1, r0          ; mul24 r0, ra10, rb6
+++  add r1, r1, r0          ; mul24 r0, ra11, rb7
+++  sub r1, r1, r0          ; mov r2, rb12
+++# As with P-pred r1 is a 22-bit signed quantity in 32-bits
+++# Top 8 bits are bad - low 6 bits should be discarded
+++  sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
+ +
+-+        strength |= (((curr->pred_flag + 1) ^ (neigh->pred_flag + 1)) >> 2);
+-+#endif
+++  asr r1, r1, 14
+++  nop                     ; mul24 r0, r1, rb14
+++  add r0, r0, r2          ; mul24 r1, r1 << 8, ra18 << 8
+ +
+-+        curr += in_inc / sizeof (MvField);
+-+        neigh += in_inc / sizeof (MvField);
+++  add r1, r1, r0          ; mov -, vw_wait
+++  shl r1, r1, 8
+ +
+-+        for (out = dup; out > 0; out--)
+-+        {
+-+            *bs = strength;
+-+            bs += out_inc;
+-+        }
+-+    }
+-+}
+++  brr.anyn -, r:yloopb
+++  asr r1, r1, rb13         # Delay 1
+++  min r1, r1, rb_k255       # Delay 2
+++  max vpm, r1, 0         # Delay 3
+ +
+- void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
+- {
+- #undef FUNC
+-@@ -257,6 +371,8 @@ int i = 0;
+-         break;
+-     }
+- 
+-+    hevcdsp->hevc_deblocking_boundary_strengths = hevc_deblocking_boundary_strengths;
+++# DMA out
+++  brr -, r:per_block_setup
+++  mov vw_setup, rb26 # VDW setup 0    Delay 1
+++  mov vw_setup, rb29 # Stride         Delay 2
+++  mov vw_addr, unif # start the VDW   Delay 3
+++
+++################################################################################
+++
+++# mc_interrupt_exit12()
+++::mc_interrupt_exit12
+++  mov  -, vw_wait # wait on the VDW
+++
+++  # Dummy wait to test instructions
+++#  mov r3,1000000
+++#:dummy_loop
+++#  sub.setf r3, r3, 1
+++#  nop
+++#  nop
+++#  brr.anynn -, r:dummy_loop
+++#  nop
+++#  nop
+++#  nop
+++
+++  ldtmu0
+++  ldtmu0
+++  ldtmu1
+++  ldtmu1
+++
+++  mov -,sacq(0) # 1
+++  mov -,sacq(0) # 2
+++  mov -,sacq(0) # 3
+++  mov -,sacq(0) # 4
+++  mov -,sacq(0) # 5
+++  mov -,sacq(0) # 6
+++  mov -,sacq(0) # 7
+++  mov -,sacq(0) # 8
+++  mov -,sacq(0) # 9
+++  mov -,sacq(0) # 10
+++  mov -,sacq(0) # 11
+++
+++  nop        ; nop ; thrend
+++  mov interrupt, 1; nop # delay slot 1
+++  nop        ; nop # delay slot 2
+++
+++
+++::mc_exit1
+++  mov  -, vw_wait # wait on the VDW
+++
+++  ldtmu0
+++  ldtmu1
+++  ldtmu0
+++  ldtmu1
+++  nop        ; nop ; thrend
+++  mov interrupt, 1; nop # delay slot 1
+++  nop        ; nop # delay slot 2
+++
+++
+++::mc_end
+++# Do not add code here because mc_end must appear after all other code.
++diff --git a/libavcodec/rpi_user_vcsm.h b/libavcodec/rpi_user_vcsm.h
++new file mode 100644
++index 0000000..db41a4d
++--- /dev/null
+++++ b/libavcodec/rpi_user_vcsm.h
++@@ -0,0 +1,459 @@
+++/*****************************************************************************
+++* Copyright 2001 - 2011 Broadcom Corporation.  All rights reserved.
+++*
+++* This program is the proprietary software of Broadcom Corporation and/or
+++* its licensors, and may only be used, duplicated, modified or distributed
+++* pursuant to the terms and conditions of a separate, written license
+++* agreement executed between you and Broadcom (an "Authorized License").
+++* Except as set forth in an Authorized License, Broadcom grants no license
+++* (express or implied), right to use, or waiver of any kind with respect to
+++* the Software, and Broadcom expressly reserves all rights in and to the
+++* Software and all intellectual property rights therein.  IF YOU HAVE NO
+++* AUTHORIZED LICENSE, THEN YOU HAVE NO RIGHT TO USE THIS SOFTWARE IN ANY
+++* WAY, AND SHOULD IMMEDIATELY NOTIFY BROADCOM AND DISCONTINUE ALL USE OF
+++* THE SOFTWARE.
+++*
+++* Except as expressly set forth in the Authorized License,
+++* 1. This program, including its structure, sequence and organization,
+++*    constitutes the valuable trade secrets of Broadcom, and you shall use
+++*    all reasonable efforts to protect the confidentiality thereof, and to
+++*    use this information only in connection with your use of Broadcom
+++*    integrated circuit products.
+++* 2. TO THE MAXIMUM EXTENT PERMITTED BY LAW, THE SOFTWARE IS PROVIDED "AS IS"
+++*    AND WITH ALL FAULTS AND BROADCOM MAKES NO PROMISES, REPRESENTATIONS OR
+++*    WARRANTIES, EITHER EXPRESS, IMPLIED, STATUTORY, OR OTHERWISE, WITH
+++*    RESPECT TO THE SOFTWARE.  BROADCOM SPECIFICALLY DISCLAIMS ANY AND ALL
+++*    IMPLIED WARRANTIES OF TITLE, MERCHANTABILITY, NONINFRINGEMENT, FITNESS
+++*    FOR A PARTICULAR PURPOSE, LACK OF VIRUSES, ACCURACY OR COMPLETENESS,
+++*    QUIET ENJOYMENT, QUIET POSSESSION OR CORRESPONDENCE TO DESCRIPTION. YOU
+++*    ASSUME THE ENTIRE RISK ARISING OUT OF USE OR PERFORMANCE OF THE SOFTWARE.
+++* 3. TO THE MAXIMUM EXTENT PERMITTED BY LAW, IN NO EVENT SHALL BROADCOM OR ITS
+++*    LICENSORS BE LIABLE FOR (i) CONSEQUENTIAL, INCIDENTAL, SPECIAL, INDIRECT,
+++*    OR EXEMPLARY DAMAGES WHATSOEVER ARISING OUT OF OR IN ANY WAY RELATING TO
+++*    YOUR USE OF OR INABILITY TO USE THE SOFTWARE EVEN IF BROADCOM HAS BEEN
+++*    ADVISED OF THE POSSIBILITY OF SUCH DAMAGES; OR (ii) ANY AMOUNT IN EXCESS
+++*    OF THE AMOUNT ACTUALLY PAID FOR THE SOFTWARE ITSELF OR U.S. $1, WHICHEVER
+++*    IS GREATER. THESE LIMITATIONS SHALL APPLY NOTWITHSTANDING ANY FAILURE OF
+++*    ESSENTIAL PURPOSE OF ANY LIMITED REMEDY.
+++*****************************************************************************/
+ +
+-     if (ARCH_X86)
+-         ff_hevc_dsp_init_x86(hevcdsp, bit_depth);
+-     if (ARCH_ARM)
+-diff --git a/libavcodec/hevcdsp.h b/libavcodec/hevcdsp.h
+-index 9f1f6dd..e221e54 100644
+---- a/libavcodec/hevcdsp.h
+-+++ b/libavcodec/hevcdsp.h
+-@@ -42,6 +42,17 @@ typedef struct SAOParams {
+-     uint8_t type_idx[3];    ///< sao_type_idx
+- } SAOParams;
+- 
+-+typedef struct Mv {
+-+    int16_t x;  ///< horizontal component of motion vector
+-+    int16_t y;  ///< vertical component of motion vector
+-+} Mv;
+++#ifndef __USER_VCSM__H__INCLUDED__
+++#define __USER_VCSM__H__INCLUDED__
+ +
+-+typedef struct MvField {
+-+    DECLARE_ALIGNED(4, Mv, mv)[2];
+-+    int8_t ref_idx[2];
+-+    int8_t pred_flag;
+-+} MvField;
+++/* VideoCore Shared Memory - user interface library.
+++**
+++** This library provides all the necessary abstraction for any application to
+++** make use of the shared memory service which is distributed accross a kernel
+++** driver and a videocore service.
+++**
+++** It is an application design decision to choose or not to use this service.
+++**
+++** The logical flow of operations that a user application needs to follow when
+++** using this service is:
+++**
+++**       1) Initialize the service.
+++**       2) Allocate shared memory blocks.
+++**       3) Start using the allocated blocks.
+++**          - In order to gain ownership on a block, lock the allocated block,
+++**            locking a block returns a valid address that the user application
+++**            can access.
+++**          - When finished with using the block for the current execution cycle
+++**            or function, and so when giving up the ownership, unlock the block.
+++**       4) A block can be locked/unlocked as many times required - within or outside
+++**          of - a specific execution context.
+++**       5) To completely release an allocated block, free it.
+++**       6) If the service is no longer required, terminate it.
+++**
+++**
+++** Some generic considerations:
+ +
+- typedef struct HEVCDSPContext {
+-     void (*put_pcm)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
+-                     struct GetBitContext *gb, int pcm_bit_depth);
+-@@ -120,6 +131,9 @@ typedef struct HEVCDSPContext {
+-     void (*hevc_v_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride,
+-                                         int32_t *tc, uint8_t *no_p,
+-                                         uint8_t *no_q);
+-+    void (*hevc_deblocking_boundary_strengths)(int pus, int dup, int in_inc, int out_inc,
+-+                                               int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
+-+                                               MvField *curr, MvField *neigh, uint8_t *bs);
+- } HEVCDSPContext;
+- 
+- void ff_hevc_dsp_init(HEVCDSPContext *hpc, int bit_depth);
+--- 
+-2.7.4
+-
+-
+-From 619366d6acfd5f040a3116fda97b1146c8e40250 Mon Sep 17 00:00:00 2001
+-From: Peter de Rivaz <peter.derivaz@gmail.com>
+-Date: Wed, 15 Jul 2015 09:09:11 +0100
+-Subject: [PATCH 68/68] Only enable qpu when needed
+-
+----
+- libavcodec/hevc.h    |  2 +-
+- libavcodec/rpi_qpu.c | 21 ++++++++++++++++-----
+- 2 files changed, 17 insertions(+), 6 deletions(-)
+-
+-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
+-index 496c0e1..ce14975 100644
+---- a/libavcodec/hevc.h
+-+++ b/libavcodec/hevc.h
+-@@ -57,7 +57,7 @@
+-   // Define RPI_WORKER to launch a worker thread for pixel processing tasks
+-   #define RPI_WORKER
+-   // Define RPI_DEBLOCK_VPU to perform deblocking on the VPUs
+--  #define RPI_DEBLOCK_VPU
+-+  //#define RPI_DEBLOCK_VPU
+- 
+- #endif
+- 
+-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
+-index 5aa0432..ffd13ca 100644
+---- a/libavcodec/rpi_qpu.c
+-+++ b/libavcodec/rpi_qpu.c
+-@@ -9,7 +9,7 @@
+- // define RPI_ASYNC to run the VPU in a separate thread, need to make a separate call to check for completion
+- #define RPI_ASYNC
+- // Define RPI_COMBINE_JOBS to find jobs that can be executed in parallel
+--#define RPI_COMBINE_JOBS
+-+//#define RPI_COMBINE_JOBS
+- 
+- #include <stdio.h>
+- #include <stdlib.h>
+-@@ -143,9 +143,9 @@ static int gpu_init(volatile struct GPU **gpu) {
+-   volatile struct GPU* ptr;
+- 	if (mb < 0)
+- 		return -1;
+--
+-+#ifndef RPI_ASYNC
+- 	if (qpu_enable(mb, 1)) return -2;
+--
+-+#endif
+-   vcsm_init();
+-   gpu_malloc_uncached_internal(sizeof(struct GPU), &gpu_mem_ptr, mb);
+-   ptr = (volatile struct GPU*)gpu_mem_ptr.arm;
+-@@ -336,9 +336,9 @@ static void gpu_term(void)
+-     vpu_post_code(0, 0, 0, 0, 0, 0, -1, NULL);
+-     pthread_join(vpu_thread, &res);
+-   }
+--#endif
+--
+-+#else
+-   qpu_enable(mb, 0);
+-+#endif
+-   gpu_free_internal(&gpu_mem_ptr);
+- 
+-   vcsm_exit();
+-@@ -400,6 +400,7 @@ static void *vpu_start(void *arg) {
+-   int count_deblock=0;
+-   int count_qpu=0;
+- #endif
+-+  int qpu_started = 0;
+-   while(1) {
+-     int i;
+-     int *p; // Pointer for a QPU/VPU job
+-@@ -427,6 +428,12 @@ static void *vpu_start(void *arg) {
+-     if (p[7] == 0 && p[0] == 0 && p[16]==0)
+-       goto job_done_early;
+- 
+-+    if (!qpu_started) {
+-+      int result = qpu_enable(gpu->mb, 1);
+-+      av_assert0(result==0);
+-+      qpu_started = 1;
+-+    }
+++** Allocating memory blocks.
+++**
+++**   Memory blocks can be allocated in different manners depending on the cache
+++**   behavior desired.  A given block can either be:
+ +
+- #ifdef RPI_COMBINE_JOBS
+-     // First scan for a qpu job
+-     for (int x=0;x<num_jobs;x++) {
+-@@ -556,6 +563,10 @@ job_done_early:
+-     pthread_mutex_unlock(&post_mutex);
+-   }
+- 
+-+  if (qpu_started) {
+-+    qpu_enable(gpu->mb, 0);
+-+  }
+++**       - Allocated in a non cached fashion all the way through host and videocore.
+++**       - Allocated in a cached fashion on host OR videocore.
+++**       - Allocated in a cached fashion on host AND videocore.
+++**
+++**   It is an application decision to determine how to allocate a block.  Evidently
+++**   if the application will be doing substantial read/write accesses to a given block,
+++**   it is recommended to allocate the block at least in a 'host cached' fashion for
+++**   better results.
+++**
+++**
+++** Locking memory blocks.
+++**
+++**   When the memory block has been allocated in a host cached fashion, locking the
+++**   memory block (and so taking ownership of it) will trigger a cache invalidation.
+++**
+++**   For the above reason and when using host cached allocation, it is important that
+++**   an application properly implements the lock/unlock mechanism to ensure cache will
+++**   stay coherent, otherwise there is no guarantee it will at all be.
+++**
+++**   It is possible to dynamically change the host cache behavior (ie cached or non
+++**   cached) of a given allocation without needing to free and re-allocate the block.
+++**   This feature can be useful for such application which requires access to the block
+++**   only at certain times and not otherwise.  By changing the cache behavior dynamically
+++**   the application can optimize performances for a given duration of use.
+++**   Such dynamic cache behavior remapping only applies to host cache and not videocore
+++**   cache.  If one requires to change the videocore cache behavior, then a new block
+++**   must be created to replace the old one.
+++**
+++**   On successful locking, a valid pointer is returned that the application can use
+++**   to access to data inside the block.  There is no guarantee that the pointer will
+++**   stay valid following the unlock action corresponding to this lock.
+++**
+++**
+++** Unocking memory blocks.
+++**
+++**   When the memory block has been allocated in a host cached fashion, unlocking the
+++**   memory block (and so forgiving its ownership) will trigger a cache flush unless
+++**   explicitely asked not to flush the cache for performances reasons.
+++**
+++**   For the above reason and when using host cached allocation, it is important that
+++**   an application properly implements the lock/unlock mechanism to ensure cache will
+++**   stay coherent, otherwise there is no guarantee it will at all be.
+++**
+++**
+++** A complete API is defined below.
+++*/
+ +
+-   return NULL;
+- }
+- 
+--- 
+-2.7.4
+-
+-From a0d0946951b53e64ce103dd61b455f8d1f72caf9 Mon Sep 17 00:00:00 2001
+-From: John Cox <jc@kynesim.co.uk>
+-Date: Tue, 9 Feb 2016 11:57:40 +0000
+-Subject: [PATCH 1/2] Zero copy code v6
+-
+-This version has GPU buffer pooling code
+----
+- ffmpeg.c                 | 123 +++++++++-----
+- libavcodec/Makefile      |   2 +
+- libavcodec/avcodec.h     |   6 +
+- libavcodec/hevc.c        |  92 ++++++-----
+- libavcodec/hevc_filter.c |  83 +++++-----
+- libavcodec/rpi_qpu.c     |   2 +-
+- libavcodec/rpi_qpu.h     | 109 ++++++++++++-
+- libavcodec/rpi_zc.c      | 406 +++++++++++++++++++++++++++++++++++++++++++++++
+- libavcodec/rpi_zc.h      |  83 ++++++++++
+- 9 files changed, 779 insertions(+), 127 deletions(-)
+- create mode 100644 libavcodec/rpi_zc.c
+- create mode 100644 libavcodec/rpi_zc.h
+-
+-diff --git a/ffmpeg.c b/ffmpeg.c
+-index 50c6e86..953e5b8 100644
+---- a/ffmpeg.c
+-+++ b/ffmpeg.c
+-@@ -25,7 +25,7 @@
+- 
+- #ifdef RPI
+- #define RPI_DISPLAY
+--//#define RPI_ZERO_COPY
+-+#define RPI_ZERO_COPY
+- #endif
+- 
+- #include "config.h"
+-@@ -80,9 +80,7 @@
+- #include <interface/mmal/util/mmal_default_components.h>
+- #include <interface/mmal/util/mmal_connection.h>
+- #include <interface/mmal/util/mmal_util_params.h>
+--#ifdef RPI_ZERO_COPY
+--#include "libavcodec/rpi_qpu.h"
+--#endif
+-+#include "libavcodec/rpi_zc.h"
+- #endif
+- 
+- #if HAVE_SYS_RESOURCE_H
+-@@ -183,13 +181,7 @@ static void free_input_threads(void);
+- 
+- static MMAL_COMPONENT_T* rpi_display = NULL;
+- static MMAL_POOL_T *rpi_pool = NULL;
+--
+--#ifdef RPI_ZERO_COPY
+--static uint8_t *get_vc_handle(AVBufferRef *bref) {
+--  GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
+--  return (uint8_t *)p->vc_handle;
+--}
+--#endif
+-+static volatile int rpi_display_count = 0;
+- 
+- static MMAL_POOL_T* display_alloc_pool(MMAL_PORT_T* port, size_t w, size_t h)
+- {
+-@@ -206,7 +198,7 @@ static MMAL_POOL_T* display_alloc_pool(MMAL_PORT_T* port, size_t w, size_t h)
+-     for (i = 0; i < NUM_BUFFERS; ++i)
+-     {
+-        MMAL_BUFFER_HEADER_T* buffer = pool->header[i];
+--       void* bufPtr = buffer->data;
+-+       char * bufPtr = buffer->data;
+-        memset(bufPtr, i*30, w*h);
+-        memset(bufPtr+w*h, 128, (w*h)/2);
+-     }
+-@@ -215,23 +207,31 @@ static MMAL_POOL_T* display_alloc_pool(MMAL_PORT_T* port, size_t w, size_t h)
+-     return pool;
+- }
+- 
+--static void display_cb_input(MMAL_PORT_T *port,MMAL_BUFFER_HEADER_T *buffer) {
+-+static void display_cb_input(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer) {
+-+#ifdef RPI_ZERO_COPY
+-+    av_rpi_zc_unref(buffer->user_data);
+-+    --rpi_display_count;
+++#ifdef __cplusplus
+++extern "C"
+++{
+ +#endif
+-+    mmal_buffer_header_release(buffer);
+-+}
+-+
+-+static void display_cb_control(MMAL_PORT_T *port,MMAL_BUFFER_HEADER_T *buffer) {
+-   mmal_buffer_header_release(buffer);
+- }
+- 
+- static MMAL_COMPONENT_T* display_init(size_t x, size_t y, size_t w, size_t h)
+- {
+-     MMAL_COMPONENT_T* display;
+--    int w2 = (w+31)&~31;
+--    int h2 = (h+15)&~15;
+-     MMAL_DISPLAYREGION_T region =
+-     {
+--        {MMAL_PARAMETER_DISPLAYREGION, sizeof(region)},
+-+        .hdr = {MMAL_PARAMETER_DISPLAYREGION, sizeof(region)},
+-         .set = MMAL_DISPLAY_SET_LAYER | MMAL_DISPLAY_SET_FULLSCREEN | MMAL_DISPLAY_SET_DEST_RECT,
+-         .layer = 2,
+-         .fullscreen = 0,
+-         .dest_rect = {x, y, w, h}
+-     };
+-+    const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(w, h);
+ +
+-     bcm_host_init();  // TODO is this needed?
+-     mmal_component_create(MMAL_COMPONENT_DEFAULT_VIDEO_RENDERER, &display);
+-     assert(display);
+-@@ -240,8 +240,8 @@ static MMAL_COMPONENT_T* display_init(size_t x, size_t y, size_t w, size_t h)
+- 
+-     MMAL_ES_FORMAT_T* format = display->input[0]->format;
+-     format->encoding = MMAL_ENCODING_I420;
+--    format->es->video.width = w2;
+--    format->es->video.height = h2;
+-+    format->es->video.width = geo.stride_y;
+-+    format->es->video.height = geo.height_y;
+-     format->es->video.crop.x = 0;
+-     format->es->video.crop.y = 0;
+-     format->es->video.crop.width = w;
+-@@ -250,46 +250,75 @@ static MMAL_COMPONENT_T* display_init(size_t x, size_t y, size_t w, size_t h)
+- 
+-     mmal_component_enable(display);
+- 
+--    rpi_pool = display_alloc_pool(display->input[0], w2, h2);
+-+    rpi_pool = display_alloc_pool(display->input[0], geo.stride_y, geo.height_y);
+- 
+-     mmal_port_enable(display->input[0],display_cb_input);
+--    mmal_port_enable(display->control,display_cb_input);
+-+    mmal_port_enable(display->control,display_cb_control);
+- 
+--    printf("Allocated display %d %d\n",w,h);
+-+    printf("Allocated display %dx%d in %dx%d\n", w, h, geo.stride_y, geo.height_y);
+- 
+-     return display;
+- }
+- 
+--static void display_frame(MMAL_COMPONENT_T* display,AVFrame* fr)
+-+static void display_frame(struct AVCodecContext * const s, MMAL_COMPONENT_T* const display, const AVFrame* const fr)
+- {
+--    int w = fr->width;
+--    int h = fr->height;
+--    int w2 = (w+31)&~31;
+--    int h2 = (h+15)&~15;
+-     if (!display || !rpi_pool)
+-         return;
+++/* Different status that can be dumped.
+++*/
+++typedef enum
+++{
+++   VCSM_STATUS_VC_WALK_ALLOC = 0,   // Walks *all* the allocation on videocore.
+++                                    // Result of the walk is seen in the videocore
+++                                    // log.
+++   VCSM_STATUS_HOST_WALK_MAP,       // Walks the *full* mapping allocation on host
+++                                    // driver (ie for all processes).  Result of
+++                                    // the walk is seen in the kernel log.
+++   VCSM_STATUS_HOST_WALK_PID_MAP,   // Walks the per process mapping allocation on host
+++                                    // driver (for current process).  Result of
+++                                    // the walk is seen in the kernel log.
+++   VCSM_STATUS_HOST_WALK_PID_ALLOC, // Walks the per process host allocation on host
+++                                    // driver (for current process).  Result of
+++                                    // the walk is seen in the kernel log.
+++   VCSM_STATUS_VC_MAP_ALL,          // Equivalent to both VCSM_STATUS_VC_WALK_ALLOC and
+++                                    // VCSM_STATUS_HOST_WALK_MAP.
+++                                    //
+++   VCSM_STATUS_NONE,                // Must be last - invalid.
+ +
+-+    if (rpi_display_count >= 3) {
+-+        av_log(s, AV_LOG_VERBOSE, "Frame dropped\n");
+-+        return;
+-+    }
+++} VCSM_STATUS_T;
+ +
+-     MMAL_BUFFER_HEADER_T* buf = mmal_queue_get(rpi_pool->queue);
+-     if (!buf) {
+--      // Running too fast so drop the frame
+--      return;
+-+        // Running too fast so drop the frame
+-+        printf("Q alloc failure\n");
+-+        return;
+-     }
+-     assert(buf);
+-     buf->cmd = 0;
+--    buf->length = (w2 * h2 * 3)/2;
+-     buf->offset = 0; // Offset to valid data
+-     buf->flags = 0;
+- #ifdef RPI_ZERO_COPY
+--    buf->data = get_vc_handle(fr->buf[0]);
+--    buf->alloc_size = (w2*h2*3)/2;
+++/* Different kind of cache behavior.
+++*/
+++typedef enum
+ +{
+-+    const AVRpiZcRefPtr fr_buf = av_rpi_zc_ref(s, fr, 1);
+++   VCSM_CACHE_TYPE_NONE = 0,        // No caching applies.
+++   VCSM_CACHE_TYPE_HOST,            // Allocation is cached on host (user space).
+++   VCSM_CACHE_TYPE_VC,              // Allocation is cached on videocore.
+++   VCSM_CACHE_TYPE_HOST_AND_VC,     // Allocation is cached on both host and videocore.
+ +
+-+    buf->user_data = fr_buf;
+-+    buf->data = av_rpi_zc_vc_handle(fr_buf);
+-+    buf->alloc_size =
+-+        buf->length = av_rpi_zc_numbytes(fr_buf);
+++} VCSM_CACHE_TYPE_T;
+ +
+-+    ++rpi_display_count;
+-+}
+- #else
+-+{
+-+#error YYY
+-+    int w = fr->width;
+-+    int h = fr->height;
+-+    int w2 = (w+31)&~31;
+-+    int h2 = (h+15)&~15;
+++/* Initialize the vcsm processing.
+++**
+++** Must be called once before attempting to do anything else.
+++**
+++** Returns 0 on success, -1 on error.
+++*/
+++int vcsm_init( void );
+ +
+-+    buf->length = (w2 * h2 * 3)/2;
+-+    buf->user_data = NULL;
+ +
+-     //mmal_buffer_header_mem_lock(buf);
+-     memcpy(buf->data, fr->data[0], w2 * h);
+-     memcpy(buf->data+w2*h2, fr->data[1], w2 * h / 4);
+-     memcpy(buf->data+w2*h2*5/4, fr->data[2], w2 * h / 4);
+-     //mmal_buffer_header_mem_unlock(buf);
+-+}
+- #endif
+- 
+--    mmal_port_send_buffer(display->input[0], buf);  // I assume this will automatically get released
+-+    while (rpi_display_count >= 3) {
+-+        usleep(5000);
+-+    }
+++/* Terminates the vcsm processing.
+++**
+++** Must be called vcsm services are no longer needed, it will
+++** take care of removing any allocation under the current process
+++** control if deemed necessary.
+++*/
+++void vcsm_exit( void );
+ +
+-+    if (mmal_port_send_buffer(display->input[0], buf) != MMAL_SUCCESS)
+-+    {
+-+        printf("** send failed: depth=%d\n", rpi_display_count);
+-+        display_cb_input(NULL, buf);
+-+    }
+- }
+- 
+- static void display_exit(MMAL_COMPONENT_T* display)
+-@@ -687,6 +716,11 @@ static void ffmpeg_cleanup(int ret)
+-         avformat_close_input(&input_files[i]->ctx);
+-         av_freep(&input_files[i]);
+-     }
+ +
+-+#ifdef RPI_DISPLAY
+-+    display_exit(rpi_display);
+-+#endif
+++/* Queries the status of the the vcsm.
+++**
+++** Triggers dump of various kind of information, see the
+++** different variants specified in VCSM_STATUS_T.
+++**
+++** Pid is optional.
+++*/
+++void vcsm_status( VCSM_STATUS_T status, int pid );
+ +
+-     for (i = 0; i < nb_input_streams; i++) {
+-         InputStream *ist = input_streams[i];
+- 
+-@@ -698,6 +732,9 @@ static void ffmpeg_cleanup(int ret)
+-         av_freep(&ist->filters);
+-         av_freep(&ist->hwaccel_device);
+- 
+-+#ifdef RPI_ZERO_COPY
+-+        av_rpi_zc_uninit(ist->dec_ctx);
+-+#endif
+-         avcodec_free_context(&ist->dec_ctx);
+- 
+-         av_freep(&input_streams[i]);
+-@@ -729,9 +766,6 @@ static void ffmpeg_cleanup(int ret)
+-     term_exit();
+-     ffmpeg_exited = 1;
+- 
+--#ifdef RPI_DISPLAY
+--    display_exit(rpi_display);
+--#endif
+- }
+- 
+- void remove_avoptions(AVDictionary **a, AVDictionary *b)
+-@@ -1091,18 +1125,19 @@ static void do_video_out(AVFormatContext *s,
+-     int frame_size = 0;
+-     InputStream *ist = NULL;
+-     AVFilterContext *filter = ost->filter->filter;
+ +
+-+    if (ost->source_index >= 0)
+-+        ist = input_streams[ost->source_index];
+++/* Allocates a non-cached block of memory of size 'size' via the vcsm memory
+++** allocator.
+++**
+++** Returns:        0 on error
+++**                 a non-zero opaque handle on success.
+++**
+++** On success, the user must invoke vcsm_lock with the returned opaque
+++** handle to gain access to the memory associated with the opaque handle.
+++** When finished using the memory, the user calls vcsm_unlock_xx (see those
+++** function definition for more details on the one that can be used).
+++**
+++** A well behaved application should make every attempt to lock/unlock
+++** only for the duration it needs to access the memory data associated with
+++** the opaque handle.
+++*/
+++unsigned int vcsm_malloc( unsigned int size, char *name );
+ +
+- #ifdef RPI_DISPLAY
+--    if (next_picture)
+-+    if (next_picture && ist != NULL)
+-     {
+--	if (!rpi_display)
+-+        if (!rpi_display)
+-            rpi_display = display_init(0,0,next_picture->width,next_picture->height);
+--        display_frame(rpi_display,next_picture);
+-+        display_frame(ist->dec_ctx, rpi_display, next_picture);
+-     }
+- #endif
+- 
+--    if (ost->source_index >= 0)
+--        ist = input_streams[ost->source_index];
+--
+-     if (filter->inputs[0]->frame_rate.num > 0 &&
+-         filter->inputs[0]->frame_rate.den > 0)
+-         duration = 1/(av_q2d(filter->inputs[0]->frame_rate) * av_q2d(enc->time_base));
+-@@ -2708,6 +2743,12 @@ static int init_input_stream(int ist_index, char *error, int error_len)
+-         ist->dec_ctx->opaque                = ist;
+-         ist->dec_ctx->get_format            = get_format;
+-         ist->dec_ctx->get_buffer2           = get_buffer;
+ +
+-+#ifdef RPI_ZERO_COPY
+-+        // Overrides the above get_buffer2
+-+        av_rpi_zc_init(ist->dec_ctx);
+-+#endif
+++/* Allocates a cached block of memory of size 'size' via the vcsm memory
+++** allocator, the type of caching requested is passed as argument of the
+++** function call.
+++**
+++** Returns:        0 on error
+++**                 a non-zero opaque handle on success.
+++**
+++** On success, the user must invoke vcsm_lock with the returned opaque
+++** handle to gain access to the memory associated with the opaque handle.
+++** When finished using the memory, the user calls vcsm_unlock_xx (see those
+++** function definition for more details on the one that can be used).
+++**
+++** A well behaved application should make every attempt to lock/unlock
+++** only for the duration it needs to access the memory data associated with
+++** the opaque handle.
+++*/
+++unsigned int vcsm_malloc_cache( unsigned int size, VCSM_CACHE_TYPE_T cache, char *name );
+ +
+-         ist->dec_ctx->thread_safe_callbacks = 1;
+- 
+-         av_opt_set_int(ist->dec_ctx, "refcounted_frames", 1, 0);
+-diff --git a/libavcodec/Makefile b/libavcodec/Makefile
+-index 03065cd..21e4514 100644
+---- a/libavcodec/Makefile
+-+++ b/libavcodec/Makefile
+-@@ -9,6 +9,7 @@ HEADERS = avcodec.h                                                     \
+-           rpi_shader.h                                                  \
+-           rpi_mailbox.h                                                 \
+-           rpi_hevc_transform.h                                          \
+-+          rpi_zc.h                                                      \
+-           d3d11va.h                                                     \
+-           dirac.h                                                       \
+-           dv_profile.h                                                  \
+-@@ -50,6 +51,7 @@ OBJS = allcodecs.o                                                      \
+-        rpi_qpu.o                                                        \
+-        rpi_shader.o                                                     \
+-        rpi_mailbox.o                                                    \
+-+       rpi_zc.o                                                         \
+-        vorbis_parser.o                                                  \
+-        xiph.o                                                           \
+- 
+-diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
+-index 39713ed..a1ba217 100644
+---- a/libavcodec/avcodec.h
+-+++ b/libavcodec/avcodec.h
+-@@ -3505,6 +3505,12 @@ typedef struct AVCodecContext {
+- #define FF_SUB_TEXT_FMT_ASS_WITH_TIMINGS 1
+- #endif
+- 
+-+    /**
+-+     * Opaque pointer for use by replacement get_buffer2 code
+-+     *
+-+     * @author jc (08/02/2016)
+-+     */
+-+    void * get_buffer_context;
+- } AVCodecContext;
+- 
+- AVRational av_codec_get_pkt_timebase         (const AVCodecContext *avctx);
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index 8437e10..51736c7 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -114,10 +114,6 @@ static uint32_t rpi_filter_coefs[8][1] = {
+-         { ENCODE_COEFFS(  -2,  10,  58,  -2) }
+- };
+- 
+--static uint32_t get_vc_address(AVBufferRef *bref) {
+--  GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
+--  return p->vc;
+--}
+- #endif
+- 
+- 
+-@@ -2197,9 +2193,9 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                   int bw = nPbW-start_x;
+-                   int bh = nPbH-start_y;
+-                   y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff);
+--                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[0]);
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref0->frame);
+-                   y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + 8 + start_x) & 0xffff);
+--                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[0]);
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref0->frame);
+-                   *y++ = ( (bw<16 ? bw : 16) << 16 ) + (bh<16 ? bh : 16);
+-                   *y++ = my2_mx2_my_mx;
+-                   if (weight_flag) {
+-@@ -2207,7 +2203,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                   } else {
+-                       *y++ = 1; // Weight of 1 and offset of 0
+-                   }
+--                  *y++ = (get_vc_address(s->frame->buf[0]) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
+-+                  *y++ = (get_vc_address_y(s->frame) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
+-                   y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter;
+-                 }
+-             }
+-@@ -2246,8 +2242,8 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
+--                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[1]);
+--                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[2]);
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_u(ref0->frame);
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_v(ref0->frame);
+-                       *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
+-                       *u++ = rpi_filter_coefs[_mx][0];
+-                       *u++ = rpi_filter_coefs[_my][0];
+-@@ -2258,8 +2254,8 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                           *u++ = 1; // Weight of 1 and offset of 0
+-                           *u++ = 1;
+-                       }
+--                      *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
+--                      *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+-+                      *u++ = (get_vc_address_u(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
+-+                      *u++ = (get_vc_address_v(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+-                     }
+-                 }
+-                 s->curr_u_mvs = u;
+-@@ -2297,9 +2293,9 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                   int bw = nPbW-start_x;
+-                   int bh = nPbH-start_y;
+-                   y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff);
+--                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[0]);
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref1->frame);
+-                   y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + 8 + start_x) & 0xffff);
+--                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[0]);
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref1->frame);
+-                   *y++ = ( (bw<16 ? bw : 16) << 16 ) + (bh<16 ? bh : 16);
+-                   *y++ = my2_mx2_my_mx;
+-                   if (weight_flag) {
+-@@ -2307,7 +2303,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                   } else {
+-                       *y++ = 1; // Weight of 1 and offset of 0
+-                   }
+--                  *y++ = (get_vc_address(s->frame->buf[0]) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
+-+                  *y++ = (get_vc_address_y(s->frame) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
+-                   y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter;
+-                 }
+-             }
+-@@ -2347,8 +2343,8 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
+--                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[1]);
+--                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[2]);
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_u(ref1->frame);
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_v(ref1->frame);
+-                       *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
+-                       // TODO chroma weight and offset... s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]
+-                       *u++ = rpi_filter_coefs[_mx][0];
+-@@ -2360,8 +2356,8 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                           *u++ = 1; // Weight of 1 and offset of 0
+-                           *u++ = 1;
+-                       }
+--                      *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
+--                      *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+-+                      *u++ = (get_vc_address_u(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
+-+                      *u++ = (get_vc_address_v(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+-                     }
+-                 }
+-                 s->curr_u_mvs = u;
+-@@ -2403,13 +2399,13 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                   int bw = nPbW-start_x;
+-                   int bh = nPbH-start_y;
+-                   y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff);
+--                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[0]);
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref0->frame);
+-                   y++[-RPI_LUMA_COMMAND_WORDS] = ((y2 - 3 + start_y) << 16) + ( (x2 - 3 + start_x) & 0xffff); // Second fetch is for ref1
+--                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[0]);
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref1->frame);
+-                   *y++ = ( (bw<8 ? bw : 8) << 16 ) + (bh<16 ? bh : 16);
+-                   *y++ = my2_mx2_my_mx;
+-                   *y++ = 1; // B frame weighted prediction not supported
+--                  *y++ = (get_vc_address(s->frame->buf[0]) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
+-+                  *y++ = (get_vc_address_y(s->frame) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
+-                   y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter_b;
+-                 }
+-             }
+-@@ -2453,8 +2449,8 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b0;
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
+--                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[1]);
+--                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[2]);
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_u(ref0->frame);
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_v(ref0->frame);
+-                       *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
+-                       *u++ = rpi_filter_coefs[_mx][0];
+-                       *u++ = rpi_filter_coefs[_my][0];
+-@@ -2464,14 +2460,14 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b;
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = x2_c - 1 + start_x;
+-                       u++[-RPI_CHROMA_COMMAND_WORDS] = y2_c - 1 + start_y;
+--                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[1]);
+--                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[2]);
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_u(ref1->frame);
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_v(ref1->frame);
+-                       *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
+-                       *u++ = rpi_filter_coefs[_mx2][0];
+-                       *u++ = rpi_filter_coefs[_my2][0];
+-                       u+=2; // Weights not supported in B slices
+--                      *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
+--                      *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+-+                      *u++ = (get_vc_address_u(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
+-+                      *u++ = (get_vc_address_v(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+-                     }
+-                 }
+-                 s->curr_u_mvs = u;
+-@@ -3270,12 +3266,13 @@ static int32_t filter8_luma(uint8_t *data, int x0, int y0, int pitch, int my_mx,
+-    return vsum;
+- }
+- 
+--static uint8_t *test_frame(HEVCContext *s,uint32_t p, AVFrame *frame, int cIdx)
+-+static uint8_t *test_frame(HEVCContext *s,uint32_t p, AVFrame *frame, const int cIdx)
+- {
+-   //int pic_width        = s->ps.sps->width >> s->ps.sps->hshift[cIdx];
+-   int pic_height       = s->ps.sps->height >> s->ps.sps->vshift[cIdx];
+-   int pitch = frame->linesize[cIdx];
+--  uint32_t base = get_vc_address(frame->buf[cIdx]);
+-+  uint32_t base = c_idx == 0 ? get_vc_address_y(frame);
+-+    c_idx == 1 ? get_vc_address_u(frame) : get_vc_address_v(frame);
+-   if (p>=base && p<base+pitch*pic_height) {
+-     return frame->data[cIdx] + (p-base);
+-   }
+-@@ -3562,6 +3559,7 @@ static void rpi_launch_vpu_qpu(HEVCContext *s)
+- #ifdef RPI
+- 
+- #ifndef RPI_FAST_CACHEFLUSH
+-+#error RPI_FAST_CACHEFLUSH is broken
+- static void flush_buffer(AVBufferRef *bref) {
+-     GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
+-     gpu_cache_flush(p);
+-@@ -3572,7 +3570,7 @@ static void flush_frame(HEVCContext *s,AVFrame *frame)
+- {
+- #ifdef RPI_FAST_CACHEFLUSH
+-     struct vcsm_user_clean_invalid_s iocache = {};
+--    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[1]);
+-+    GPU_MEM_PTR_T p = get_gpu_mem_ptr_u(s->frame);
+-     int n = s->ps.sps->height;
+-     int curr_y = 0;
+-     int curr_uv = 0;
+-@@ -3580,21 +3578,21 @@ static void flush_frame(HEVCContext *s,AVFrame *frame)
+-     int sz,base;
+-     sz = s->frame->linesize[1] * (n_uv-curr_uv);
+-     base = s->frame->linesize[1] * curr_uv;
+--    iocache.s[0].handle = p->vcsm_handle;
+-+    iocache.s[0].handle = p.vcsm_handle;
+-     iocache.s[0].cmd = 3; // clean+invalidate
+--    iocache.s[0].addr = (int)(p->arm) + base;
+-+    iocache.s[0].addr = (int)(p.arm) + base;
+-     iocache.s[0].size  = sz;
+--    p = av_buffer_pool_opaque(frame->buf[2]);
+--    iocache.s[1].handle = p->vcsm_handle;
+-+    p = get_gpu_mem_ptr_v(s->frame);
+-+    iocache.s[1].handle = p.vcsm_handle;
+-     iocache.s[1].cmd = 3; // clean+invalidate
+--    iocache.s[1].addr = (int)(p->arm) + base;
+-+    iocache.s[1].addr = (int)(p.arm) + base;
+-     iocache.s[1].size  = sz;
+--    p = av_buffer_pool_opaque(frame->buf[0]);
+-+    p = get_gpu_mem_ptr_y(s->frame);
+-     sz = s->frame->linesize[0] * (n-curr_y);
+-     base = s->frame->linesize[0] * curr_y;
+--    iocache.s[2].handle = p->vcsm_handle;
+-+    iocache.s[2].handle = p.vcsm_handle;
+-     iocache.s[2].cmd = 3; // clean+invalidate
+--    iocache.s[2].addr = (int)(p->arm) + base;
+-+    iocache.s[2].addr = (int)(p.arm) + base;
+-     iocache.s[2].size  = sz;
+-     vcsm_clean_invalid( &iocache );
+- #else
+-@@ -3612,7 +3610,7 @@ static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM
+-     int curr_y;
+-     int curr_uv;
+-     int n_uv;
+--    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[1]);
+-+    GPU_MEM_PTR_T p = get_gpu_mem_ptr_u(s->frame);
+-     int sz,base;
+-     int (*d)[2] = s->dblk_cmds[job];
+-     int low=(*d)[1];
+-@@ -3629,21 +3627,21 @@ static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM
+- 
+-     sz = s->frame->linesize[1] * (n_uv-curr_uv);
+-     base = s->frame->linesize[1] * curr_uv;
+--    iocache.s[0].handle = p->vcsm_handle;
+-+    iocache.s[0].handle = p.vcsm_handle;
+-     iocache.s[0].cmd = 3; // clean+invalidate
+--    iocache.s[0].addr = (int)(p->arm) + base;
+-+    iocache.s[0].addr = (int)(p.arm) + base;
+-     iocache.s[0].size  = sz;
+--    p = av_buffer_pool_opaque(frame->buf[2]);
+--    iocache.s[1].handle = p->vcsm_handle;
+-+    p = get_gpu_mem_ptr_v(s->frame);
+-+    iocache.s[1].handle = p.vcsm_handle;
+-     iocache.s[1].cmd = 3; // clean+invalidate
+--    iocache.s[1].addr = (int)(p->arm) + base;
+-+    iocache.s[1].addr = (int)(p.arm) + base;
+-     iocache.s[1].size  = sz;
+--    p = av_buffer_pool_opaque(frame->buf[0]);
+-+    p = get_gpu_mem_ptr_y(s->frame);
+-     sz = s->frame->linesize[0] * (n-curr_y);
+-     base = s->frame->linesize[0] * curr_y;
+--    iocache.s[2].handle = p->vcsm_handle;
+-+    iocache.s[2].handle = p.vcsm_handle;
+-     iocache.s[2].cmd = 3; // clean+invalidate
+--    iocache.s[2].addr = (int)(p->arm) + base;
+-+    iocache.s[2].addr = (int)(p.arm) + base;
+-     iocache.s[2].size  = sz;
+- 
+-     iocache.s[3].handle = p0->vcsm_handle;
+-diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
+-index 826a82f..c4fa305 100644
+---- a/libavcodec/hevc_filter.c
+-+++ b/libavcodec/hevc_filter.c
+-@@ -879,17 +879,25 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
+- #undef CR
+- 
+- #ifdef RPI_INTER_QPU
+--static void flush_buffer(AVBufferRef *bref) {
+--    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
+--    gpu_cache_flush(p);
+-+static void flush_buffer_y(const AVFrame * const frame) {
+-+    GPU_MEM_PTR_T p = get_gpu_mem_ptr_y(frame);
+-+    gpu_cache_flush(&p);
+- }
+- 
+--// Return Physical address for this image
+--static uint32_t get_vc_address(AVBufferRef *bref) {
+--  GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
+--  return p->vc;
+-+static void flush_buffer_u(const AVFrame * const frame) {
+-+    GPU_MEM_PTR_T p = get_gpu_mem_ptr_u(frame);
+-+    gpu_cache_flush(&p);
+- }
+- 
+-+static void flush_buffer_v(const AVFrame * const frame) {
+-+    GPU_MEM_PTR_T p = get_gpu_mem_ptr_v(frame);
+-+    gpu_cache_flush(&p);
+-+}
+++
+++/* Shares an allocated block of memory via the vcsm memory allocator.
+++**
+++** Returns:        0 on error
+++**                 a non-zero opaque handle on success.
+++**
+++** On success, the user must invoke vcsm_lock with the returned opaque
+++** handle to gain access to the memory associated with the opaque handle.
+++** When finished using the memory, the user calls vcsm_unlock_xx (see those
+++** function definition for more details on the one that can be used).
+++**
+++** A well behaved application should make every attempt to lock/unlock
+++** only for the duration it needs to access the memory data associated with
+++** the opaque handle.
+++*/
+++unsigned int vcsm_malloc_share( unsigned int handle );
+ +
+ +
+-+#ifdef RPI_DEBLOCK_VPU
+-+#error Not fixed yet
+++/* Resizes a block of memory allocated previously by vcsm_alloc.
+++**
+++** Returns:        0 on success
+++**                 -errno on error.
+++**
+++** The handle must be unlocked by user prior to attempting any
+++** resize action.
+++**
+++** On error, the original size allocated against the handle
+++** remains available the same way it would be following a
+++** successful vcsm_malloc.
+++*/
+++int vcsm_resize( unsigned int handle, unsigned int new_size );
+ +
+- // ff_hevc_flush_buffer_lines
+- // flushes and invalidates all pixel rows in [start,end-1]
+- static void ff_hevc_flush_buffer_lines(HEVCContext *s, int start, int end, int flush_luma, int flush_chroma)
+-@@ -901,44 +909,44 @@ static void ff_hevc_flush_buffer_lines(HEVCContext *s, int start, int end, int f
+-         int curr_uv = curr_y >> s->ps.sps->vshift[1];
+-         int n_uv = n >> s->ps.sps->vshift[1];
+-         int sz,base;
+--        GPU_MEM_PTR_T *p;
+-+        GPU_MEM_PTR_T p;
+-         if (curr_uv < 0) curr_uv = 0;
+-         if (n_uv<=curr_uv) { return; }
+-         sz = s->frame->linesize[1] * (n_uv-curr_uv);
+-         base = s->frame->linesize[1] * curr_uv;
+-         if (flush_chroma) {
+--          p = av_buffer_pool_opaque(s->frame->buf[1]);
+--          iocache.s[0].handle = p->vcsm_handle;
+-+          p = get_gpu_mem_ptr_u(s->frame);
+-+          iocache.s[0].handle = p.vcsm_handle;
+-           iocache.s[0].cmd = 3; // clean+invalidate
+--          iocache.s[0].addr = (int)p->arm + base;
+-+          iocache.s[0].addr = (int)p.arm + base;
+-           iocache.s[0].size  = sz;
+--          p = av_buffer_pool_opaque(s->frame->buf[2]);
+--          iocache.s[1].handle = p->vcsm_handle;
+-+          p = get_gpu_mem_ptr_v(s->frame);
+-+          iocache.s[1].handle = p.vcsm_handle;
+-           iocache.s[1].cmd = 3; // clean+invalidate
+--          iocache.s[1].addr = (int)p->arm + base;
+-+          iocache.s[1].addr = (int)p.arm + base;
+-           iocache.s[1].size  = sz;
+-         }
+-         if (flush_luma) {
+--          p = av_buffer_pool_opaque(s->frame->buf[0]);
+-+          p = get_gpu_mem_ptr_y(s->frame);
+-           sz = s->frame->linesize[0] * (n-curr_y);
+-           base = s->frame->linesize[0] * curr_y;
+--          iocache.s[2].handle = p->vcsm_handle;
+-+          iocache.s[2].handle = p.vcsm_handle;
+-           iocache.s[2].cmd = 3; // clean+invalidate
+--          iocache.s[2].addr = (int)p->arm + base;
+-+          iocache.s[2].addr = (int)p.arm + base;
+-           iocache.s[2].size  = sz;
+-         }
+-         vcsm_clean_invalid( &iocache );
+- #else
+-         if (flush_chroma) {
+--          flush_buffer(s->frame->buf[1]);
+--          flush_buffer(s->frame->buf[2]);
+-+          flush_buffer_u(s->frame);
+-+          flush_buffer_v(s->frame);
+-         }
+-         if (flush_luma) {
+--          flush_buffer(s->frame->buf[0]);
+-+          flush_buffer_y(s->frame);
+-         }
+- #endif
+- }
+--
+-+#endif
+- 
+- void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n)
+- {
+-@@ -950,37 +958,37 @@ void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n)
+-         int curr_uv = curr_y >> s->ps.sps->vshift[1];
+-         int n_uv = n >> s->ps.sps->vshift[1];
+-         int sz,base;
+--        GPU_MEM_PTR_T *p;
+-+        GPU_MEM_PTR_T p;
+-         if (curr_uv < 0) curr_uv = 0;
+-         if (n_uv<=curr_uv) { return; }
+-         sz = s->frame->linesize[1] * (n_uv-curr_uv);
+-         base = s->frame->linesize[1] * curr_uv;
+--        p = av_buffer_pool_opaque(s->frame->buf[1]);
+--        iocache.s[0].handle = p->vcsm_handle;
+-+        p = get_gpu_mem_ptr_u(s->frame);
+-+        iocache.s[0].handle = p.vcsm_handle;
+-         iocache.s[0].cmd = 3; // clean+invalidate
+--        iocache.s[0].addr = (int)p->arm + base;
+-+        iocache.s[0].addr = (int)p.arm + base;
+-         iocache.s[0].size  = sz;
+--        p = av_buffer_pool_opaque(s->frame->buf[2]);
+--        iocache.s[1].handle = p->vcsm_handle;
+-+        p = get_gpu_mem_ptr_v(s->frame);
+-+        iocache.s[1].handle = p.vcsm_handle;
+-         iocache.s[1].cmd = 3; // clean+invalidate
+--        iocache.s[1].addr = (int)p->arm + base;
+-+        iocache.s[1].addr = (int)p.arm + base;
+-         iocache.s[1].size  = sz;
+- 
+- #ifdef RPI_LUMA_QPU
+--        p = av_buffer_pool_opaque(s->frame->buf[0]);
+-+        p = get_gpu_mem_ptr_y(s->frame);
+-         sz = s->frame->linesize[0] * (n-curr_y);
+-         base = s->frame->linesize[0] * curr_y;
+--        iocache.s[2].handle = p->vcsm_handle;
+-+        iocache.s[2].handle = p.vcsm_handle;
+-         iocache.s[2].cmd = 3; // clean+invalidate
+--        iocache.s[2].addr = (int)p->arm + base;
+-+        iocache.s[2].addr = (int)p.arm + base;
+-         iocache.s[2].size  = sz;
+- #endif
+-         vcsm_clean_invalid( &iocache );
+- #else
+--        flush_buffer(s->frame->buf[1]);
+--        flush_buffer(s->frame->buf[2]);
+-+        flush_buffer_u(s->frame);
+-+        flush_buffer_v(s->frame);
+- #ifdef RPI_LUMA_QPU
+--        flush_buffer(s->frame->buf[0]);
+-+        flush_buffer_y(s->frame);
+- #endif
+- 
+- #endif
+-@@ -992,6 +1000,7 @@ void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n)
+- #endif
+- 
+- #ifdef RPI_DEBLOCK_VPU
+-+#error XXX
+- /* rpi_deblock deblocks an entire row of ctbs using the VPU */
+- static void rpi_deblock(HEVCContext *s, int y, int ctb_size)
+- {
+-@@ -1000,21 +1009,21 @@ static void rpi_deblock(HEVCContext *s, int y, int ctb_size)
+-   // TODO flush buffer of beta/tc setup when it becomes cached
+- 
+-   // Prepare three commands at once to avoid calling overhead
+--  s->vpu_cmds_arm[0][0] = get_vc_address(s->frame->buf[0]) + s->frame->linesize[0] * y;
+-+  s->vpu_cmds_arm[0][0] = get_vc_address_y(s->frame) + s->frame->linesize[0] * y;
+-   s->vpu_cmds_arm[0][1] = s->frame->linesize[0];
+-   s->vpu_cmds_arm[0][2] = s->setup_width;
+-   s->vpu_cmds_arm[0][3] = (int) ( s->y_setup_vc + s->setup_width * (y>>4) );
+-   s->vpu_cmds_arm[0][4] = ctb_size>>4;
+-   s->vpu_cmds_arm[0][5] = 2;
+- 
+--  s->vpu_cmds_arm[1][0] = get_vc_address(s->frame->buf[1]) + s->frame->linesize[1] * (y>> s->ps.sps->vshift[1]);
+-+  s->vpu_cmds_arm[1][0] = get_vc_address_u(s->frame) + s->frame->linesize[1] * (y>> s->ps.sps->vshift[1]);
+-   s->vpu_cmds_arm[1][1] = s->frame->linesize[1];
+-   s->vpu_cmds_arm[1][2] = s->uv_setup_width;
+-   s->vpu_cmds_arm[1][3] = (int) ( s->uv_setup_vc + s->uv_setup_width * ((y>>4)>> s->ps.sps->vshift[1]) );
+-   s->vpu_cmds_arm[1][4] = (ctb_size>>4)>> s->ps.sps->vshift[1];
+-   s->vpu_cmds_arm[1][5] = 3;
+- 
+--  s->vpu_cmds_arm[2][0] = get_vc_address(s->frame->buf[2]) + s->frame->linesize[2] * (y>> s->ps.sps->vshift[2]);
+-+  s->vpu_cmds_arm[2][0] = get_vc_address_v(s->frame) + s->frame->linesize[2] * (y>> s->ps.sps->vshift[2]);
+-   s->vpu_cmds_arm[2][1] = s->frame->linesize[2];
+-   s->vpu_cmds_arm[2][2] = s->uv_setup_width;
+-   s->vpu_cmds_arm[2][3] = (int) ( s->uv_setup_vc + s->uv_setup_width * ((y>>4)>> s->ps.sps->vshift[1]) );
+-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
+-index ffd13ca..b0c9bc5 100644
+---- a/libavcodec/rpi_qpu.c
+-+++ b/libavcodec/rpi_qpu.c
+-@@ -250,7 +250,7 @@ int gpu_get_mailbox(void)
+- }
+- 
+- // Call this to clean and invalidate a region of memory
+--void gpu_cache_flush(GPU_MEM_PTR_T *p)
+-+void gpu_cache_flush(const GPU_MEM_PTR_T * const p)
+- {
+- #ifdef RPI_FAST_CACHEFLUSH
+-     struct vcsm_user_clean_invalid_s iocache = {};
+-diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
+-index 81c2bb1..b913f79 100644
+---- a/libavcodec/rpi_qpu.h
+-+++ b/libavcodec/rpi_qpu.h
+-@@ -2,8 +2,11 @@
+- #define RPI_QPU_H
+- 
+- // Define RPI_FAST_CACHEFLUSH to use the VCSM cache flush code
+-+// *** N.B. Code has rotted & crashes if this is unset (before this set of changes)
+- #define RPI_FAST_CACHEFLUSH
+- 
+-+#define RPI_ONE_BUF 1
+ +
+- typedef struct gpu_mem_ptr_s {
+-   unsigned char *arm; // Pointer to memory mapped on ARM side
+-   int vc_handle;   // Videocore handle of relocatable memory
+-@@ -16,9 +19,113 @@ typedef struct gpu_mem_ptr_s {
+- extern int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p);
+- extern int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p);
+- extern void gpu_free(GPU_MEM_PTR_T *p);
+--extern void gpu_cache_flush(GPU_MEM_PTR_T *p);
+-+extern void gpu_cache_flush(const GPU_MEM_PTR_T * const p);
+- extern void gpu_cache_flush3(GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2);
+- 
+-+#include "libavutil/frame.h"
+-+#if !RPI_ONE_BUF
+-+static inline uint32_t get_vc_address_y(const AVFrame * const frame) {
+-+    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[0]);
+-+    return p->vc;
+-+}
+++/* Frees a block of memory that was successfully allocated by
+++** a prior call the vcms_alloc.
+++**
+++** The handle should be considered invalid upon return from this
+++** call.
+++**
+++** Whether any memory is actually freed up or not as the result of
+++** this call will depends on many factors, if all goes well it will
+++** be freed.  If something goes wrong, the memory will likely end up
+++** being freed up as part of the vcsm_exit process.  In the end the
+++** memory is guaranteed to be freed one way or another.
+++*/
+++void vcsm_free( unsigned int handle );
+ +
+-+static inline uint32_t get_vc_address_u(const AVFrame * const frame) {
+-+    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[1]);
+-+    return p->vc;
+-+}
+ +
+-+static inline uint32_t get_vc_address_v(const AVFrame * const frame) {
+-+    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[2]);
+-+    return p->vc;
+-+}
+++/* Retrieves a videocore opaque handle from a mapped user address
+++** pointer.  The videocore handle will correspond to the actual
+++** memory mapped in videocore.
+++**
+++** Returns:        0 on error
+++**                 a non-zero opaque handle on success.
+++**
+++** Note: the videocore opaque handle is distinct from the user
+++**       opaque handle (allocated via vcsm_malloc) and it is only
+++**       significant for such application which knows what to do
+++**       with it, for the others it is just a number with little
+++**       use since nothing can be done with it (in particular
+++**       for safety reason it cannot be used to map anything).
+++*/
+++unsigned int vcsm_vc_hdl_from_ptr( void *usr_ptr );
+ +
+-+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) {
+-+    return *(GPU_MEM_PTR_T *)av_buffer_pool_opaque(frame->buf[0]);
+-+}
+ +
+-+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_u(const AVFrame * const frame) {
+-+    return *(GPU_MEM_PTR_T *)av_buffer_pool_opaque(frame->buf[1]);
+-+}
+++/* Retrieves a videocore opaque handle from a opaque handle
+++** pointer.  The videocore handle will correspond to the actual
+++** memory mapped in videocore.
+++**
+++** Returns:        0 on error
+++**                 a non-zero opaque handle on success.
+++**
+++** Note: the videocore opaque handle is distinct from the user
+++**       opaque handle (allocated via vcsm_malloc) and it is only
+++**       significant for such application which knows what to do
+++**       with it, for the others it is just a number with little
+++**       use since nothing can be done with it (in particular
+++**       for safety reason it cannot be used to map anything).
+++*/
+++unsigned int vcsm_vc_hdl_from_hdl( unsigned int handle );
+ +
+-+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_v(const AVFrame * const frame) {
+-+    return *(GPU_MEM_PTR_T *)av_buffer_pool_opaque(frame->buf[2]);
+-+}
+ +
+-+#else
+++/* Retrieves a user opaque handle from a mapped user address
+++** pointer.
+++**
+++** Returns:        0 on error
+++**                 a non-zero opaque handle on success.
+++*/
+++unsigned int vcsm_usr_handle( void *usr_ptr );
+ +
+-+static inline int gpu_is_buf1(const AVFrame * const frame)
+-+{
+-+    return frame->buf[1] == NULL;
+-+}
+ +
+-+static inline GPU_MEM_PTR_T * gpu_buf1_gmem(const AVFrame * const frame)
+-+{
+-+    return av_buffer_get_opaque(frame->buf[0]);
+-+}
+++/* Retrieves a mapped user address from an opaque user
+++** handle.
+++**
+++** Returns:        0 on error
+++**                 a non-zero address on success.
+++**
+++** On success, the address corresponds to the pointer
+++** which can access the data allocated via the vcsm_malloc
+++** call.
+++*/
+++void *vcsm_usr_address( unsigned int handle );
+ +
+-+static inline GPU_MEM_PTR_T * gpu_buf3_gmem(const AVFrame * const frame, const int n)
+-+{
+-+    return av_buffer_pool_opaque(frame->buf[n]);
+-+}
+++
+++/* Locks the memory associated with this opaque handle.
+++**
+++** Returns:        NULL on error
+++**                 a valid pointer on success.
+++**
+++** A user MUST lock the handle received from vcsm_malloc
+++** in order to be able to use the memory associated with it.
+++**
+++** On success, the pointer returned is only valid within
+++** the lock content (ie until a corresponding vcsm_unlock_xx
+++** is invoked).
+++*/
+++void *vcsm_lock( unsigned int handle );
+ +
+ +
+-+static inline uint32_t get_vc_address_y(const AVFrame * const frame) {
+-+    return gpu_is_buf1(frame) ? gpu_buf1_gmem(frame)->vc : gpu_buf3_gmem(frame, 0)->vc;
+-+}
+++/* Locks the memory associated with this opaque handle.  The lock
+++** also gives a chance to update the *host* cache behavior of the
+++** allocated buffer if so desired.  The *videocore* cache behavior
+++** of the allocated buffer cannot be changed by this call and such
+++** attempt will be ignored.
+++**
+++** The system will attempt to honour the cache_update mode request,
+++** the cache_result mode will provide the final answer on which cache
+++** mode is really in use.  Failing to change the cache mode will not
+++** result in a failure to lock the buffer as it is an application
+++** decision to choose what to do if (cache_result != cache_update)
+++**
+++** The value returned in cache_result can only be considered valid if
+++** the returned pointer is non NULL.  The cache_result pointer may be
+++** NULL if the application does not care about the actual outcome of
+++** its action with regards to the cache behavior change.
+++**
+++** Returns:        NULL on error
+++**                 a valid pointer on success.
+++**
+++** A user MUST lock the handle received from vcsm_malloc
+++** in order to be able to use the memory associated with it.
+++**
+++** On success, the pointer returned is only valid within
+++** the lock content (ie until a corresponding vcsm_unlock_xx
+++** is invoked).
+++*/
+++void *vcsm_lock_cache( unsigned int handle,
+++                       VCSM_CACHE_TYPE_T cache_update,
+++                       VCSM_CACHE_TYPE_T *cache_result );
+++
+++
+++/* Unlocks the memory associated with this user mapped address.
+++**
+++** Returns:        0 on success
+++**                 -errno on error.
+++**
+++** After unlocking a mapped address, the user should no longer
+++** attempt to reference it.
+++*/
+++int vcsm_unlock_ptr( void *usr_ptr );
+++
+++
+++/* Unlocks the memory associated with this user mapped address.
+++** Apply special processing that would override the otherwise
+++** default behavior.
+++**
+++** If 'cache_no_flush' is specified:
+++**    Do not flush cache as the result of the unlock (if cache
+++**    flush was otherwise applicable in this case).
+++**
+++** Returns:        0 on success
+++**                 -errno on error.
+++**
+++** After unlocking a mapped address, the user should no longer
+++** attempt to reference it.
+++*/
+++int vcsm_unlock_ptr_sp( void *usr_ptr, int cache_no_flush );
+++
+ +
+-+static inline uint32_t get_vc_address_u(const AVFrame * const frame) {
+-+    return gpu_is_buf1(frame) ?
+-+        gpu_buf1_gmem(frame)->vc + frame->data[1] - frame->data[0] :
+-+        gpu_buf3_gmem(frame, 1)->vc;
+-+}
+++/* Unlocks the memory associated with this user opaque handle.
+++**
+++** Returns:        0 on success
+++**                 -errno on error.
+++**
+++** After unlocking an opaque handle, the user should no longer
+++** attempt to reference the mapped addressed once associated
+++** with it.
+++*/
+++int vcsm_unlock_hdl( unsigned int handle );
+ +
+-+static inline uint32_t get_vc_address_v(const AVFrame * const frame) {
+-+    return gpu_is_buf1(frame) ?
+-+        gpu_buf1_gmem(frame)->vc + frame->data[2] - frame->data[0] :
+-+        gpu_buf3_gmem(frame, 2)->vc;
+-+}
+ +
+++/* Unlocks the memory associated with this user opaque handle.
+++** Apply special processing that would override the otherwise
+++** default behavior.
+++**
+++** If 'cache_no_flush' is specified:
+++**    Do not flush cache as the result of the unlock (if cache
+++**    flush was otherwise applicable in this case).
+++**
+++** Returns:        0 on success
+++**                 -errno on error.
+++**
+++** After unlocking an opaque handle, the user should no longer
+++** attempt to reference the mapped addressed once associated
+++** with it.
+++*/
+++int vcsm_unlock_hdl_sp( unsigned int handle, int cache_no_flush );
+ +
+-+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) {
+-+    if (gpu_is_buf1(frame))
+-+    {
+-+        GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
+-+        g.numbytes = frame->data[1] - frame->data[0];
+-+        return g;
+-+    }
+-+    else
+-+        return *gpu_buf3_gmem(frame, 0);
+-+}
+++/* Clean and/or invalidate the memory associated with this user opaque handle
+++**
+++** Returns:        non-zero on error
+++**
+++** structure contains a list of flush/invalidate commands. Commands are:
+++** 0: nop
+++** 1: invalidate       given virtual range in L1/L2
+++** 2: clean            given virtual range in L1/L2
+++** 3: clean+invalidate given virtual range in L1/L2
+++** 4: flush all L1/L2
+++*/
+++struct vcsm_user_clean_invalid_s {
+++   struct {
+++      unsigned int cmd;
+++      unsigned int handle;
+++      unsigned int addr;
+++      unsigned int size;
+++   } s[8];
+++};
+ +
+-+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_u(const AVFrame * const frame) {
+-+    if (gpu_is_buf1(frame))
+-+    {
+-+        GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
+-+        g.arm += frame->data[1] - frame->data[0];
+-+        g.vc += frame->data[1] - frame->data[0];
+-+        g.numbytes = frame->data[2] - frame->data[1];  // chroma size
+-+        return g;
+-+    }
+-+    else
+-+        return *gpu_buf3_gmem(frame, 1);
+-+}
+++int vcsm_clean_invalid( struct vcsm_user_clean_invalid_s *s );
+ +
+-+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_v(const AVFrame * const frame) {
+-+    if (gpu_is_buf1(frame))
+-+    {
+-+        GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
+-+        g.arm += frame->data[2] - frame->data[0];
+-+        g.vc += frame->data[2] - frame->data[0];
+-+        g.numbytes = frame->data[2] - frame->data[1];  // chroma size
+-+        return g;
+-+    }
+-+    else
+-+        return *gpu_buf3_gmem(frame, 2);
+++#ifdef __cplusplus
+ +}
+-+
+ +#endif
+ +
+-+
+- // QPU specific functions
+- extern void qpu_run_shader8(int code, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8);
+- extern void qpu_run_shader12(int code, int num, int code2, int num2, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8, int unifs9, int unifs10, int unifs11, int unifs12);
+++#endif /* __USER_VCSM__H__INCLUDED__ */
+ diff --git a/libavcodec/rpi_zc.c b/libavcodec/rpi_zc.c
+ new file mode 100644
+ index 0000000..9580165
+@@ -38057,80 +15466,3089 @@ index 0000000..f0109f4
+ +
+ +#endif
+ +
+--- 
+-2.7.4
+-
+-
+-From a6da64e1ca42f0394ccfa55dca782a456841da94 Mon Sep 17 00:00:00 2001
+-From: John Cox <jc@kynesim.co.uk>
+-Date: Tue, 1 Mar 2016 14:21:25 +0000
+-Subject: [PATCH 2/2] Set VPU scheduling thread to high priority after creation
+-
+----
+- libavcodec/rpi_qpu.c | 48 +++++++++++++++++++++++++++++++++++++++++++++++-
+- 1 file changed, 47 insertions(+), 1 deletion(-)
+-
+-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
+-index b0c9bc5..ee19231 100644
+---- a/libavcodec/rpi_qpu.c
+-+++ b/libavcodec/rpi_qpu.c
+-@@ -182,9 +182,55 @@ static int gpu_init(volatile struct GPU **gpu) {
+-     err = pthread_create(&vpu_thread, NULL, vpu_start, NULL);
+-     //printf("Created thread\n");
+-     if (err) {
+--        printf("Failed to create vpu thread\n");
+-+        av_log(NULL, AV_LOG_FATAL, "Failed to create vpu thread\n");
+-         return -4;
+-     }
++diff --git a/libavcodec/utils.c b/libavcodec/utils.c
++index f7adb52..708526e 100644
++--- a/libavcodec/utils.c
+++++ b/libavcodec/utils.c
++@@ -26,6 +26,12 @@
++  */
++ 
++ #include "config.h"
+ +
+-+    {
+-+      struct sched_param param = {0};
+-+      int policy = 0;
+++#ifdef RPI
+++// Move video buffers to GPU memory
+++#define RPI_GPU_BUFFERS
+++#endif
+ +
+-+      if (pthread_getschedparam(vpu_thread, &policy, &param) != 0)
+-+      {
+-+        av_log(NULL, AV_LOG_ERROR, "Unable to get VPU thread scheduling parameters\n");
+-+      }
+-+      else
+-+      {
+-+        av_log(NULL, AV_LOG_INFO, "VPU thread: policy=%d (%s), pri=%d\n",
+-+            policy,
+-+            policy == SCHED_RR ? "RR" : policy == SCHED_FIFO ? "FIFO" : "???" ,
+-+            param.sched_priority);
++ #include "libavutil/atomic.h"
++ #include "libavutil/attributes.h"
++ #include "libavutil/avassert.h"
++@@ -64,6 +70,10 @@
++ #include "libavutil/ffversion.h"
++ const char av_codec_ffversion[] = "FFmpeg version " FFMPEG_VERSION;
++ 
+++#ifdef RPI_GPU_BUFFERS
+++#include "rpi_qpu.h"
+++#endif
+ +
+-+        policy = SCHED_FIFO;
+-+        param.sched_priority = sched_get_priority_max(SCHED_FIFO);
++ #if HAVE_PTHREADS || HAVE_W32THREADS || HAVE_OS2THREADS
++ static int default_lockmgr_cb(void **arg, enum AVLockOp op)
++ {
++@@ -503,6 +513,47 @@ int avcodec_fill_audio_frame(AVFrame *frame, int nb_channels,
++     return ret;
++ }
++ 
+++#ifdef RPI_GPU_BUFFERS
+++static void rpi_buffer_default_free(void *opaque, uint8_t *data)
+++{
+++    GPU_MEM_PTR_T *p = opaque;
+++    gpu_free(p);
+++    av_free(p);
+++}
+ +
+-+        av_log(NULL, AV_LOG_INFO, "Attempt to set: policy=%d (%s), pri=%d\n",
+-+            policy,
+-+            policy == SCHED_RR ? "RR" : policy == SCHED_FIFO ? "FIFO" : "???" ,
+-+            param.sched_priority);
+++static AVBufferRef *rpi_buffer_alloc(int size)
+++{
+++    AVBufferRef *ret = NULL;
+++    uint8_t    *data = NULL;
+++    GPU_MEM_PTR_T *p;
+ +
+-+        if (pthread_setschedparam(vpu_thread, policy, &param) != 0)
+-+        {
+-+          av_log(NULL, AV_LOG_ERROR, "Unable to set VPU thread scheduling parameters\n");
+-+        }
+-+        else
+-+        {
+-+          if (pthread_getschedparam(vpu_thread, &policy, &param) != 0)
+-+          {
+-+            av_log(NULL, AV_LOG_ERROR, "Unable to get VPU thread scheduling parameters\n");
+-+          }
+-+          else
+-+          {
+-+            av_log(NULL, AV_LOG_INFO, "VPU thread (after): policy=%d (%s), pri=%d\n",
+-+                policy,
+-+                policy == SCHED_RR ? "RR" : policy == SCHED_FIFO ? "FIFO" : "???" ,
+-+                param.sched_priority);
+-+          }
+-+        }
+-+      }
+++    static int total=0;
+++    total+=size;
+++
+++    p = av_malloc(sizeof *p);
+++    if (!p)
+++        return NULL;
+++
+++    if (gpu_malloc_cached(size,p)<0)  // Change this line to choose cached or uncached memory.  The caching here refers to the ARM data cache.
+++        return NULL;
+++
+++    data = p->arm;
+++    printf("Rpi alloc %d/%d ARM=%p VC=%x->%x\n",size,total,p->arm,p->vc,p->vc+size);
+++    //memset(data, 64, size);
+++
+++    if (!data)
+++        return NULL;
+ +
+++    ret = av_buffer_create(data, size, rpi_buffer_default_free, p, 0);
+++    if (!ret) {
+++        gpu_free(p);
+++        av_freep(&p);
+ +    }
+ +
+-   }
+++    return ret;
+++}
+++#endif
+++
++ static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame)
++ {
++     FramePool *pool = avctx->internal->pool;
++@@ -550,6 +601,14 @@ static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame)
++             av_buffer_pool_uninit(&pool->pools[i]);
++             pool->linesize[i] = linesize[i];
++             if (size[i]) {
+++#ifdef RPI_GPU_BUFFERS
+++                if (avctx->codec_id == AV_CODEC_ID_HEVC)
+++                    pool->pools[i] = av_buffer_pool_init(size[i] + 16 + STRIDE_ALIGN - 1,
+++                                                     CONFIG_MEMORY_POISONING ?
+++                                                        NULL :
+++                                                        rpi_buffer_alloc);
+++                else
+++#endif
++                 pool->pools[i] = av_buffer_pool_init(size[i] + 16 + STRIDE_ALIGN - 1,
++                                                      CONFIG_MEMORY_POISONING ?
++                                                         NULL :
++diff --git a/libavformat/mpegts.c b/libavformat/mpegts.c
++index b31d233..2767306 100644
++--- a/libavformat/mpegts.c
+++++ b/libavformat/mpegts.c
++@@ -701,7 +701,7 @@ static const StreamType ISO_types[] = {
+  #endif
++     { 0x1b, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_H264       },
++     { 0x1c, AVMEDIA_TYPE_AUDIO, AV_CODEC_ID_AAC        },
++-    { 0x20, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_H264       },
+++    { 0x20, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_H264_MVC   },
++     { 0x21, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_JPEG2000   },
++     { 0x24, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_HEVC       },
++     { 0x42, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_CAVS       },
++diff --git a/libavformat/utils.c b/libavformat/utils.c
++index 6f343f2..83f26d5 100644
++--- a/libavformat/utils.c
+++++ b/libavformat/utils.c
++@@ -691,7 +691,7 @@ static int update_wrap_reference(AVFormatContext *s, AVStream *st, int stream_in
++         int default_stream_index = av_find_default_stream_index(s);
++         if (s->streams[default_stream_index]->pts_wrap_reference == AV_NOPTS_VALUE) {
++             for (i = 0; i < s->nb_streams; i++) {
++-                if (av_find_program_from_stream(s, NULL, i))
+++                if (0 && av_find_program_from_stream(s, NULL, i))
++                     continue;
++                 s->streams[i]->pts_wrap_reference = pts_wrap_reference;
++                 s->streams[i]->pts_wrap_behavior = pts_wrap_behavior;
++diff --git a/libavutil/buffer.c b/libavutil/buffer.c
++index 694e116..203ca7b 100644
++--- a/libavutil/buffer.c
+++++ b/libavutil/buffer.c
++@@ -425,3 +425,9 @@ AVBufferRef *av_buffer_pool_get(AVBufferPool *pool)
++ 
++     return ret;
++ }
+++
+++// Return the opaque for the underlying frame (gives us a GPU_MEM_PTR_T)
+++void *av_buffer_pool_opaque(AVBufferRef *ref) {
+++  BufferPoolEntry *buf = av_buffer_get_opaque(ref);
+++  return buf->opaque;
+++}
++diff --git a/libavutil/buffer.h b/libavutil/buffer.h
++index 0c0ce12..82e0bc3 100644
++--- a/libavutil/buffer.h
+++++ b/libavutil/buffer.h
++@@ -283,6 +283,9 @@ void av_buffer_pool_uninit(AVBufferPool **pool);
++  */
++ AVBufferRef *av_buffer_pool_get(AVBufferPool *pool);
+  
+--- 
+-2.7.4
+-
+++// Return the opaque for the underlying frame
+++void *av_buffer_pool_opaque(AVBufferRef *ref);
+++
++ /**
++  * @}
++  */
++diff --git a/pi-util/conf.sh b/pi-util/conf.sh
++new file mode 100755
++index 0000000..8b596a2
++--- /dev/null
+++++ b/pi-util/conf.sh
++@@ -0,0 +1,33 @@
+++echo "Configure for Pi2/3"
+++
+++RPI_BUILDROOT=`pwd`/build
+++RPI_ROOTFS=$RPI_BUILDROOT/linux/raspian_jessie_pi1-sysroot
+++RPI_TOOLROOT=$RPI_BUILDROOT/tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf
+++RPI_OPT_VC=$RPI_ROOTFS/opt/vc
+++#RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_ROOTFS/usr/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
+++RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
+++RPI_DEFS="-D__VCCOREVER__=0x04000000 -DRPI=1"
+++#RPI_DEFS="-D__VCCOREVER__=0x04000000"
+++RPI_LIBDIRS="-L$RPI_ROOTFS/lib -L$RPI_ROOTFS/usr/lib -L$RPI_OPT_VC/lib"
+++#RPI_KEEPS="-save-temps=obj"
+++RPI_KEEPS=""
+++
+++./configure --enable-cross-compile\
+++ --arch=armv6t2\
+++ --cpu=cortex-a7\
+++ --target-os=linux\
+++ --disable-stripping\
+++ --disable-thumb\
+++ --enable-mmal\
+++ --extra-cflags="-g $RPI_KEEPS $RPI_DEFS $RPI_INCLUDES"\
+++ --extra-cxxflags="$RPI_DEFS $RPI_INCLUDES"\
+++ --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_ROOTFS/lib,-rpath-link=$RPI_ROOTFS/usr/lib"\
+++ --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\
+++ --cross-prefix=$RPI_TOOLROOT/bin/arm-linux-gnueabihf-
+++
+++# --enable-extra-warnings\
+++# --arch=armv71\
+++# --enable-shared\
+++
+++# gcc option for getting asm listing
+++# -Wa,-ahls
++diff --git a/pi-util/conf_h265.csv b/pi-util/conf_h265.csv
++new file mode 100644
++index 0000000..61d1399
++--- /dev/null
+++++ b/pi-util/conf_h265.csv
++@@ -0,0 +1,144 @@
+++1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.bit,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.md5
+++2,AMP_A_Samsung_6,AMP_A_Samsung_6.bin,AMP_A_Samsung_6.md5
+++1,AMP_B_Samsung_6,AMP_B_Samsung_6.bin,AMP_B_Samsung_6.md5
+++1,AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5
+++1,AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5
+++1,AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5
+++1,AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5
+++1,AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5
+++2,AMVP_C_Samsung_6,AMVP_C_Samsung_6.bin,AMVP_C_Samsung_6.md5
+++1,BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5
+++1,CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5
+++1,CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5
+++1,CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5
+++1,CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5
+++1,CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5
+++1,CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5
+++1,CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5
+++1,CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5
+++1,CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5
+++1,cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5
+++1,CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5
+++1,CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5
+++1,DBLK_A_MAIN10_VIXS_3,DBLK_A_MAIN10_VIXS_3.bit,DBLK_A_MAIN10_VIXS_3.md5
+++1,DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5
+++1,DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5
+++1,DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5
+++1,DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5
+++1,DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5
+++1,DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5
+++1,DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5
+++2,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5
+++1,DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5
+++1,DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5
+++1,DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5
+++1,DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5
+++1,DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5
+++1,ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5
+++1,ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5
+++1,ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5
+++1,EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5
+++1,FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5
+++1,HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5
+++1,INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5
+++1,INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5
+++1,ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5
+++1,ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5
+++1,ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5
+++1,ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5
+++1,ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5
+++1,IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5
+++1,IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5
+++1,IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5
+++1,LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5
+++1,LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5
+++1,LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5
+++1,MAXBINS_A_TI_4,MAXBINS_A_TI_4.bit,MAXBINS_A_TI_4.md5
+++1,MAXBINS_B_TI_4,MAXBINS_B_TI_4.bit,MAXBINS_B_TI_4.md5
+++1,MAXBINS_C_TI_4,MAXBINS_C_TI_4.bit,MAXBINS_C_TI_4.md5
+++1,MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5
+++1,MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5
+++1,MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5
+++1,MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5
+++1,MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5
+++1,MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5
+++1,MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5
+++1,MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5
+++1,MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5
+++1,MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5
+++1,NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5
+++1,NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5
+++2,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5
+++1,OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5
+++1,OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5
+++1,OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5
+++1,PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5
+++1,PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5
+++1,PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5
+++1,PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5
+++1,PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5
+++1,PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5
+++1,PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5
+++1,PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5
+++1,PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5
+++1,POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5
+++2,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5
+++1,PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5
+++1,RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5
+++2,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5
+++1,RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5
+++1,RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5
+++1,RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5
+++1,RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5
+++1,RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5
+++1,RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5
+++1,RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5
+++1,RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5
+++1,RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5
+++1,RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5
+++1,RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5
+++1,RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5
+++1,RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5
+++1,RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5
+++1,RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5
+++1,SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5
+++1,SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5
+++1,SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5
+++1,SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5
+++1,SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5
+++1,SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5
+++1,SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5
+++1,SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5
+++1,SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5
+++1,SLIST_A_Sony_4,str.bin,SLIST_A_Sony_4_yuv.md5
+++1,SLIST_B_Sony_8,str.bin,SLIST_B_Sony_8_yuv.md5
+++1,SLIST_C_Sony_3,str.bin,SLIST_C_Sony_3_yuv.md5
+++1,SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5
+++2,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5
+++1,STRUCT_A_Samsung_6,STRUCT_A_Samsung_6.bin,STRUCT_A_Samsung_6.md5
+++1,STRUCT_B_Samsung_6,STRUCT_B_Samsung_6.bin,STRUCT_B_Samsung_6.md5
+++1,TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5
+++1,TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5
+++1,TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5
+++1,TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5
+++1,TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5
+++1,TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5
+++2,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5
+++1,TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5
+++2,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5
+++1,WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5
+++1,WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5
+++1,WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5
+++1,WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5
+++1,WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5
+++1,WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5
+++1,WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5
+++1,WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5
+++1,WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5
+++1,WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5
+++1,WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5
+++1,WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5
+++1,WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5
+++1,WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5
+++1,WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5
+++1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
++diff --git a/pi-util/ffconf.py b/pi-util/ffconf.py
++new file mode 100644
++index 0000000..38f942f
++--- /dev/null
+++++ b/pi-util/ffconf.py
++@@ -0,0 +1,146 @@
+++#!/usr/bin/env python
+++
+++import os
+++import subprocess
+++import re
+++import argparse
+++import sys
+++import csv
+++from stat import *
+++
+++conf_root = "/opt/conform/h265"
+++ffmpeg_exec = "./ffmpeg"
+++
+++def testone(fileroot, name, es_file, md5_file):
+++    tmp_root = "/tmp"
+++
+++    dec_file = os.path.join(tmp_root, name + ".dec.md5")
+++    try:
+++        os.remove(dec_file)
+++    except:
+++        pass
+++
+++    flog = open(os.path.join(tmp_root, name + ".log"), "wt")
+++
+++    # Unaligned needed for cropping conformance
+++    rstr = subprocess.call(
+++        [ffmpeg_exec, "-flags", "unaligned", "-vcodec", "hevc", "-i", os.path.join(fileroot, es_file), "-f", "md5", dec_file],
+++        stdout=flog, stderr=subprocess.STDOUT)
+++
+++    try:
+++        m1 = None
+++        m2 = None
+++        with open(os.path.join(fileroot, md5_file)) as f:
+++            for line in f:
+++                m1 = re.search("[0-9a-f]{32}", line.lower())
+++                if m1:
+++                    break
+++
+++        with open(dec_file) as f:
+++            m2 = re.search("[0-9a-f]{32}", f.readline())
+++    except:
+++        pass
+++
+++    rv = False
+++    if  m1 and m2 and m1.group() == m2.group():
+++        print >> flog, "Match: " + m1.group()
+++        rv = True
+++    elif not m1:
+++        print >> flog, "****** Cannot find m1"
+++    elif not m2:
+++        print >> flog, "****** Cannot find m2"
+++    else:
+++        print >> flog, "****** Mismatch: " + m1.group() + " != " + m2.group()
+++    flog.close()
+++    return rv
+++
+++def scandir(root):
+++    aconf = []
+++    ents = os.listdir(conf_root)
+++    ents.sort(key=str.lower)
+++    for name in ents:
+++        test_path = os.path.join(conf_root, name)
+++        if S_ISDIR(os.stat(test_path).st_mode):
+++            files = os.listdir(test_path)
+++            es_file = "?"
+++            md5_file = "?"
+++            for f in files:
+++                (base, ext) = os.path.splitext(f)
+++                if base[0] == '.':
+++                    pass
+++                elif ext == ".bit" or ext == ".bin":
+++                    es_file = f
+++                elif ext == ".md5":
+++                    if md5_file == "?":
+++                        md5_file = f
+++                    elif base[-3:] == "yuv":
+++                        md5_file = f
+++            aconf.append((1, name, es_file, md5_file))
+++    return aconf
+++
+++def runtest(name, tests):
+++    if not tests:
+++        return True
+++    for t in tests:
+++        if name[0:len(t)] == t:
+++            return True
+++        return False
+++
+++def doconf(csva, tests):
+++    failures = []
+++    unx_success = []
+++    for a in csva:
+++        exp_test = int(a[0])
+++        if (exp_test and runtest(a[1], tests)):
+++            name = a[1]
+++            print "==== ", name,
+++            sys.stdout.flush()
+++
+++            if (not testone(os.path.join(conf_root, name), name, a[2], a[3])) :
+++                if exp_test == 1:
+++                    failures.append(name)
+++                    print ": * FAIL *"
+++                else:
+++                    print ": fail"
+++            else:
+++                if exp_test == 2:
+++                    print ": * OK *"
+++                    unx_success.append(name)
+++                else:
+++                    print ": ok"
+++
+++
+++    if failures or unx_success:
+++        print "Unexpected Failures:", failures
+++        print "Unexpected Success: ", unx_success
+++    else:
+++        print "All tests normal"
+++
+++
+++class ConfCSVDialect(csv.Dialect):
+++    delimiter = ','
+++    doublequote = True
+++    lineterminator = '\n'
+++    quotechar='"'
+++    quoting = csv.QUOTE_MINIMAL
+++    skipinitialspace = True
+++    strict = True
+++
+++if __name__ == '__main__':
+++
+++    argp = argparse.ArgumentParser(description="FFmpeg h265 conformance tester")
+++    argp.add_argument("tests", nargs='*')
+++    argp.add_argument("--csvgen", action='store_true', help="Generate CSV file for dir")
+++    argp.add_argument("--csv", default="pi-util/conf_h265.csv", help="CSV filename")
+++    args = argp.parse_args()
+++
+++    if args.csvgen:
+++        csv.writer(sys.stdout).writerows(scandir(conf_root))
+++        exit(0)
+++
+++    with open(args.csv, 'rt') as csvfile:
+++        csva = [a for a in csv.reader(csvfile, ConfCSVDialect())]
+++
+++
+++    doconf(csva, args.tests)
+++
++diff --git a/pi-util/qasm.py b/pi-util/qasm.py
++new file mode 100644
++index 0000000..1eacc04
++--- /dev/null
+++++ b/pi-util/qasm.py
++@@ -0,0 +1,2502 @@
+++#!/usr/bin/env python
+++
+++#    add.ifz.setf  -, r0, ra0 ; fmul  rb1, rany2, 0 ; thrend # comment
+++#    add  r0, r0, 1                    # implicit mul nop
+++#    nop                               # explicit add nop, implicit mul nop
+++#    bkpt                              # implicit add/mul nop
+++#    mov  r0, 0x1234                   # hex immediate
+++#    mov  r0, 20 * 40                  # expressions...
+++#    mov  r0, f(sqrt(2.0) * 3.0)       # f() converts float to bits
+++#    mov  r0, a:label                  # put address of label in r0
+++# :label
+++#    bra.allnn  ra2, a:1f              # branch to label 1 (searching forward), using absolute address
+++# :1
+++#    brr.anyz  -, r:1b                 # branch to label 1 (searching backward), using relative address
+++# :1                                   # multiple definitions of numeric labels (differentiated using f/b)
+++# .set my_val, 3                       # introduce alias for 3
+++# .set my_reg, r0                      # and for r0
+++#    mov  my_reg, my_val               # then use them
+++# .set my_reg2, my_reg + my_val        # r0 plus 3 is r3
+++# .macro my_add, a, b, c               # a, b, c act as if .set on entry
+++# .set my_val, 10
+++#    add  a, b, c
+++#    mov  r0, my_val                   # 10
+++# .endm                                # forget all .sets since .macro (including arg .sets)
+++#    mov  r0, my_val                   # 3
+++#    my_add  my_reg2, my_reg, ra0 << 4 # << rotates left (>> rotates right)
+++
+++import math
+++import optparse
+++import os
+++import random
+++import re
+++import struct
+++import sys
+++import time
+++
+++###############################################################################
+++# constants
+++###############################################################################
+++
+++# ops
+++######
+++
+++# negatives are internal qasm ops
+++
+++AOP_MOV     = -3   # two operands
+++AOP_BRA     = -2   # two operands
+++AOP_BRR     = -1   # two operands
+++AOP_NOP     = 0x00 # no operands
+++AOP_FADD    = 0x01
+++AOP_FSUB    = 0x02
+++AOP_FMIN    = 0x03
+++AOP_FMAX    = 0x04
+++AOP_FMINABS = 0x05
+++AOP_FMAXABS = 0x06
+++AOP_FTOI    = 0x07 # two operands
+++AOP_ITOF    = 0x08 # two operands
+++AOP_ADD     = 0x0c
+++AOP_SUB     = 0x0d
+++AOP_SHR     = 0x0e
+++AOP_ASR     = 0x0f
+++AOP_ROR     = 0x10
+++AOP_SHL     = 0x11
+++AOP_MIN     = 0x12
+++AOP_MAX     = 0x13
+++AOP_AND     = 0x14
+++AOP_OR      = 0x15
+++AOP_XOR     = 0x16
+++AOP_NOT     = 0x17 # two operands
+++AOP_CLZ     = 0x18 # two operands
+++AOP_V8ADDS  = 0x1e
+++AOP_V8SUBS  = 0x1f
+++
+++MOP_MOV    = -1  # two operands
+++MOP_NOP    = 0x0 # no operands
+++MOP_FMUL   = 0x1
+++MOP_MUL24  = 0x2
+++MOP_V8MULD = 0x3
+++MOP_V8MIN  = 0x4
+++MOP_V8MAX  = 0x5
+++MOP_V8ADDS = 0x6
+++MOP_V8SUBS = 0x7
+++
+++# ldi modes
+++############
+++
+++LDI_32          = 0
+++LDI_EL_SIGNED   = 1
+++LDI_EL_UNSIGNED = 3
+++LDI_SEMA        = 4
+++
+++# conds
+++########
+++
+++COND_NEVER  = 0
+++COND_ALWAYS = 1
+++COND_IFZ    = 2
+++COND_IFNZ   = 3
+++COND_IFN    = 4
+++COND_IFNN   = 5
+++COND_IFC    = 6
+++COND_IFNC   = 7
+++
+++BCOND_ALLZ   = 0
+++BCOND_ALLNZ  = 1
+++BCOND_ANYZ   = 2
+++BCOND_ANYNZ  = 3
+++BCOND_ALLN   = 4
+++BCOND_ALLNN  = 5
+++BCOND_ANYN   = 6
+++BCOND_ANYNN  = 7
+++BCOND_ALLC   = 8
+++BCOND_ALLNC  = 9
+++BCOND_ANYC   = 10
+++BCOND_ANYNC  = 11
+++BCOND_ALWAYS = 15
+++
+++# packing/unpacking
+++####################
+++
+++# regfile a pack modes
+++PACK_A_NOP   = 0
+++PACK_A_16A   = 1
+++PACK_A_16B   = 2
+++PACK_A_8888  = 3
+++PACK_A_8A    = 4
+++PACK_A_8B    = 5
+++PACK_A_8C    = 6
+++PACK_A_8D    = 7
+++PACK_A_32S   = 8
+++PACK_A_16AS  = 9
+++PACK_A_16BS  = 10
+++PACK_A_8888S = 11
+++PACK_A_8AS   = 12
+++PACK_A_8BS   = 13
+++PACK_A_8CS   = 14
+++PACK_A_8DS   = 15
+++
+++# mul unit pack modes
+++PACK_MUL_NOP  = 0
+++PACK_MUL_8888 = 3
+++PACK_MUL_8A   = 4
+++PACK_MUL_8B   = 5
+++PACK_MUL_8C   = 6
+++PACK_MUL_8D   = 7
+++
+++# regfile a unpack modes
+++UNPACK_A_NOP = 0
+++UNPACK_A_16A = 1
+++UNPACK_A_16B = 2
+++UNPACK_A_8R  = 3
+++UNPACK_A_8A  = 4
+++UNPACK_A_8B  = 5
+++UNPACK_A_8C  = 6
+++UNPACK_A_8D  = 7
+++
+++# r4 unpack modes
+++UNPACK_R4_NOP = 0
+++UNPACK_R4_16A = 1
+++UNPACK_R4_16B = 2
+++UNPACK_R4_8R  = 3
+++UNPACK_R4_8A  = 4
+++UNPACK_R4_8B  = 5
+++UNPACK_R4_8C  = 6
+++UNPACK_R4_8D  = 7
+++
+++PACK_TYPE_INT    = 0
+++PACK_TYPE_FLOAT  = 1
+++PACK_TYPE_EITHER = -1
+++
+++PACK_MODE_A      = 0 # regfile a
+++PACK_MODE_M      = 1 # mul unit
+++PACK_MODE_EITHER = -1
+++
+++UNPACK_LOC_A     = 0 # regfile a
+++UNPACK_LOC_R4    = 1 # r4
+++UNPACK_LOC_AB    = 2 # either regfile a or regfile b
+++UNPACK_LOC_OTHER = 3 # somewhere else
+++
+++# args
+++#######
+++
+++# loc_t, ie internal
+++MUX_AC  = 0
+++MUX_ANY = 1
+++MUX_A   = 2
+++MUX_B   = 3
+++RW_EITHER = 0
+++RW_READ   = 1
+++RW_WRITE  = 2
+++
+++RADDR_NOP = 39
+++
+++# negatives are for internal use
+++RMUX_SEMA  = -6
+++RMUX_LABEL = -5
+++RMUX_IMMV  = -4
+++RMUX_IMM   = -3
+++RMUX_AC    = -2
+++RMUX_ANY   = -1
+++RMUX_A0    = 0 # followed by A1, A2, A3, A4, A5
+++RMUX_A     = 6
+++RMUX_B     = 7
+++
+++WADDR_R0  = 32 # followed by R1, R2, R3
+++WADDR_NOP = 39
+++
+++WMUX_ANY = 0
+++WMUX_A   = 1
+++WMUX_B   = 2
+++
+++# signals
+++##########
+++
+++SIG_BKPT       = 0
+++SIG_NORMAL     = 1
+++SIG_THRSW      = 2
+++SIG_THREND     = 3
+++SIG_SBWAIT     = 4
+++SIG_SBDONE     = 5
+++SIG_INT        = 6 # on a0
+++SIG_LTHRSW     = 6 # on b0
+++SIG_LOADCV     = 7
+++SIG_LOADC      = 8
+++SIG_LDCEND     = 9
+++SIG_LDTMU0     = 10
+++SIG_LDTMU1     = 11
+++SIG_ROTATE     = 12 # on a0
+++SIG_LOADAM     = 12 # on b0
+++SIG_SMALLIMMED = 13
+++SIG_IMMED      = 14
+++SIG_BRANCH     = 15
+++
+++# multi-line assembler constructs
+++##################################
+++
+++CONSTRUCT_MACRO = 0x1
+++CONSTRUCT_IF    = 0x2
+++CONSTRUCT_ELSE  = 0x4
+++CONSTRUCT_REP   = 0x8
+++
+++###############################################################################
+++# helpers
+++###############################################################################
+++
+++def asm_error(message, location = None):
+++   if location is None:
+++      location = current_location
+++   if location == '':
+++      sys.stderr.write('qasm ERROR: %s\n' % message)
+++   else:
+++      sys.stderr.write('qasm ERROR: %s: %s\n' % (location, message))
+++   sys.exit(-1)
+++
+++def asm_warning(message, location = None):
+++   if disable_warnings or (nwarn_level != 0):
+++      return
+++   if location is None:
+++      location = current_location
+++   if location == '':
+++      sys.stderr.write('qasm WARNING: %s\n' % message)
+++   else:
+++      sys.stderr.write('qasm WARNING: %s: %s\n' % (location, message))
+++   if warnings_are_errors:
+++      asm_error('warnings are errors!', location)
+++
+++# smart_split('') = []
+++# smart_split('a') = ['a']
+++# smart_split('a(1, 2),[3, 4, 5],6') = ['a(1, 2)', '[3, 4, 5]', '6']
+++def smart_split(s, delim = ',', count = 0):
+++   if len(s) == 0:
+++      return []
+++   parts = []
+++   depth = 0
+++   i = 0
+++   for j in xrange(len(s)):
+++      if s[j] in '([{':
+++         depth += 1
+++      elif s[j] in ')]}':
+++         depth -= 1
+++      elif (s[j] == delim) and (depth == 0):
+++         parts.append(s[i:j])
+++         i = j + 1
+++         if len(parts) == count:
+++            break
+++   if depth != 0:
+++      asm_error('bracket nesting fail')
+++   parts.append(s[i:])
+++   return parts
+++
+++def is_int(x):
+++   return isinstance(x, int) or isinstance(x, long)
+++
+++###############################################################################
+++# "parsing" stuff
+++###############################################################################
+++
+++re_macro = re.compile('\\.macro\\s+(?P<name>\\w+)(?P<params>(\\s*,\\s*\\w+)*)$')
+++re_if = re.compile('\\.if((?P<set>n?set)\\s+(?P<name>\\w+)|\\s(?P<condition>.+))$')
+++re_elif = re.compile('\\.elif((?P<set>n?set)\\s+(?P<name>\\w+)|\\s(?P<condition>.+))$')
+++re_rep = re.compile('\\.rep\\s+(?P<name>\\w+)\\s*,(?P<count>.+)$')
+++re_include = re.compile('\\.include\\s(?P<filename>.+)$')
+++re_set = re.compile('\\.set\\s+(?P<name>\\w+)\\s*,(?P<val>.+)$')
+++re_unset = re.compile('\\.unset\\s+(?P<name>\\w+)$')
+++re_eval = re.compile('\\.eval\\s(?P<expr>.+)$')
+++re_print_info_warn_error = re.compile('\\.(?P<print_info_warn_error>print|info|warn|error)\\s(?P<message>.+)$')
+++re_assert = re.compile('\\.assert\\s(?P<condition>.+)$')
+++re_data = re.compile('\\.d(?P<size>[124])\\s(?P<data>.+)$')
+++re_macro_inst = re.compile('(?P<name>\\w+)(?P<args>\\s.+|)$')
+++re_label = re.compile(':(?P<name>:?[a-zA-Z_]\\w*|\\d+)$')
+++re_op = re.compile('(?P<op>\\w+)(\\.(?P<cond>\\w+))??(\\.(?P<sf>setf))?(?P<args>\\s.+|)$')
+++re_label_ref_left = re.compile('\\b([ar]):')
+++re_label_ref_right = re.compile('[a-zA-Z_]\\w*|\\d+[bf]$')
+++re_pack = re.compile('\\.([0-9]\\w*[a-df-zA-DF-Z_])') # a bit weird because we don't want to pick up float literals...
+++
+++# ops
+++######
+++
+++aops = {
+++   'mov': (AOP_MOV, 2),
+++   'bra': (AOP_BRA, 2),
+++   'brr': (AOP_BRR, 2),
+++   'nop': (AOP_NOP, 0),
+++   'fadd': (AOP_FADD, 3),
+++   'fsub': (AOP_FSUB, 3),
+++   'fmin': (AOP_FMIN, 3),
+++   'fmax': (AOP_FMAX, 3),
+++   'fminabs': (AOP_FMINABS, 3),
+++   'fmaxabs': (AOP_FMAXABS, 3),
+++   'ftoi': (AOP_FTOI, 2),
+++   'itof': (AOP_ITOF, 2),
+++   'add': (AOP_ADD, 3),
+++   'sub': (AOP_SUB, 3),
+++   'shr': (AOP_SHR, 3),
+++   'asr': (AOP_ASR, 3),
+++   'ror': (AOP_ROR, 3),
+++   'shl': (AOP_SHL, 3),
+++   'min': (AOP_MIN, 3),
+++   'max': (AOP_MAX, 3),
+++   'and': (AOP_AND, 3),
+++   'or': (AOP_OR, 3),
+++   'xor': (AOP_XOR, 3),
+++   'not': (AOP_NOT, 2),
+++   'clz': (AOP_CLZ, 2),
+++   'v8adds': (AOP_V8ADDS, 3),
+++   'v8subs': (AOP_V8SUBS, 3)}
+++
+++def get_aop(aop):
+++   if aop not in aops:
+++      asm_error('invalid aop')
+++   return aops[aop]
+++
+++mops = {
+++   'mov': (MOP_MOV, 2),
+++   'nop': (MOP_NOP, 0),
+++   'fmul': (MOP_FMUL, 3),
+++   'mul24': (MOP_MUL24, 3),
+++   'v8muld': (MOP_V8MULD, 3),
+++   'v8min': (MOP_V8MIN, 3),
+++   'v8max': (MOP_V8MAX, 3),
+++   'v8adds': (MOP_V8ADDS, 3),
+++   'v8subs': (MOP_V8SUBS, 3)}
+++
+++def get_mop(mop):
+++   if mop not in mops:
+++      asm_error('invalid mop')
+++   return mops[mop]
+++
+++# conds
+++########
+++
+++conds = {
+++   'ifz': COND_IFZ,
+++   'ifnz': COND_IFNZ,
+++   'ifn': COND_IFN,
+++   'ifnn': COND_IFNN,
+++   'ifc': COND_IFC,
+++   'ifnc': COND_IFNC}
+++
+++def get_cond(cond):
+++   if not cond:
+++      return COND_ALWAYS
+++   if cond not in conds:
+++      asm_error('invalid cond')
+++   return conds[cond]
+++
+++bconds = {
+++   'allz': BCOND_ALLZ,
+++   'allnz': BCOND_ALLNZ,
+++   'anyz': BCOND_ANYZ,
+++   'anynz': BCOND_ANYNZ,
+++   'alln': BCOND_ALLN,
+++   'allnn': BCOND_ALLNN,
+++   'anyn': BCOND_ANYN,
+++   'anynn': BCOND_ANYNN,
+++   'allc': BCOND_ALLC,
+++   'allnc': BCOND_ALLNC,
+++   'anyc': BCOND_ANYC,
+++   'anync': BCOND_ANYNC}
+++
+++def get_bcond(bcond):
+++   if not bcond:
+++      return BCOND_ALWAYS
+++   if bcond not in bconds:
+++      asm_error('invalid bcond')
+++   return bconds[bcond]
+++
+++def get_setf(setf):
+++   if not setf:
+++      return False
+++   return True
+++
+++# packing/unpacking
+++####################
+++
+++packs = {
+++   '16a':    (PACK_A_16A,    PACK_TYPE_INT,    PACK_MODE_A),
+++   '16b':    (PACK_A_16B,    PACK_TYPE_INT,    PACK_MODE_A),
+++   '16af':   (PACK_A_16A,    PACK_TYPE_FLOAT,  PACK_MODE_A),
+++   '16bf':   (PACK_A_16B,    PACK_TYPE_FLOAT,  PACK_MODE_A),
+++   '8abcd':  (PACK_A_8888,   PACK_TYPE_EITHER, PACK_MODE_A),
+++   '8a':     (PACK_A_8A,     PACK_TYPE_EITHER, PACK_MODE_A),
+++   '8b':     (PACK_A_8B,     PACK_TYPE_EITHER, PACK_MODE_A),
+++   '8c':     (PACK_A_8C,     PACK_TYPE_EITHER, PACK_MODE_A),
+++   '8d':     (PACK_A_8D,     PACK_TYPE_EITHER, PACK_MODE_A),
+++   's':      (PACK_A_32S,    PACK_TYPE_EITHER, PACK_MODE_A),
+++   '16as':   (PACK_A_16AS,   PACK_TYPE_EITHER, PACK_MODE_A),
+++   '16bs':   (PACK_A_16BS,   PACK_TYPE_EITHER, PACK_MODE_A),
+++   '8abcds': (PACK_A_8888S,  PACK_TYPE_EITHER, PACK_MODE_A),
+++   '8as':    (PACK_A_8AS,    PACK_TYPE_EITHER, PACK_MODE_A),
+++   '8bs':    (PACK_A_8BS,    PACK_TYPE_EITHER, PACK_MODE_A),
+++   '8cs':    (PACK_A_8CS,    PACK_TYPE_EITHER, PACK_MODE_A),
+++   '8ds':    (PACK_A_8DS,    PACK_TYPE_EITHER, PACK_MODE_A),
+++   '8abcdc': (PACK_MUL_8888, PACK_TYPE_EITHER, PACK_MODE_M),
+++   '8ac':    (PACK_MUL_8A,   PACK_TYPE_EITHER, PACK_MODE_M),
+++   '8bc':    (PACK_MUL_8B,   PACK_TYPE_EITHER, PACK_MODE_M),
+++   '8cc':    (PACK_MUL_8C,   PACK_TYPE_EITHER, PACK_MODE_M),
+++   '8dc':    (PACK_MUL_8D,   PACK_TYPE_EITHER, PACK_MODE_M)}
+++
+++def get_pack(pack):
+++   if not pack:
+++      return (0, PACK_TYPE_EITHER, PACK_MODE_EITHER)
+++   if pack not in packs:
+++      asm_error('invalid pack')
+++   return packs[pack]
+++
+++a_unpacks = {
+++   '16a':  (UNPACK_A_16A, PACK_TYPE_INT),
+++   '16b':  (UNPACK_A_16B, PACK_TYPE_INT),
+++   '16af': (UNPACK_A_16A, PACK_TYPE_FLOAT),
+++   '16bf': (UNPACK_A_16B, PACK_TYPE_FLOAT),
+++   '8dr':  (UNPACK_A_8R,  PACK_TYPE_EITHER),
+++   '8a':   (UNPACK_A_8A,  PACK_TYPE_INT),
+++   '8b':   (UNPACK_A_8B,  PACK_TYPE_INT),
+++   '8c':   (UNPACK_A_8C,  PACK_TYPE_INT),
+++   '8d':   (UNPACK_A_8D,  PACK_TYPE_INT),
+++   '8ac':  (UNPACK_A_8A,  PACK_TYPE_FLOAT),
+++   '8bc':  (UNPACK_A_8B,  PACK_TYPE_FLOAT),
+++   '8cc':  (UNPACK_A_8C,  PACK_TYPE_FLOAT),
+++   '8dc':  (UNPACK_A_8D,  PACK_TYPE_FLOAT)}
+++
+++def get_a_unpack(unpack):
+++   if not unpack:
+++      return (UNPACK_A_NOP, PACK_TYPE_EITHER, UNPACK_LOC_A)
+++   if unpack not in a_unpacks:
+++      asm_error('invalid ra unpack')
+++   return a_unpacks[unpack] + (UNPACK_LOC_A,)
+++
+++r4_unpacks = {
+++   '16af': UNPACK_R4_16A,
+++   '16bf': UNPACK_R4_16B,
+++   '8dr':  UNPACK_R4_8R,
+++   '8ac':  UNPACK_R4_8A,
+++   '8bc':  UNPACK_R4_8B,
+++   '8cc':  UNPACK_R4_8C,
+++   '8dc':  UNPACK_R4_8D}
+++
+++def get_r4_unpack(unpack):
+++   if not unpack:
+++      return (UNPACK_R4_NOP, PACK_TYPE_EITHER, UNPACK_LOC_R4)
+++   if unpack not in r4_unpacks:
+++      asm_error('invalid r4 unpack')
+++   return (r4_unpacks[unpack], PACK_TYPE_EITHER, UNPACK_LOC_R4)
+++
+++# args
+++#######
+++
+++class loc_t:
+++   def __init__(self, mux, i, rot, r5_rot, pack, rw):
+++      self.mux = mux
+++      self.i = i
+++      self.rot = rot % 16
+++      self.r5_rot = r5_rot % 16
+++      self.pack = pack
+++      self.rw = rw
+++
+++   def copy(self):
+++      return loc_t(self.mux, self.i, self.rot, self.r5_rot, self.pack, self.rw)
+++
+++   def __add__(self, i):
+++      if not is_int(i):
+++         raise Exception('can only add integer to loc')
+++      return loc_t(self.mux, self.i + i, self.rot, self.r5_rot, self.pack, self.rw)
+++
+++   def __sub__(self, i):
+++      if not is_int(i):
+++         raise Exception('can only subtract integer from loc')
+++      return loc_t(self.mux, self.i - i, self.rot, self.r5_rot, self.pack, self.rw)
+++
+++   def __cmp__(self, other):
+++      if is_int(other):
+++         return cmp(self.i, other)
+++      if not isinstance(other, loc_t):
+++         raise Exception('can only compare loc to integer or other loc')
+++      if self.mux != other.mux:
+++         return cmp(self.mux, other.mux)
+++      if self.i != other.i:
+++         return cmp(self.i, other.i)
+++      if self.rot != other.rot:
+++         return cmp(self.rot, other.rot)
+++      if self.r5_rot != other.r5_rot:
+++         return cmp(self.r5_rot, other.r5_rot)
+++      return cmp(self.pack, other.pack)
+++
+++   def is_r5(self):
+++      return (self.mux == MUX_AC) and (self.i == 5)
+++
+++   def shift(self, rot, left):
+++      if isinstance(rot, loc_t) and rot.is_r5():
+++         if (rot.rot != 0) or (rot.r5_rot != 0) or rot.pack:
+++            raise Exception('can\'t rotate by rotated/unpacked r5')
+++         return loc_t(self.mux, self.i, self.rot, self.r5_rot + (-1 if left else 1), self.pack, self.rw)
+++      if not is_int(rot):
+++         raise Exception('can only rotate by integer or r5')
+++      return loc_t(self.mux, self.i, self.rot + (-rot if left else rot), self.r5_rot, self.pack, self.rw)
+++
+++   def __lshift__(self, rot):
+++      return self.shift(rot, True)
+++
+++   def __rshift__(self, rot):
+++      return self.shift(rot, False)
+++
+++   def __getattr__(self, name):
+++      # discard the first character if it is an underscore. this is a total hack
+++      # to allow packs starting with a digit to work
+++      if name[0] == '_':
+++         name = name[1:]
+++      if (name in packs) or (name in a_unpacks) or (name in r4_unpacks):
+++         if self.pack:
+++            raise Exception('can\'t specify two packs')
+++         return loc_t(self.mux, self.i, self.rot, self.r5_rot, name, self.rw)
+++      raise AttributeError()
+++
+++   def __str__(self):
+++      if self.mux == MUX_AC:
+++         return 'r%d' % self.i
+++      if self.mux == MUX_ANY:
+++         return 'rany%d' % self.i
+++      if self.mux == MUX_A:
+++         return 'ra%d' % self.i
+++      if self.mux == MUX_B:
+++         return 'rb%d' % self.i
+++      assert 0
+++
+++class sema_t:
+++   def __init__(self, acq, i):
+++      if not is_int(i):
+++         raise Exception('semaphore index must be integer')
+++      self.acq = acq
+++      self.i = i
+++
+++class label_t:
+++   def __init__(self, rel, name, offset):
+++      self.rel = rel
+++      self.name = name
+++      self.offset = offset
+++
+++   def __add__(self, offset):
+++      return label_t(self.rel, self.name, self.offset + offset)
+++
+++   def __sub__(self, offset):
+++      return label_t(self.rel, self.name, self.offset - offset)
+++
+++class label_maker_t:
+++   def __init__(self, rel):
+++      self.rel = rel
+++
+++   def __getattr__(self, name):
+++      # we discard the first character. this is a total hack to allow numeric labels to work
+++      if not re_label_ref_right.match(name[1:]):
+++         raise Exception('invalid label reference')
+++      return label_t(self.rel, name[1:], 0)
+++
+++def bits(x, n):
+++   if (x >> n) != 0:
+++      raise Exception('%d doesn\'t fit in %d bits' % (x, n))
+++   return x
+++
+++def bitsw(x, n):
+++   if x == (1 << n):
+++      x = 0
+++   return bits(x, n)
+++
+++def bitsws(x, n):
+++   if x == (1 << (n - 1)):
+++      x = 0
+++   if -(1 << (n - 1)) <= x < 0:
+++      x += 1 << n
+++   return bits(x, n)
+++
+++def vpm_setup(n, stride, addr, v2 = False):
+++   horiz, laned, size, y, x, p = addr
+++   if size not in (0, 1, 2):
+++      raise Exception('addr size should be 0, 1, or 2')
+++   if horiz:
+++      if x != 0:
+++         raise Exception('horizontal accesses must have x of 0')
+++   else:
+++      if (y & 0xf) != 0:
+++         raise Exception('vertical accesses must be 16 row aligned')
+++   hls = (bits(horiz, 1) << 3) | (bits(laned, 1) << 2) | (2 - size)
+++   if v2:
+++      return ((1 << 29) | (bitsw(n, 5) << 24) | (bitsws(stride, 7) << 16) |
+++         (hls << 12) | ((bits(y, 8) | bits(x, 4)) << size) | bits(p, size))
+++   return ((bitsw(n, 4) << 20) | (bitsw(stride, 6) << 12) |
+++      (hls << 8) | ((bits(y, 6) | bits(x, 4)) << size) | bits(p, size))
+++
+++def vdw_setup_0(n, m, addr):
+++   horiz, size, y, x, p = addr
+++   if size not in (0, 1, 2):
+++      raise Exception('addr size should be 0, 1, or 2')
+++   return ((2 << 30) | (bitsw(n, 7) << 23) | (bitsw(m, 7) << 16) |
+++      (bits(horiz, 1) << 14) | (bits(y, 7) << 7) | (bits(x, 4) << 3) | (size << 1) | bits(p, size))
+++
+++def vdr_setup_0(n, m, addr, vpm_stride, stride):
+++   horiz, size, y, x, p = addr
+++   if size not in (0, 1, 2):
+++      raise Exception('addr size should be 0, 1, or 2')
+++   if (stride < 8) or (stride & (stride - 1)):
+++      raise Exception('stride must be power of 2 >= 8, 8 meaning use extended stride')
+++   log2_stride = 3
+++   while (1 << log2_stride) != stride:
+++      log2_stride += 1
+++   return ((1 << 31) | (size << 29) | (bits(p, size) << 28) | (bits(log2_stride - 3, 4) << 24) |
+++      (bitsw(m, 4) << 20) | (bitsw(n, 4) << 16) | (bitsw(vpm_stride, 4) << 12) |
+++      (bits(1 - horiz, 1) << 11) | (bits(y, 7) << 4) | bits(x, 4))
+++
+++class allocator_t:
+++   def __init__(self, *available):
+++      self.available = list(available)
+++      self.allocated = {}
+++      self.reserved = []
+++
+++   def copy(self):
+++      a = allocator_t()
+++      a.available = self.available[:]
+++      a.allocated = self.allocated.copy()
+++      a.reserved = self.reserved[:]
+++      return a
+++
+++   def forget(self):
+++      self.__init__(self.available + self.allocated.values() + self.reserved)
+++
+++   def reserve(self, *rs):
+++      for r in rs:
+++         self.available.remove(r)
+++         self.reserved.append(r)
+++
+++   def retire(self, name):
+++      r = self.allocated.pop(name)
+++      del r.__invert__
+++      del r.retire
+++      self.available.append(r)
+++      return r
+++
+++   def __getattr__(self, name):
+++      if name not in self.allocated:
+++         r = self.available.pop()
+++         r.retire = lambda: self.retire(name) # this is an ugly hack to get nicer retire syntax
+++         r.__invert__ = r.retire
+++         self.allocated[name] = r
+++      return self.allocated[name]
+++
+++def pragma_allow_xor_0(x):
+++   global allow_xor_0
+++
+++   if not isinstance(x, bool):
+++      raise Exception('allow_xor_0 must be bool')
+++   x, allow_xor_0 = allow_xor_0, x
+++   return x
+++
+++def pragma_dont_warn_when_mul_rot_inp_r5(x):
+++   global dont_warn_when_mul_rot_inp_r5
+++
+++   if not isinstance(x, bool):
+++      raise Exception('dont_warn_when_mul_rot_inp_r5 must be bool')
+++   x, dont_warn_when_mul_rot_inp_r5 = dont_warn_when_mul_rot_inp_r5, x
+++   return x
+++
+++arg_defs = {
+++   # special reg names (these alias the regular names, but also have appropriate read/write restrictions)
+++   'w':             loc_t(MUX_A,   15, 0, 0, None, RW_EITHER),
+++   'z':             loc_t(MUX_B,   15, 0, 0, None, RW_EITHER),
+++   'unif':          loc_t(MUX_ANY, 32, 0, 0, None, RW_READ),
+++   'vary':          loc_t(MUX_ANY, 35, 0, 0, None, RW_READ),
+++   'tmurs':         loc_t(MUX_ANY, 36, 0, 0, None, RW_WRITE),
+++   'r5quad':        loc_t(MUX_A,   37, 0, 0, None, RW_WRITE),
+++   'r5rep':         loc_t(MUX_B,   37, 0, 0, None, RW_WRITE),
+++   'elem_num':      loc_t(MUX_A,   38, 0, 0, None, RW_READ),
+++   'qpu_num':       loc_t(MUX_B,   38, 0, 0, None, RW_READ),
+++   'unif_addr':     loc_t(MUX_A,   40, 0, 0, None, RW_WRITE),
+++   'unif_addr_rel': loc_t(MUX_B,   40, 0, 0, None, RW_WRITE),
+++   'x_coord':       loc_t(MUX_A,   41, 0, 0, None, RW_EITHER),
+++   'y_coord':       loc_t(MUX_B,   41, 0, 0, None, RW_EITHER),
+++   'ms_mask':       loc_t(MUX_A,   42, 0, 0, None, RW_EITHER),
+++   'rev_flag':      loc_t(MUX_B,   42, 0, 0, None, RW_EITHER),
+++   'stencil':       loc_t(MUX_ANY, 43, 0, 0, None, RW_WRITE),
+++   'tlbz':          loc_t(MUX_ANY, 44, 0, 0, None, RW_WRITE),
+++   'tlbm':          loc_t(MUX_ANY, 45, 0, 0, None, RW_WRITE),
+++   'tlbc':          loc_t(MUX_ANY, 46, 0, 0, None, RW_WRITE),
+++   'vpm':           loc_t(MUX_ANY, 48, 0, 0, None, RW_EITHER),
+++   'vr_busy':       loc_t(MUX_A,   49, 0, 0, None, RW_READ),
+++   'vw_busy':       loc_t(MUX_B,   49, 0, 0, None, RW_READ),
+++   'vr_setup':      loc_t(MUX_A,   49, 0, 0, None, RW_WRITE),
+++   'vw_setup':      loc_t(MUX_B,   49, 0, 0, None, RW_WRITE),
+++   'vr_wait':       loc_t(MUX_A,   50, 0, 0, None, RW_READ),
+++   'vw_wait':       loc_t(MUX_B,   50, 0, 0, None, RW_READ),
+++   'vr_addr':       loc_t(MUX_A,   50, 0, 0, None, RW_WRITE),
+++   'vw_addr':       loc_t(MUX_B,   50, 0, 0, None, RW_WRITE),
+++   'mutex':         loc_t(MUX_ANY, 51, 0, 0, None, RW_EITHER),
+++   'recip':         loc_t(MUX_ANY, 52, 0, 0, None, RW_WRITE),
+++   'recipsqrt':     loc_t(MUX_ANY, 53, 0, 0, None, RW_WRITE),
+++   'rsqrt':         loc_t(MUX_ANY, 53, 0, 0, None, RW_WRITE),
+++   'exp':           loc_t(MUX_ANY, 54, 0, 0, None, RW_WRITE),
+++   'log':           loc_t(MUX_ANY, 55, 0, 0, None, RW_WRITE),
+++   't0s':           loc_t(MUX_ANY, 56, 0, 0, None, RW_WRITE),
+++   't0t':           loc_t(MUX_ANY, 57, 0, 0, None, RW_WRITE),
+++   't0r':           loc_t(MUX_ANY, 58, 0, 0, None, RW_WRITE),
+++   't0b':           loc_t(MUX_ANY, 59, 0, 0, None, RW_WRITE),
+++   't1s':           loc_t(MUX_ANY, 60, 0, 0, None, RW_WRITE),
+++   't1t':           loc_t(MUX_ANY, 61, 0, 0, None, RW_WRITE),
+++   't1r':           loc_t(MUX_ANY, 62, 0, 0, None, RW_WRITE),
+++   't1b':           loc_t(MUX_ANY, 63, 0, 0, None, RW_WRITE),
+++
+++   # semaphore acq/rel
+++   'sacq': lambda i: sema_t(True, i),
+++   'srel': lambda i: sema_t(False, i),
+++
+++   # label makers (before evaluating, the syntax x:label gets transformed to x_label_maker._label)
+++   'r_label_maker': label_maker_t(True),
+++   'a_label_maker': label_maker_t(False),
+++
+++   # handy functions
+++   'f':     lambda x: struct.unpack('I', struct.pack('f', x))[0],
+++   'sqrt':  math.sqrt,
+++   'sin':   math.sin,
+++   'cos':   math.cos,
+++   'atan2': math.atan2,
+++   'pi':    math.pi,
+++   'rseed': random.seed,
+++   'rand':  lambda: int(random.getrandbits(32)),
+++   'bits':  bits,
+++   'bitsw': bitsw,
+++   'bitsws': bitsws,
+++
+++   # handy vpm/vdw/vdr stuff
+++   'h32':  lambda y:       (1, 0, 0, y, 0, 0),
+++   'h16l': lambda y, p:    (1, 1, 1, y, 0, p),
+++   'h16p': lambda y, p:    (1, 0, 1, y, 0, p),
+++   'h8l':  lambda y, p:    (1, 1, 2, y, 0, p),
+++   'h8p':  lambda y, p:    (1, 0, 2, y, 0, p),
+++   'v32':  lambda y, x:    (0, 0, 0, y, x, 0),
+++   'v16l': lambda y, x, p: (0, 1, 1, y, x, p),
+++   'v16p': lambda y, x, p: (0, 0, 1, y, x, p),
+++   'v8l':  lambda y, x, p: (0, 1, 2, y, x, p),
+++   'v8p':  lambda y, x, p: (0, 0, 2, y, x, p),
+++   'dma_h32':  lambda y, x:    (1, 0, y, x, 0),
+++   'dma_h16p': lambda y, x, p: (1, 1, y, x, p),
+++   'dma_h8p':  lambda y, x, p: (1, 2, y, x, p),
+++   'dma_v32':  lambda y, x:    (0, 0, y, x, 0),
+++   'dma_v16p': lambda y, x, p: (0, 1, y, x, p),
+++   'dma_v8p':  lambda y, x, p: (0, 2, y, x, p),
+++   'vpm_setup': vpm_setup,
+++   'vpm_setup_v2': lambda n, stride, addr: vpm_setup(n, stride, addr, True),
+++   'vdw_setup_0': vdw_setup_0,
+++   'vdw_setup_1': lambda stride: (3 << 30) | bits(stride, 13),
+++   'vdr_setup_0': vdr_setup_0,
+++   'vdr_setup_ext_stride': 8, # stride of 8 means use extended stride
+++   'vdr_setup_1': lambda stride: (9 << 28) | bits(stride, 13),
+++
+++   # annotations
+++   'mul_used': lambda *is_: ('mul_used', sum(1 << i for i in is_)),
+++   'mul_unused': lambda *is_: ('mul_used', sum(1 << i for i in is_) ^ 0xffff),
+++   'preserve_cond': ('preserve_cond', 1),
+++
+++   # somewhat experimental register allocator
+++   'allocator_t': allocator_t,
+++
+++   # pragmas
+++   'pragma_allow_xor_0': pragma_allow_xor_0,
+++   'pragma_dont_warn_when_mul_rot_inp_r5': pragma_dont_warn_when_mul_rot_inp_r5}
+++
+++# accumulators and regs (regular names -- r0, ra0, etc)
+++arg_defs.update(('r%d' % i, loc_t(MUX_AC, i, 0, 0, None, RW_EITHER)) for i in xrange(6))
+++arg_defs.update(('rany%d' % i, loc_t(MUX_ANY, i, 0, 0, None, RW_EITHER)) for i in xrange(64))
+++arg_defs.update(('ra%d' % i, loc_t(MUX_A, i, 0, 0, None, RW_EITHER)) for i in xrange(64))
+++arg_defs.update(('rb%d' % i, loc_t(MUX_B, i, 0, 0, None, RW_EITHER)) for i in xrange(64))
+++
+++def arg_eval(arg, sets):
+++   s = (arg.strip().split('.', 1) + [None])[:2]
+++   if s[0] == '-':
+++      return loc_t(MUX_ANY, WADDR_NOP, 0, 0, s[1], RW_WRITE)
+++   arg = re_label_ref_left.sub('\\1_label_maker._', arg) # todo: we probably don't want to replace in strings...
+++   arg = re_pack.sub('._\\1', arg)
+++   try:
+++      # todo: i would like to be able to pass both arg_defs and sets in here
+++      # (with sets hiding arg_defs in the case of conflicts), but the obvious
+++      # dict(arg_defs, **sets) won't permit things such as:
+++      # .set f, lambda x: y
+++      # .set y, 4
+++      # (the y in the lambda will be looked up in the temporary dict we created
+++      # when evaluating the f .set, which doesn't contain y)
+++      #
+++      # instead, sets is initially set to (a copy of) arg_defs. to simulate the
+++      # hiding behaviour, on an unset, we restore any hidden arg_defs value.
+++      # also, before dumping sets at the end, we strip out the arg_defs stuff
+++      # (this isn't entirely correct as we want to dump sets that are hiding
+++      # arg_defs)
+++      return eval(arg, sets)
+++   except Exception, e:
+++      asm_error(e)
+++   except:
+++      asm_error('unknown error while evaluating argument')
+++
+++# doesn't check/fixup pack
+++def check_and_fixup_loc(loc, read):
+++   if (not read) and (loc.rw == RW_READ):
+++      asm_error('writing to read-only hardware register')
+++   if read and (loc.rw == RW_WRITE):
+++      asm_error('reading from write-only hardware register')
+++   if not read:
+++      # conceptually, we are writing to a location rotated right by
+++      # loc.rot/loc.r5_rot. but we are actually rotating the output right by
+++      # -loc.rot/-loc.r5_rot then writing it to the unrotated location
+++      loc.rot = -loc.rot % 16
+++      loc.r5_rot = -loc.r5_rot % 16
+++   if (loc.rot != 0) and (loc.r5_rot != 0):
+++      asm_error('can\'t rotate by both r5 and immediate')
+++   if (loc.r5_rot != 0) and (loc.r5_rot != 1):
+++      asm_error('only supported rotation by r5 is once to the %s' % ('left', 'right')[read])
+++   if (not mulw_rotate) and ((loc.rot != 0) or loc.r5_rot): # mulw_rotate source checking is done later
+++      if not read:
+++         asm_error('target doesn\'t support write rotation')
+++      if loc.mux == MUX_ANY:
+++         loc.mux = MUX_A # can't do rotated read from regfile b
+++      if loc.mux != MUX_A:
+++         asm_error('rotation on read only allowed from regfile a')
+++      if loc.i >= 32:
+++         asm_warning('rotation only works from physical regfile')
+++   if loc.mux == MUX_AC:
+++      if (loc.i < 0) or (loc.i >= 6):
+++         asm_error('reg out of range')
+++      if not read:
+++         if loc.i == 4:
+++            asm_error('not allowed to write to r4')
+++         if loc.i == 5:
+++
+++            asm_error('not allowed to write to r5 -- please specify r5quad or r5rep')
+++   elif (loc.mux == MUX_ANY) or (loc.mux == MUX_A) or (loc.mux == MUX_B):
+++      if (loc.i < 0) or (loc.i >= 64):
+++         asm_error('reg out of range')
+++   else:
+++      assert 0
+++
+++def get_dst(dst, sets):
+++   if not dst:
+++      return None, None, (0, PACK_TYPE_EITHER, PACK_MODE_EITHER), 0, 0
+++   dst = arg_eval(dst, sets)
+++   if not isinstance(dst, loc_t):
+++      asm_error('invalid dst')
+++   dst = dst.copy()
+++   check_and_fixup_loc(dst, False)
+++   pack = get_pack(dst.pack)
+++   if dst.mux == MUX_AC:
+++      if pack[2] == PACK_MODE_A:
+++         asm_warning('ra packing only works when writing to physical regfile')
+++         return WADDR_R0 + dst.i, WMUX_A, pack, dst.rot, dst.r5_rot
+++      return WADDR_R0 + dst.i, WMUX_ANY, pack, dst.rot, dst.r5_rot
+++   if (dst.mux == MUX_A) or ((dst.mux == MUX_ANY) and (pack[2] == PACK_MODE_A)): # can't pack to regfile b with this operation
+++      if (pack[2] == PACK_MODE_A) and (dst.i >= 32):
+++         asm_warning('ra packing only works when writing to physical regfile')
+++      return dst.i, WMUX_A, pack, dst.rot, dst.r5_rot
+++   if dst.mux == MUX_ANY:
+++      return dst.i, WMUX_ANY, pack, dst.rot, dst.r5_rot
+++   if dst.mux == MUX_B:
+++      if pack[2] == PACK_MODE_A:
+++         asm_error('this packing operation can only be used for regfile a')
+++      return dst.i, WMUX_B, pack, dst.rot, dst.r5_rot
+++   assert 0
+++
+++def get_src(src, sets):
+++   if not src:
+++      return None, None, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), None, None
+++   src = arg_eval(src, sets)
+++   if isinstance(src, sema_t):
+++      if not have_sema:
+++         asm_error('target does not support semaphores')
+++      if (src.i < 0) or (src.i >= 16):
+++         asm_error('semaphore number must be in [0, 16)')
+++      return src.i | (src.acq << 4), RMUX_SEMA, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), 0, 0
+++   if isinstance(src, label_t):
+++      return (src.name, src.rel, src.offset), RMUX_LABEL, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), 0, 0
+++   if isinstance(src, list):
+++      if len(src) != 16:
+++         asm_error('vector immediate must have length 16')
+++      src = src[:]
+++      for i in xrange(16):
+++         if not is_int(src[i]):
+++            asm_error('all elements of vector immediate must be integers')
+++         src[i] &= (1 << 32) - 1
+++      return src, RMUX_IMMV, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), 0, 0
+++   if is_int(src):
+++      return src & ((1 << 32) - 1), RMUX_IMM, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), 0, 0
+++   if not isinstance(src, loc_t):
+++      asm_error('invalid src')
+++   src = src.copy()
+++   check_and_fixup_loc(src, True)
+++   if mulw_rotate:
+++      srot, sr5rot = 0, 0
+++      drot, dr5rot = src.rot, src.r5_rot
+++   else:
+++      srot, sr5rot = src.rot, src.r5_rot
+++      drot, dr5rot = 0, 0
+++   if src.mux == MUX_AC:
+++      if src.i == 4:
+++         return 4, RMUX_AC, get_r4_unpack(src.pack), drot, dr5rot
+++      if src.pack:
+++         asm_error('unpack only allowed for regfile a or r4')
+++      return src.i, RMUX_AC, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), drot, dr5rot
+++   if (src.mux == MUX_A) or ((src.mux == MUX_ANY) and src.pack): # can't unpack from regfile b
+++      return (src.i, srot, sr5rot), RMUX_A, get_a_unpack(src.pack), drot, dr5rot
+++   if src.mux == MUX_ANY:
+++      return src.i, RMUX_ANY, (0, PACK_TYPE_EITHER, UNPACK_LOC_AB), drot, dr5rot
+++   if src.mux == MUX_B:
+++      if src.pack:
+++         asm_error('unpack only allowed for regfile a or r4')
+++      return src.i, RMUX_B, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), drot, dr5rot
+++   assert 0
+++
+++# signals
+++##########
+++
+++sigs = {
+++   'bkpt': SIG_BKPT,
+++   'thrsw': SIG_THRSW,
+++   'thrend': SIG_THREND,
+++   'sbwait': SIG_SBWAIT,
+++   'sbdone': SIG_SBDONE,
+++   'int': SIG_INT,
+++   'loadcv': SIG_LOADCV,
+++   'loadc': SIG_LOADC,
+++   'ldcend': SIG_LDCEND,
+++   'ldtmu0': SIG_LDTMU0,
+++   'ldtmu1': SIG_LDTMU1}
+++
+++def get_sig(sig):
+++   if sig not in sigs:
+++      return SIG_NORMAL
+++   return sigs[sig]
+++
+++# annotations
+++##############
+++
+++def get_annots(annot, sets):
+++   annots = arg_eval(annot, sets)
+++   if isinstance(annots, list):
+++      annots = annots[:]
+++   else:
+++      annots = [annots]
+++   for i, annot in enumerate(annots):
+++      if ((not isinstance(annot, tuple)) or (len(annot) != 2) or (not isinstance(annot[0], str)) or
+++         (not is_int(annot[1]))):
+++         asm_error('annotation must be (string, integer) pair, or a list of such pairs')
+++      annots[i] = (annot[0], annot[1] & ((1 << 32) - 1))
+++   return annots
+++
+++###############################################################################
+++# core
+++###############################################################################
+++
+++def calculate_pack_modes(rpacks, rfloats, couldrfloat, wpacks, wfloats):
+++   needfloat = PACK_TYPE_EITHER
+++   havefloata = False
+++   havefloatr4 = False
+++   unpacka = None
+++   unpackr4 = None
+++   forcebs = [False, False, False, False]
+++   forcerafloat = False
+++
+++   pm = PACK_MODE_EITHER
+++   for i in (0, 1, 2, 3):
+++      if (rpacks[i][2] == UNPACK_LOC_OTHER) or (rpacks[i][2] == UNPACK_LOC_AB):
+++         assert rpacks[i][0] == 0
+++      else:
+++         if rpacks[i][2] == UNPACK_LOC_A:
+++            if unpacka is None:
+++               unpacka = rpacks[i][0]
+++            elif unpacka != rpacks[i][0]:
+++               asm_error('conflicting unpack operations on regfile a')
+++            havefloata = havefloata or rfloats[i]
+++         elif rpacks[i][2] == UNPACK_LOC_R4:
+++            if unpackr4 is None:
+++               unpackr4 = rpacks[i][0]
+++            elif unpackr4 != rpacks[i][0]:
+++               asm_error('conflicting unpack operations on r4')
+++            havefloatr4 = havefloatr4 or rfloats[i]
+++         else:
+++            assert 0
+++
+++         if rpacks[i][1] != PACK_TYPE_EITHER:
+++            if (needfloat != PACK_TYPE_EITHER) and (needfloat != rpacks[i][1]):
+++               asm_error('conflicting unpack float requirements')
+++            needfloat = rpacks[i][1]
+++   for i in (0, 1, 2, 3):
+++      if rpacks[i][2] == UNPACK_LOC_AB:
+++         if (unpacka is not None) and (unpacka != UNPACK_A_NOP):
+++            forcebs[i] = True # non-nop unpack from regfile a. must use b
+++
+++   if unpacka:
+++      if (needfloat == PACK_TYPE_FLOAT) and (not havefloata) and couldrfloat:
+++         havefloata = True
+++         forcerafloat = True
+++      havefloat = havefloata
+++   else:
+++      havefloat = havefloatr4
+++
+++   if (needfloat == PACK_TYPE_FLOAT) and (not havefloat):
+++      asm_error('float unpack operation used in integer alu operations')
+++   if (needfloat == PACK_TYPE_INT) and havefloat:
+++      asm_error('integer unpack operation used in float alu operation')
+++
+++   unpack = 0
+++   if unpacka and unpackr4:
+++      asm_error('cannot specify pack operation for both regfile a and r4')
+++   if unpacka:
+++      pm = PACK_MODE_A
+++      unpack = unpacka
+++   elif unpackr4:
+++      pm = PACK_MODE_M
+++      unpack = unpackr4
+++
+++   pack = 0
+++   if wpacks[0][2] == PACK_MODE_M:
+++      asm_error('mul-unit pack operation used on add result')
+++   for i in (0, 1):
+++      if wpacks[i][2] == PACK_MODE_A:
+++         if (pm != PACK_MODE_EITHER) and (pm != PACK_MODE_A):
+++            asm_error('conflicting pack modes')
+++         pm = PACK_MODE_A
+++         pack = wpacks[i][0]
+++      elif wpacks[i][2] == PACK_MODE_M:
+++         if (pm != PACK_MODE_EITHER) and (pm != PACK_MODE_M):
+++            asm_error('conflicting pack modes')
+++         pm = PACK_MODE_M
+++         pack = wpacks[i][0]
+++
+++      if (wpacks[i][1] == PACK_TYPE_FLOAT) and (not wfloats[i]):
+++         asm_error('float pack operation used with integer alu result')
+++      if (wpacks[i][1] == PACK_TYPE_INT) and wfloats[i]:
+++         asm_error('integer pack operation used with float alu result')
+++
+++   if pm == PACK_MODE_EITHER:
+++      pm = PACK_MODE_A
+++   return pm, pack, unpack, forcebs, forcerafloat
+++
+++# immediates that can be encoded with SIG_SMALLIMMED
+++bimms = {}
+++bimms.update((i, i) for i in xrange(16))
+++bimms.update(((i - 32) + (1 << 32), i) for i in xrange(16, 32))
+++bimms.update(((127 + (i - 32)) << 23, i) for i in xrange(32, 40))
+++bimms.update(((127 + (i - 48)) << 23, i) for i in xrange(40, 48))
+++
+++def merge_rmux(raddr_a, raddr_b, immb, arot_r5, raddr, rmux):
+++   if rmux == RMUX_SEMA:
+++      asm_error('semaphore op can only be used with mov')
+++   if rmux == RMUX_LABEL:
+++      asm_error('label not allowed here')
+++   if rmux == RMUX_IMMV:
+++      asm_error('vector immediate can only be used with mov')
+++   if rmux == RMUX_IMM:
+++      if raddr not in bimms:
+++         asm_error('can\'t encode immediate 0x%08x' % raddr)
+++      raddr = bimms[raddr]
+++      if not immb:
+++         if raddr_b is not None:
+++            asm_error('regfile b and immediates don\'t mix')
+++         raddr_b = raddr
+++         immb = True
+++      elif raddr_b != raddr:
+++         asm_error('can only encode one rotation/immediate')
+++      return raddr_a, raddr_b, immb, arot_r5, RMUX_B
+++   if rmux == RMUX_AC:
+++      return raddr_a, raddr_b, immb, arot_r5, RMUX_A0 + raddr
+++   if rmux == RMUX_ANY:
+++      if (mulw_rotate or (((not immb) or (raddr_b < 48)) and (not arot_r5))) and (raddr_a == raddr):
+++         return raddr_a, raddr_b, immb, arot_r5, RMUX_A
+++      if (not immb) and (raddr_b == raddr):
+++         return raddr_a, raddr_b, immb, arot_r5, RMUX_B
+++      if raddr_a is None:
+++         assert mulw_rotate or (((not immb) or (raddr_b < 48)) and (not arot_r5))
+++         raddr_a = raddr
+++         return raddr_a, raddr_b, immb, arot_r5, RMUX_A
+++      if raddr_b is None:
+++         assert not immb
+++         raddr_b = raddr
+++         return raddr_a, raddr_b, immb, arot_r5, RMUX_B
+++      asm_error('no free read slots')
+++   if rmux == RMUX_A:
+++      if (not mulw_rotate) and (raddr_a is not None) and (
+++         ((raddr[1] != 0) | ((raddr[2] != 0) << 1)) != ((immb and (raddr_b >= 48)) | (arot_r5 << 1))):
+++         asm_error('conflicting rotations from regfile a')
+++      if raddr_a is None:
+++         raddr_a = raddr[0]
+++      elif raddr_a != raddr[0]:
+++         asm_error('can only read from one location in each regfile')
+++      arot_r5 = raddr[2]
+++      if raddr[1] == 0:
+++         return raddr_a, raddr_b, immb, arot_r5, RMUX_A
+++      raddr = 48 + raddr[1]
+++      if not immb:
+++         if raddr_b is not None:
+++            asm_error('regfile b and rotation don\'t mix')
+++         raddr_b = raddr
+++         immb = True
+++      elif raddr_b != raddr:
+++         asm_error('can only encode one rotation/immediate')
+++      return raddr_a, raddr_b, immb, arot_r5, RMUX_A
+++   if rmux == RMUX_B:
+++      if immb:
+++         asm_error('regfile b and rotation/immediates don\'t mix')
+++      if raddr_b is None:
+++         raddr_b = raddr
+++      elif raddr_b != raddr:
+++         asm_error('can only read from one location in each regfile')
+++      return raddr_a, raddr_b, immb, arot_r5, RMUX_B
+++   assert 0
+++
+++# ok if:
+++# - accumulator (r0-r3)
+++# - uniform (ie all elements identical). this is true of unif, qpu_num, vr_busy,
+++#   and vw_busy. it's also true of r5 if it was written by r5rep, but not if it
+++#   was written by r5quad. so, by default, r5 isn't considered uniform. todo:
+++#   what about vr_wait/vw_wait/mutex?
+++def read_rot_ok(rmux, raddr_a, raddr_b):
+++   return ((rmux < 4) or ((rmux == 5) and dont_warn_when_mul_rot_inp_r5) or
+++      ((rmux == 6) and (raddr_a in (32, 49))) or # unif/vr_busy
+++      ((rmux == 7) and (raddr_b in (32, 38, 49)))) # unif/qpu_num/vw_busy
+++
+++def asm_flush_prog_data():
+++   global prog_data
+++
+++   while len(prog_data) & 7:
+++      prog_data.append(0)
+++   for i in xrange(0, len(prog_data), 8):
+++      prog.append(((prog_data[i + 3] << 24) | (prog_data[i + 2] << 16) | (prog_data[i + 1] << 8) | (prog_data[i + 0] << 0),
+++         (prog_data[i + 7] << 24) | (prog_data[i + 6] << 16) | (prog_data[i + 5] << 8) | (prog_data[i + 4] << 0), 'data', {}))
+++   prog_data = []
+++
+++def asm_line(sets, location, line):
+++   global current_location, construct, nwarn_level
+++
+++   prev_location = current_location
+++   current_location = location
+++
+++   try:
+++      if construct != None:
+++         if re_macro.match(line):
+++            construct_stack.append(CONSTRUCT_MACRO)
+++         elif re_if.match(line):
+++            construct_stack.append(CONSTRUCT_IF)
+++         elif re_rep.match(line):
+++            construct_stack.append(CONSTRUCT_REP)
+++         else:
+++            else_m = line == '.else'
+++            elif_m = re_elif.match(line)
+++            if elif_m:
+++               end_construct = CONSTRUCT_IF
+++            else:
+++               end_construct = {
+++                  '.endm':  CONSTRUCT_MACRO,
+++                  '.else':  CONSTRUCT_IF,
+++                  '.endif': CONSTRUCT_IF | CONSTRUCT_ELSE,
+++                  '.endr':  CONSTRUCT_REP}.get(line)
+++            if end_construct is not None:
+++               end_construct &= construct_stack.pop()
+++               if end_construct == 0:
+++                  if elif_m:
+++                     asm_error('unexpected .elif')
+++                  asm_error('unexpected %s' % line)
+++               if len(construct_stack) == 0:
+++                  lines = construct
+++                  construct = None
+++                  if end_construct == CONSTRUCT_MACRO:
+++                     return
+++                  if (end_construct == CONSTRUCT_IF) or (end_construct == CONSTRUCT_ELSE):
+++                     condition_if, condition_else = lines[0]
+++                     lines = lines[1:]
+++                     if condition_if:
+++                        for location, line in lines:
+++                           asm_line(sets, location, line)
+++                     if else_m:
+++                        construct = [(condition_else, False)]
+++                        construct_stack.append(CONSTRUCT_ELSE)
+++                     elif elif_m:
+++                        if elif_m.group('set'):
+++                           condition_if = condition_else and ((elif_m.group('set') == 'nset') ^ (elif_m.group('name') in sets))
+++                        else:
+++                           condition_if = condition_else and arg_eval(elif_m.group('condition'), sets)
+++                        condition_else = condition_else and (not condition_if)
+++                        construct = [(condition_if, condition_else)]
+++                        construct_stack.append(CONSTRUCT_IF)
+++                     return
+++                  if end_construct == CONSTRUCT_REP:
+++                     name, count = lines[0]
+++                     lines = lines[1:]
+++                     for i in xrange(count):
+++                        sets[name] = i
+++                        for location, line in lines:
+++                           asm_line(sets, location, line)
+++                     return
+++                  assert 0
+++               if else_m:
+++                  construct_stack.append(CONSTRUCT_ELSE)
+++               elif elif_m:
+++                  construct_stack.append(CONSTRUCT_IF)
+++         construct.append((current_location, line))
+++         return
+++
+++      if line in ('.endm', '.else', '.endif', '.endr'):
+++         asm_error('unexpected %s' % line)
+++      if re_elif.match(line):
+++         asm_error('unexpected .elif')
+++
+++      m = re_macro.match(line)
+++      if m:
+++         construct = []
+++         construct_stack.append(CONSTRUCT_MACRO)
+++         macros[m.group('name')] = ([param.strip() for param in m.group('params').split(',')[1:]], construct)
+++         return
+++
+++      m = re_if.match(line)
+++      if m:
+++         if m.group('set'):
+++            condition = (m.group('set') == 'nset') ^ (m.group('name') in sets)
+++         else:
+++            # not not forces condition to a bool (this matters if condition is
+++            # something mutable like a list)
+++            condition = not not arg_eval(m.group('condition'), sets)
+++         construct = [(condition, not condition)]
+++         construct_stack.append(CONSTRUCT_IF)
+++         return
+++
+++      m = re_rep.match(line)
+++      if m:
+++         count = arg_eval(m.group('count'), sets)
+++         if not is_int(count):
+++            asm_error('.rep count must be integer')
+++         construct = [(m.group('name'), count)]
+++         construct_stack.append(CONSTRUCT_REP)
+++         return
+++
+++      m = re_include.match(line)
+++      if m:
+++         filename = arg_eval(m.group('filename'), sets)
+++         if not isinstance(filename, str):
+++            asm_error('expected string')
+++         asm_file(sets, '%s: %s' % (current_location, filename), filename)
+++         return
+++
+++      m = re_set.match(line)
+++      if m:
+++         sets[m.group('name')] = arg_eval(m.group('val'), sets)
+++         return
+++
+++      m = re_unset.match(line)
+++      if m:
+++         name = m.group('name')
+++         if name not in sets:
+++            asm_error('%s not set' % name)
+++         if name in arg_defs: # todo: see arg_eval
+++            sets[name] = arg_defs[name]
+++         else:
+++            del sets[name]
+++         return
+++
+++      m = re_eval.match(line)
+++      if m:
+++         arg_eval(m.group('expr'), sets)
+++         return
+++
+++      m = re_print_info_warn_error.match(line)
+++      if m:
+++         def print_fn(message):
+++            print message
+++         def info_fn(message):
+++            sys.stderr.write('%s\n' % message)
+++         {'print': print_fn, 'info': info_fn, 'warn': asm_warning, 'error': asm_error}[
+++            m.group('print_info_warn_error')](arg_eval(m.group('message'), sets))
+++         return
+++
+++      m = re_assert.match(line)
+++      if m:
+++         if not arg_eval(m.group('condition'), sets):
+++            asm_error('assertion failure: \'%s\'' % m.group('condition'))
+++         return
+++
+++      m = re_data.match(line)
+++      if m:
+++         size = int(m.group('size'))
+++         for datum in smart_split(m.group('data')):
+++            datum = arg_eval(datum, sets)
+++            if not is_int(datum):
+++               asm_error('datum must be integer')
+++            prog_data.extend(((datum >> (i * 8)) & 0xff) for i in xrange(size))
+++         return
+++
+++      m = re_macro_inst.match(line)
+++      if m:
+++         name = m.group('name')
+++         if name in macros:
+++            params, lines = macros[name]
+++            args = smart_split(m.group('args'))
+++            if len(args) > len(params):
+++               asm_error('too many arguments to macro')
+++            sets = sets.copy()
+++            sets.update(zip(params, (arg_eval(arg, sets) for arg in args)))
+++            for param in params[len(args):]:
+++               if param in sets:
+++                  if param in arg_defs: # todo: see arg_eval
+++                     sets[param] = arg_defs[param]
+++                  else:
+++                     del sets[param]
+++            for location, line in lines:
+++               asm_line(sets, '%s: %s' % (current_location, location), line)
+++            return
+++
+++      if line == '.pushnwarn':
+++         nwarn_level += 1
+++         return
+++      if line == '.popnwarn':
+++         if nwarn_level == 0:
+++            asm_error('.popnwarn without .pushnwarn')
+++         nwarn_level -= 1
+++         return
+++
+++      # everything below assumes prog is up to date
+++      asm_flush_prog_data()
+++
+++      m = re_label.match(line)
+++      if m:
+++         name = m.group('name')
+++         if name[0].isdigit():
+++            labels.setdefault(name, []).append(len(prog))
+++         else:
+++            if name[0] == ':':
+++               undecorated_name = name[1:]
+++            else:
+++               undecorated_name = name
+++            if (undecorated_name in labels) or ((':' + undecorated_name) in labels):
+++               asm_error('named label defined twice')
+++            labels[name] = len(prog)
+++         return
+++
+++      annots = line.split('@')
+++      ops = [op.strip() for op in annots[0].split(';')]
+++      annots = sum((get_annots(annot, sets) for annot in annots[1:]), [])
+++      sig = get_sig(ops[-1])
+++      if sig != SIG_NORMAL:
+++         ops = ops[:-1]
+++      if len(ops) > 2:
+++         asm_error('too many ops')
+++      elif (len(ops) == 1) and (ops[0] == ''):
+++         ops = []
+++      ops = (ops + ['nop', 'nop'])[:2]
+++      m = re_op.match(ops[0])
+++      if not m:
+++         asm_error('invalid syntax')
+++      aop, aargs_n = get_aop(m.group('op'))
+++      if (aop == AOP_BRA) or (aop == AOP_BRR):
+++         acond = get_bcond(m.group('cond'))
+++      else:
+++         acond = get_cond(m.group('cond'))
+++      asf = get_setf(m.group('sf'))
+++      aargs = smart_split(m.group('args'))
+++      if len(aargs) != aargs_n:
+++         asm_error('wrong operand count')
+++      ard, ara, arb = (aargs + [None, None, None])[:3]
+++      m = re_op.match(ops[1])
+++      if not m:
+++         asm_error('invalid syntax')
+++      mop, margs_n = get_mop(m.group('op'))
+++      mcond = get_cond(m.group('cond'))
+++      msf = get_setf(m.group('sf'))
+++      margs = smart_split(m.group('args'))
+++      if len(margs) != margs_n:
+++         asm_error('wrong operand count')
+++      mrd, mra, mrb = (margs + [None, None, None])[:3]
+++      # eval srcs first so allocator can retire and reuse registers for dst
+++      aaraddr, aarmux, aarpack, aadrot, aadrot_r5 = get_src(ara, sets)
+++      abraddr, abrmux, abrpack, abdrot, abdrot_r5 = get_src(arb, sets)
+++      maraddr, marmux, marpack, madrot, madrot_r5 = get_src(mra, sets)
+++      mbraddr, mbrmux, mbrpack, mbdrot, mbdrot_r5 = get_src(mrb, sets)
+++      awaddr, awmux, awpack, awrot, awrot_r5 = get_dst(ard, sets)
+++      mwaddr, mwmux, mwpack, mwrot, mwrot_r5 = get_dst(mrd, sets)
+++      if (((abrmux is not None) and ((aadrot != abdrot) or (aadrot_r5 != abdrot_r5))) or
+++         ((mbrmux is not None) and ((madrot != mbdrot) or (madrot_r5 != mbdrot_r5)))):
+++         asm_error('cannot have 2 arguments with different rotations')
+++      if aarmux is not None:
+++         awrot = (awrot + aadrot) % 16
+++         awrot_r5 = (awrot_r5 + aadrot_r5) % 16
+++      if (awrot != 0) or awrot_r5:
+++         asm_error('rotate not allowed on add write')
+++      if marmux is not None:
+++         mwrot = (mwrot + madrot) % 16
+++         mwrot_r5 = (mwrot_r5 + madrot_r5) % 16
+++
+++      afloatr = aop in (AOP_FADD, AOP_FSUB, AOP_FMIN, AOP_FMAX, AOP_FMINABS, AOP_FMAXABS, AOP_FTOI)
+++      afloatw = aop in (AOP_FADD, AOP_FSUB, AOP_FMIN, AOP_FMAX, AOP_FMINABS, AOP_FMAXABS, AOP_ITOF)
+++      pm, pack, unpack, forcebs, forcerafloat = calculate_pack_modes(
+++         [aarpack, abrpack, marpack, mbrpack],
+++         [afloatr, afloatr, mop == MOP_FMUL, mop == MOP_FMUL],
+++         aop == AOP_FTOI,
+++         [awpack, mwpack],
+++         [afloatw, mop == MOP_FMUL])
+++      if forcebs[0]:
+++         aarmux = RMUX_B
+++      if forcebs[1]:
+++         abrmux = RMUX_B
+++      if forcebs[2]:
+++         marmux = RMUX_B
+++      if forcebs[3]:
+++         mbrmux = RMUX_B
+++
+++      # extend nops to 3 operands
+++      if aop == AOP_NOP:
+++         awaddr, awmux, aaraddr, aarmux, abraddr, abrmux = WADDR_NOP, WMUX_ANY, 0, RMUX_AC, 0, RMUX_AC
+++      if mop == MOP_NOP:
+++         mwaddr, mwmux, maraddr, marmux, mbraddr, mbrmux = WADDR_NOP, WMUX_ANY, 0, RMUX_AC, 0, RMUX_AC
+++
+++      # extend 2 operand alu ops to 3 operands (by duplicating the 2nd operand)
+++      if (aop == AOP_FTOI) or (aop == AOP_ITOF) or (aop == AOP_NOT) or (aop == AOP_CLZ):
+++         if forcerafloat:
+++            assert aop == AOP_FTOI # can only forcerafloat if we have an unused float operand
+++            # instead of duplicating the 2nd operand, take the ra operand from
+++            # the mul op thus forcing the ra value to be considered a float for
+++            # the purposes of unpacking
+++            if marmux == RMUX_A:
+++               abraddr, abrmux = maraddr, marmux
+++            else:
+++               assert mbrmux == RMUX_A
+++               abraddr, abrmux = mbraddr, mbrmux
+++         else:
+++            abraddr, abrmux = aaraddr, aarmux
+++      else:
+++         assert not forcerafloat # can only forcerafloat if we have an unused operand
+++
+++      # handle write addrs
+++      if (awmux == mwmux) and (awmux != WMUX_ANY):
+++         asm_error('add/mul ops not allowed to write to same regfile')
+++      ws = (awmux == WMUX_B) or (mwmux == WMUX_A)
+++
+++      # handle branch
+++      if (aop == AOP_BRA) or (aop == AOP_BRR):
+++         # check setf
+++         if asf:
+++            asm_error('setf not allowed on bra/brr')
+++
+++         # check pack/unpack
+++         if (pack != 0) or (unpack != 0):
+++            asm_error('pack/unpack not allowed with bra/brr')
+++
+++         # handle read address
+++         if aarmux == RMUX_LABEL:
+++            if (aop == AOP_BRA) and aaraddr[1]:
+++               asm_warning('bra with rel label')
+++            if (aop == AOP_BRR) and (not aaraddr[1]):
+++               asm_warning('brr with abs label')
+++            aaraddr, aarmux = (current_location,) + aaraddr, RMUX_IMM
+++         if aarmux == RMUX_ANY:
+++            aaraddr, aarmux = (aaraddr, 0, 0), RMUX_A
+++         if (aarmux != RMUX_IMM) and (aarmux != RMUX_A):
+++            asm_error('branch destination must be either label, immediate, or from regfile a')
+++         if aarmux == RMUX_IMM:
+++            imm = aaraddr
+++            raddr = 0 # can't use RADDR_NOP
+++         elif aarmux == RMUX_A:
+++            if (aaraddr[1] != 0) or (aaraddr[2] != 0):
+++               asm_error('rotation of read from regfile a not allowed with branch')
+++            if aop == AOP_BRR:
+++               asm_warning('brr with ra')
+++            imm = 0
+++            raddr = aaraddr[0]
+++         else:
+++            assert 0
+++
+++         # check mul op is nop
+++         if mop != MOP_NOP:
+++            asm_error('mul op not allowed with branch')
+++
+++         # check sig
+++         if sig != SIG_NORMAL:
+++            asm_error('no signal allowed with branch')
+++
+++         if raddr >= 32:
+++            asm_error('can only branch to register locations in physical regfile')
+++         if raddr & 1:
+++            asm_warning('branch instruction will destroy flags (see hw-2780)')
+++
+++         # construct branch instruction
+++         prog.append((imm,
+++            (mwaddr << 0) | (awaddr << 6) | (ws << 12) | (raddr << 13) | ((aarmux == RMUX_A) << 18) | ((aop == AOP_BRR) << 19) | (acond << 20) | (SIG_BRANCH << 28),
+++            line, annots))
+++
+++         return
+++
+++      # use COND_NEVER when possible (might save power / allow mul setf)
+++      if not dict(annots).get('preserve_cond', 0):
+++          if (awaddr == WADDR_NOP) and (not asf):
+++             acond = COND_NEVER
+++          if (mwaddr == WADDR_NOP) and (not msf):
+++             mcond = COND_NEVER
+++
+++      # attempt to convert movs to ldi
+++      if (# no mul setf
+++         (not msf) and
+++         # ops must either be nop or mov of sema/label/imm/immv
+++         ((aop == AOP_NOP) or ((aop == AOP_MOV) and (aarmux in (RMUX_SEMA, RMUX_LABEL, RMUX_IMMV, RMUX_IMM)))) and
+++         ((mop == MOP_NOP) or ((mop == MOP_MOV) and (marmux in (RMUX_SEMA, RMUX_LABEL, RMUX_IMMV, RMUX_IMM)))) and
+++         # but we don't want 2 nops
+++         ((aop != AOP_NOP) or (mop != MOP_NOP)) and
+++         # if both ops are movs, srcs must be identical
+++         ((aop != AOP_MOV) or (mop != MOP_MOV) or ((aarmux == marmux) and (aaraddr == maraddr))) and
+++         # no signal
+++         (sig == SIG_NORMAL)):
+++         # make sure aarmux/aaraddr contains the value
+++         if aop != AOP_MOV:
+++            aarmux = marmux
+++            aaraddr = maraddr
+++
+++         # convert immediate
+++         if aarmux == RMUX_SEMA:
+++            ldi_mode = LDI_SEMA
+++         elif aarmux == RMUX_LABEL:
+++            ldi_mode = LDI_32
+++            aaraddr, aarmux = (current_location,) + aaraddr, RMUX_IMM
+++         elif aarmux == RMUX_IMMV:
+++            signed, unsigned = True, True
+++            imm = 0
+++            for i, elem in enumerate(aaraddr):
+++               if elem not in (-2 + (1 << 32), -1 + (1 << 32), 0, 1):
+++                  signed = False
+++               if elem not in (0, 1, 2, 3):
+++                  unsigned = False
+++               imm |= ((elem & 0x1) << i) | ((elem & 0x2) << (15 + i))
+++            if not (signed or unsigned):
+++               asm_error('can\'t encode vector immediate')
+++            if signed:
+++               ldi_mode = LDI_EL_SIGNED
+++            else:
+++               ldi_mode = LDI_EL_UNSIGNED
+++            aaraddr, aarmux = imm, RMUX_IMM
+++         elif aarmux == RMUX_IMM:
+++            ldi_mode = LDI_32
+++         else:
+++            assert 0
+++
+++         # construct ldi instruction
+++         prog.append((aaraddr,
+++            (mwaddr << 0) | (awaddr << 6) | (ws << 12) | (asf << 13) | (mcond << 14) | (acond << 17) | (pack << 20) | (pm << 24) | (ldi_mode << 25) | (SIG_IMMED << 28),
+++            line, annots))
+++
+++         return
+++
+++      # convert movs to alu ops
+++      if aop == AOP_MOV:
+++         if allow_xor_0 and (aarmux == RMUX_IMM) and (aaraddr == 0):
+++            aop = AOP_XOR
+++            aaraddr, aarmux = 0, RMUX_AC
+++            abraddr, abrmux = 0, RMUX_AC
+++         else:
+++            aop = AOP_OR
+++            abraddr, abrmux = aaraddr, aarmux
+++      if mop == MOP_MOV:
+++         if allow_xor_0 and (marmux == RMUX_IMM) and (maraddr == 0):
+++            mop = MOP_V8SUBS
+++            maraddr, marmux = 0, RMUX_AC
+++            mbraddr, mbrmux = 0, RMUX_AC
+++         else:
+++            mop = MOP_V8MIN
+++            mbraddr, mbrmux = maraddr, marmux
+++
+++      # normal alu instruction...
+++
+++      # handle setf
+++      if asf and (aop == AOP_NOP):
+++         asm_error('nop.setf is not allowed in add pipe')
+++      if msf and (mop == MOP_NOP):
+++         asm_warning('nop.setf, really?')
+++      if (aop == AOP_NOP) or (acond == COND_NEVER):
+++         sf = msf
+++      else:
+++         if msf:
+++            asm_error('setf only allowed on mul op if add op is nop or add condition is never')
+++         sf = asf
+++
+++      # handle read addrs
+++      raddr_a = None
+++      raddr_b = None
+++      immb = False
+++      arot_r5 = False
+++      muxes = [0, 0, 0, 0]
+++      if mwrot != 0:
+++         raddr_b = 48 + mwrot
+++         immb = True
+++      if mwrot_r5 and have_am:
+++         raddr_b = 48
+++         immb = True
+++      for f in lambda rmux: rmux != RMUX_ANY, lambda rmux: rmux == RMUX_ANY: # do RMUX_ANY last
+++         for i, raddr, rmux in (0, aaraddr, aarmux), (1, abraddr, abrmux), (2, maraddr, marmux), (3, mbraddr, mbrmux):
+++            if f(rmux):
+++               raddr_a, raddr_b, immb, arot_r5, muxes[i] = merge_rmux(raddr_a, raddr_b, immb, arot_r5, raddr, rmux)
+++      add_a, add_b, mul_a, mul_b = muxes
+++      if (not read_rot_ok(mul_a, raddr_a, raddr_b)) or (not read_rot_ok(mul_b, raddr_a, raddr_b)):
+++         # some output elements might not be as expected
+++         if mwrot_r5 or ((mwrot >= 4) and (mwrot <= 12)):
+++            bad_elems = 0xffff
+++         else:
+++            bad_elems = ((1 << (mwrot & 0x3)) - 1) * 0x1111
+++            if mwrot > 12:
+++               bad_elems ^= 0xffff
+++         bad_elems &= dict(annots).get('mul_used', 0xffff)
+++         if not msf:
+++            if mwaddr == WADDR_NOP:
+++               # not writing anywhere and not setting flags. no elements used
+++               bad_elems = 0
+++            elif ((mwaddr in (36, 40, 43, 49, 50, 51)) or
+++               ((not ws) and (mwaddr == 37))):
+++               # writing to tmurs/r5rep/unif_addr/unif_addr_rel/stencil/
+++               # vr_setup/vw_setup/vr_addr/vw_addr/mutex and not setting flags.
+++               # only use element 0
+++               bad_elems &= 0x0001
+++            elif ((mwaddr == 41) or (ws and (mwaddr == 37)) or
+++               ((not ws) and (mwaddr == 42))):
+++               # writing to r5quad/x_coord/y_coord/rev_flag and not setting
+++               # flags. only use elements 0, 4, 8, and 12
+++               bad_elems &= 0x1111
+++         if bad_elems:
+++            asm_warning('mul inputs don\'t come from accumulators (r0-r3). output may not be as expected')
+++      if raddr_a is None:
+++         raddr_a = RADDR_NOP
+++      if raddr_b is None:
+++         raddr_b = RADDR_NOP
+++      if immb:
+++         if sig != SIG_NORMAL:
+++            asm_error('rotation/immediates and signal don\'t mix')
+++         sig = SIG_SMALLIMMED
+++      if arot_r5 or (mwrot_r5 and (not have_am)):
+++         if sig != SIG_NORMAL:
+++            asm_error('rotation/immediates/signal don\'t mix')
+++         sig = SIG_ROTATE
+++
+++      # construct instruction
+++      prog.append(((mul_b << 0) | (mul_a << 3) | (add_b << 6) | (add_a << 9) | (raddr_b << 12) | (raddr_a << 18) | (aop << 24) | (mop << 29),
+++         (mwaddr << 0) | (awaddr << 6) | (ws << 12) | (sf << 13) | (mcond << 14) | (acond << 17) | (pack << 20) | (pm << 24) | (unpack << 25) | (sig << 28),
+++         line, annots))
+++   finally:
+++      current_location = prev_location
+++
+++def preprocess_passthrough(file):
+++   line_number = 0
+++   for line in file:
+++      line_number += 1
+++      yield line_number, line
+++
+++def asm_file(sets, location, filename, preprocess = None):
+++   global current_dir, current_location
+++
+++   if filename is None:
+++      location = '<stdin>'
+++      file = sys.stdin
+++
+++      prev_dir = current_dir
+++   else:
+++      filename = os.path.normpath(os.path.join(current_dir, filename))
+++
+++      try:
+++         file = open(filename)
+++      except Exception, e:
+++         asm_error(e)
+++      except:
+++         asm_error('unknown error while opening file %s' % filename)
+++
+++      prev_dir = current_dir
+++      current_dir = os.path.dirname(filename)
+++
+++   prev_location = current_location
+++   current_location = location
+++
+++   if preprocess is None:
+++      preprocess = preprocess_passthrough
+++
+++   try:
+++      for line_number, line in preprocess(file):
+++         # strip off comments and whitespace
+++         line = line.split('#')[0].strip()
+++         if line == '':
+++            continue
+++
+++         asm_line(sets, '%s: %d' % (current_location, line_number), line)
+++   finally:
+++      current_dir = prev_dir
+++      current_location = prev_location
+++
+++def asm_end_prog():
+++   # check we aren't in a multi-line construct (eg .macro or .rep)
+++   if construct != None:
+++      asm_error({
+++         CONSTRUCT_MACRO: '.macro without .endm',
+++         CONSTRUCT_IF:    '.if/.elif without .endif',
+++         CONSTRUCT_ELSE:  '.else without .endif',
+++         CONSTRUCT_REP:   '.rep without .endr'}[construct_stack[-1]])
+++
+++   # check no warnings level back to 0
+++   if nwarn_level != 0:
+++      asm_error('.pushnwarn without .popnwarn')
+++
+++   # flush queued up data
+++   asm_flush_prog_data()
+++
+++   # fixup all the label references we can
+++   for pc in xrange(len(prog)):
+++      if isinstance(prog[pc][0], tuple):
+++         location, label, rel, offset = prog[pc][0]
+++         if label[0].isdigit():
+++            label_pcs = labels.get(label[:-1], [])
+++            if label[-1] == 'b':
+++               label_pcs = filter(lambda label_pc: label_pc <= pc, label_pcs)[-1:]
+++            else:
+++               label_pcs = filter(lambda label_pc: label_pc > pc, label_pcs)[:1]
+++            if label_pcs == []:
+++               asm_error('search for label reached begin/end of file', location = location)
+++            imm = label_pcs[0]
+++         elif label in labels:
+++            imm = labels[label]
+++         elif (':' + label) in labels:
+++            imm = labels[':' + label]
+++         elif external_link:
+++            continue # let the external linker deal with it
+++         else:
+++            asm_error('undefined label', location = location)
+++         imm = (imm * 8) + offset
+++         if rel:
+++            imm -= (pc + 4) * 8 # relative to instruction after delay slots
+++            imm &= (1 << 32) - 1
+++         else:
+++            if not external_link:
+++               asm_error('can\'t get absolute address without using an external linker. this mode doesn\'t have an external linker', location = location)
+++            imm = (location, label, rel, offset, imm)
+++         prog[pc] = (imm,) + prog[pc][1:]
+++
+++def asm_init():
+++   global current_dir, current_location, prog, prog_data, macros, labels, construct, construct_stack, nwarn_level
+++
+++   current_dir = os.getcwd()
+++   current_location = ''
+++   prog = []
+++   prog_data = []
+++   macros = {
+++      'sacq': (['dst', 'i'], [('candyland', 'mov  dst, sacq(i)')]),
+++      'srel': (['dst', 'i'], [('candyland', 'mov  dst, srel(i)')])}
+++   labels = {}
+++   construct = None
+++   construct_stack = []
+++   nwarn_level = 0
+++
+++def asm_reset_prog():
+++   global prog, labels
+++
+++   prog = []
+++   labels = {}
+++
+++###############################################################################
+++# dumping
+++###############################################################################
+++
+++def print_lines(lines):
+++   for line in lines:
+++      print line
+++
+++class dumper_t:
+++   def external_link(self): return False
+++   def begin(self): pass
+++   def label(self, pc, name): pass
+++   def line(self, pc, ls, ms, line, annots, first): pass
+++   def end(self): pass
+++   def sets(self, sets): pass
+++   def direct(self, line): pass
+++
+++class clif_dumper_t(dumper_t):
+++   def __init__(self):
+++      self.annot_mode = 0
+++
+++   def external_link(self):
+++      return True
+++
+++   def parse_annot_mode(self, line):
+++      l = line.split(',')
+++      self.annot_mode = int(l[0])
+++      if self.annot_mode not in (0, 1, 2):
+++         asm_error('bad annot mode')
+++      if self.annot_mode == 2:
+++         if len(l) != 2:
+++            asm_error('expected buffer name')
+++         self.annot_name = l[1].strip()
+++         self.annot_offset = 0
+++      elif len(l) != 1:
+++         asm_error('unexpected comma')
+++
+++   def label(self, pc, name):
+++      if (self.annot_mode != 1) and (name[0] == ':'):
+++         if self.annot_mode == 2:
+++            name = name + '_annotations'
+++         print '@label %s' % name[1:]
+++      else:
+++         print '// :%s' % name
+++
+++   def line(self, pc, ls, ms, line, annots, first):
+++      if self.annot_mode == 0:
+++         if isinstance(ls, tuple):
+++            if len(ls) == 5:
+++               location, label, rel, offset, offset_from_prog = ls
+++               assert not rel
+++               ls = '[. - %d + %d]' % (pc * 8, offset_from_prog)
+++            else:
+++               location, label, rel, offset = ls
+++               if rel:
+++                  asm_error('relative external label references not allowed in this mode', location = location)
+++               ls = '[%s + %d]' % (label, offset)
+++         else:
+++            ls = '0x%08x' % ls
+++         print '%s 0x%08x // %s' % (ls, ms, line)
+++      elif self.annot_mode == 1:
+++         print '// %s' % line
+++         for annot in annots:
+++            print '0x%08x 0x%08x // %s' % ({
+++               # todo: would rather not have these hard coded
+++               'mul_used':              1,
+++               'preserve_cond':         2,
+++               'geomd_open':            3,
+++               'geomd_i':               4,
+++               'geomd_tris_clear':      5,
+++               'geomd_verts':           6,
+++               'geomd_tris_add':        7,
+++               'geomd_tris_set_center': 8,
+++               'geomd_region_clear':    9,
+++               'geomd_region_set':      10,
+++               'geomd_images_clear':    11,
+++               'geomd_images_l':        12,
+++               'geomd_images_b':        13,
+++               'geomd_images_r':        14,
+++               'geomd_images_t':        15,
+++               'geomd_images_add_vpm':  16,
+++               'trace_4c':              17,
+++               'geomd_images_add_tex':  18,}[annot[0]], annot[1], annot[0])
+++         if len(annots) != 0:
+++            print '0x00000000 // end'
+++      else:
+++         assert self.annot_mode == 2
+++         if len(annots) == 0:
+++            print '0x00000000 // %s' % line
+++         else:
+++            print '[%s + %d] // %s' % (self.annot_name, self.annot_offset, line)
+++            self.annot_offset += (len(annots) * 8) + 4
+++
+++   def direct(self, line):
+++      print line
+++
+++class plain_dumper_t(dumper_t):
+++   def line(self, pc, ls, ms, line, annots, first):
+++      print '0x%08x, 0x%08x, // %s' % (ls, ms, line)
+++
+++class c_c_dumper_t(dumper_t):
+++   def __init__(self, header_name, full_header_name, array_name):
+++      self.header_name = header_name
+++      self.array_name = array_name
+++
+++   def external_link(self):
+++      return True
+++
+++   def begin(self):
+++      self.external_labels = set()
+++      self.lines = []
+++
+++      print '#include "%s.h"' % self.header_name
+++      print ''
+++      print '#ifdef _MSC_VER'
+++      print '   #include <stdint.h>'
+++      print '   /* cast through uintptr_t to avoid warnings */'
+++      print '   #define POINTER_TO_UINT(X) ((unsigned int)(uintptr_t)(X))'
+++      print '#else'
+++      print '   #define POINTER_TO_UINT(X) ((unsigned int)(X))'
+++      print '#endif'
+++      print ''
+++      print '#ifdef __cplusplus'
+++      print 'extern "C" { /* the types are probably wrong... */'
+++      print '#endif'
+++
+++   def label(self, pc, name):
+++      self.lines.append('// :%s' % name)
+++
+++   def line(self, pc, ls, ms, line, annots, first):
+++      if isinstance(ls, tuple):
+++         if len(ls) == 5:
+++            location, label, rel, offset, offset_from_prog = ls
+++            assert not rel
+++            ls = 'POINTER_TO_UINT(%s) + %d' % (self.array_name, offset_from_prog)
+++         else:
+++            location, label, rel, offset = ls
+++            if rel:
+++               asm_error('relative external label references not allowed in this mode', location = location)
+++            if label not in self.external_labels:
+++               self.external_labels.add(label)
+++               print 'extern uint8_t %s[];' % label
+++            ls = 'POINTER_TO_UINT(%s) + %d' % (label, offset)
+++      else:
+++         ls = '0x%08x' % ls
+++      self.lines.append('/* [0x%08x] */ %s, 0x%08x, // %s' % (pc * 8, ls, ms, line))
+++
+++   def end(self):
+++      print '#ifdef __cplusplus'
+++      print '}'
+++      print '#endif'
+++      print ''
+++      print '#ifdef _MSC_VER'
+++      print '__declspec(align(8))'
+++      print '#elif defined(__GNUC__)'
+++      print '__attribute__((aligned(8)))'
+++      print '#endif'
+++      print 'unsigned int %s[] = {' % self.array_name
+++      print_lines(self.lines)
+++      print '};'
+++      print '#ifdef __HIGHC__'
+++      print '#pragma Align_to(8, %s)' % self.array_name
+++      print '#endif'
+++
+++class c_h_dumper_t(dumper_t):
+++   def __init__(self, header_name, full_header_name, array_name):
+++      self.full_header_name = full_header_name
+++      self.array_name = array_name
+++
+++   def external_link(self):
+++      return True
+++
+++   def begin(self):
+++      print '#ifndef %s_H' % self.full_header_name
+++      print '#define %s_H' % self.full_header_name
+++      print ''
+++      print 'extern unsigned int %s[];' % self.array_name
+++      print ''
+++
+++   def label(self, pc, name):
+++      if name[0] == ':':
+++         print '#define %s (%s + %d)' % (name[1:], self.array_name, pc * 2)
+++
+++   def end(self):
+++      print ''
+++      print '#endif'
+++
+++class ml_c_dumper_t(dumper_t):
+++   def __init__(self, header_name, full_header_name, name, annots):
+++      self.header_name = header_name
+++      self.name = name
+++      self.annots = annots
+++
+++   def external_link(self):
+++      return True
+++
+++   def begin(self):
+++      if self.annots:
+++         self.annot_lines = []
+++      self.lines = []
+++      self.external_labels = set()
+++      self.link_lines = []
+++
+++      print '#include "%s.h"' % self.header_name
+++      print '#include <assert.h>'
+++      if self.annots:
+++         print '#ifdef SIMPENROSE'
+++         print '#include <stddef.h>'
+++         print '#include "v3d/verification/tools/2760sim/simpenrose.h"'
+++      print ''
+++
+++   def label(self, pc, name):
+++      self.lines.append('// :%s' % name)
+++
+++   def line(self, pc, ls, ms, line, annots, first):
+++      if self.annots:
+++         if len(annots) == 0:
+++            self.annot_lines.append('NULL,')
+++         else:
+++            print 'static unsigned int const annotations_%d[] = {' % pc
+++            for annot in annots:
+++               print '   SIMPENROSE_SHADER_ANNOTATION_%s, 0x%08x,' % (annot[0].upper(), annot[1])
+++            print '   SIMPENROSE_SHADER_ANNOTATION_END};'
+++            print ''
+++            self.annot_lines.append('annotations_%d,' % pc)
+++      if isinstance(ls, tuple):
+++         self.link_lines.append('   assert(p[%d] == 0xdeadbeef);' % (pc * 2))
+++         if len(ls) == 5:
+++            location, label, rel, offset, offset_from_prog = ls
+++            assert not rel
+++            self.link_lines.append('   p[%d] = base + %d;' % (pc * 2, offset_from_prog))
+++         else:
+++            location, label, rel, offset = ls
+++            self.external_labels.add(label)
+++            if rel:
+++               self.link_lines.append('   p[%d] = (%s + %d) - (base + %d);' % (pc * 2, label, offset, (pc + 4) * 8))
+++            else:
+++               self.link_lines.append('   p[%d] = %s + %d;' % (pc * 2, label, offset))
+++         ls = '0xdeadbeef'
+++      else:
+++         ls = '0x%08x' % ls
+++      self.lines.append('/* [0x%08x] */ %s, 0x%08x, // %s' % (pc * 8, ls, ms, line))
+++
+++   def end(self):
+++      if self.annots:
+++         print 'unsigned int const *const %s_annotations_array[] = {' % self.name
+++         print_lines(self.annot_lines)
+++         print '};'
+++         print '#endif'
+++         print ''
+++      print 'static unsigned int const array[] = {'
+++      print_lines(self.lines)
+++      print '};'
+++      print ''
+++      print 'void %s_link(void *p_in, unsigned int base' % self.name
+++      for label in sorted(self.external_labels):
+++         print '   , unsigned int %s' % label
+++      print '   )'
+++      print '{'
+++      print '   unsigned int *p = (unsigned int *)p_in;'
+++      print '   unsigned int i;'
+++      print '   for (i = 0; i != (%s_SIZE / 4); ++i) {' % self.name.upper()
+++      print '      p[i] = array[i];'
+++      print '   }'
+++      print_lines(self.link_lines)
+++      print '}'
+++
+++class ml_h_dumper_t(dumper_t):
+++   def __init__(self, header_name, full_header_name, name, annots):
+++      self.full_header_name = full_header_name
+++      self.name = name
+++      self.annots = annots
+++
+++   def external_link(self):
+++      return True
+++
+++   def begin(self):
+++      self.external_labels = set()
+++      self.lines_n = 0
+++
+++      print '#ifndef %s_H' % self.full_header_name
+++      print '#define %s_H' % self.full_header_name
+++      print ''
+++      if self.annots:
+++         print '#ifdef SIMPENROSE'
+++         print '   extern unsigned int const *const %s_annotations_array[];' % self.name
+++         print '#endif'
+++         print ''
+++
+++   def label(self, pc, name):
+++      if name[0] == ':':
+++         print '#define %s_OFFSET %d' % (name[1:].upper(), pc * 8)
+++         if self.annots:
+++            print '#ifdef SIMPENROSE'
+++            print '   #define %s_annotations (%s_annotations_array + %d)' % (name[1:], self.name, pc)
+++            print '#endif'
+++
+++   def line(self, pc, ls, ms, line, annots, first):
+++      if isinstance(ls, tuple) and (len(ls) != 5):
+++         self.external_labels.add(ls[1])
+++      self.lines_n += 1
+++
+++   def end(self):
+++      print ''
+++      print 'extern void %s_link(void *p, unsigned int base' % self.name
+++      for label in sorted(self.external_labels):
+++         print '   , unsigned int %s' % label
+++      print '   );'
+++      print ''
+++      print '#define %s_SIZE %d' % (self.name.upper(), (self.lines_n * 8))
+++      print ''
+++      print '#endif'
+++
+++def print_lines_lc(lines):
+++   for line in lines:
+++      print '%s \\' % line
+++
+++def print_groups_lc(groups):
+++   first = True
+++   for group in groups:
+++      if first:
+++         print '{ \\'
+++      else:
+++         print ', { \\'
+++      print_lines_lc(group)
+++      print '} \\'
+++      first = False
+++
+++class inline_c_dumper_t(dumper_t):
+++   def __init__(self, annots):
+++      self.annots = annots
+++      self.iteration = False
+++
+++   def begin_iteration(self):
+++      assert not self.iteration
+++      self.iteration = True
+++      self.iteration_lines = []
+++      if self.annots:
+++         self.iteration_annot_lines = []
+++         self.annot_arrs = []
+++
+++   def end_iteration(self):
+++      assert self.iteration
+++      self.iteration = False
+++      print '%d, \\' % self.iteration_n
+++      if self.annots:
+++         print '( \\'
+++      print_groups_lc(self.iteration_lines)
+++      if self.annots:
+++         print '), ( \\'
+++         print_groups_lc(self.iteration_annot_lines)
+++         print '), ( \\'
+++         for annot_arr in self.annot_arrs:
+++            print_lines_lc(annot_arr)
+++         print ') \\'
+++
+++   def begin(self):
+++      self.n = 0
+++      self.lines = []
+++      if self.annots:
+++         self.annot_lines = []
+++         if not self.iteration:
+++            self.annot_arrs = []
+++
+++   def label(self, pc, name):
+++      self.lines.append('/* :%s */' % name)
+++      if self.annots:
+++         self.annot_lines.append('/* :%s */' % name)
+++
+++   def line(self, pc, ls, ms, line, annots, first):
+++      self.n += 1
+++      if first:
+++         prefix = ''
+++      else:
+++         prefix = ', '
+++      self.lines.append('%s0x%08x, 0x%08x /* %s */' % (prefix, ls, ms, line))
+++      if self.annots:
+++         if len(annots) == 0:
+++            a = 'NULL'
+++         else:
+++            a = 'annotations_%d' % len(self.annot_arrs)
+++            annot_arr = ['static unsigned int const annotations_%d[] = {' % len(self.annot_arrs)]
+++            for annot in annots:
+++               annot_arr.append('   SIMPENROSE_SHADER_ANNOTATION_%s, 0x%08x,' % (annot[0].upper(), annot[1]))
+++            annot_arr.append('   SIMPENROSE_SHADER_ANNOTATION_END};')
+++            self.annot_arrs.append(annot_arr)
+++         self.annot_lines.append('%s%s /* %s */' % (prefix, a, line))
+++
+++   def end(self):
+++      if self.iteration:
+++         if len(self.iteration_lines) == 0:
+++            self.iteration_n = self.n
+++         elif self.iteration_n != self.n:
+++            asm_error('number of instructions differs between iterations')
+++         self.iteration_lines.append(self.lines)
+++         if self.annots:
+++            self.iteration_annot_lines.append(self.annot_lines)
+++      else:
+++         if self.annots:
+++            print '( \\'
+++         print_lines_lc(self.lines)
+++         if self.annots:
+++            print '), ( \\'
+++            print_lines_lc(self.annot_lines)
+++            print '), ( \\'
+++            for annot_arr in self.annot_arrs:
+++               print_lines_lc(annot_arr)
+++            print ') \\'
+++
+++   def direct(self, line):
+++      print line
+++
+++class asvc_dumper_t(dumper_t):
+++   def external_link(self):
+++      return True
+++
+++   def begin(self):
+++      print '.align 8'
+++
+++   def label(self, pc, name):
+++      if name[0] == ':':
+++         print '%s::' % name[1:]
+++      else:
+++         print '%s:' % name
+++
+++   def line(self, pc, ls, ms, line, annots, first):
+++      if isinstance(ls, tuple):
+++         location, label, rel, offset = ls[:4]
+++         if rel:
+++            ls = '%s + %d - (. + 32)' % (label, offset)
+++         else:
+++            ls = '%s + %d' % (label, offset)
+++      else:
+++         ls = '0x%08x' % ls
+++      print '.word %s, 0x%08x ; %s' % (ls, ms, line)
+++
+++def is_ra_or_rb(val):
+++   return isinstance(val, loc_t) and ((val.mux == MUX_A) or (val.mux == MUX_B))
+++
+++class aliases_dumper_t(dumper_t):
+++   def external_link(self):
+++      return True
+++
+++   def begin(self):
+++      print '#ifndef JUST_DQASM_ARGS'
+++
+++   def label(self, pc, name):
+++      if not name[0].isdigit():
+++         if name[0] == ':':
+++            name = name[1:]
+++         print '"bs%s", "bs%x",' % (name, pc * 8)
+++         print '"bu%s", "bu%x",' % (name, pc * 8)
+++
+++   def end(self):
+++      print '#endif'
+++
+++   # todo: handle things other than ra and rb? dqasm only allows ra and rb atm
+++   def sets(self, sets):
+++      dqasm_args = []
+++      print '#ifndef JUST_DQASM_ARGS'
+++      for name in sets:
+++         if is_ra_or_rb(sets[name]):
+++            dqasm_args.append('-r%s=%s' % (sets[name], name))
+++            print '"%s", "%s",' % (name, sets[name])
+++         elif isinstance(sets[name], list):
+++            for i, val in enumerate(sets[name]):
+++               if is_ra_or_rb(val):
+++                  dqasm_args.append('-r%s=%s[%d]' % (val, name, i))
+++                  print '"%s[%d]", "%s",' % (name, i, val)
+++      print '#endif'
+++      print '#define DQASM_ARGS "%s"' % ' '.join(dqasm_args)
+++
+++def dump(dumper):
+++   if (len(prog) != 0) or (len(labels) != 0):
+++      dumper.begin()
+++
+++      sorted_labels = []
+++      for name in labels:
+++         if name[0].isdigit():
+++            for pc in labels[name]:
+++               sorted_labels.append((pc, name))
+++         else:
+++            sorted_labels.append((labels[name], name))
+++      sorted_labels.sort(reverse = True)
+++
+++      first = True
+++      for pc in xrange(len(prog)):
+++         ls, ms, line, annots = prog[pc]
+++         while (len(sorted_labels) != 0) and (sorted_labels[-1][0] == pc):
+++            dumper.label(*sorted_labels.pop())
+++         dumper.line(pc, ls, ms, line, annots, first)
+++         first = False
+++      for sorted_label in sorted_labels:
+++         assert sorted_label[0] == len(prog)
+++         dumper.label(*sorted_label)
+++
+++      dumper.end()
+++
+++###############################################################################
+++# preprocessing
+++###############################################################################
+++
+++def preprocess_inline_c(dumper):
+++   def preprocess(file):
+++      ls = None
+++      line_number = 0
+++      for line in file:
+++         line_number += 1
+++         while True:
+++            if ls is None:
+++               l = line.split('%[', 1)
+++               if len(l) == 1:
+++                  dumper.direct(l[0].rstrip())
+++                  break
+++               dumper.direct('%s \\' % l[0].rstrip())
+++               line = l[1]
+++               ls = []
+++            else:
+++               l = line.split('%]', 1)
+++               ls.append((line_number, l[0]))
+++               if len(l) == 1:
+++                  break
+++               line = l[1]
+++               l = ls[-1][1].split('%|', 1)
+++               if len(l) == 1:
+++                  for l_number, l in ls:
+++                     yield l_number, l
+++                  asm_end_prog()
+++                  dump(dumper)
+++                  asm_reset_prog()
+++               else:
+++                  ls[-1] = (ls[-1][0], l[0])
+++                  if hasattr(dumper, 'begin_iteration'):
+++                     dumper.begin_iteration()
+++                  for repls in l[1].split('%,'):
+++                     repls = [repl.strip() for repl in repls.split('%/')]
+++                     for l_number, l in ls:
+++                        for i, repl in enumerate(repls):
+++                           l = l.replace('%' + str(i), repl)
+++                        yield l_number, l
+++                     asm_end_prog()
+++                     dump(dumper)
+++                     asm_reset_prog()
+++                  if hasattr(dumper, 'end_iteration'):
+++                     dumper.end_iteration()
+++               ls = None
+++   return preprocess
+++
+++def preprocess_clif(dumper):
+++   def preprocess(file):
+++      in_asm = False
+++      line_number = 0
+++      for line in file:
+++         line_number += 1
+++         if in_asm:
+++            if line.strip() == '%]':
+++               asm_end_prog()
+++               dump(dumper)
+++               asm_reset_prog()
+++               in_asm = False
+++            else:
+++               yield line_number, line
+++         else:
+++            if line.strip() == '%[':
+++               in_asm = True
+++            elif (line[:1] == '%') and (line[:2] != '%@'):
+++               yield line_number, line[1:]
+++            else:
+++               asm_end_prog()
+++               dump(dumper)
+++               asm_reset_prog()
+++               if line[:2] == '%@':
+++                  if hasattr(dumper, 'parse_annot_mode'):
+++                     dumper.parse_annot_mode(line[2:])
+++               else:
+++                  dumper.direct(line.rstrip())
+++   return preprocess
+++
+++###############################################################################
+++# main
+++###############################################################################
+++
+++def main():
+++   global external_link, allow_xor_0, dont_warn_when_mul_rot_inp_r5
+++   global warnings_are_errors, disable_warnings, have_sema, have_am, mulw_rotate
+++
+++   asm_init() # do this first so we can use asm_error without having to pass a location and so asm_warning will work
+++
+++   # parse command line
+++   parser = optparse.OptionParser(usage = 'usage: %prog [options] <filename>')
+++   parser.add_option('-m', '--mode', dest = 'mode',
+++      help = '<mode> should be clif, plain, ' +
+++      'c_c:<header_name>,<full_header_name>,<array_name>, ' +
+++      'c_h:<header_name>,<full_header_name>,<array_name>, ' +
+++      'ml_c:<header_name>,<full_header_name>,<name>[,annots], ' +
+++      'ml_h:<header_name>,<full_header_name>,<name>[,annots], ' +
+++      'inline_c[:annots], asvc, or aliases[:<preprocess_mode>]', metavar = '<mode>')
+++   parser.add_option('-t', '--target', dest = 'target',
+++      help = '<target> should be a0, b0, or hera', metavar = '<target>')
+++   parser.add_option('-x', '--allow_xor_0', dest = 'allow_xor_0', action = 'store_true', default = False)
+++   parser.add_option('-r', '--dont_warn_when_mul_rot_inp_r5', dest = 'dont_warn_when_mul_rot_inp_r5', action = 'store_true', default = False)
+++   parser.add_option('-w', '--warnings_are_errors', dest = 'warnings_are_errors', action = 'store_true', default = False)
+++   parser.add_option('-d', '--disable_warnings', dest = 'disable_warnings', action = 'store_true', default = False)
+++   parser.add_option('-s', '--set', dest = 'sets', action = 'append', default = [], metavar = '<name>=<val>')
+++   options, args = parser.parse_args()
+++   if len(args) == 0:
+++      filename = None
+++   elif len(args) == 1:
+++      filename = args[0]
+++   else:
+++      parser.print_help()
+++      sys.exit(-1)
+++
+++   # handle mode
+++   mode = options.mode or 'clif' # assume clif if no mode specified
+++   if mode == 'clif':
+++      dumper = clif_dumper_t()
+++      preprocess = preprocess_clif(dumper)
+++   elif mode == 'plain':
+++      dumper = plain_dumper_t()
+++      preprocess = None
+++   elif (mode[:4] == 'c_c:') or (mode[:4] == 'c_h:'):
+++      mode_options = mode[4:].split(',')
+++      if len(mode_options) != 3:
+++         asm_error('badly formatted mode on command line')
+++      dumper = {'c_c': c_c_dumper_t, 'c_h': c_h_dumper_t}[mode[:3]](*mode_options)
+++      preprocess = None
+++   elif (mode[:5] == 'ml_c:') or (mode[:5] == 'ml_h:'):
+++      mode_options = mode[5:].split(',')
+++      if (len(mode_options) != 3) and ((len(mode_options) != 4) or (mode_options[3] != 'annots')):
+++         asm_error('badly formatted mode on command line')
+++      dumper = {'ml_c': ml_c_dumper_t, 'ml_h': ml_h_dumper_t
+++         }[mode[:4]](*(mode_options[:3] + [len(mode_options) == 4]))
+++      preprocess = None
+++   elif mode == 'inline_c':
+++      dumper = inline_c_dumper_t(False)
+++      preprocess = preprocess_inline_c(dumper)
+++   elif mode == 'inline_c:annots':
+++      dumper = inline_c_dumper_t(True)
+++      preprocess = preprocess_inline_c(dumper)
+++   elif mode == 'asvc':
+++      dumper = asvc_dumper_t()
+++      preprocess = None
+++   elif mode == 'aliases':
+++      dumper = aliases_dumper_t()
+++      preprocess = None
+++   elif mode == 'aliases:inline_c':
+++      dumper = aliases_dumper_t()
+++      preprocess = preprocess_inline_c(dumper)
+++   else:
+++      asm_error('invalid mode')
+++   external_link = dumper.external_link()
+++
+++   # handle target
+++   target = options.target or 'b0' # assume b0 if no target specified
+++   if target == 'a0':
+++      have_sema = False
+++      have_am = False
+++      mulw_rotate = False
+++      have_lthrsw = False
+++   elif target == 'b0':
+++      have_sema = True
+++      have_am = True
+++      mulw_rotate = True
+++      have_lthrsw = True
+++   elif target == 'hera':
+++      have_sema = True
+++      have_am = False
+++      mulw_rotate = True
+++      have_lthrsw = True
+++   else:
+++      asm_error('invalid target')
+++   if have_am:
+++      sigs['loadam'] = SIG_LOADAM
+++      arg_defs['tlbam'] = loc_t(MUX_ANY, 47, 0, 0, None, RW_WRITE)
+++   if have_lthrsw:
+++      sigs['lthrsw'] = SIG_LTHRSW
+++      del sigs['int']
+++      arg_defs['interrupt'] = loc_t(MUX_ANY, 38, 0, 0, None, RW_WRITE)
+++
+++   # handle misc options
+++   allow_xor_0 = options.allow_xor_0
+++   dont_warn_when_mul_rot_inp_r5 = options.dont_warn_when_mul_rot_inp_r5
+++   warnings_are_errors = options.warnings_are_errors
+++   disable_warnings = options.disable_warnings
+++
+++   # make options visible to asm
+++   arg_defs['mode'] = mode
+++   arg_defs['target'] = target
+++
+++   # arg_defs all setup at this point
+++   sets = arg_defs.copy() # todo: see arg_eval
+++
+++   # handle command line sets
+++   re_options_set = re.compile('(?P<name>\\w+)=(?P<val>.+)$')
+++   for options_set in options.sets:
+++      m = re_options_set.match(options_set)
+++      if not m:
+++         asm_error('badly formatted set on command line')
+++      sets[m.group('name')] = arg_eval(m.group('val'), sets)
+++
+++   # assemble input file and dump
+++   asm_file(sets, filename, filename, preprocess)
+++   asm_end_prog()
+++   dump(dumper)
+++   for name in arg_defs: # todo: see arg_eval
+++      del sets[name]
+++   dumper.sets(sets)
+++
+++if __name__ == '__main__':
+++   main()
++diff --git a/pi-util/rebase_liblinks.py b/pi-util/rebase_liblinks.py
++new file mode 100755
++index 0000000..6a9a33f
++--- /dev/null
+++++ b/pi-util/rebase_liblinks.py
++@@ -0,0 +1,37 @@
+++#!/usr/bin/env python
+++
+++import os, sys
+++from stat import *
+++
+++def walktree(top, callback, n, prefix):
+++    '''recursively descend the directory tree rooted at top,
+++       calling the callback function for each regular file'''
+++
+++    for f in os.listdir(top):
+++        pathname = os.path.join(top, f)
+++        mode = os.lstat(pathname).st_mode
+++        if S_ISDIR(mode):
+++            # It's a directory, recurse into it
+++            walktree(pathname, callback, n+1, prefix)
+++        elif S_ISLNK(mode):
+++            # It's a file, call the callback function
+++            callback(pathname, os.readlink(pathname), n, prefix)
+++
+++def visitfile(file, linkname, n, prefix):
+++    if (linkname.startswith(prefix + 'lib/')):
+++        newlink = "../" * n + linkname[len(prefix):]
+++        print 'relinking', file, "->", newlink
+++        os.remove(file)
+++        os.symlink(newlink, file)
+++
+++if __name__ == '__main__':
+++    argc = len(sys.argv)
+++    if argc == 2:
+++        walktree(sys.argv[1], visitfile, 0, "/")
+++    elif argc == 3:
+++        walktree(sys.argv[1], visitfile, 0, sys.argv[2])
+++    else:
+++        print "rebase_liblinks.py <local root> [<old sysroot>]"
+++
+++
+++
++diff --git a/pi-util/syncroot.sh b/pi-util/syncroot.sh
++new file mode 100755
++index 0000000..d8bdd91
++--- /dev/null
+++++ b/pi-util/syncroot.sh
++@@ -0,0 +1,43 @@
+++set -e
+++
+++if [ "$1" == "" ]; then
+++  echo Usage: $0 \<src_dir\> [\<rootname\>]
+++  echo src_dir is a source for rsync so may contain m/c name.
+++  echo rootname will be set to \"raspian_jessie_pi1\" if missing
+++  echo e.g.: pi-util/syncroot.sh my-pi: raspian_jessie_pi1
+++  exit 1
+++fi
+++
+++SYSROOT_NAME=$2
+++if [ "$SYSROOT_NAME" == "" ]; then
+++  SYSROOT_NAME=raspian_jessie_pi1
+++fi
+++
+++DST_ROOT=`pwd`
+++DST=$DST_ROOT/build/linux/$SYSROOT_NAME-sysroot
+++SRC=$1
+++
+++echo Sync src:  $SRC
+++echo Sync dest: $DST
+++
+++mkdir -p $DST/lib
+++mkdir -p $DST/opt/vc/include
+++mkdir -p $DST/usr/lib/pkgconfig
+++mkdir -p $DST/usr/bin
+++mkdir -p $DST/usr/share
+++
+++#### MUST NOT include /opt/vc/include/*GL*
+++# Creates conflicts with GL includes inside Chrome
+++
+++rsync -rl $SRC/lib/arm-linux-gnueabihf $DST/lib
+++rsync -rl $SRC/opt/vc/lib $DST/opt/vc
+++rsync -l  $SRC/opt/vc/include/bcm_host.h $DST/opt/vc/include
+++rsync -rl $SRC/opt/vc/include/interface $DST/opt/vc/include
+++rsync -rl $SRC/opt/vc/include/vcinclude $DST/opt/vc/include
+++rsync -rl $SRC/usr/lib/arm-linux-gnueabihf $DST/usr/lib
+++rsync -rl $SRC/usr/lib/gcc $DST/usr/lib
+++rsync -rl $SRC/usr/include $DST/usr
+++
+++pi-util/rebase_liblinks.py $DST
+++
+++
diff --git a/tools/RPi/rpi-kodi-rebase.sh b/tools/RPi/rpi-kodi-rebase.sh
index fbf4838148..ac05939913 100755
--- a/tools/RPi/rpi-kodi-rebase.sh
+++ b/tools/RPi/rpi-kodi-rebase.sh
@@ -5,6 +5,8 @@ TODO=$1
 # Drop commits not used
 DROP_COMMITS="
 UNSTABLE\: This is a placeholder\. Commits after this point are considered experimental\.
+MANUALLY REMOVE - \[cec\] Add settings for configuring button repeats
+MANUALLY REMOVE - \[cec\] Don't discard buttons when repeat mode is enabled
 ADSP: Hack - disable
 "